xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 3faff0630ddb0e5c928de8ab5c63f2cda2d2edd1)
1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
2a001520aSPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <petscblaslapack.h>
6cac129eeSSatish Balay 
75f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
896e086a2SDaniel Kokron #include <immintrin.h>
996e086a2SDaniel Kokron #endif
1096e086a2SDaniel Kokron 
119371c9d4SSatish Balay PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) {
12a3192f15SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
135d0c19d7SBarry Smith   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
145d0c19d7SBarry Smith   const PetscInt *idx;
15690b6cddSBarry Smith   PetscInt        start, end, *ai, *aj, bs, *nidx2;
16f1af5d2fSBarry Smith   PetscBT         table;
17a3192f15SSatish Balay 
183a40ed3dSBarry Smith   PetscFunctionBegin;
19a3192f15SSatish Balay   m  = a->mbs;
20a3192f15SSatish Balay   ai = a->i;
21a3192f15SSatish Balay   aj = a->j;
22d0f46423SBarry Smith   bs = A->rmap->bs;
23a3192f15SSatish Balay 
2408401ef6SPierre Jolivet   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
25a3192f15SSatish Balay 
269566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(m, &table));
279566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &nidx));
289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2));
29a3192f15SSatish Balay 
30a3192f15SSatish Balay   for (i = 0; i < is_max; i++) {
31a3192f15SSatish Balay     /* Initialise the two local arrays */
32a3192f15SSatish Balay     isz = 0;
339566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(m, table));
34a3192f15SSatish Balay 
35a3192f15SSatish Balay     /* Extract the indices, assume there can be duplicate entries */
369566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i], &idx));
379566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i], &n));
38a3192f15SSatish Balay 
39a3192f15SSatish Balay     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40a3192f15SSatish Balay     for (j = 0; j < n; ++j) {
41218c64b6SSatish Balay       ival = idx[j] / bs; /* convert the indices into block indices */
4208401ef6SPierre Jolivet       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
4326fbe8dcSKarl Rupp       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44a3192f15SSatish Balay     }
459566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i], &idx));
469566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&is[i]));
47a3192f15SSatish Balay 
48a3192f15SSatish Balay     k = 0;
49a3192f15SSatish Balay     for (j = 0; j < ov; j++) { /* for each overlap*/
50a3192f15SSatish Balay       n = isz;
51a3192f15SSatish Balay       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52a3192f15SSatish Balay         row   = nidx[k];
53a3192f15SSatish Balay         start = ai[row];
54a3192f15SSatish Balay         end   = ai[row + 1];
55a3192f15SSatish Balay         for (l = start; l < end; l++) {
56a3192f15SSatish Balay           val = aj[l];
5726fbe8dcSKarl Rupp           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58a3192f15SSatish Balay         }
59a3192f15SSatish Balay       }
60a3192f15SSatish Balay     }
61218c64b6SSatish Balay     /* expand the Index Set */
62218c64b6SSatish Balay     for (j = 0; j < isz; j++) {
6326fbe8dcSKarl Rupp       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
64218c64b6SSatish Balay     }
659566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i));
66a3192f15SSatish Balay   }
679566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table));
689566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx));
699566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx2));
703a40ed3dSBarry Smith   PetscFunctionReturn(0);
71a3192f15SSatish Balay }
721c351548SSatish Balay 
739371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
74736121d4SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
75690b6cddSBarry Smith   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
76690b6cddSBarry Smith   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
775d0c19d7SBarry Smith   const PetscInt *irow, *icol;
785d0c19d7SBarry Smith   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
79690b6cddSBarry Smith   PetscInt       *aj = a->j, *ai = a->i;
803f1db9ecSBarry Smith   MatScalar      *mat_a;
81736121d4SSatish Balay   Mat             C;
826041f1b1SToby Isaac   PetscBool       flag;
83736121d4SSatish Balay 
843a40ed3dSBarry Smith   PetscFunctionBegin;
859566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
869566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
879566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
889566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
89736121d4SSatish Balay 
909566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(1 + oldcols, &smap));
91736121d4SSatish Balay   ssmap = smap;
929566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1 + nrows, &lens));
93736121d4SSatish Balay   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
94736121d4SSatish Balay   /* determine lens of each row */
95736121d4SSatish Balay   for (i = 0; i < nrows; i++) {
96736121d4SSatish Balay     kstart  = ai[irow[i]];
97736121d4SSatish Balay     kend    = kstart + a->ilen[irow[i]];
98736121d4SSatish Balay     lens[i] = 0;
99736121d4SSatish Balay     for (k = kstart; k < kend; k++) {
10026fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
101736121d4SSatish Balay     }
102736121d4SSatish Balay   }
103736121d4SSatish Balay   /* Create and fill new matrix */
104736121d4SSatish Balay   if (scall == MAT_REUSE_MATRIX) {
105736121d4SSatish Balay     c = (Mat_SeqBAIJ *)((*B)->data);
106736121d4SSatish Balay 
107aed4548fSBarry Smith     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
1089566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
10928b400f6SJacob Faibussowitsch     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
1109566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(c->ilen, c->mbs));
111736121d4SSatish Balay     C = *B;
1123a40ed3dSBarry Smith   } else {
1139566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1149566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
1159566063dSJacob Faibussowitsch     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1169566063dSJacob Faibussowitsch     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
117736121d4SSatish Balay   }
118736121d4SSatish Balay   c = (Mat_SeqBAIJ *)(C->data);
119736121d4SSatish Balay   for (i = 0; i < nrows; i++) {
120736121d4SSatish Balay     row      = irow[i];
121736121d4SSatish Balay     kstart   = ai[row];
122736121d4SSatish Balay     kend     = kstart + a->ilen[row];
123736121d4SSatish Balay     mat_i    = c->i[i];
124d29f2997SMatthew Woehlke     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
125d29f2997SMatthew Woehlke     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
126736121d4SSatish Balay     mat_ilen = c->ilen + i;
127736121d4SSatish Balay     for (k = kstart; k < kend; k++) {
128736121d4SSatish Balay       if ((tcol = ssmap[a->j[k]])) {
129736121d4SSatish Balay         *mat_j++ = tcol - 1;
1309566063dSJacob Faibussowitsch         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
131549d3d68SSatish Balay         mat_a += bs2;
132736121d4SSatish Balay         (*mat_ilen)++;
133736121d4SSatish Balay       }
134736121d4SSatish Balay     }
135736121d4SSatish Balay   }
136cdc6f3adSToby Isaac   /* sort */
137d29f2997SMatthew Woehlke   if (c->j && c->a) {
138cdc6f3adSToby Isaac     MatScalar *work;
1399566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(bs2, &work));
140cdc6f3adSToby Isaac     for (i = 0; i < nrows; i++) {
141cdc6f3adSToby Isaac       PetscInt ilen;
142cdc6f3adSToby Isaac       mat_i = c->i[i];
143cdc6f3adSToby Isaac       mat_j = c->j + mat_i;
144cdc6f3adSToby Isaac       mat_a = c->a + mat_i * bs2;
145cdc6f3adSToby Isaac       ilen  = c->ilen[i];
1469566063dSJacob Faibussowitsch       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
147cdc6f3adSToby Isaac     }
1489566063dSJacob Faibussowitsch     PetscCall(PetscFree(work));
149cdc6f3adSToby Isaac   }
150218c64b6SSatish Balay 
151736121d4SSatish Balay   /* Free work space */
1529566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
1539566063dSJacob Faibussowitsch   PetscCall(PetscFree(smap));
1549566063dSJacob Faibussowitsch   PetscCall(PetscFree(lens));
1559566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1569566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
157736121d4SSatish Balay 
1589566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
159736121d4SSatish Balay   *B = C;
1603a40ed3dSBarry Smith   PetscFunctionReturn(0);
161736121d4SSatish Balay }
162736121d4SSatish Balay 
1639371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
164218c64b6SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
165218c64b6SSatish Balay   IS              is1, is2;
166afebec48SHong Zhang   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
1675d0c19d7SBarry Smith   const PetscInt *irow, *icol;
168218c64b6SSatish Balay 
1693a40ed3dSBarry Smith   PetscFunctionBegin;
1709566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
1719566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
1729566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
1739566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
174218c64b6SSatish Balay 
175218c64b6SSatish Balay   /* Verify if the indices corespond to each element in a block
176218c64b6SSatish Balay    and form the IS with compressed IS */
177f8ecb639SStefano Zampini   maxmnbs = PetscMax(a->mbs, a->nbs);
1789566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
1799566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->mbs));
180218c64b6SSatish Balay   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
181ad540459SPierre Jolivet   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
1826041f1b1SToby Isaac   count = 0;
1836041f1b1SToby Isaac   for (i = 0; i < nrows; i++) {
184afebec48SHong Zhang     j = irow[i] / bs;
1856041f1b1SToby Isaac     if ((vary[j]--) == bs) iary[count++] = j;
186218c64b6SSatish Balay   }
1879566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
188218c64b6SSatish Balay 
1899566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->nbs));
190218c64b6SSatish Balay   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
191ad540459SPierre Jolivet   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
1926041f1b1SToby Isaac   count = 0;
1936041f1b1SToby Isaac   for (i = 0; i < ncols; i++) {
194afebec48SHong Zhang     j = icol[i] / bs;
1956041f1b1SToby Isaac     if ((vary[j]--) == bs) iary[count++] = j;
1966041f1b1SToby Isaac   }
1979566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
1989566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
1999566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
2009566063dSJacob Faibussowitsch   PetscCall(PetscFree2(vary, iary));
201218c64b6SSatish Balay 
2029566063dSJacob Faibussowitsch   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
2039566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is1));
2049566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is2));
2053a40ed3dSBarry Smith   PetscFunctionReturn(0);
206218c64b6SSatish Balay }
207218c64b6SSatish Balay 
2089371c9d4SSatish Balay PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) {
20916b64355SHong Zhang   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
2105c39f6d9SHong Zhang   Mat_SubSppt *submatj = c->submatis1;
21116b64355SHong Zhang 
21216b64355SHong Zhang   PetscFunctionBegin;
2139566063dSJacob Faibussowitsch   PetscCall((*submatj->destroy)(C));
2149566063dSJacob Faibussowitsch   PetscCall(MatDestroySubMatrix_Private(submatj));
21516b64355SHong Zhang   PetscFunctionReturn(0);
21616b64355SHong Zhang }
21716b64355SHong Zhang 
21889a1a59bSHong Zhang /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
2199371c9d4SSatish Balay PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) {
22086e85357SHong Zhang   PetscInt     i;
22186e85357SHong Zhang   Mat          C;
22286e85357SHong Zhang   Mat_SeqBAIJ *c;
22386e85357SHong Zhang   Mat_SubSppt *submatj;
22486e85357SHong Zhang 
22586e85357SHong Zhang   PetscFunctionBegin;
22686e85357SHong Zhang   for (i = 0; i < n; i++) {
22786e85357SHong Zhang     C       = (*mat)[i];
22886e85357SHong Zhang     c       = (Mat_SeqBAIJ *)C->data;
22986e85357SHong Zhang     submatj = c->submatis1;
23086e85357SHong Zhang     if (submatj) {
2317daefbafSJunchao Zhang       if (--((PetscObject)C)->refct <= 0) {
23226cc229bSBarry Smith         PetscCall(PetscFree(C->factorprefix));
2339566063dSJacob Faibussowitsch         PetscCall((*submatj->destroy)(C));
2349566063dSJacob Faibussowitsch         PetscCall(MatDestroySubMatrix_Private(submatj));
2359566063dSJacob Faibussowitsch         PetscCall(PetscFree(C->defaultvectype));
236*3faff063SStefano Zampini         PetscCall(PetscFree(C->defaultrandtype));
2379566063dSJacob Faibussowitsch         PetscCall(PetscLayoutDestroy(&C->rmap));
2389566063dSJacob Faibussowitsch         PetscCall(PetscLayoutDestroy(&C->cmap));
2399566063dSJacob Faibussowitsch         PetscCall(PetscHeaderDestroy(&C));
2407daefbafSJunchao Zhang       }
24186e85357SHong Zhang     } else {
2429566063dSJacob Faibussowitsch       PetscCall(MatDestroy(&C));
24386e85357SHong Zhang     }
24486e85357SHong Zhang   }
2457daefbafSJunchao Zhang 
2467daefbafSJunchao Zhang   /* Destroy Dummy submatrices created for reuse */
2479566063dSJacob Faibussowitsch   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
2487daefbafSJunchao Zhang 
2499566063dSJacob Faibussowitsch   PetscCall(PetscFree(*mat));
25086e85357SHong Zhang   PetscFunctionReturn(0);
25186e85357SHong Zhang }
25286e85357SHong Zhang 
2539371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) {
254690b6cddSBarry Smith   PetscInt i;
255736121d4SSatish Balay 
2563a40ed3dSBarry Smith   PetscFunctionBegin;
25748a46eb9SPierre Jolivet   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
258736121d4SSatish Balay 
25948a46eb9SPierre Jolivet   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
2603a40ed3dSBarry Smith   PetscFunctionReturn(0);
261736121d4SSatish Balay }
262218c64b6SSatish Balay 
2632d61bbb3SSatish Balay /* -------------------------------------------------------*/
2642d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */
2652d61bbb3SSatish Balay /* -------------------------------------------------------*/
2662d61bbb3SSatish Balay 
2679371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) {
2682d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
269d9fead3dSBarry Smith   PetscScalar       *z, sum;
270d9fead3dSBarry Smith   const PetscScalar *x;
271d9fead3dSBarry Smith   const MatScalar   *v;
2727c565772SBarry Smith   PetscInt           mbs, i, n;
2730298fd71SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
274ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2752d61bbb3SSatish Balay 
2762d61bbb3SSatish Balay   PetscFunctionBegin;
2779566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
2789566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &z));
2792d61bbb3SSatish Balay 
28026e093fcSHong Zhang   if (usecprow) {
28126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
28226e093fcSHong Zhang     ii   = a->compressedrow.i;
2837b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
2849566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(z, a->mbs));
28526e093fcSHong Zhang   } else {
28626e093fcSHong Zhang     mbs = a->mbs;
2872d61bbb3SSatish Balay     ii  = a->i;
28826e093fcSHong Zhang   }
2892d61bbb3SSatish Balay 
2902d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
291ee54c7eeSHong Zhang     n   = ii[1] - ii[0];
292ee54c7eeSHong Zhang     v   = a->a + ii[0];
293ee54c7eeSHong Zhang     idx = a->j + ii[0];
294ee54c7eeSHong Zhang     ii++;
295444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
296444d8c10SJed Brown     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2972d61bbb3SSatish Balay     sum = 0.0;
2982162cab8SBarry Smith     PetscSparseDensePlusDot(sum, x, v, idx, n);
29926e093fcSHong Zhang     if (usecprow) {
3007b2bb3b9SHong Zhang       z[ridx[i]] = sum;
30126e093fcSHong Zhang     } else {
3022d61bbb3SSatish Balay       z[i] = sum;
3032d61bbb3SSatish Balay     }
30426e093fcSHong Zhang   }
3059566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3069566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &z));
3079566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
3082d61bbb3SSatish Balay   PetscFunctionReturn(0);
3092d61bbb3SSatish Balay }
3102d61bbb3SSatish Balay 
3119371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) {
3122d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
313f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, *zarray;
314d9fead3dSBarry Smith   const PetscScalar *x, *xb;
31587828ca2SBarry Smith   PetscScalar        x1, x2;
316d9fead3dSBarry Smith   const MatScalar   *v;
3177c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
318ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
3192d61bbb3SSatish Balay 
3202d61bbb3SSatish Balay   PetscFunctionBegin;
3219566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3229566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
3232d61bbb3SSatish Balay 
3242d61bbb3SSatish Balay   idx = a->j;
3252d61bbb3SSatish Balay   v   = a->a;
32626e093fcSHong Zhang   if (usecprow) {
32726e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
32826e093fcSHong Zhang     ii   = a->compressedrow.i;
3297b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
3309566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
33126e093fcSHong Zhang   } else {
33226e093fcSHong Zhang     mbs = a->mbs;
3332d61bbb3SSatish Balay     ii  = a->i;
33426e093fcSHong Zhang     z   = zarray;
33526e093fcSHong Zhang   }
3362d61bbb3SSatish Balay 
3372d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
3389371c9d4SSatish Balay     n = ii[1] - ii[0];
3399371c9d4SSatish Balay     ii++;
3409371c9d4SSatish Balay     sum1 = 0.0;
3419371c9d4SSatish Balay     sum2 = 0.0;
342444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
343444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3442d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
3459371c9d4SSatish Balay       xb = x + 2 * (*idx++);
3469371c9d4SSatish Balay       x1 = xb[0];
3479371c9d4SSatish Balay       x2 = xb[1];
3482d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[2] * x2;
3492d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[3] * x2;
3502d61bbb3SSatish Balay       v += 4;
3512d61bbb3SSatish Balay     }
3527b2bb3b9SHong Zhang     if (usecprow) z = zarray + 2 * ridx[i];
3539371c9d4SSatish Balay     z[0] = sum1;
3549371c9d4SSatish Balay     z[1] = sum2;
35526e093fcSHong Zhang     if (!usecprow) z += 2;
3562d61bbb3SSatish Balay   }
3579566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3589566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
3599566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
3602d61bbb3SSatish Balay   PetscFunctionReturn(0);
3612d61bbb3SSatish Balay }
3622d61bbb3SSatish Balay 
3639371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) {
3642d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
365f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
366d9fead3dSBarry Smith   const PetscScalar *x, *xb;
367d9fead3dSBarry Smith   const MatScalar   *v;
3687c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
369ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
37026e093fcSHong Zhang 
371b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
372fee21e36SBarry Smith #pragma disjoint(*v, *z, *xb)
373fee21e36SBarry Smith #endif
374fee21e36SBarry Smith 
3752d61bbb3SSatish Balay   PetscFunctionBegin;
3769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3779566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
3782d61bbb3SSatish Balay 
3792d61bbb3SSatish Balay   idx = a->j;
3802d61bbb3SSatish Balay   v   = a->a;
38126e093fcSHong Zhang   if (usecprow) {
38226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
38326e093fcSHong Zhang     ii   = a->compressedrow.i;
3847b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
3859566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
38626e093fcSHong Zhang   } else {
38726e093fcSHong Zhang     mbs = a->mbs;
3882d61bbb3SSatish Balay     ii  = a->i;
38926e093fcSHong Zhang     z   = zarray;
39026e093fcSHong Zhang   }
3912d61bbb3SSatish Balay 
3922d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
3939371c9d4SSatish Balay     n = ii[1] - ii[0];
3949371c9d4SSatish Balay     ii++;
3959371c9d4SSatish Balay     sum1 = 0.0;
3969371c9d4SSatish Balay     sum2 = 0.0;
3979371c9d4SSatish Balay     sum3 = 0.0;
398444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
399444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
4002d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
40126fbe8dcSKarl Rupp       xb = x + 3 * (*idx++);
40226fbe8dcSKarl Rupp       x1 = xb[0];
40326fbe8dcSKarl Rupp       x2 = xb[1];
40426fbe8dcSKarl Rupp       x3 = xb[2];
40526fbe8dcSKarl Rupp 
4062d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
4072d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
4082d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
4092d61bbb3SSatish Balay       v += 9;
4102d61bbb3SSatish Balay     }
4117b2bb3b9SHong Zhang     if (usecprow) z = zarray + 3 * ridx[i];
4129371c9d4SSatish Balay     z[0] = sum1;
4139371c9d4SSatish Balay     z[1] = sum2;
4149371c9d4SSatish Balay     z[2] = sum3;
41526e093fcSHong Zhang     if (!usecprow) z += 3;
4162d61bbb3SSatish Balay   }
4179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
4199566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
4202d61bbb3SSatish Balay   PetscFunctionReturn(0);
4212d61bbb3SSatish Balay }
4222d61bbb3SSatish Balay 
4239371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) {
4242d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
425f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
426d9fead3dSBarry Smith   const PetscScalar *x, *xb;
427d9fead3dSBarry Smith   const MatScalar   *v;
4287c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
429ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
4302d61bbb3SSatish Balay 
4312d61bbb3SSatish Balay   PetscFunctionBegin;
4329566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4339566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
4342d61bbb3SSatish Balay 
4352d61bbb3SSatish Balay   idx = a->j;
4362d61bbb3SSatish Balay   v   = a->a;
43726e093fcSHong Zhang   if (usecprow) {
43826e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
43926e093fcSHong Zhang     ii   = a->compressedrow.i;
4407b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
4419566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
44226e093fcSHong Zhang   } else {
44326e093fcSHong Zhang     mbs = a->mbs;
4442d61bbb3SSatish Balay     ii  = a->i;
44526e093fcSHong Zhang     z   = zarray;
44626e093fcSHong Zhang   }
4472d61bbb3SSatish Balay 
4482d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
44926fbe8dcSKarl Rupp     n = ii[1] - ii[0];
45026fbe8dcSKarl Rupp     ii++;
45126fbe8dcSKarl Rupp     sum1 = 0.0;
45226fbe8dcSKarl Rupp     sum2 = 0.0;
45326fbe8dcSKarl Rupp     sum3 = 0.0;
45426fbe8dcSKarl Rupp     sum4 = 0.0;
45526fbe8dcSKarl Rupp 
456444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
457444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
4582d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
4592d61bbb3SSatish Balay       xb = x + 4 * (*idx++);
4609371c9d4SSatish Balay       x1 = xb[0];
4619371c9d4SSatish Balay       x2 = xb[1];
4629371c9d4SSatish Balay       x3 = xb[2];
4639371c9d4SSatish Balay       x4 = xb[3];
4642d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
4652d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
4662d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
4672d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
4682d61bbb3SSatish Balay       v += 16;
4692d61bbb3SSatish Balay     }
4707b2bb3b9SHong Zhang     if (usecprow) z = zarray + 4 * ridx[i];
4719371c9d4SSatish Balay     z[0] = sum1;
4729371c9d4SSatish Balay     z[1] = sum2;
4739371c9d4SSatish Balay     z[2] = sum3;
4749371c9d4SSatish Balay     z[3] = sum4;
47526e093fcSHong Zhang     if (!usecprow) z += 4;
4762d61bbb3SSatish Balay   }
4779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4789566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
4799566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
4802d61bbb3SSatish Balay   PetscFunctionReturn(0);
4812d61bbb3SSatish Balay }
4822d61bbb3SSatish Balay 
4839371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) {
4842d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
485f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
486d9fead3dSBarry Smith   const PetscScalar *xb, *x;
487d9fead3dSBarry Smith   const MatScalar   *v;
4880298fd71SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
4897c565772SBarry Smith   PetscInt           mbs, i, j, n;
490ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
4912d61bbb3SSatish Balay 
492433994e6SBarry Smith   PetscFunctionBegin;
4939566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4949566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
4952d61bbb3SSatish Balay 
4962d61bbb3SSatish Balay   idx = a->j;
4972d61bbb3SSatish Balay   v   = a->a;
49826e093fcSHong Zhang   if (usecprow) {
49926e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
50026e093fcSHong Zhang     ii   = a->compressedrow.i;
5017b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
5029566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
50326e093fcSHong Zhang   } else {
50426e093fcSHong Zhang     mbs = a->mbs;
5052d61bbb3SSatish Balay     ii  = a->i;
50626e093fcSHong Zhang     z   = zarray;
50726e093fcSHong Zhang   }
5082d61bbb3SSatish Balay 
5092d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
5109371c9d4SSatish Balay     n = ii[1] - ii[0];
5119371c9d4SSatish Balay     ii++;
5129371c9d4SSatish Balay     sum1 = 0.0;
5139371c9d4SSatish Balay     sum2 = 0.0;
5149371c9d4SSatish Balay     sum3 = 0.0;
5159371c9d4SSatish Balay     sum4 = 0.0;
5169371c9d4SSatish Balay     sum5 = 0.0;
517444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
518444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
5192d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
5202d61bbb3SSatish Balay       xb = x + 5 * (*idx++);
5219371c9d4SSatish Balay       x1 = xb[0];
5229371c9d4SSatish Balay       x2 = xb[1];
5239371c9d4SSatish Balay       x3 = xb[2];
5249371c9d4SSatish Balay       x4 = xb[3];
5259371c9d4SSatish Balay       x5 = xb[4];
5262d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
5272d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
5282d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
5292d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
5302d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
5312d61bbb3SSatish Balay       v += 25;
5322d61bbb3SSatish Balay     }
5337b2bb3b9SHong Zhang     if (usecprow) z = zarray + 5 * ridx[i];
5349371c9d4SSatish Balay     z[0] = sum1;
5359371c9d4SSatish Balay     z[1] = sum2;
5369371c9d4SSatish Balay     z[2] = sum3;
5379371c9d4SSatish Balay     z[3] = sum4;
5389371c9d4SSatish Balay     z[4] = sum5;
53926e093fcSHong Zhang     if (!usecprow) z += 5;
5402d61bbb3SSatish Balay   }
5419566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5429566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
5439566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
5442d61bbb3SSatish Balay   PetscFunctionReturn(0);
5452d61bbb3SSatish Balay }
5462d61bbb3SSatish Balay 
5479371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) {
54815091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
549f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
550d9fead3dSBarry Smith   const PetscScalar *x, *xb;
55126e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
552d9fead3dSBarry Smith   const MatScalar   *v;
5537c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
554ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
55515091d37SBarry Smith 
556433994e6SBarry Smith   PetscFunctionBegin;
5579566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5589566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
55915091d37SBarry Smith 
56015091d37SBarry Smith   idx = a->j;
56115091d37SBarry Smith   v   = a->a;
56226e093fcSHong Zhang   if (usecprow) {
56326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
56426e093fcSHong Zhang     ii   = a->compressedrow.i;
5657b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
5669566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
56726e093fcSHong Zhang   } else {
56826e093fcSHong Zhang     mbs = a->mbs;
56915091d37SBarry Smith     ii  = a->i;
57026e093fcSHong Zhang     z   = zarray;
57126e093fcSHong Zhang   }
57215091d37SBarry Smith 
57315091d37SBarry Smith   for (i = 0; i < mbs; i++) {
57426fbe8dcSKarl Rupp     n = ii[1] - ii[0];
57526fbe8dcSKarl Rupp     ii++;
57626fbe8dcSKarl Rupp     sum1 = 0.0;
57726fbe8dcSKarl Rupp     sum2 = 0.0;
57826fbe8dcSKarl Rupp     sum3 = 0.0;
57926fbe8dcSKarl Rupp     sum4 = 0.0;
58026fbe8dcSKarl Rupp     sum5 = 0.0;
58126fbe8dcSKarl Rupp     sum6 = 0.0;
58226fbe8dcSKarl Rupp 
583444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
584444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
58515091d37SBarry Smith     for (j = 0; j < n; j++) {
58615091d37SBarry Smith       xb = x + 6 * (*idx++);
5879371c9d4SSatish Balay       x1 = xb[0];
5889371c9d4SSatish Balay       x2 = xb[1];
5899371c9d4SSatish Balay       x3 = xb[2];
5909371c9d4SSatish Balay       x4 = xb[3];
5919371c9d4SSatish Balay       x5 = xb[4];
5929371c9d4SSatish Balay       x6 = xb[5];
59315091d37SBarry Smith       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
59415091d37SBarry Smith       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
59515091d37SBarry Smith       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
59615091d37SBarry Smith       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
59715091d37SBarry Smith       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
59815091d37SBarry Smith       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
59915091d37SBarry Smith       v += 36;
60015091d37SBarry Smith     }
6017b2bb3b9SHong Zhang     if (usecprow) z = zarray + 6 * ridx[i];
6029371c9d4SSatish Balay     z[0] = sum1;
6039371c9d4SSatish Balay     z[1] = sum2;
6049371c9d4SSatish Balay     z[2] = sum3;
6059371c9d4SSatish Balay     z[3] = sum4;
6069371c9d4SSatish Balay     z[4] = sum5;
6079371c9d4SSatish Balay     z[5] = sum6;
60826e093fcSHong Zhang     if (!usecprow) z += 6;
60915091d37SBarry Smith   }
61015091d37SBarry Smith 
6119566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6129566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
6139566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
61415091d37SBarry Smith   PetscFunctionReturn(0);
61515091d37SBarry Smith }
6168ab949d8SShri Abhyankar 
6179371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) {
6182d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
619f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
620d9fead3dSBarry Smith   const PetscScalar *x, *xb;
62126e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
622d9fead3dSBarry Smith   const MatScalar   *v;
6237c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
624ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
6252d61bbb3SSatish Balay 
626433994e6SBarry Smith   PetscFunctionBegin;
6279566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6289566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
6292d61bbb3SSatish Balay 
6302d61bbb3SSatish Balay   idx = a->j;
6312d61bbb3SSatish Balay   v   = a->a;
63226e093fcSHong Zhang   if (usecprow) {
63326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
63426e093fcSHong Zhang     ii   = a->compressedrow.i;
6357b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
6369566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
63726e093fcSHong Zhang   } else {
63826e093fcSHong Zhang     mbs = a->mbs;
6392d61bbb3SSatish Balay     ii  = a->i;
64026e093fcSHong Zhang     z   = zarray;
64126e093fcSHong Zhang   }
6422d61bbb3SSatish Balay 
6432d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
64426fbe8dcSKarl Rupp     n = ii[1] - ii[0];
64526fbe8dcSKarl Rupp     ii++;
64626fbe8dcSKarl Rupp     sum1 = 0.0;
64726fbe8dcSKarl Rupp     sum2 = 0.0;
64826fbe8dcSKarl Rupp     sum3 = 0.0;
64926fbe8dcSKarl Rupp     sum4 = 0.0;
65026fbe8dcSKarl Rupp     sum5 = 0.0;
65126fbe8dcSKarl Rupp     sum6 = 0.0;
65226fbe8dcSKarl Rupp     sum7 = 0.0;
65326fbe8dcSKarl Rupp 
654444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
655444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
6562d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
6572d61bbb3SSatish Balay       xb = x + 7 * (*idx++);
6589371c9d4SSatish Balay       x1 = xb[0];
6599371c9d4SSatish Balay       x2 = xb[1];
6609371c9d4SSatish Balay       x3 = xb[2];
6619371c9d4SSatish Balay       x4 = xb[3];
6629371c9d4SSatish Balay       x5 = xb[4];
6639371c9d4SSatish Balay       x6 = xb[5];
6649371c9d4SSatish Balay       x7 = xb[6];
6652d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
6662d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
6672d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
6682d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
6692d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
6702d61bbb3SSatish Balay       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
6712d61bbb3SSatish Balay       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
6722d61bbb3SSatish Balay       v += 49;
6732d61bbb3SSatish Balay     }
6747b2bb3b9SHong Zhang     if (usecprow) z = zarray + 7 * ridx[i];
6759371c9d4SSatish Balay     z[0] = sum1;
6769371c9d4SSatish Balay     z[1] = sum2;
6779371c9d4SSatish Balay     z[2] = sum3;
6789371c9d4SSatish Balay     z[3] = sum4;
6799371c9d4SSatish Balay     z[4] = sum5;
6809371c9d4SSatish Balay     z[5] = sum6;
6819371c9d4SSatish Balay     z[6] = sum7;
68226e093fcSHong Zhang     if (!usecprow) z += 7;
6832d61bbb3SSatish Balay   }
6842d61bbb3SSatish Balay 
6859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6869566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
6879566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
6882d61bbb3SSatish Balay   PetscFunctionReturn(0);
6892d61bbb3SSatish Balay }
6902d61bbb3SSatish Balay 
6915f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
6929371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) {
69396e086a2SDaniel Kokron   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
694f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
69596e086a2SDaniel Kokron   const PetscScalar *x, *xb;
69696e086a2SDaniel Kokron   const MatScalar   *v;
69796e086a2SDaniel Kokron   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
69896e086a2SDaniel Kokron   const PetscInt    *idx, *ii, *ridx = NULL;
699ce68d72fSJed Brown   PetscInt           k;
70096e086a2SDaniel Kokron   PetscBool          usecprow = a->compressedrow.use;
70196e086a2SDaniel Kokron 
70296e086a2SDaniel Kokron   __m256d a0, a1, a2, a3, a4, a5;
703ce68d72fSJed Brown   __m256d w0, w1, w2, w3;
70496e086a2SDaniel Kokron   __m256d z0, z1, z2;
70596e086a2SDaniel Kokron   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
70696e086a2SDaniel Kokron 
70796e086a2SDaniel Kokron   PetscFunctionBegin;
7089566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7099566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
71096e086a2SDaniel Kokron 
71196e086a2SDaniel Kokron   idx = a->j;
71296e086a2SDaniel Kokron   v   = a->a;
71396e086a2SDaniel Kokron   if (usecprow) {
71496e086a2SDaniel Kokron     mbs  = a->compressedrow.nrows;
71596e086a2SDaniel Kokron     ii   = a->compressedrow.i;
71696e086a2SDaniel Kokron     ridx = a->compressedrow.rindex;
7179566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
71896e086a2SDaniel Kokron   } else {
71996e086a2SDaniel Kokron     mbs = a->mbs;
72096e086a2SDaniel Kokron     ii  = a->i;
72196e086a2SDaniel Kokron     z   = zarray;
72296e086a2SDaniel Kokron   }
72396e086a2SDaniel Kokron 
72496e086a2SDaniel Kokron   if (!a->mult_work) {
72596e086a2SDaniel Kokron     k = PetscMax(A->rmap->n, A->cmap->n);
7269566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
72796e086a2SDaniel Kokron   }
72896e086a2SDaniel Kokron 
72996e086a2SDaniel Kokron   work = a->mult_work;
73096e086a2SDaniel Kokron   for (i = 0; i < mbs; i++) {
7319371c9d4SSatish Balay     n = ii[1] - ii[0];
7329371c9d4SSatish Balay     ii++;
73396e086a2SDaniel Kokron     workt = work;
73496e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
73596e086a2SDaniel Kokron       xb = x + bs * (*idx++);
73696e086a2SDaniel Kokron       for (k = 0; k < bs; k++) workt[k] = xb[k];
73796e086a2SDaniel Kokron       workt += bs;
73896e086a2SDaniel Kokron     }
73996e086a2SDaniel Kokron     if (usecprow) z = zarray + bs * ridx[i];
74096e086a2SDaniel Kokron 
7419371c9d4SSatish Balay     z0 = _mm256_setzero_pd();
7429371c9d4SSatish Balay     z1 = _mm256_setzero_pd();
7439371c9d4SSatish Balay     z2 = _mm256_setzero_pd();
74496e086a2SDaniel Kokron 
74596e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
746c05b70c4SSatish Balay       /* first column of a */
74796e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9]);
7489371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81]);
7499371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
7509371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
7519371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
7529371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
7539371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
75496e086a2SDaniel Kokron 
755c05b70c4SSatish Balay       /* second column of a */
75696e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 1]);
7579371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
7589371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
7599371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
7609371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
7619371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
7629371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
76396e086a2SDaniel Kokron 
764c05b70c4SSatish Balay       /* third column of a */
76596e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 2]);
7669371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
7679371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w2, z0);
7689371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
7699371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w2, z1);
7709371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
7719371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w2, z2);
77296e086a2SDaniel Kokron 
773c05b70c4SSatish Balay       /* fourth column of a */
77496e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 3]);
7759371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
7769371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w3, z0);
7779371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
7789371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w3, z1);
7799371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
7809371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w3, z2);
78196e086a2SDaniel Kokron 
782c05b70c4SSatish Balay       /* fifth column of a */
78396e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 4]);
7849371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
7859371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w0, z0);
7869371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
7879371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w0, z1);
7889371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
7899371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w0, z2);
79096e086a2SDaniel Kokron 
791c05b70c4SSatish Balay       /* sixth column of a */
79296e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 5]);
7939371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
7949371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
7959371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
7969371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
7979371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
7989371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
79996e086a2SDaniel Kokron 
800c05b70c4SSatish Balay       /* seventh column of a */
80196e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 6]);
8029371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
8039371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
8049371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
8059371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
8069371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
8079371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
80896e086a2SDaniel Kokron 
8096aad120cSJose E. Roman       /* eighth column of a */
81096e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 7]);
8119371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
8129371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
8139371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
8149371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
8159371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
8169371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
81796e086a2SDaniel Kokron 
818c05b70c4SSatish Balay       /* ninth column of a */
81996e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 8]);
8209371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
8219371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
8229371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
8239371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
8249371c9d4SSatish Balay       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
8259371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
82696e086a2SDaniel Kokron     }
82796e086a2SDaniel Kokron 
8289371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
8299371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
8309371c9d4SSatish Balay     _mm256_maskstore_pd(&z[8], mask1, z2);
83196e086a2SDaniel Kokron 
83296e086a2SDaniel Kokron     v += n * bs2;
83396e086a2SDaniel Kokron     if (!usecprow) z += bs;
83496e086a2SDaniel Kokron   }
8359566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8369566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
8379566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
83896e086a2SDaniel Kokron   PetscFunctionReturn(0);
83996e086a2SDaniel Kokron }
84096e086a2SDaniel Kokron #endif
84196e086a2SDaniel Kokron 
8429371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) {
843ebada01fSBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
844f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
845ebada01fSBarry Smith   const PetscScalar *x, *xb;
846ebada01fSBarry Smith   PetscScalar       *zarray, xv;
847ebada01fSBarry Smith   const MatScalar   *v;
848ebada01fSBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
849ebada01fSBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
850ebada01fSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
851ebada01fSBarry Smith 
852ebada01fSBarry Smith   PetscFunctionBegin;
8539566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8549566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
855ebada01fSBarry Smith 
856ebada01fSBarry Smith   v = a->a;
857ebada01fSBarry Smith   if (usecprow) {
858ebada01fSBarry Smith     mbs  = a->compressedrow.nrows;
859ebada01fSBarry Smith     ii   = a->compressedrow.i;
860ebada01fSBarry Smith     ridx = a->compressedrow.rindex;
8619566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
862ebada01fSBarry Smith   } else {
863ebada01fSBarry Smith     mbs = a->mbs;
864ebada01fSBarry Smith     ii  = a->i;
865ebada01fSBarry Smith     z   = zarray;
866ebada01fSBarry Smith   }
867ebada01fSBarry Smith 
868ebada01fSBarry Smith   for (i = 0; i < mbs; i++) {
869ebada01fSBarry Smith     n     = ii[i + 1] - ii[i];
870ebada01fSBarry Smith     idx   = ij + ii[i];
8719371c9d4SSatish Balay     sum1  = 0.0;
8729371c9d4SSatish Balay     sum2  = 0.0;
8739371c9d4SSatish Balay     sum3  = 0.0;
8749371c9d4SSatish Balay     sum4  = 0.0;
8759371c9d4SSatish Balay     sum5  = 0.0;
8769371c9d4SSatish Balay     sum6  = 0.0;
8779371c9d4SSatish Balay     sum7  = 0.0;
8789371c9d4SSatish Balay     sum8  = 0.0;
8799371c9d4SSatish Balay     sum9  = 0.0;
8809371c9d4SSatish Balay     sum10 = 0.0;
8819371c9d4SSatish Balay     sum11 = 0.0;
882ebada01fSBarry Smith 
883ebada01fSBarry Smith     for (j = 0; j < n; j++) {
884ebada01fSBarry Smith       xb = x + 11 * (idx[j]);
885ebada01fSBarry Smith 
886ebada01fSBarry Smith       for (k = 0; k < 11; k++) {
887ebada01fSBarry Smith         xv = xb[k];
888ebada01fSBarry Smith         sum1 += v[0] * xv;
889ebada01fSBarry Smith         sum2 += v[1] * xv;
890ebada01fSBarry Smith         sum3 += v[2] * xv;
891ebada01fSBarry Smith         sum4 += v[3] * xv;
892ebada01fSBarry Smith         sum5 += v[4] * xv;
893ebada01fSBarry Smith         sum6 += v[5] * xv;
894ebada01fSBarry Smith         sum7 += v[6] * xv;
895ebada01fSBarry Smith         sum8 += v[7] * xv;
896ebada01fSBarry Smith         sum9 += v[8] * xv;
897ebada01fSBarry Smith         sum10 += v[9] * xv;
898ebada01fSBarry Smith         sum11 += v[10] * xv;
899ebada01fSBarry Smith         v += 11;
900ebada01fSBarry Smith       }
901ebada01fSBarry Smith     }
902ebada01fSBarry Smith     if (usecprow) z = zarray + 11 * ridx[i];
9039371c9d4SSatish Balay     z[0]  = sum1;
9049371c9d4SSatish Balay     z[1]  = sum2;
9059371c9d4SSatish Balay     z[2]  = sum3;
9069371c9d4SSatish Balay     z[3]  = sum4;
9079371c9d4SSatish Balay     z[4]  = sum5;
9089371c9d4SSatish Balay     z[5]  = sum6;
9099371c9d4SSatish Balay     z[6]  = sum7;
9109371c9d4SSatish Balay     z[7]  = sum8;
9119371c9d4SSatish Balay     z[8]  = sum9;
9129371c9d4SSatish Balay     z[9]  = sum10;
9139371c9d4SSatish Balay     z[10] = sum11;
914ebada01fSBarry Smith 
915ebada01fSBarry Smith     if (!usecprow) z += 11;
916ebada01fSBarry Smith   }
917ebada01fSBarry Smith 
9189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9199566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
9209566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
921ebada01fSBarry Smith   PetscFunctionReturn(0);
922ebada01fSBarry Smith }
923ebada01fSBarry Smith 
9246679dcc1SBarry Smith /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
9259371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) {
9266679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
9276679dcc1SBarry Smith   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
9286679dcc1SBarry Smith   const PetscScalar *x, *xb;
9296679dcc1SBarry Smith   PetscScalar       *zarray, xv;
9306679dcc1SBarry Smith   const MatScalar   *v;
9316679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
9326679dcc1SBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
9336679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
9346679dcc1SBarry Smith 
9356679dcc1SBarry Smith   PetscFunctionBegin;
9369566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9379566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
9386679dcc1SBarry Smith 
9396679dcc1SBarry Smith   v = a->a;
9406679dcc1SBarry Smith   if (usecprow) {
9416679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
9426679dcc1SBarry Smith     ii   = a->compressedrow.i;
9436679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
9449566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
9456679dcc1SBarry Smith   } else {
9466679dcc1SBarry Smith     mbs = a->mbs;
9476679dcc1SBarry Smith     ii  = a->i;
9486679dcc1SBarry Smith     z   = zarray;
9496679dcc1SBarry Smith   }
9506679dcc1SBarry Smith 
9516679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
9526679dcc1SBarry Smith     n     = ii[i + 1] - ii[i];
9536679dcc1SBarry Smith     idx   = ij + ii[i];
9549371c9d4SSatish Balay     sum1  = 0.0;
9559371c9d4SSatish Balay     sum2  = 0.0;
9569371c9d4SSatish Balay     sum3  = 0.0;
9579371c9d4SSatish Balay     sum4  = 0.0;
9589371c9d4SSatish Balay     sum5  = 0.0;
9599371c9d4SSatish Balay     sum6  = 0.0;
9609371c9d4SSatish Balay     sum7  = 0.0;
9619371c9d4SSatish Balay     sum8  = 0.0;
9629371c9d4SSatish Balay     sum9  = 0.0;
9639371c9d4SSatish Balay     sum10 = 0.0;
9649371c9d4SSatish Balay     sum11 = 0.0;
9659371c9d4SSatish Balay     sum12 = 0.0;
9666679dcc1SBarry Smith 
9676679dcc1SBarry Smith     for (j = 0; j < n; j++) {
9686679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
9696679dcc1SBarry Smith 
9706679dcc1SBarry Smith       for (k = 0; k < 12; k++) {
9716679dcc1SBarry Smith         xv = xb[k];
9726679dcc1SBarry Smith         sum1 += v[0] * xv;
9736679dcc1SBarry Smith         sum2 += v[1] * xv;
9746679dcc1SBarry Smith         sum3 += v[2] * xv;
9756679dcc1SBarry Smith         sum4 += v[3] * xv;
9766679dcc1SBarry Smith         sum5 += v[4] * xv;
9776679dcc1SBarry Smith         sum6 += v[5] * xv;
9786679dcc1SBarry Smith         sum7 += v[6] * xv;
9796679dcc1SBarry Smith         sum8 += v[7] * xv;
9806679dcc1SBarry Smith         sum9 += v[8] * xv;
9816679dcc1SBarry Smith         sum10 += v[9] * xv;
9826679dcc1SBarry Smith         sum11 += v[10] * xv;
9836679dcc1SBarry Smith         sum12 += v[11] * xv;
9846679dcc1SBarry Smith         v += 12;
9856679dcc1SBarry Smith       }
9866679dcc1SBarry Smith     }
9876679dcc1SBarry Smith     if (usecprow) z = zarray + 12 * ridx[i];
9889371c9d4SSatish Balay     z[0]  = sum1;
9899371c9d4SSatish Balay     z[1]  = sum2;
9909371c9d4SSatish Balay     z[2]  = sum3;
9919371c9d4SSatish Balay     z[3]  = sum4;
9929371c9d4SSatish Balay     z[4]  = sum5;
9939371c9d4SSatish Balay     z[5]  = sum6;
9949371c9d4SSatish Balay     z[6]  = sum7;
9959371c9d4SSatish Balay     z[7]  = sum8;
9969371c9d4SSatish Balay     z[8]  = sum9;
9979371c9d4SSatish Balay     z[9]  = sum10;
9989371c9d4SSatish Balay     z[10] = sum11;
9999371c9d4SSatish Balay     z[11] = sum12;
10006679dcc1SBarry Smith     if (!usecprow) z += 12;
10016679dcc1SBarry Smith   }
10029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10039566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
10049566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
10056679dcc1SBarry Smith   PetscFunctionReturn(0);
10066679dcc1SBarry Smith }
10076679dcc1SBarry Smith 
10089371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) {
10096679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
10106679dcc1SBarry Smith   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
10116679dcc1SBarry Smith   const PetscScalar *x, *xb;
10126679dcc1SBarry Smith   PetscScalar       *zarray, *yarray, xv;
10136679dcc1SBarry Smith   const MatScalar   *v;
10146679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
10156679dcc1SBarry Smith   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
10166679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
10176679dcc1SBarry Smith 
10186679dcc1SBarry Smith   PetscFunctionBegin;
10199566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
10209566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
10216679dcc1SBarry Smith 
10226679dcc1SBarry Smith   v = a->a;
10236679dcc1SBarry Smith   if (usecprow) {
102448a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
10256679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
10266679dcc1SBarry Smith     ii   = a->compressedrow.i;
10276679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
10286679dcc1SBarry Smith   } else {
10296679dcc1SBarry Smith     ii = a->i;
10306679dcc1SBarry Smith     y  = yarray;
10316679dcc1SBarry Smith     z  = zarray;
10326679dcc1SBarry Smith   }
10336679dcc1SBarry Smith 
10346679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
10356679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
10366679dcc1SBarry Smith     idx = ij + ii[i];
10376679dcc1SBarry Smith 
10386679dcc1SBarry Smith     if (usecprow) {
10396679dcc1SBarry Smith       y = yarray + 12 * ridx[i];
10406679dcc1SBarry Smith       z = zarray + 12 * ridx[i];
10416679dcc1SBarry Smith     }
10429371c9d4SSatish Balay     sum1  = y[0];
10439371c9d4SSatish Balay     sum2  = y[1];
10449371c9d4SSatish Balay     sum3  = y[2];
10459371c9d4SSatish Balay     sum4  = y[3];
10469371c9d4SSatish Balay     sum5  = y[4];
10479371c9d4SSatish Balay     sum6  = y[5];
10489371c9d4SSatish Balay     sum7  = y[6];
10499371c9d4SSatish Balay     sum8  = y[7];
10509371c9d4SSatish Balay     sum9  = y[8];
10519371c9d4SSatish Balay     sum10 = y[9];
10529371c9d4SSatish Balay     sum11 = y[10];
10539371c9d4SSatish Balay     sum12 = y[11];
10546679dcc1SBarry Smith 
10556679dcc1SBarry Smith     for (j = 0; j < n; j++) {
10566679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
10576679dcc1SBarry Smith 
10586679dcc1SBarry Smith       for (k = 0; k < 12; k++) {
10596679dcc1SBarry Smith         xv = xb[k];
10606679dcc1SBarry Smith         sum1 += v[0] * xv;
10616679dcc1SBarry Smith         sum2 += v[1] * xv;
10626679dcc1SBarry Smith         sum3 += v[2] * xv;
10636679dcc1SBarry Smith         sum4 += v[3] * xv;
10646679dcc1SBarry Smith         sum5 += v[4] * xv;
10656679dcc1SBarry Smith         sum6 += v[5] * xv;
10666679dcc1SBarry Smith         sum7 += v[6] * xv;
10676679dcc1SBarry Smith         sum8 += v[7] * xv;
10686679dcc1SBarry Smith         sum9 += v[8] * xv;
10696679dcc1SBarry Smith         sum10 += v[9] * xv;
10706679dcc1SBarry Smith         sum11 += v[10] * xv;
10716679dcc1SBarry Smith         sum12 += v[11] * xv;
10726679dcc1SBarry Smith         v += 12;
10736679dcc1SBarry Smith       }
10746679dcc1SBarry Smith     }
10756679dcc1SBarry Smith 
10769371c9d4SSatish Balay     z[0]  = sum1;
10779371c9d4SSatish Balay     z[1]  = sum2;
10789371c9d4SSatish Balay     z[2]  = sum3;
10799371c9d4SSatish Balay     z[3]  = sum4;
10809371c9d4SSatish Balay     z[4]  = sum5;
10819371c9d4SSatish Balay     z[5]  = sum6;
10829371c9d4SSatish Balay     z[6]  = sum7;
10839371c9d4SSatish Balay     z[7]  = sum8;
10849371c9d4SSatish Balay     z[8]  = sum9;
10859371c9d4SSatish Balay     z[9]  = sum10;
10869371c9d4SSatish Balay     z[10] = sum11;
10879371c9d4SSatish Balay     z[11] = sum12;
10886679dcc1SBarry Smith     if (!usecprow) {
10896679dcc1SBarry Smith       y += 12;
10906679dcc1SBarry Smith       z += 12;
10916679dcc1SBarry Smith     }
10926679dcc1SBarry Smith   }
10939566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10949566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
10959566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
10966679dcc1SBarry Smith   PetscFunctionReturn(0);
10976679dcc1SBarry Smith }
10986679dcc1SBarry Smith 
10996679dcc1SBarry Smith /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
11009371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) {
11016679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
11026679dcc1SBarry Smith   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
11036679dcc1SBarry Smith   const PetscScalar *x, *xb;
11046679dcc1SBarry Smith   PetscScalar        x1, x2, x3, x4, *zarray;
11056679dcc1SBarry Smith   const MatScalar   *v;
11066679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
11076679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
11086679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
11096679dcc1SBarry Smith 
11106679dcc1SBarry Smith   PetscFunctionBegin;
11119566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
11129566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
11136679dcc1SBarry Smith 
11146679dcc1SBarry Smith   v = a->a;
11156679dcc1SBarry Smith   if (usecprow) {
11166679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
11176679dcc1SBarry Smith     ii   = a->compressedrow.i;
11186679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
11199566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
11206679dcc1SBarry Smith   } else {
11216679dcc1SBarry Smith     mbs = a->mbs;
11226679dcc1SBarry Smith     ii  = a->i;
11236679dcc1SBarry Smith     z   = zarray;
11246679dcc1SBarry Smith   }
11256679dcc1SBarry Smith 
11266679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
11276679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
11286679dcc1SBarry Smith     idx = ij + ii[i];
11296679dcc1SBarry Smith 
11306679dcc1SBarry Smith     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
11316679dcc1SBarry Smith     for (j = 0; j < n; j++) {
11326679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
11339371c9d4SSatish Balay       x1 = xb[0];
11349371c9d4SSatish Balay       x2 = xb[1];
11359371c9d4SSatish Balay       x3 = xb[2];
11369371c9d4SSatish Balay       x4 = xb[3];
11376679dcc1SBarry Smith 
11386679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11396679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11406679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11416679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11426679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11436679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11446679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11456679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11466679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11476679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11486679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11496679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11506679dcc1SBarry Smith       v += 48;
11516679dcc1SBarry Smith 
11529371c9d4SSatish Balay       x1 = xb[4];
11539371c9d4SSatish Balay       x2 = xb[5];
11549371c9d4SSatish Balay       x3 = xb[6];
11559371c9d4SSatish Balay       x4 = xb[7];
11566679dcc1SBarry Smith 
11576679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11586679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11596679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11606679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11616679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11626679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11636679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11646679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11656679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11666679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11676679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11686679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11696679dcc1SBarry Smith       v += 48;
11706679dcc1SBarry Smith 
11719371c9d4SSatish Balay       x1 = xb[8];
11729371c9d4SSatish Balay       x2 = xb[9];
11739371c9d4SSatish Balay       x3 = xb[10];
11749371c9d4SSatish Balay       x4 = xb[11];
11756679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11766679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11776679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11786679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11796679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11806679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11816679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11826679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11836679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11846679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11856679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11866679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11876679dcc1SBarry Smith       v += 48;
11886679dcc1SBarry Smith     }
11896679dcc1SBarry Smith     if (usecprow) z = zarray + 12 * ridx[i];
11909371c9d4SSatish Balay     z[0]  = sum1;
11919371c9d4SSatish Balay     z[1]  = sum2;
11929371c9d4SSatish Balay     z[2]  = sum3;
11939371c9d4SSatish Balay     z[3]  = sum4;
11949371c9d4SSatish Balay     z[4]  = sum5;
11959371c9d4SSatish Balay     z[5]  = sum6;
11969371c9d4SSatish Balay     z[6]  = sum7;
11979371c9d4SSatish Balay     z[7]  = sum8;
11989371c9d4SSatish Balay     z[8]  = sum9;
11999371c9d4SSatish Balay     z[9]  = sum10;
12009371c9d4SSatish Balay     z[10] = sum11;
12019371c9d4SSatish Balay     z[11] = sum12;
12026679dcc1SBarry Smith     if (!usecprow) z += 12;
12036679dcc1SBarry Smith   }
12049566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
12059566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
12069566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
12076679dcc1SBarry Smith   PetscFunctionReturn(0);
12086679dcc1SBarry Smith }
12096679dcc1SBarry Smith 
12106679dcc1SBarry Smith /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
12119371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) {
12126679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
12136679dcc1SBarry Smith   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
12146679dcc1SBarry Smith   const PetscScalar *x, *xb;
12156679dcc1SBarry Smith   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
12166679dcc1SBarry Smith   const MatScalar   *v;
12176679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
12186679dcc1SBarry Smith   PetscInt           mbs      = a->mbs, i, j, n;
12196679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
12206679dcc1SBarry Smith 
12216679dcc1SBarry Smith   PetscFunctionBegin;
12229566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
12239566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
12246679dcc1SBarry Smith 
12256679dcc1SBarry Smith   v = a->a;
12266679dcc1SBarry Smith   if (usecprow) {
122748a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
12286679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
12296679dcc1SBarry Smith     ii   = a->compressedrow.i;
12306679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
12316679dcc1SBarry Smith   } else {
12326679dcc1SBarry Smith     ii = a->i;
12336679dcc1SBarry Smith     y  = yarray;
12346679dcc1SBarry Smith     z  = zarray;
12356679dcc1SBarry Smith   }
12366679dcc1SBarry Smith 
12376679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
12386679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
12396679dcc1SBarry Smith     idx = ij + ii[i];
12406679dcc1SBarry Smith 
12416679dcc1SBarry Smith     if (usecprow) {
12426679dcc1SBarry Smith       y = yarray + 12 * ridx[i];
12436679dcc1SBarry Smith       z = zarray + 12 * ridx[i];
12446679dcc1SBarry Smith     }
12459371c9d4SSatish Balay     sum1  = y[0];
12469371c9d4SSatish Balay     sum2  = y[1];
12479371c9d4SSatish Balay     sum3  = y[2];
12489371c9d4SSatish Balay     sum4  = y[3];
12499371c9d4SSatish Balay     sum5  = y[4];
12509371c9d4SSatish Balay     sum6  = y[5];
12519371c9d4SSatish Balay     sum7  = y[6];
12529371c9d4SSatish Balay     sum8  = y[7];
12539371c9d4SSatish Balay     sum9  = y[8];
12549371c9d4SSatish Balay     sum10 = y[9];
12559371c9d4SSatish Balay     sum11 = y[10];
12569371c9d4SSatish Balay     sum12 = y[11];
12576679dcc1SBarry Smith 
12586679dcc1SBarry Smith     for (j = 0; j < n; j++) {
12596679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
12609371c9d4SSatish Balay       x1 = xb[0];
12619371c9d4SSatish Balay       x2 = xb[1];
12629371c9d4SSatish Balay       x3 = xb[2];
12639371c9d4SSatish Balay       x4 = xb[3];
12646679dcc1SBarry Smith 
12656679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
12666679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
12676679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
12686679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
12696679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
12706679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
12716679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
12726679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
12736679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
12746679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
12756679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
12766679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
12776679dcc1SBarry Smith       v += 48;
12786679dcc1SBarry Smith 
12799371c9d4SSatish Balay       x1 = xb[4];
12809371c9d4SSatish Balay       x2 = xb[5];
12819371c9d4SSatish Balay       x3 = xb[6];
12829371c9d4SSatish Balay       x4 = xb[7];
12836679dcc1SBarry Smith 
12846679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
12856679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
12866679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
12876679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
12886679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
12896679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
12906679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
12916679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
12926679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
12936679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
12946679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
12956679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
12966679dcc1SBarry Smith       v += 48;
12976679dcc1SBarry Smith 
12989371c9d4SSatish Balay       x1 = xb[8];
12999371c9d4SSatish Balay       x2 = xb[9];
13009371c9d4SSatish Balay       x3 = xb[10];
13019371c9d4SSatish Balay       x4 = xb[11];
13026679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
13036679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
13046679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
13056679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
13066679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
13076679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
13086679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
13096679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
13106679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
13116679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
13126679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
13136679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
13146679dcc1SBarry Smith       v += 48;
13156679dcc1SBarry Smith     }
13169371c9d4SSatish Balay     z[0]  = sum1;
13179371c9d4SSatish Balay     z[1]  = sum2;
13189371c9d4SSatish Balay     z[2]  = sum3;
13199371c9d4SSatish Balay     z[3]  = sum4;
13209371c9d4SSatish Balay     z[4]  = sum5;
13219371c9d4SSatish Balay     z[5]  = sum6;
13229371c9d4SSatish Balay     z[6]  = sum7;
13239371c9d4SSatish Balay     z[7]  = sum8;
13249371c9d4SSatish Balay     z[8]  = sum9;
13259371c9d4SSatish Balay     z[9]  = sum10;
13269371c9d4SSatish Balay     z[10] = sum11;
13279371c9d4SSatish Balay     z[11] = sum12;
13286679dcc1SBarry Smith     if (!usecprow) {
13296679dcc1SBarry Smith       y += 12;
13306679dcc1SBarry Smith       z += 12;
13316679dcc1SBarry Smith     }
13326679dcc1SBarry Smith   }
13339566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
13349566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
13359566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
13366679dcc1SBarry Smith   PetscFunctionReturn(0);
13376679dcc1SBarry Smith }
13386679dcc1SBarry Smith 
13396679dcc1SBarry Smith #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
13409371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) {
13416679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
13426679dcc1SBarry Smith   PetscScalar       *z = NULL, *zarray;
13436679dcc1SBarry Smith   const PetscScalar *x, *work;
13446679dcc1SBarry Smith   const MatScalar   *v = a->a;
13456679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
13466679dcc1SBarry Smith   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
13476679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
13486679dcc1SBarry Smith   const PetscInt     bs = 12, bs2 = 144;
13496679dcc1SBarry Smith 
13506679dcc1SBarry Smith   __m256d a0, a1, a2, a3, a4, a5;
13516679dcc1SBarry Smith   __m256d w0, w1, w2, w3;
13526679dcc1SBarry Smith   __m256d z0, z1, z2;
13536679dcc1SBarry Smith 
13546679dcc1SBarry Smith   PetscFunctionBegin;
13559566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
13569566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
13576679dcc1SBarry Smith 
13586679dcc1SBarry Smith   if (usecprow) {
13596679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
13606679dcc1SBarry Smith     ii   = a->compressedrow.i;
13616679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
13629566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
13636679dcc1SBarry Smith   } else {
13646679dcc1SBarry Smith     mbs = a->mbs;
13656679dcc1SBarry Smith     ii  = a->i;
13666679dcc1SBarry Smith     z   = zarray;
13676679dcc1SBarry Smith   }
13686679dcc1SBarry Smith 
13696679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
13709371c9d4SSatish Balay     z0 = _mm256_setzero_pd();
13719371c9d4SSatish Balay     z1 = _mm256_setzero_pd();
13729371c9d4SSatish Balay     z2 = _mm256_setzero_pd();
13736679dcc1SBarry Smith 
13749371c9d4SSatish Balay     n = ii[1] - ii[0];
13759371c9d4SSatish Balay     ii++;
13766679dcc1SBarry Smith     for (j = 0; j < n; j++) {
13776679dcc1SBarry Smith       work = x + bs * (*idx++);
13786679dcc1SBarry Smith 
13796679dcc1SBarry Smith       /* first column of a */
13806679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[0]);
13819371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 0);
13829371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
13839371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 4);
13849371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
13859371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 8);
13869371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
13876679dcc1SBarry Smith 
13886679dcc1SBarry Smith       /* second column of a */
13896679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[1]);
13909371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 12);
13919371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
13929371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 16);
13939371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
13949371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 20);
13959371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
13966679dcc1SBarry Smith 
13976679dcc1SBarry Smith       /* third column of a */
13986679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[2]);
13999371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 24);
14009371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14019371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 28);
14029371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14039371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 32);
14049371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14056679dcc1SBarry Smith 
14066679dcc1SBarry Smith       /* fourth column of a */
14076679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[3]);
14089371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 36);
14099371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
14109371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 40);
14119371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
14129371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 44);
14139371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14146679dcc1SBarry Smith 
14156679dcc1SBarry Smith       /* fifth column of a */
14166679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[4]);
14179371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 48);
14189371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
14199371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 52);
14209371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
14219371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 56);
14229371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14236679dcc1SBarry Smith 
14246679dcc1SBarry Smith       /* sixth column of a */
14256679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[5]);
14269371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 60);
14279371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
14289371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 64);
14299371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
14309371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 68);
14319371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14326679dcc1SBarry Smith 
14336679dcc1SBarry Smith       /* seventh column of a */
14346679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[6]);
14359371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 72);
14369371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14379371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 76);
14389371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14399371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 80);
14409371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14416679dcc1SBarry Smith 
14426aad120cSJose E. Roman       /* eighth column of a */
14436679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[7]);
14449371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 84);
14459371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
14469371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 88);
14479371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
14489371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 92);
14499371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14506679dcc1SBarry Smith 
14516679dcc1SBarry Smith       /* ninth column of a */
14526679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[8]);
14539371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 96);
14549371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
14559371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 100);
14569371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
14579371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 104);
14589371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14596679dcc1SBarry Smith 
14606679dcc1SBarry Smith       /* tenth column of a */
14616679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[9]);
14629371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 108);
14639371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
14649371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 112);
14659371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
14669371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 116);
14679371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14686679dcc1SBarry Smith 
14696679dcc1SBarry Smith       /* eleventh column of a */
14706679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[10]);
14719371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 120);
14729371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14739371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 124);
14749371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14759371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 128);
14769371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14776679dcc1SBarry Smith 
14786679dcc1SBarry Smith       /* twelveth column of a */
14796679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[11]);
14809371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 132);
14819371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
14829371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 136);
14839371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
14849371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 140);
14859371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14866679dcc1SBarry Smith 
14876679dcc1SBarry Smith       v += bs2;
14886679dcc1SBarry Smith     }
14896679dcc1SBarry Smith     if (usecprow) z = zarray + bs * ridx[i];
14909371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
14919371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
14929371c9d4SSatish Balay     _mm256_storeu_pd(&z[8], z2);
14936679dcc1SBarry Smith     if (!usecprow) z += bs;
14946679dcc1SBarry Smith   }
14959566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
14969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
14979566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
14986679dcc1SBarry Smith   PetscFunctionReturn(0);
14996679dcc1SBarry Smith }
15006679dcc1SBarry Smith #endif
15016679dcc1SBarry Smith 
15028ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1503832cc040SShri Abhyankar /* Default MatMult for block size 15 */
15049371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) {
15058ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1506f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
15078ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
150853ef36baSBarry Smith   PetscScalar       *zarray, xv;
15098ab949d8SShri Abhyankar   const MatScalar   *v;
15108ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
15117c565772SBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1512ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
15138ab949d8SShri Abhyankar 
15148ab949d8SShri Abhyankar   PetscFunctionBegin;
15159566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
15169566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
15178ab949d8SShri Abhyankar 
15188ab949d8SShri Abhyankar   v = a->a;
15198ab949d8SShri Abhyankar   if (usecprow) {
15208ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
15218ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
15228ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
15239566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
15248ab949d8SShri Abhyankar   } else {
15258ab949d8SShri Abhyankar     mbs = a->mbs;
15268ab949d8SShri Abhyankar     ii  = a->i;
15278ab949d8SShri Abhyankar     z   = zarray;
15288ab949d8SShri Abhyankar   }
15298ab949d8SShri Abhyankar 
15308ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
15318ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
15328ab949d8SShri Abhyankar     idx   = ij + ii[i];
15339371c9d4SSatish Balay     sum1  = 0.0;
15349371c9d4SSatish Balay     sum2  = 0.0;
15359371c9d4SSatish Balay     sum3  = 0.0;
15369371c9d4SSatish Balay     sum4  = 0.0;
15379371c9d4SSatish Balay     sum5  = 0.0;
15389371c9d4SSatish Balay     sum6  = 0.0;
15399371c9d4SSatish Balay     sum7  = 0.0;
15409371c9d4SSatish Balay     sum8  = 0.0;
15419371c9d4SSatish Balay     sum9  = 0.0;
15429371c9d4SSatish Balay     sum10 = 0.0;
15439371c9d4SSatish Balay     sum11 = 0.0;
15449371c9d4SSatish Balay     sum12 = 0.0;
15459371c9d4SSatish Balay     sum13 = 0.0;
15469371c9d4SSatish Balay     sum14 = 0.0;
15479371c9d4SSatish Balay     sum15 = 0.0;
15488ab949d8SShri Abhyankar 
15498ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
15508ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
15518ab949d8SShri Abhyankar 
15528ab949d8SShri Abhyankar       for (k = 0; k < 15; k++) {
155353ef36baSBarry Smith         xv = xb[k];
155453ef36baSBarry Smith         sum1 += v[0] * xv;
155553ef36baSBarry Smith         sum2 += v[1] * xv;
155653ef36baSBarry Smith         sum3 += v[2] * xv;
155753ef36baSBarry Smith         sum4 += v[3] * xv;
155853ef36baSBarry Smith         sum5 += v[4] * xv;
155953ef36baSBarry Smith         sum6 += v[5] * xv;
156053ef36baSBarry Smith         sum7 += v[6] * xv;
156153ef36baSBarry Smith         sum8 += v[7] * xv;
156253ef36baSBarry Smith         sum9 += v[8] * xv;
156353ef36baSBarry Smith         sum10 += v[9] * xv;
156453ef36baSBarry Smith         sum11 += v[10] * xv;
156553ef36baSBarry Smith         sum12 += v[11] * xv;
156653ef36baSBarry Smith         sum13 += v[12] * xv;
156753ef36baSBarry Smith         sum14 += v[13] * xv;
156853ef36baSBarry Smith         sum15 += v[14] * xv;
15698ab949d8SShri Abhyankar         v += 15;
15708ab949d8SShri Abhyankar       }
15718ab949d8SShri Abhyankar     }
15728ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
15739371c9d4SSatish Balay     z[0]  = sum1;
15749371c9d4SSatish Balay     z[1]  = sum2;
15759371c9d4SSatish Balay     z[2]  = sum3;
15769371c9d4SSatish Balay     z[3]  = sum4;
15779371c9d4SSatish Balay     z[4]  = sum5;
15789371c9d4SSatish Balay     z[5]  = sum6;
15799371c9d4SSatish Balay     z[6]  = sum7;
15809371c9d4SSatish Balay     z[7]  = sum8;
15819371c9d4SSatish Balay     z[8]  = sum9;
15829371c9d4SSatish Balay     z[9]  = sum10;
15839371c9d4SSatish Balay     z[10] = sum11;
15849371c9d4SSatish Balay     z[11] = sum12;
15859371c9d4SSatish Balay     z[12] = sum13;
15869371c9d4SSatish Balay     z[13] = sum14;
15879371c9d4SSatish Balay     z[14] = sum15;
15888ab949d8SShri Abhyankar 
15898ab949d8SShri Abhyankar     if (!usecprow) z += 15;
15908ab949d8SShri Abhyankar   }
15918ab949d8SShri Abhyankar 
15929566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
15939566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
15949566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
15958ab949d8SShri Abhyankar   PetscFunctionReturn(0);
15968ab949d8SShri Abhyankar }
15978ab949d8SShri Abhyankar 
15988ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
15999371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) {
16008ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1601f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
16028ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
16030b8f6341SShri Abhyankar   PetscScalar        x1, x2, x3, x4, *zarray;
16048ab949d8SShri Abhyankar   const MatScalar   *v;
16058ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
16067c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1607ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
16088ab949d8SShri Abhyankar 
16098ab949d8SShri Abhyankar   PetscFunctionBegin;
16109566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
16119566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
16128ab949d8SShri Abhyankar 
16138ab949d8SShri Abhyankar   v = a->a;
16148ab949d8SShri Abhyankar   if (usecprow) {
16158ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
16168ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
16178ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
16189566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
16198ab949d8SShri Abhyankar   } else {
16208ab949d8SShri Abhyankar     mbs = a->mbs;
16218ab949d8SShri Abhyankar     ii  = a->i;
16228ab949d8SShri Abhyankar     z   = zarray;
16238ab949d8SShri Abhyankar   }
16248ab949d8SShri Abhyankar 
16258ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
16268ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
16278ab949d8SShri Abhyankar     idx   = ij + ii[i];
16289371c9d4SSatish Balay     sum1  = 0.0;
16299371c9d4SSatish Balay     sum2  = 0.0;
16309371c9d4SSatish Balay     sum3  = 0.0;
16319371c9d4SSatish Balay     sum4  = 0.0;
16329371c9d4SSatish Balay     sum5  = 0.0;
16339371c9d4SSatish Balay     sum6  = 0.0;
16349371c9d4SSatish Balay     sum7  = 0.0;
16359371c9d4SSatish Balay     sum8  = 0.0;
16369371c9d4SSatish Balay     sum9  = 0.0;
16379371c9d4SSatish Balay     sum10 = 0.0;
16389371c9d4SSatish Balay     sum11 = 0.0;
16399371c9d4SSatish Balay     sum12 = 0.0;
16409371c9d4SSatish Balay     sum13 = 0.0;
16419371c9d4SSatish Balay     sum14 = 0.0;
16429371c9d4SSatish Balay     sum15 = 0.0;
16438ab949d8SShri Abhyankar 
16448ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
16458ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
16469371c9d4SSatish Balay       x1 = xb[0];
16479371c9d4SSatish Balay       x2 = xb[1];
16489371c9d4SSatish Balay       x3 = xb[2];
16499371c9d4SSatish Balay       x4 = xb[3];
16508ab949d8SShri Abhyankar 
16518ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16528ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16538ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16548ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16558ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
16568ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
16578ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
16588ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
16598ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
16608ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
16618ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
16628ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
16638ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
16648ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
16658ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
16668ab949d8SShri Abhyankar 
16678ab949d8SShri Abhyankar       v += 60;
16688ab949d8SShri Abhyankar 
16699371c9d4SSatish Balay       x1 = xb[4];
16709371c9d4SSatish Balay       x2 = xb[5];
16719371c9d4SSatish Balay       x3 = xb[6];
16729371c9d4SSatish Balay       x4 = xb[7];
16738ab949d8SShri Abhyankar 
16748ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16758ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16768ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16778ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16788ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
16798ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
16808ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
16818ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
16828ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
16838ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
16848ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
16858ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
16868ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
16878ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
16888ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
16898ab949d8SShri Abhyankar       v += 60;
16908ab949d8SShri Abhyankar 
16919371c9d4SSatish Balay       x1 = xb[8];
16929371c9d4SSatish Balay       x2 = xb[9];
16939371c9d4SSatish Balay       x3 = xb[10];
16949371c9d4SSatish Balay       x4 = xb[11];
16950b8f6341SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16960b8f6341SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16970b8f6341SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16980b8f6341SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16990b8f6341SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
17000b8f6341SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
17010b8f6341SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
17020b8f6341SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
17030b8f6341SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
17040b8f6341SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
17050b8f6341SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
17060b8f6341SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
17070b8f6341SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
17080b8f6341SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
17090b8f6341SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
17100b8f6341SShri Abhyankar       v += 60;
17110b8f6341SShri Abhyankar 
17129371c9d4SSatish Balay       x1 = xb[12];
17139371c9d4SSatish Balay       x2 = xb[13];
17149371c9d4SSatish Balay       x3 = xb[14];
17158ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
17168ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
17178ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
17188ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
17198ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
17208ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
17218ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
17228ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
17238ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
17248ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
17258ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
17268ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
17278ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
17288ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
17298ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
17308ab949d8SShri Abhyankar       v += 45;
17318ab949d8SShri Abhyankar     }
17328ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
17339371c9d4SSatish Balay     z[0]  = sum1;
17349371c9d4SSatish Balay     z[1]  = sum2;
17359371c9d4SSatish Balay     z[2]  = sum3;
17369371c9d4SSatish Balay     z[3]  = sum4;
17379371c9d4SSatish Balay     z[4]  = sum5;
17389371c9d4SSatish Balay     z[5]  = sum6;
17399371c9d4SSatish Balay     z[6]  = sum7;
17409371c9d4SSatish Balay     z[7]  = sum8;
17419371c9d4SSatish Balay     z[8]  = sum9;
17429371c9d4SSatish Balay     z[9]  = sum10;
17439371c9d4SSatish Balay     z[10] = sum11;
17449371c9d4SSatish Balay     z[11] = sum12;
17459371c9d4SSatish Balay     z[12] = sum13;
17469371c9d4SSatish Balay     z[13] = sum14;
17479371c9d4SSatish Balay     z[14] = sum15;
17488ab949d8SShri Abhyankar 
17498ab949d8SShri Abhyankar     if (!usecprow) z += 15;
17508ab949d8SShri Abhyankar   }
17518ab949d8SShri Abhyankar 
17529566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
17539566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
17549566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
17558ab949d8SShri Abhyankar   PetscFunctionReturn(0);
17568ab949d8SShri Abhyankar }
17578ab949d8SShri Abhyankar 
17588ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
17599371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) {
17608ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1761f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
17628ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
17630b8f6341SShri Abhyankar   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
17648ab949d8SShri Abhyankar   const MatScalar   *v;
17658ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
17667c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1767ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
17688ab949d8SShri Abhyankar 
17698ab949d8SShri Abhyankar   PetscFunctionBegin;
17709566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
17719566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
17728ab949d8SShri Abhyankar 
17738ab949d8SShri Abhyankar   v = a->a;
17748ab949d8SShri Abhyankar   if (usecprow) {
17758ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
17768ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
17778ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
17789566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
17798ab949d8SShri Abhyankar   } else {
17808ab949d8SShri Abhyankar     mbs = a->mbs;
17818ab949d8SShri Abhyankar     ii  = a->i;
17828ab949d8SShri Abhyankar     z   = zarray;
17838ab949d8SShri Abhyankar   }
17848ab949d8SShri Abhyankar 
17858ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
17868ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
17878ab949d8SShri Abhyankar     idx   = ij + ii[i];
17889371c9d4SSatish Balay     sum1  = 0.0;
17899371c9d4SSatish Balay     sum2  = 0.0;
17909371c9d4SSatish Balay     sum3  = 0.0;
17919371c9d4SSatish Balay     sum4  = 0.0;
17929371c9d4SSatish Balay     sum5  = 0.0;
17939371c9d4SSatish Balay     sum6  = 0.0;
17949371c9d4SSatish Balay     sum7  = 0.0;
17959371c9d4SSatish Balay     sum8  = 0.0;
17969371c9d4SSatish Balay     sum9  = 0.0;
17979371c9d4SSatish Balay     sum10 = 0.0;
17989371c9d4SSatish Balay     sum11 = 0.0;
17999371c9d4SSatish Balay     sum12 = 0.0;
18009371c9d4SSatish Balay     sum13 = 0.0;
18019371c9d4SSatish Balay     sum14 = 0.0;
18029371c9d4SSatish Balay     sum15 = 0.0;
18038ab949d8SShri Abhyankar 
18048ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
18058ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
18069371c9d4SSatish Balay       x1 = xb[0];
18079371c9d4SSatish Balay       x2 = xb[1];
18089371c9d4SSatish Balay       x3 = xb[2];
18099371c9d4SSatish Balay       x4 = xb[3];
18109371c9d4SSatish Balay       x5 = xb[4];
18119371c9d4SSatish Balay       x6 = xb[5];
18129371c9d4SSatish Balay       x7 = xb[6];
18130b8f6341SShri Abhyankar       x8 = xb[7];
18148ab949d8SShri Abhyankar 
18158ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
18168ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
18178ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
18188ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
18198ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
18208ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
18218ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
18228ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
18238ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
18248ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
18258ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
18268ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
18278ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
18288ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
18298ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
18308ab949d8SShri Abhyankar       v += 120;
18318ab949d8SShri Abhyankar 
18329371c9d4SSatish Balay       x1 = xb[8];
18339371c9d4SSatish Balay       x2 = xb[9];
18349371c9d4SSatish Balay       x3 = xb[10];
18359371c9d4SSatish Balay       x4 = xb[11];
18369371c9d4SSatish Balay       x5 = xb[12];
18379371c9d4SSatish Balay       x6 = xb[13];
18389371c9d4SSatish Balay       x7 = xb[14];
18390b8f6341SShri Abhyankar 
18408ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
18418ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
18428ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
18438ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
18448ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
18458ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
18468ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
18478ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
18488ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
18498ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
18508ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
18518ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
18528ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
18538ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
18548ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
18558ab949d8SShri Abhyankar       v += 105;
18568ab949d8SShri Abhyankar     }
18578ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
18589371c9d4SSatish Balay     z[0]  = sum1;
18599371c9d4SSatish Balay     z[1]  = sum2;
18609371c9d4SSatish Balay     z[2]  = sum3;
18619371c9d4SSatish Balay     z[3]  = sum4;
18629371c9d4SSatish Balay     z[4]  = sum5;
18639371c9d4SSatish Balay     z[5]  = sum6;
18649371c9d4SSatish Balay     z[6]  = sum7;
18659371c9d4SSatish Balay     z[7]  = sum8;
18669371c9d4SSatish Balay     z[8]  = sum9;
18679371c9d4SSatish Balay     z[9]  = sum10;
18689371c9d4SSatish Balay     z[10] = sum11;
18699371c9d4SSatish Balay     z[11] = sum12;
18709371c9d4SSatish Balay     z[12] = sum13;
18719371c9d4SSatish Balay     z[13] = sum14;
18729371c9d4SSatish Balay     z[14] = sum15;
18738ab949d8SShri Abhyankar 
18748ab949d8SShri Abhyankar     if (!usecprow) z += 15;
18758ab949d8SShri Abhyankar   }
18768ab949d8SShri Abhyankar 
18779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
18789566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
18799566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
18808ab949d8SShri Abhyankar   PetscFunctionReturn(0);
18818ab949d8SShri Abhyankar }
18828ab949d8SShri Abhyankar 
18838ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
18849371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) {
18858ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1886f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
18878ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
18888ab949d8SShri Abhyankar   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
18898ab949d8SShri Abhyankar   const MatScalar   *v;
18908ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
18917c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1892ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
18938ab949d8SShri Abhyankar 
18948ab949d8SShri Abhyankar   PetscFunctionBegin;
18959566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
18969566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
18978ab949d8SShri Abhyankar 
18988ab949d8SShri Abhyankar   v = a->a;
18998ab949d8SShri Abhyankar   if (usecprow) {
19008ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
19018ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
19028ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
19039566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
19048ab949d8SShri Abhyankar   } else {
19058ab949d8SShri Abhyankar     mbs = a->mbs;
19068ab949d8SShri Abhyankar     ii  = a->i;
19078ab949d8SShri Abhyankar     z   = zarray;
19088ab949d8SShri Abhyankar   }
19098ab949d8SShri Abhyankar 
19108ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
19118ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
19128ab949d8SShri Abhyankar     idx   = ij + ii[i];
19139371c9d4SSatish Balay     sum1  = 0.0;
19149371c9d4SSatish Balay     sum2  = 0.0;
19159371c9d4SSatish Balay     sum3  = 0.0;
19169371c9d4SSatish Balay     sum4  = 0.0;
19179371c9d4SSatish Balay     sum5  = 0.0;
19189371c9d4SSatish Balay     sum6  = 0.0;
19199371c9d4SSatish Balay     sum7  = 0.0;
19209371c9d4SSatish Balay     sum8  = 0.0;
19219371c9d4SSatish Balay     sum9  = 0.0;
19229371c9d4SSatish Balay     sum10 = 0.0;
19239371c9d4SSatish Balay     sum11 = 0.0;
19249371c9d4SSatish Balay     sum12 = 0.0;
19259371c9d4SSatish Balay     sum13 = 0.0;
19269371c9d4SSatish Balay     sum14 = 0.0;
19279371c9d4SSatish Balay     sum15 = 0.0;
19288ab949d8SShri Abhyankar 
19298ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
19308ab949d8SShri Abhyankar       xb  = x + 15 * (idx[j]);
19319371c9d4SSatish Balay       x1  = xb[0];
19329371c9d4SSatish Balay       x2  = xb[1];
19339371c9d4SSatish Balay       x3  = xb[2];
19349371c9d4SSatish Balay       x4  = xb[3];
19359371c9d4SSatish Balay       x5  = xb[4];
19369371c9d4SSatish Balay       x6  = xb[5];
19379371c9d4SSatish Balay       x7  = xb[6];
19389371c9d4SSatish Balay       x8  = xb[7];
19399371c9d4SSatish Balay       x9  = xb[8];
19409371c9d4SSatish Balay       x10 = xb[9];
19419371c9d4SSatish Balay       x11 = xb[10];
19429371c9d4SSatish Balay       x12 = xb[11];
19439371c9d4SSatish Balay       x13 = xb[12];
19449371c9d4SSatish Balay       x14 = xb[13];
19459371c9d4SSatish Balay       x15 = xb[14];
19468ab949d8SShri Abhyankar 
19478ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
19488ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
19498ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
19508ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
19518ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
19528ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
19538ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
19548ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
19558ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
19568ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
19578ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
19588ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
19598ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
19608ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
19618ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
19628ab949d8SShri Abhyankar       v += 225;
19638ab949d8SShri Abhyankar     }
19648ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
19659371c9d4SSatish Balay     z[0]  = sum1;
19669371c9d4SSatish Balay     z[1]  = sum2;
19679371c9d4SSatish Balay     z[2]  = sum3;
19689371c9d4SSatish Balay     z[3]  = sum4;
19699371c9d4SSatish Balay     z[4]  = sum5;
19709371c9d4SSatish Balay     z[5]  = sum6;
19719371c9d4SSatish Balay     z[6]  = sum7;
19729371c9d4SSatish Balay     z[7]  = sum8;
19739371c9d4SSatish Balay     z[8]  = sum9;
19749371c9d4SSatish Balay     z[9]  = sum10;
19759371c9d4SSatish Balay     z[10] = sum11;
19769371c9d4SSatish Balay     z[11] = sum12;
19779371c9d4SSatish Balay     z[12] = sum13;
19789371c9d4SSatish Balay     z[13] = sum14;
19799371c9d4SSatish Balay     z[14] = sum15;
19808ab949d8SShri Abhyankar 
19818ab949d8SShri Abhyankar     if (!usecprow) z += 15;
19828ab949d8SShri Abhyankar   }
19838ab949d8SShri Abhyankar 
19849566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
19859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
19869566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
19878ab949d8SShri Abhyankar   PetscFunctionReturn(0);
19888ab949d8SShri Abhyankar }
19898ab949d8SShri Abhyankar 
19903f1db9ecSBarry Smith /*
19913f1db9ecSBarry Smith     This will not work with MatScalar == float because it calls the BLAS
19923f1db9ecSBarry Smith */
19939371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) {
19942d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1995f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
1996d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1997d9ca1df4SBarry Smith   const MatScalar   *v;
1998d9ca1df4SBarry Smith   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1999d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2000d9ca1df4SBarry Smith   PetscInt           ncols, k;
2001ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
20022d61bbb3SSatish Balay 
20032d61bbb3SSatish Balay   PetscFunctionBegin;
20049566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
20059566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
20062d61bbb3SSatish Balay 
20072d61bbb3SSatish Balay   idx = a->j;
20082d61bbb3SSatish Balay   v   = a->a;
200926e093fcSHong Zhang   if (usecprow) {
201026e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
201126e093fcSHong Zhang     ii   = a->compressedrow.i;
20127b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
20139566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
201426e093fcSHong Zhang   } else {
201526e093fcSHong Zhang     mbs = a->mbs;
20162d61bbb3SSatish Balay     ii  = a->i;
201726e093fcSHong Zhang     z   = zarray;
201826e093fcSHong Zhang   }
2019218c64b6SSatish Balay 
20202d61bbb3SSatish Balay   if (!a->mult_work) {
2021d0f46423SBarry Smith     k = PetscMax(A->rmap->n, A->cmap->n);
20229566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
20232d61bbb3SSatish Balay   }
20242d61bbb3SSatish Balay   work = a->mult_work;
20252d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
20269371c9d4SSatish Balay     n = ii[1] - ii[0];
20279371c9d4SSatish Balay     ii++;
20282d61bbb3SSatish Balay     ncols = n * bs;
20292d61bbb3SSatish Balay     workt = work;
20302d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
20312d61bbb3SSatish Balay       xb = x + bs * (*idx++);
20322d61bbb3SSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
20332d61bbb3SSatish Balay       workt += bs;
20342d61bbb3SSatish Balay     }
20357b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs * ridx[i];
203696b95a6bSBarry Smith     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
20372d61bbb3SSatish Balay     v += n * bs2;
203826e093fcSHong Zhang     if (!usecprow) z += bs;
20392d61bbb3SSatish Balay   }
20409566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
20419566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
20429566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
20432d61bbb3SSatish Balay   PetscFunctionReturn(0);
20442d61bbb3SSatish Balay }
20452d61bbb3SSatish Balay 
20469371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) {
20472d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2048122f12eaSBarry Smith   const PetscScalar *x;
2049122f12eaSBarry Smith   PetscScalar       *y, *z, sum;
2050122f12eaSBarry Smith   const MatScalar   *v;
20517c565772SBarry Smith   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2052122f12eaSBarry Smith   const PetscInt    *idx, *ii;
2053ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
20542d61bbb3SSatish Balay 
20552d61bbb3SSatish Balay   PetscFunctionBegin;
20569566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
20579566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
20582d61bbb3SSatish Balay 
20592d61bbb3SSatish Balay   idx = a->j;
20602d61bbb3SSatish Balay   v   = a->a;
206126e093fcSHong Zhang   if (usecprow) {
206248a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
206326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
206426e093fcSHong Zhang     ii   = a->compressedrow.i;
20657b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
206626e093fcSHong Zhang   } else {
20672d61bbb3SSatish Balay     ii = a->i;
206826e093fcSHong Zhang   }
20692d61bbb3SSatish Balay 
20702d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2071122f12eaSBarry Smith     n = ii[1] - ii[0];
2072122f12eaSBarry Smith     ii++;
207326e093fcSHong Zhang     if (!usecprow) {
2074122f12eaSBarry Smith       sum = y[i];
2075122f12eaSBarry Smith     } else {
2076122f12eaSBarry Smith       sum = y[ridx[i]];
2077122f12eaSBarry Smith     }
2078444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2079444d8c10SJed Brown     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2080122f12eaSBarry Smith     PetscSparseDensePlusDot(sum, x, v, idx, n);
2081122f12eaSBarry Smith     v += n;
2082122f12eaSBarry Smith     idx += n;
2083122f12eaSBarry Smith     if (usecprow) {
2084122f12eaSBarry Smith       z[ridx[i]] = sum;
2085122f12eaSBarry Smith     } else {
2086122f12eaSBarry Smith       z[i] = sum;
208726e093fcSHong Zhang     }
20882d61bbb3SSatish Balay   }
20899566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
20909566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
20919566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
20922d61bbb3SSatish Balay   PetscFunctionReturn(0);
20932d61bbb3SSatish Balay }
20942d61bbb3SSatish Balay 
20959371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) {
20962d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2097f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2098d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
209926e093fcSHong Zhang   PetscScalar        x1, x2, *yarray, *zarray;
2100d9ca1df4SBarry Smith   const MatScalar   *v;
2101d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, j;
2102d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2103ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
21042d61bbb3SSatish Balay 
21052d61bbb3SSatish Balay   PetscFunctionBegin;
21069566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
21079566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
21082d61bbb3SSatish Balay 
21092d61bbb3SSatish Balay   idx = a->j;
21102d61bbb3SSatish Balay   v   = a->a;
211126e093fcSHong Zhang   if (usecprow) {
211248a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
211326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
211426e093fcSHong Zhang     ii   = a->compressedrow.i;
21157b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
211626e093fcSHong Zhang   } else {
21172d61bbb3SSatish Balay     ii = a->i;
211826e093fcSHong Zhang     y  = yarray;
211926e093fcSHong Zhang     z  = zarray;
212026e093fcSHong Zhang   }
21212d61bbb3SSatish Balay 
21222d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
21239371c9d4SSatish Balay     n = ii[1] - ii[0];
21249371c9d4SSatish Balay     ii++;
212526e093fcSHong Zhang     if (usecprow) {
21267b2bb3b9SHong Zhang       z = zarray + 2 * ridx[i];
21277b2bb3b9SHong Zhang       y = yarray + 2 * ridx[i];
212826e093fcSHong Zhang     }
21299371c9d4SSatish Balay     sum1 = y[0];
21309371c9d4SSatish Balay     sum2 = y[1];
2131444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2132444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
21332d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
213426fbe8dcSKarl Rupp       xb = x + 2 * (*idx++);
213526fbe8dcSKarl Rupp       x1 = xb[0];
213626fbe8dcSKarl Rupp       x2 = xb[1];
213726fbe8dcSKarl Rupp 
21382d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[2] * x2;
21392d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[3] * x2;
21402d61bbb3SSatish Balay       v += 4;
21412d61bbb3SSatish Balay     }
21429371c9d4SSatish Balay     z[0] = sum1;
21439371c9d4SSatish Balay     z[1] = sum2;
214426e093fcSHong Zhang     if (!usecprow) {
21459371c9d4SSatish Balay       z += 2;
21469371c9d4SSatish Balay       y += 2;
21472d61bbb3SSatish Balay     }
214826e093fcSHong Zhang   }
21499566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
21509566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
21519566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(4.0 * a->nz));
21522d61bbb3SSatish Balay   PetscFunctionReturn(0);
21532d61bbb3SSatish Balay }
21542d61bbb3SSatish Balay 
21559371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) {
21562d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2157f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2158d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2159d9ca1df4SBarry Smith   const MatScalar   *v;
2160d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2161d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2162ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
21632d61bbb3SSatish Balay 
21642d61bbb3SSatish Balay   PetscFunctionBegin;
21659566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
21669566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
21672d61bbb3SSatish Balay 
21682d61bbb3SSatish Balay   idx = a->j;
21692d61bbb3SSatish Balay   v   = a->a;
217026e093fcSHong Zhang   if (usecprow) {
217148a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
217226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
217326e093fcSHong Zhang     ii   = a->compressedrow.i;
21747b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
217526e093fcSHong Zhang   } else {
21762d61bbb3SSatish Balay     ii = a->i;
217726e093fcSHong Zhang     y  = yarray;
217826e093fcSHong Zhang     z  = zarray;
217926e093fcSHong Zhang   }
21802d61bbb3SSatish Balay 
21812d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
21829371c9d4SSatish Balay     n = ii[1] - ii[0];
21839371c9d4SSatish Balay     ii++;
218426e093fcSHong Zhang     if (usecprow) {
21857b2bb3b9SHong Zhang       z = zarray + 3 * ridx[i];
21867b2bb3b9SHong Zhang       y = yarray + 3 * ridx[i];
218726e093fcSHong Zhang     }
21889371c9d4SSatish Balay     sum1 = y[0];
21899371c9d4SSatish Balay     sum2 = y[1];
21909371c9d4SSatish Balay     sum3 = y[2];
2191444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2192444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
21932d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
21949371c9d4SSatish Balay       xb = x + 3 * (*idx++);
21959371c9d4SSatish Balay       x1 = xb[0];
21969371c9d4SSatish Balay       x2 = xb[1];
21979371c9d4SSatish Balay       x3 = xb[2];
21982d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
21992d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
22002d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
22012d61bbb3SSatish Balay       v += 9;
22022d61bbb3SSatish Balay     }
22039371c9d4SSatish Balay     z[0] = sum1;
22049371c9d4SSatish Balay     z[1] = sum2;
22059371c9d4SSatish Balay     z[2] = sum3;
220626e093fcSHong Zhang     if (!usecprow) {
22079371c9d4SSatish Balay       z += 3;
22089371c9d4SSatish Balay       y += 3;
22092d61bbb3SSatish Balay     }
221026e093fcSHong Zhang   }
22119566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
22129566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
22139566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * a->nz));
22142d61bbb3SSatish Balay   PetscFunctionReturn(0);
22152d61bbb3SSatish Balay }
22162d61bbb3SSatish Balay 
22179371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) {
22182d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2219f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2220d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2221d9ca1df4SBarry Smith   const MatScalar   *v;
2222d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2223d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2224ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
22252d61bbb3SSatish Balay 
22262d61bbb3SSatish Balay   PetscFunctionBegin;
22279566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
22289566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
22292d61bbb3SSatish Balay 
22302d61bbb3SSatish Balay   idx = a->j;
22312d61bbb3SSatish Balay   v   = a->a;
223226e093fcSHong Zhang   if (usecprow) {
223348a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
223426e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
223526e093fcSHong Zhang     ii   = a->compressedrow.i;
22367b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
223726e093fcSHong Zhang   } else {
22382d61bbb3SSatish Balay     ii = a->i;
223926e093fcSHong Zhang     y  = yarray;
224026e093fcSHong Zhang     z  = zarray;
224126e093fcSHong Zhang   }
22422d61bbb3SSatish Balay 
22432d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
22449371c9d4SSatish Balay     n = ii[1] - ii[0];
22459371c9d4SSatish Balay     ii++;
224626e093fcSHong Zhang     if (usecprow) {
22477b2bb3b9SHong Zhang       z = zarray + 4 * ridx[i];
22487b2bb3b9SHong Zhang       y = yarray + 4 * ridx[i];
224926e093fcSHong Zhang     }
22509371c9d4SSatish Balay     sum1 = y[0];
22519371c9d4SSatish Balay     sum2 = y[1];
22529371c9d4SSatish Balay     sum3 = y[2];
22539371c9d4SSatish Balay     sum4 = y[3];
2254444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2255444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
22562d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
22572d61bbb3SSatish Balay       xb = x + 4 * (*idx++);
22589371c9d4SSatish Balay       x1 = xb[0];
22599371c9d4SSatish Balay       x2 = xb[1];
22609371c9d4SSatish Balay       x3 = xb[2];
22619371c9d4SSatish Balay       x4 = xb[3];
22622d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
22632d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
22642d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
22652d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
22662d61bbb3SSatish Balay       v += 16;
22672d61bbb3SSatish Balay     }
22689371c9d4SSatish Balay     z[0] = sum1;
22699371c9d4SSatish Balay     z[1] = sum2;
22709371c9d4SSatish Balay     z[2] = sum3;
22719371c9d4SSatish Balay     z[3] = sum4;
227226e093fcSHong Zhang     if (!usecprow) {
22739371c9d4SSatish Balay       z += 4;
22749371c9d4SSatish Balay       y += 4;
22752d61bbb3SSatish Balay     }
227626e093fcSHong Zhang   }
22779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
22789566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
22799566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * a->nz));
22802d61bbb3SSatish Balay   PetscFunctionReturn(0);
22812d61bbb3SSatish Balay }
22822d61bbb3SSatish Balay 
22839371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) {
22842d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2285f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2286d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
228726e093fcSHong Zhang   PetscScalar       *yarray, *zarray;
2288d9ca1df4SBarry Smith   const MatScalar   *v;
2289d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2290d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2291ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
22922d61bbb3SSatish Balay 
22932d61bbb3SSatish Balay   PetscFunctionBegin;
22949566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
22959566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
22962d61bbb3SSatish Balay 
22972d61bbb3SSatish Balay   idx = a->j;
22982d61bbb3SSatish Balay   v   = a->a;
229926e093fcSHong Zhang   if (usecprow) {
230048a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
230126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
230226e093fcSHong Zhang     ii   = a->compressedrow.i;
23037b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
230426e093fcSHong Zhang   } else {
23052d61bbb3SSatish Balay     ii = a->i;
230626e093fcSHong Zhang     y  = yarray;
230726e093fcSHong Zhang     z  = zarray;
230826e093fcSHong Zhang   }
23092d61bbb3SSatish Balay 
23102d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
23119371c9d4SSatish Balay     n = ii[1] - ii[0];
23129371c9d4SSatish Balay     ii++;
231326e093fcSHong Zhang     if (usecprow) {
23147b2bb3b9SHong Zhang       z = zarray + 5 * ridx[i];
23157b2bb3b9SHong Zhang       y = yarray + 5 * ridx[i];
231626e093fcSHong Zhang     }
23179371c9d4SSatish Balay     sum1 = y[0];
23189371c9d4SSatish Balay     sum2 = y[1];
23199371c9d4SSatish Balay     sum3 = y[2];
23209371c9d4SSatish Balay     sum4 = y[3];
23219371c9d4SSatish Balay     sum5 = y[4];
2322444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2323444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
23242d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
23252d61bbb3SSatish Balay       xb = x + 5 * (*idx++);
23269371c9d4SSatish Balay       x1 = xb[0];
23279371c9d4SSatish Balay       x2 = xb[1];
23289371c9d4SSatish Balay       x3 = xb[2];
23299371c9d4SSatish Balay       x4 = xb[3];
23309371c9d4SSatish Balay       x5 = xb[4];
23312d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
23322d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
23332d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
23342d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
23352d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
23362d61bbb3SSatish Balay       v += 25;
23372d61bbb3SSatish Balay     }
23389371c9d4SSatish Balay     z[0] = sum1;
23399371c9d4SSatish Balay     z[1] = sum2;
23409371c9d4SSatish Balay     z[2] = sum3;
23419371c9d4SSatish Balay     z[3] = sum4;
23429371c9d4SSatish Balay     z[4] = sum5;
234326e093fcSHong Zhang     if (!usecprow) {
23449371c9d4SSatish Balay       z += 5;
23459371c9d4SSatish Balay       y += 5;
23462d61bbb3SSatish Balay     }
234726e093fcSHong Zhang   }
23489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
23499566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
23509566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * a->nz));
23512d61bbb3SSatish Balay   PetscFunctionReturn(0);
23522d61bbb3SSatish Balay }
2353c2916339SPierre Jolivet 
23549371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) {
235515091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2356f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2357d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
235826e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2359d9ca1df4SBarry Smith   const MatScalar   *v;
2360d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2361d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2362ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
236315091d37SBarry Smith 
236415091d37SBarry Smith   PetscFunctionBegin;
23659566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
23669566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
236715091d37SBarry Smith 
236815091d37SBarry Smith   idx = a->j;
236915091d37SBarry Smith   v   = a->a;
237026e093fcSHong Zhang   if (usecprow) {
237148a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
237226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
237326e093fcSHong Zhang     ii   = a->compressedrow.i;
23747b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
237526e093fcSHong Zhang   } else {
237615091d37SBarry Smith     ii = a->i;
237726e093fcSHong Zhang     y  = yarray;
237826e093fcSHong Zhang     z  = zarray;
237926e093fcSHong Zhang   }
238015091d37SBarry Smith 
238115091d37SBarry Smith   for (i = 0; i < mbs; i++) {
23829371c9d4SSatish Balay     n = ii[1] - ii[0];
23839371c9d4SSatish Balay     ii++;
238426e093fcSHong Zhang     if (usecprow) {
23857b2bb3b9SHong Zhang       z = zarray + 6 * ridx[i];
23867b2bb3b9SHong Zhang       y = yarray + 6 * ridx[i];
238726e093fcSHong Zhang     }
23889371c9d4SSatish Balay     sum1 = y[0];
23899371c9d4SSatish Balay     sum2 = y[1];
23909371c9d4SSatish Balay     sum3 = y[2];
23919371c9d4SSatish Balay     sum4 = y[3];
23929371c9d4SSatish Balay     sum5 = y[4];
23939371c9d4SSatish Balay     sum6 = y[5];
2394444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2395444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
239615091d37SBarry Smith     for (j = 0; j < n; j++) {
23973b95cb0eSSatish Balay       xb = x + 6 * (*idx++);
23989371c9d4SSatish Balay       x1 = xb[0];
23999371c9d4SSatish Balay       x2 = xb[1];
24009371c9d4SSatish Balay       x3 = xb[2];
24019371c9d4SSatish Balay       x4 = xb[3];
24029371c9d4SSatish Balay       x5 = xb[4];
24039371c9d4SSatish Balay       x6 = xb[5];
240415091d37SBarry Smith       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
240515091d37SBarry Smith       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
240615091d37SBarry Smith       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
240715091d37SBarry Smith       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
240815091d37SBarry Smith       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
240915091d37SBarry Smith       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
241015091d37SBarry Smith       v += 36;
241115091d37SBarry Smith     }
24129371c9d4SSatish Balay     z[0] = sum1;
24139371c9d4SSatish Balay     z[1] = sum2;
24149371c9d4SSatish Balay     z[2] = sum3;
24159371c9d4SSatish Balay     z[3] = sum4;
24169371c9d4SSatish Balay     z[4] = sum5;
24179371c9d4SSatish Balay     z[5] = sum6;
241826e093fcSHong Zhang     if (!usecprow) {
24199371c9d4SSatish Balay       z += 6;
24209371c9d4SSatish Balay       y += 6;
242115091d37SBarry Smith     }
242226e093fcSHong Zhang   }
24239566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
24249566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
24259566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * a->nz));
242615091d37SBarry Smith   PetscFunctionReturn(0);
242715091d37SBarry Smith }
24282d61bbb3SSatish Balay 
24299371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) {
24302d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2431f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2432d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
243326e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2434d9ca1df4SBarry Smith   const MatScalar   *v;
2435d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2436d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2437ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
24382d61bbb3SSatish Balay 
24392d61bbb3SSatish Balay   PetscFunctionBegin;
24409566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
24419566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
24422d61bbb3SSatish Balay 
24432d61bbb3SSatish Balay   idx = a->j;
24442d61bbb3SSatish Balay   v   = a->a;
244526e093fcSHong Zhang   if (usecprow) {
244648a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
244726e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
244826e093fcSHong Zhang     ii   = a->compressedrow.i;
24497b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
245026e093fcSHong Zhang   } else {
24512d61bbb3SSatish Balay     ii = a->i;
245226e093fcSHong Zhang     y  = yarray;
245326e093fcSHong Zhang     z  = zarray;
245426e093fcSHong Zhang   }
24552d61bbb3SSatish Balay 
24562d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
24579371c9d4SSatish Balay     n = ii[1] - ii[0];
24589371c9d4SSatish Balay     ii++;
245926e093fcSHong Zhang     if (usecprow) {
24607b2bb3b9SHong Zhang       z = zarray + 7 * ridx[i];
24617b2bb3b9SHong Zhang       y = yarray + 7 * ridx[i];
246226e093fcSHong Zhang     }
24639371c9d4SSatish Balay     sum1 = y[0];
24649371c9d4SSatish Balay     sum2 = y[1];
24659371c9d4SSatish Balay     sum3 = y[2];
24669371c9d4SSatish Balay     sum4 = y[3];
24679371c9d4SSatish Balay     sum5 = y[4];
24689371c9d4SSatish Balay     sum6 = y[5];
24699371c9d4SSatish Balay     sum7 = y[6];
2470444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2471444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
24722d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
24732d61bbb3SSatish Balay       xb = x + 7 * (*idx++);
24749371c9d4SSatish Balay       x1 = xb[0];
24759371c9d4SSatish Balay       x2 = xb[1];
24769371c9d4SSatish Balay       x3 = xb[2];
24779371c9d4SSatish Balay       x4 = xb[3];
24789371c9d4SSatish Balay       x5 = xb[4];
24799371c9d4SSatish Balay       x6 = xb[5];
24809371c9d4SSatish Balay       x7 = xb[6];
24812d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
24822d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
24832d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
24842d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
24852d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
24862d61bbb3SSatish Balay       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
24872d61bbb3SSatish Balay       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
24882d61bbb3SSatish Balay       v += 49;
24892d61bbb3SSatish Balay     }
24909371c9d4SSatish Balay     z[0] = sum1;
24919371c9d4SSatish Balay     z[1] = sum2;
24929371c9d4SSatish Balay     z[2] = sum3;
24939371c9d4SSatish Balay     z[3] = sum4;
24949371c9d4SSatish Balay     z[4] = sum5;
24959371c9d4SSatish Balay     z[5] = sum6;
24969371c9d4SSatish Balay     z[6] = sum7;
249726e093fcSHong Zhang     if (!usecprow) {
24989371c9d4SSatish Balay       z += 7;
24999371c9d4SSatish Balay       y += 7;
25002d61bbb3SSatish Balay     }
250126e093fcSHong Zhang   }
25029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
25039566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
25049566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * a->nz));
25052d61bbb3SSatish Balay   PetscFunctionReturn(0);
25062d61bbb3SSatish Balay }
2507218c64b6SSatish Balay 
25085f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
25099371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) {
251096e086a2SDaniel Kokron   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2511f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
251296e086a2SDaniel Kokron   const PetscScalar *x, *xb;
251396e086a2SDaniel Kokron   const MatScalar   *v;
25146679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
2515ce68d72fSJed Brown   PetscInt           k;
251696e086a2SDaniel Kokron   PetscBool          usecprow = a->compressedrow.use;
25176679dcc1SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
251896e086a2SDaniel Kokron 
251996e086a2SDaniel Kokron   __m256d a0, a1, a2, a3, a4, a5;
2520ce68d72fSJed Brown   __m256d w0, w1, w2, w3;
252196e086a2SDaniel Kokron   __m256d z0, z1, z2;
252296e086a2SDaniel Kokron   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
252396e086a2SDaniel Kokron 
252496e086a2SDaniel Kokron   PetscFunctionBegin;
25259566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
25269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
25279566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &zarray));
252896e086a2SDaniel Kokron 
252996e086a2SDaniel Kokron   idx = a->j;
253096e086a2SDaniel Kokron   v   = a->a;
253196e086a2SDaniel Kokron   if (usecprow) {
253296e086a2SDaniel Kokron     mbs  = a->compressedrow.nrows;
253396e086a2SDaniel Kokron     ii   = a->compressedrow.i;
253496e086a2SDaniel Kokron     ridx = a->compressedrow.rindex;
253596e086a2SDaniel Kokron   } else {
253696e086a2SDaniel Kokron     mbs = a->mbs;
253796e086a2SDaniel Kokron     ii  = a->i;
253896e086a2SDaniel Kokron     z   = zarray;
253996e086a2SDaniel Kokron   }
254096e086a2SDaniel Kokron 
254196e086a2SDaniel Kokron   if (!a->mult_work) {
254296e086a2SDaniel Kokron     k = PetscMax(A->rmap->n, A->cmap->n);
25439566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
254496e086a2SDaniel Kokron   }
254596e086a2SDaniel Kokron 
254696e086a2SDaniel Kokron   work = a->mult_work;
254796e086a2SDaniel Kokron   for (i = 0; i < mbs; i++) {
25489371c9d4SSatish Balay     n = ii[1] - ii[0];
25499371c9d4SSatish Balay     ii++;
255096e086a2SDaniel Kokron     workt = work;
255196e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
255296e086a2SDaniel Kokron       xb = x + bs * (*idx++);
255396e086a2SDaniel Kokron       for (k = 0; k < bs; k++) workt[k] = xb[k];
255496e086a2SDaniel Kokron       workt += bs;
255596e086a2SDaniel Kokron     }
255696e086a2SDaniel Kokron     if (usecprow) z = zarray + bs * ridx[i];
255796e086a2SDaniel Kokron 
25589371c9d4SSatish Balay     z0 = _mm256_loadu_pd(&z[0]);
25599371c9d4SSatish Balay     z1 = _mm256_loadu_pd(&z[4]);
25609371c9d4SSatish Balay     z2 = _mm256_set1_pd(z[8]);
256196e086a2SDaniel Kokron 
256296e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
2563c05b70c4SSatish Balay       /* first column of a */
256496e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9]);
25659371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81]);
25669371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
25679371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
25689371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
25699371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
25709371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
257196e086a2SDaniel Kokron 
2572c05b70c4SSatish Balay       /* second column of a */
257396e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 1]);
25749371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
25759371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
25769371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
25779371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
25789371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
25799371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
258096e086a2SDaniel Kokron 
2581c05b70c4SSatish Balay       /* third column of a */
258296e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 2]);
25839371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
25849371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w2, z0);
25859371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
25869371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w2, z1);
25879371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
25889371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w2, z2);
258996e086a2SDaniel Kokron 
2590c05b70c4SSatish Balay       /* fourth column of a */
259196e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 3]);
25929371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
25939371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w3, z0);
25949371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
25959371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w3, z1);
25969371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
25979371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w3, z2);
259896e086a2SDaniel Kokron 
2599c05b70c4SSatish Balay       /* fifth column of a */
260096e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 4]);
26019371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
26029371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w0, z0);
26039371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
26049371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w0, z1);
26059371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
26069371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w0, z2);
260796e086a2SDaniel Kokron 
2608c05b70c4SSatish Balay       /* sixth column of a */
260996e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 5]);
26109371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
26119371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
26129371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
26139371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
26149371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
26159371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
261696e086a2SDaniel Kokron 
2617c05b70c4SSatish Balay       /* seventh column of a */
261896e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 6]);
26199371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
26209371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
26219371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
26229371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
26239371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
26249371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
262596e086a2SDaniel Kokron 
26266aad120cSJose E. Roman       /* eighth column of a */
262796e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 7]);
26289371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
26299371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
26309371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
26319371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
26329371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
26339371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
263496e086a2SDaniel Kokron 
2635c05b70c4SSatish Balay       /* ninth column of a */
263696e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 8]);
26379371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
26389371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
26399371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
26409371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
26419371c9d4SSatish Balay       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
26429371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
264396e086a2SDaniel Kokron     }
264496e086a2SDaniel Kokron 
26459371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
26469371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
26479371c9d4SSatish Balay     _mm256_maskstore_pd(&z[8], mask1, z2);
264896e086a2SDaniel Kokron 
264996e086a2SDaniel Kokron     v += n * bs2;
265096e086a2SDaniel Kokron     if (!usecprow) z += bs;
265196e086a2SDaniel Kokron   }
26529566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
26539566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &zarray));
26549566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(162.0 * a->nz));
265596e086a2SDaniel Kokron   PetscFunctionReturn(0);
265696e086a2SDaniel Kokron }
265796e086a2SDaniel Kokron #endif
265896e086a2SDaniel Kokron 
26599371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) {
2660ebada01fSBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2661f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2662ebada01fSBarry Smith   const PetscScalar *x, *xb;
2663ebada01fSBarry Smith   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2664ebada01fSBarry Smith   const MatScalar   *v;
2665ebada01fSBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2666ebada01fSBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2667ebada01fSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2668ebada01fSBarry Smith 
2669ebada01fSBarry Smith   PetscFunctionBegin;
26709566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
26719566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2672ebada01fSBarry Smith 
2673ebada01fSBarry Smith   idx = a->j;
2674ebada01fSBarry Smith   v   = a->a;
2675ebada01fSBarry Smith   if (usecprow) {
267648a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2677ebada01fSBarry Smith     mbs  = a->compressedrow.nrows;
2678ebada01fSBarry Smith     ii   = a->compressedrow.i;
2679ebada01fSBarry Smith     ridx = a->compressedrow.rindex;
2680ebada01fSBarry Smith   } else {
2681ebada01fSBarry Smith     ii = a->i;
2682ebada01fSBarry Smith     y  = yarray;
2683ebada01fSBarry Smith     z  = zarray;
2684ebada01fSBarry Smith   }
2685ebada01fSBarry Smith 
2686ebada01fSBarry Smith   for (i = 0; i < mbs; i++) {
26879371c9d4SSatish Balay     n = ii[1] - ii[0];
26889371c9d4SSatish Balay     ii++;
2689ebada01fSBarry Smith     if (usecprow) {
2690ebada01fSBarry Smith       z = zarray + 11 * ridx[i];
2691ebada01fSBarry Smith       y = yarray + 11 * ridx[i];
2692ebada01fSBarry Smith     }
26939371c9d4SSatish Balay     sum1  = y[0];
26949371c9d4SSatish Balay     sum2  = y[1];
26959371c9d4SSatish Balay     sum3  = y[2];
26969371c9d4SSatish Balay     sum4  = y[3];
26979371c9d4SSatish Balay     sum5  = y[4];
26989371c9d4SSatish Balay     sum6  = y[5];
26999371c9d4SSatish Balay     sum7  = y[6];
27009371c9d4SSatish Balay     sum8  = y[7];
27019371c9d4SSatish Balay     sum9  = y[8];
27029371c9d4SSatish Balay     sum10 = y[9];
27039371c9d4SSatish Balay     sum11 = y[10];
2704ebada01fSBarry Smith     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2705ebada01fSBarry Smith     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2706ebada01fSBarry Smith     for (j = 0; j < n; j++) {
2707ebada01fSBarry Smith       xb  = x + 11 * (*idx++);
27089371c9d4SSatish Balay       x1  = xb[0];
27099371c9d4SSatish Balay       x2  = xb[1];
27109371c9d4SSatish Balay       x3  = xb[2];
27119371c9d4SSatish Balay       x4  = xb[3];
27129371c9d4SSatish Balay       x5  = xb[4];
27139371c9d4SSatish Balay       x6  = xb[5];
27149371c9d4SSatish Balay       x7  = xb[6];
27159371c9d4SSatish Balay       x8  = xb[7];
27169371c9d4SSatish Balay       x9  = xb[8];
27179371c9d4SSatish Balay       x10 = xb[9];
27189371c9d4SSatish Balay       x11 = xb[10];
2719ebada01fSBarry Smith       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2720ebada01fSBarry Smith       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2721ebada01fSBarry Smith       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2722ebada01fSBarry Smith       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2723ebada01fSBarry Smith       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2724ebada01fSBarry Smith       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2725ebada01fSBarry Smith       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2726ebada01fSBarry Smith       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2727ebada01fSBarry Smith       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2728ebada01fSBarry Smith       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2729ebada01fSBarry Smith       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2730ebada01fSBarry Smith       v += 121;
2731ebada01fSBarry Smith     }
27329371c9d4SSatish Balay     z[0]  = sum1;
27339371c9d4SSatish Balay     z[1]  = sum2;
27349371c9d4SSatish Balay     z[2]  = sum3;
27359371c9d4SSatish Balay     z[3]  = sum4;
27369371c9d4SSatish Balay     z[4]  = sum5;
27379371c9d4SSatish Balay     z[5]  = sum6;
27389371c9d4SSatish Balay     z[6]  = sum7;
27399371c9d4SSatish Balay     z[7]  = sum8;
27409371c9d4SSatish Balay     z[8]  = sum9;
27419371c9d4SSatish Balay     z[9]  = sum10;
27429371c9d4SSatish Balay     z[10] = sum11;
2743ebada01fSBarry Smith     if (!usecprow) {
27449371c9d4SSatish Balay       z += 11;
27459371c9d4SSatish Balay       y += 11;
2746ebada01fSBarry Smith     }
2747ebada01fSBarry Smith   }
27489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
27499566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
27509566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(242.0 * a->nz));
2751ebada01fSBarry Smith   PetscFunctionReturn(0);
2752ebada01fSBarry Smith }
2753ebada01fSBarry Smith 
27549371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) {
27552d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2756f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
2757d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2758d9ca1df4SBarry Smith   const MatScalar   *v;
2759d9ca1df4SBarry Smith   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2760d9ca1df4SBarry Smith   PetscInt           ncols, k;
2761d9ca1df4SBarry Smith   const PetscInt    *ridx     = NULL, *idx, *ii;
2762ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2763218c64b6SSatish Balay 
27642d61bbb3SSatish Balay   PetscFunctionBegin;
27659566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
27669566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
27679566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &zarray));
27682d61bbb3SSatish Balay 
27692d61bbb3SSatish Balay   idx = a->j;
27702d61bbb3SSatish Balay   v   = a->a;
277126e093fcSHong Zhang   if (usecprow) {
277226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
277326e093fcSHong Zhang     ii   = a->compressedrow.i;
27747b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
277526e093fcSHong Zhang   } else {
277626e093fcSHong Zhang     mbs = a->mbs;
27772d61bbb3SSatish Balay     ii  = a->i;
277826e093fcSHong Zhang     z   = zarray;
277926e093fcSHong Zhang   }
27802d61bbb3SSatish Balay 
27812d61bbb3SSatish Balay   if (!a->mult_work) {
2782d0f46423SBarry Smith     k = PetscMax(A->rmap->n, A->cmap->n);
27839566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
27842d61bbb3SSatish Balay   }
27852d61bbb3SSatish Balay   work = a->mult_work;
27862d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
27879371c9d4SSatish Balay     n = ii[1] - ii[0];
27889371c9d4SSatish Balay     ii++;
27892d61bbb3SSatish Balay     ncols = n * bs;
27902d61bbb3SSatish Balay     workt = work;
27912d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
27922d61bbb3SSatish Balay       xb = x + bs * (*idx++);
27932d61bbb3SSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
27942d61bbb3SSatish Balay       workt += bs;
27952d61bbb3SSatish Balay     }
27967b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs * ridx[i];
279796b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
27982d61bbb3SSatish Balay     v += n * bs2;
279926fbe8dcSKarl Rupp     if (!usecprow) z += bs;
280026e093fcSHong Zhang   }
28019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
28029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &zarray));
28039566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
28042d61bbb3SSatish Balay   PetscFunctionReturn(0);
28052d61bbb3SSatish Balay }
28062d61bbb3SSatish Balay 
28079371c9d4SSatish Balay PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) {
2808547795f9SHong Zhang   PetscScalar zero = 0.0;
2809547795f9SHong Zhang 
2810547795f9SHong Zhang   PetscFunctionBegin;
28119566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
28129566063dSJacob Faibussowitsch   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2813547795f9SHong Zhang   PetscFunctionReturn(0);
2814547795f9SHong Zhang }
2815547795f9SHong Zhang 
28169371c9d4SSatish Balay PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) {
28173447b6efSHong Zhang   PetscScalar zero = 0.0;
28182d61bbb3SSatish Balay 
28192d61bbb3SSatish Balay   PetscFunctionBegin;
28209566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
28219566063dSJacob Faibussowitsch   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
28222d61bbb3SSatish Balay   PetscFunctionReturn(0);
28232d61bbb3SSatish Balay }
28242d61bbb3SSatish Balay 
28259371c9d4SSatish Balay PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) {
2826547795f9SHong Zhang   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2827b8c08b77SHong Zhang   PetscScalar       *z, x1, x2, x3, x4, x5;
2828d9ca1df4SBarry Smith   const PetscScalar *x, *xb = NULL;
2829d9ca1df4SBarry Smith   const MatScalar   *v;
2830b8c08b77SHong Zhang   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2831d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2832547795f9SHong Zhang   Mat_CompressedRow  cprow    = a->compressedrow;
2833ace3abfcSBarry Smith   PetscBool          usecprow = cprow.use;
2834547795f9SHong Zhang 
2835547795f9SHong Zhang   PetscFunctionBegin;
28369566063dSJacob Faibussowitsch   if (yy != zz) PetscCall(VecCopy(yy, zz));
28379566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
28389566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
2839547795f9SHong Zhang 
2840547795f9SHong Zhang   idx = a->j;
2841547795f9SHong Zhang   v   = a->a;
2842547795f9SHong Zhang   if (usecprow) {
2843547795f9SHong Zhang     mbs  = cprow.nrows;
2844547795f9SHong Zhang     ii   = cprow.i;
2845547795f9SHong Zhang     ridx = cprow.rindex;
2846547795f9SHong Zhang   } else {
2847547795f9SHong Zhang     mbs = a->mbs;
2848547795f9SHong Zhang     ii  = a->i;
2849547795f9SHong Zhang     xb  = x;
2850547795f9SHong Zhang   }
2851547795f9SHong Zhang 
2852547795f9SHong Zhang   switch (bs) {
2853547795f9SHong Zhang   case 1:
2854547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2855547795f9SHong Zhang       if (usecprow) xb = x + ridx[i];
2856547795f9SHong Zhang       x1 = xb[0];
2857547795f9SHong Zhang       ib = idx + ii[0];
28589371c9d4SSatish Balay       n  = ii[1] - ii[0];
28599371c9d4SSatish Balay       ii++;
2860547795f9SHong Zhang       for (j = 0; j < n; j++) {
2861547795f9SHong Zhang         rval = ib[j];
2862547795f9SHong Zhang         z[rval] += PetscConj(*v) * x1;
2863547795f9SHong Zhang         v++;
2864547795f9SHong Zhang       }
2865547795f9SHong Zhang       if (!usecprow) xb++;
2866547795f9SHong Zhang     }
2867547795f9SHong Zhang     break;
2868547795f9SHong Zhang   case 2:
2869547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2870547795f9SHong Zhang       if (usecprow) xb = x + 2 * ridx[i];
28719371c9d4SSatish Balay       x1 = xb[0];
28729371c9d4SSatish Balay       x2 = xb[1];
2873547795f9SHong Zhang       ib = idx + ii[0];
28749371c9d4SSatish Balay       n  = ii[1] - ii[0];
28759371c9d4SSatish Balay       ii++;
2876547795f9SHong Zhang       for (j = 0; j < n; j++) {
2877547795f9SHong Zhang         rval = ib[j] * 2;
2878547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2879547795f9SHong Zhang         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2880547795f9SHong Zhang         v += 4;
2881547795f9SHong Zhang       }
2882547795f9SHong Zhang       if (!usecprow) xb += 2;
2883547795f9SHong Zhang     }
2884547795f9SHong Zhang     break;
2885547795f9SHong Zhang   case 3:
2886547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2887547795f9SHong Zhang       if (usecprow) xb = x + 3 * ridx[i];
28889371c9d4SSatish Balay       x1 = xb[0];
28899371c9d4SSatish Balay       x2 = xb[1];
28909371c9d4SSatish Balay       x3 = xb[2];
2891547795f9SHong Zhang       ib = idx + ii[0];
28929371c9d4SSatish Balay       n  = ii[1] - ii[0];
28939371c9d4SSatish Balay       ii++;
2894547795f9SHong Zhang       for (j = 0; j < n; j++) {
2895547795f9SHong Zhang         rval = ib[j] * 3;
2896547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2897547795f9SHong Zhang         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2898547795f9SHong Zhang         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2899547795f9SHong Zhang         v += 9;
2900547795f9SHong Zhang       }
2901547795f9SHong Zhang       if (!usecprow) xb += 3;
2902547795f9SHong Zhang     }
2903547795f9SHong Zhang     break;
2904547795f9SHong Zhang   case 4:
2905547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2906547795f9SHong Zhang       if (usecprow) xb = x + 4 * ridx[i];
29079371c9d4SSatish Balay       x1 = xb[0];
29089371c9d4SSatish Balay       x2 = xb[1];
29099371c9d4SSatish Balay       x3 = xb[2];
29109371c9d4SSatish Balay       x4 = xb[3];
2911547795f9SHong Zhang       ib = idx + ii[0];
29129371c9d4SSatish Balay       n  = ii[1] - ii[0];
29139371c9d4SSatish Balay       ii++;
2914547795f9SHong Zhang       for (j = 0; j < n; j++) {
2915547795f9SHong Zhang         rval = ib[j] * 4;
2916547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2917547795f9SHong Zhang         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2918547795f9SHong Zhang         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2919547795f9SHong Zhang         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2920547795f9SHong Zhang         v += 16;
2921547795f9SHong Zhang       }
2922547795f9SHong Zhang       if (!usecprow) xb += 4;
2923547795f9SHong Zhang     }
2924547795f9SHong Zhang     break;
2925547795f9SHong Zhang   case 5:
2926547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2927547795f9SHong Zhang       if (usecprow) xb = x + 5 * ridx[i];
29289371c9d4SSatish Balay       x1 = xb[0];
29299371c9d4SSatish Balay       x2 = xb[1];
29309371c9d4SSatish Balay       x3 = xb[2];
29319371c9d4SSatish Balay       x4 = xb[3];
29329371c9d4SSatish Balay       x5 = xb[4];
2933547795f9SHong Zhang       ib = idx + ii[0];
29349371c9d4SSatish Balay       n  = ii[1] - ii[0];
29359371c9d4SSatish Balay       ii++;
2936547795f9SHong Zhang       for (j = 0; j < n; j++) {
2937547795f9SHong Zhang         rval = ib[j] * 5;
2938547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2939547795f9SHong Zhang         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2940547795f9SHong Zhang         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2941547795f9SHong Zhang         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2942547795f9SHong Zhang         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2943547795f9SHong Zhang         v += 25;
2944547795f9SHong Zhang       }
2945547795f9SHong Zhang       if (!usecprow) xb += 5;
2946547795f9SHong Zhang     }
2947547795f9SHong Zhang     break;
29489371c9d4SSatish Balay   default: /* block sizes larger than 5 by 5 are handled by BLAS */ SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2949968ae2c8SSatish Balay #if 0
2950968ae2c8SSatish Balay     {
2951b8c08b77SHong Zhang       PetscInt          ncols,k,bs2=a->bs2;
2952b8c08b77SHong Zhang       PetscScalar       *work,*workt,zb;
2953d9ca1df4SBarry Smith       const PetscScalar *xtmp;
2954547795f9SHong Zhang       if (!a->mult_work) {
2955547795f9SHong Zhang         k    = PetscMax(A->rmap->n,A->cmap->n);
29569566063dSJacob Faibussowitsch         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2957547795f9SHong Zhang       }
2958547795f9SHong Zhang       work = a->mult_work;
2959547795f9SHong Zhang       xtmp = x;
2960547795f9SHong Zhang       for (i=0; i<mbs; i++) {
2961547795f9SHong Zhang         n     = ii[1] - ii[0]; ii++;
2962547795f9SHong Zhang         ncols = n*bs;
29639566063dSJacob Faibussowitsch         PetscCall(PetscArrayzero(work,ncols));
296426fbe8dcSKarl Rupp         if (usecprow) xtmp = x + bs*ridx[i];
296596b95a6bSBarry Smith         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2966547795f9SHong Zhang         v += n*bs2;
2967547795f9SHong Zhang         if (!usecprow) xtmp += bs;
2968547795f9SHong Zhang         workt = work;
2969547795f9SHong Zhang         for (j=0; j<n; j++) {
2970547795f9SHong Zhang           zb = z + bs*(*idx++);
2971547795f9SHong Zhang           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2972547795f9SHong Zhang           workt += bs;
2973547795f9SHong Zhang         }
2974547795f9SHong Zhang       }
2975547795f9SHong Zhang     }
2976968ae2c8SSatish Balay #endif
2977547795f9SHong Zhang   }
29789566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
29799566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
29809566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2981547795f9SHong Zhang   PetscFunctionReturn(0);
2982547795f9SHong Zhang }
2983547795f9SHong Zhang 
29849371c9d4SSatish Balay PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) {
29852d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2986d9ca1df4SBarry Smith   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
2987f4259b30SLisandro Dalcin   const PetscScalar *x, *xb = NULL;
2988d9ca1df4SBarry Smith   const MatScalar   *v;
2989d9ca1df4SBarry Smith   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2990d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
29913447b6efSHong Zhang   Mat_CompressedRow  cprow    = a->compressedrow;
2992ace3abfcSBarry Smith   PetscBool          usecprow = cprow.use;
29932d61bbb3SSatish Balay 
29942d61bbb3SSatish Balay   PetscFunctionBegin;
29959566063dSJacob Faibussowitsch   if (yy != zz) PetscCall(VecCopy(yy, zz));
29969566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
29979566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
29982d61bbb3SSatish Balay 
29992d61bbb3SSatish Balay   idx = a->j;
30002d61bbb3SSatish Balay   v   = a->a;
30013447b6efSHong Zhang   if (usecprow) {
30023447b6efSHong Zhang     mbs  = cprow.nrows;
30033447b6efSHong Zhang     ii   = cprow.i;
30047b2bb3b9SHong Zhang     ridx = cprow.rindex;
30053447b6efSHong Zhang   } else {
30063447b6efSHong Zhang     mbs = a->mbs;
30072d61bbb3SSatish Balay     ii  = a->i;
3008f1af5d2fSBarry Smith     xb  = x;
30093447b6efSHong Zhang   }
30102d61bbb3SSatish Balay 
30112d61bbb3SSatish Balay   switch (bs) {
30122d61bbb3SSatish Balay   case 1:
30132d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30147b2bb3b9SHong Zhang       if (usecprow) xb = x + ridx[i];
3015f1af5d2fSBarry Smith       x1 = xb[0];
30163447b6efSHong Zhang       ib = idx + ii[0];
30179371c9d4SSatish Balay       n  = ii[1] - ii[0];
30189371c9d4SSatish Balay       ii++;
30192d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30202d61bbb3SSatish Balay         rval = ib[j];
3021f1af5d2fSBarry Smith         z[rval] += *v * x1;
3022f1af5d2fSBarry Smith         v++;
30232d61bbb3SSatish Balay       }
30243447b6efSHong Zhang       if (!usecprow) xb++;
30252d61bbb3SSatish Balay     }
30262d61bbb3SSatish Balay     break;
30272d61bbb3SSatish Balay   case 2:
30282d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30297b2bb3b9SHong Zhang       if (usecprow) xb = x + 2 * ridx[i];
30309371c9d4SSatish Balay       x1 = xb[0];
30319371c9d4SSatish Balay       x2 = xb[1];
30323447b6efSHong Zhang       ib = idx + ii[0];
30339371c9d4SSatish Balay       n  = ii[1] - ii[0];
30349371c9d4SSatish Balay       ii++;
30352d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30362d61bbb3SSatish Balay         rval = ib[j] * 2;
30372d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2;
30382d61bbb3SSatish Balay         z[rval++] += v[2] * x1 + v[3] * x2;
30392d61bbb3SSatish Balay         v += 4;
30402d61bbb3SSatish Balay       }
30413447b6efSHong Zhang       if (!usecprow) xb += 2;
30422d61bbb3SSatish Balay     }
30432d61bbb3SSatish Balay     break;
30442d61bbb3SSatish Balay   case 3:
30452d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30467b2bb3b9SHong Zhang       if (usecprow) xb = x + 3 * ridx[i];
30479371c9d4SSatish Balay       x1 = xb[0];
30489371c9d4SSatish Balay       x2 = xb[1];
30499371c9d4SSatish Balay       x3 = xb[2];
30503447b6efSHong Zhang       ib = idx + ii[0];
30519371c9d4SSatish Balay       n  = ii[1] - ii[0];
30529371c9d4SSatish Balay       ii++;
30532d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30542d61bbb3SSatish Balay         rval = ib[j] * 3;
30552d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
30562d61bbb3SSatish Balay         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
30572d61bbb3SSatish Balay         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
30582d61bbb3SSatish Balay         v += 9;
30592d61bbb3SSatish Balay       }
30603447b6efSHong Zhang       if (!usecprow) xb += 3;
30612d61bbb3SSatish Balay     }
30622d61bbb3SSatish Balay     break;
30632d61bbb3SSatish Balay   case 4:
30642d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30657b2bb3b9SHong Zhang       if (usecprow) xb = x + 4 * ridx[i];
30669371c9d4SSatish Balay       x1 = xb[0];
30679371c9d4SSatish Balay       x2 = xb[1];
30689371c9d4SSatish Balay       x3 = xb[2];
30699371c9d4SSatish Balay       x4 = xb[3];
30703447b6efSHong Zhang       ib = idx + ii[0];
30719371c9d4SSatish Balay       n  = ii[1] - ii[0];
30729371c9d4SSatish Balay       ii++;
30732d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30742d61bbb3SSatish Balay         rval = ib[j] * 4;
30752d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
30762d61bbb3SSatish Balay         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
30772d61bbb3SSatish Balay         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
30782d61bbb3SSatish Balay         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
30792d61bbb3SSatish Balay         v += 16;
30802d61bbb3SSatish Balay       }
30813447b6efSHong Zhang       if (!usecprow) xb += 4;
30822d61bbb3SSatish Balay     }
30832d61bbb3SSatish Balay     break;
30842d61bbb3SSatish Balay   case 5:
30852d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30867b2bb3b9SHong Zhang       if (usecprow) xb = x + 5 * ridx[i];
30879371c9d4SSatish Balay       x1 = xb[0];
30889371c9d4SSatish Balay       x2 = xb[1];
30899371c9d4SSatish Balay       x3 = xb[2];
30909371c9d4SSatish Balay       x4 = xb[3];
30919371c9d4SSatish Balay       x5 = xb[4];
30923447b6efSHong Zhang       ib = idx + ii[0];
30939371c9d4SSatish Balay       n  = ii[1] - ii[0];
30949371c9d4SSatish Balay       ii++;
30952d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30962d61bbb3SSatish Balay         rval = ib[j] * 5;
30972d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
30982d61bbb3SSatish Balay         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
30992d61bbb3SSatish Balay         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
31002d61bbb3SSatish Balay         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
31012d61bbb3SSatish Balay         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
31022d61bbb3SSatish Balay         v += 25;
31032d61bbb3SSatish Balay       }
31043447b6efSHong Zhang       if (!usecprow) xb += 5;
31052d61bbb3SSatish Balay     }
31062d61bbb3SSatish Balay     break;
3107f1af5d2fSBarry Smith   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3108690b6cddSBarry Smith     PetscInt           ncols, k;
3109d9ca1df4SBarry Smith     PetscScalar       *work, *workt;
3110d9ca1df4SBarry Smith     const PetscScalar *xtmp;
31112d61bbb3SSatish Balay     if (!a->mult_work) {
3112d0f46423SBarry Smith       k = PetscMax(A->rmap->n, A->cmap->n);
31139566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
31142d61bbb3SSatish Balay     }
31152d61bbb3SSatish Balay     work = a->mult_work;
31163447b6efSHong Zhang     xtmp = x;
31172d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
31189371c9d4SSatish Balay       n = ii[1] - ii[0];
31199371c9d4SSatish Balay       ii++;
31202d61bbb3SSatish Balay       ncols = n * bs;
31219566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(work, ncols));
312226fbe8dcSKarl Rupp       if (usecprow) xtmp = x + bs * ridx[i];
312396b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
31242d61bbb3SSatish Balay       v += n * bs2;
31253447b6efSHong Zhang       if (!usecprow) xtmp += bs;
31262d61bbb3SSatish Balay       workt = work;
31272d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
31282d61bbb3SSatish Balay         zb = z + bs * (*idx++);
31292d61bbb3SSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
31302d61bbb3SSatish Balay         workt += bs;
31312d61bbb3SSatish Balay       }
31322d61bbb3SSatish Balay     }
31332d61bbb3SSatish Balay   }
31342d61bbb3SSatish Balay   }
31359566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
31369566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
31379566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
31382d61bbb3SSatish Balay   PetscFunctionReturn(0);
31392d61bbb3SSatish Balay }
31402d61bbb3SSatish Balay 
31419371c9d4SSatish Balay PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) {
31422d61bbb3SSatish Balay   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3143690b6cddSBarry Smith   PetscInt     totalnz = a->bs2 * a->nz;
3144f4df32b1SMatthew Knepley   PetscScalar  oalpha  = alpha;
3145c5df96a5SBarry Smith   PetscBLASInt one     = 1, tnz;
31462d61bbb3SSatish Balay 
31472d61bbb3SSatish Balay   PetscFunctionBegin;
31489566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3149792fecdfSBarry Smith   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
31509566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(totalnz));
31512d61bbb3SSatish Balay   PetscFunctionReturn(0);
31522d61bbb3SSatish Balay }
31532d61bbb3SSatish Balay 
31549371c9d4SSatish Balay PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) {
31552d61bbb3SSatish Balay   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
31563f1db9ecSBarry Smith   MatScalar   *v   = a->a;
3157329f5518SBarry Smith   PetscReal    sum = 0.0;
3158d0f46423SBarry Smith   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
31592d61bbb3SSatish Balay 
31602d61bbb3SSatish Balay   PetscFunctionBegin;
31612d61bbb3SSatish Balay   if (type == NORM_FROBENIUS) {
3162570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16)
3163570b7f6dSBarry Smith     PetscBLASInt one = 1, cnt = bs2 * nz;
3164792fecdfSBarry Smith     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3165570b7f6dSBarry Smith #else
31662d61bbb3SSatish Balay     for (i = 0; i < bs2 * nz; i++) {
31679371c9d4SSatish Balay       sum += PetscRealPart(PetscConj(*v) * (*v));
31689371c9d4SSatish Balay       v++;
31692d61bbb3SSatish Balay     }
3170570b7f6dSBarry Smith #endif
31718f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum);
31729566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
31738a62d963SHong Zhang   } else if (type == NORM_1) { /* maximum column sum */
31748a62d963SHong Zhang     PetscReal *tmp;
31758a62d963SHong Zhang     PetscInt  *bcol = a->j;
31769566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
31778a62d963SHong Zhang     for (i = 0; i < nz; i++) {
31788a62d963SHong Zhang       for (j = 0; j < bs; j++) {
31798a62d963SHong Zhang         k1 = bs * (*bcol) + j; /* column index */
31808a62d963SHong Zhang         for (k = 0; k < bs; k++) {
31819371c9d4SSatish Balay           tmp[k1] += PetscAbsScalar(*v);
31829371c9d4SSatish Balay           v++;
31838a62d963SHong Zhang         }
31848a62d963SHong Zhang       }
31858a62d963SHong Zhang       bcol++;
31868a62d963SHong Zhang     }
31878a62d963SHong Zhang     *norm = 0.0;
3188d0f46423SBarry Smith     for (j = 0; j < A->cmap->n; j++) {
31898a62d963SHong Zhang       if (tmp[j] > *norm) *norm = tmp[j];
31908a62d963SHong Zhang     }
31919566063dSJacob Faibussowitsch     PetscCall(PetscFree(tmp));
31929566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3193596552b5SBarry Smith   } else if (type == NORM_INFINITY) { /* maximum row sum */
3194596552b5SBarry Smith     *norm = 0.0;
3195596552b5SBarry Smith     for (k = 0; k < bs; k++) {
319674f84c7bSSatish Balay       for (j = 0; j < a->mbs; j++) {
3197596552b5SBarry Smith         v   = a->a + bs2 * a->i[j] + k;
3198596552b5SBarry Smith         sum = 0.0;
3199596552b5SBarry Smith         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
32000e90e235SBarry Smith           for (k1 = 0; k1 < bs; k1++) {
3201596552b5SBarry Smith             sum += PetscAbsScalar(*v);
3202596552b5SBarry Smith             v += bs;
32032d61bbb3SSatish Balay           }
32040e90e235SBarry Smith         }
3205596552b5SBarry Smith         if (sum > *norm) *norm = sum;
3206596552b5SBarry Smith       }
3207596552b5SBarry Smith     }
32089566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3209e7e72b3dSBarry Smith   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
32102d61bbb3SSatish Balay   PetscFunctionReturn(0);
32112d61bbb3SSatish Balay }
32122d61bbb3SSatish Balay 
32139371c9d4SSatish Balay PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) {
32142d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
32152d61bbb3SSatish Balay 
32162d61bbb3SSatish Balay   PetscFunctionBegin;
32172d61bbb3SSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3218d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3219273d9f13SBarry Smith     *flg = PETSC_FALSE;
3220273d9f13SBarry Smith     PetscFunctionReturn(0);
32212d61bbb3SSatish Balay   }
32222d61bbb3SSatish Balay 
32232d61bbb3SSatish Balay   /* if the a->i are the same */
32249566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
322526fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
32262d61bbb3SSatish Balay 
32272d61bbb3SSatish Balay   /* if a->j are the same */
32289566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
322926fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
323026fbe8dcSKarl Rupp 
32312d61bbb3SSatish Balay   /* if a->a are the same */
32329566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
32332d61bbb3SSatish Balay   PetscFunctionReturn(0);
32342d61bbb3SSatish Balay }
32352d61bbb3SSatish Balay 
32369371c9d4SSatish Balay PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) {
32372d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3238690b6cddSBarry Smith   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
323987828ca2SBarry Smith   PetscScalar *x, zero = 0.0;
32403f1db9ecSBarry Smith   MatScalar   *aa, *aa_j;
32412d61bbb3SSatish Balay 
32422d61bbb3SSatish Balay   PetscFunctionBegin;
324328b400f6SJacob Faibussowitsch   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3244d0f46423SBarry Smith   bs   = A->rmap->bs;
32452d61bbb3SSatish Balay   aa   = a->a;
32462d61bbb3SSatish Balay   ai   = a->i;
32472d61bbb3SSatish Balay   aj   = a->j;
32482d61bbb3SSatish Balay   ambs = a->mbs;
32492d61bbb3SSatish Balay   bs2  = a->bs2;
32502d61bbb3SSatish Balay 
32519566063dSJacob Faibussowitsch   PetscCall(VecSet(v, zero));
32529566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
32539566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(v, &n));
325408401ef6SPierre Jolivet   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
32552d61bbb3SSatish Balay   for (i = 0; i < ambs; i++) {
32562d61bbb3SSatish Balay     for (j = ai[i]; j < ai[i + 1]; j++) {
32572d61bbb3SSatish Balay       if (aj[j] == i) {
32582d61bbb3SSatish Balay         row  = i * bs;
32592d61bbb3SSatish Balay         aa_j = aa + j * bs2;
32602d61bbb3SSatish Balay         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
32612d61bbb3SSatish Balay         break;
32622d61bbb3SSatish Balay       }
32632d61bbb3SSatish Balay     }
32642d61bbb3SSatish Balay   }
32659566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
32662d61bbb3SSatish Balay   PetscFunctionReturn(0);
32672d61bbb3SSatish Balay }
32682d61bbb3SSatish Balay 
32699371c9d4SSatish Balay PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) {
32702d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
327153ef36baSBarry Smith   const PetscScalar *l, *r, *li, *ri;
327253ef36baSBarry Smith   PetscScalar        x;
32733f1db9ecSBarry Smith   MatScalar         *aa, *v;
327453ef36baSBarry Smith   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
327553ef36baSBarry Smith   const PetscInt    *ai, *aj;
32762d61bbb3SSatish Balay 
32772d61bbb3SSatish Balay   PetscFunctionBegin;
32782d61bbb3SSatish Balay   ai  = a->i;
32792d61bbb3SSatish Balay   aj  = a->j;
32802d61bbb3SSatish Balay   aa  = a->a;
3281d0f46423SBarry Smith   m   = A->rmap->n;
3282d0f46423SBarry Smith   n   = A->cmap->n;
3283d0f46423SBarry Smith   bs  = A->rmap->bs;
32842d61bbb3SSatish Balay   mbs = a->mbs;
32852d61bbb3SSatish Balay   bs2 = a->bs2;
32862d61bbb3SSatish Balay   if (ll) {
32879566063dSJacob Faibussowitsch     PetscCall(VecGetArrayRead(ll, &l));
32889566063dSJacob Faibussowitsch     PetscCall(VecGetLocalSize(ll, &lm));
328908401ef6SPierre Jolivet     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
32902d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) { /* for each block row */
32912d61bbb3SSatish Balay       M  = ai[i + 1] - ai[i];
32922d61bbb3SSatish Balay       li = l + i * bs;
32932d61bbb3SSatish Balay       v  = aa + bs2 * ai[i];
32942d61bbb3SSatish Balay       for (j = 0; j < M; j++) { /* for each block */
3295ad540459SPierre Jolivet         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
32962d61bbb3SSatish Balay       }
32972d61bbb3SSatish Balay     }
32989566063dSJacob Faibussowitsch     PetscCall(VecRestoreArrayRead(ll, &l));
32999566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
33002d61bbb3SSatish Balay   }
33012d61bbb3SSatish Balay 
33022d61bbb3SSatish Balay   if (rr) {
33039566063dSJacob Faibussowitsch     PetscCall(VecGetArrayRead(rr, &r));
33049566063dSJacob Faibussowitsch     PetscCall(VecGetLocalSize(rr, &rn));
330508401ef6SPierre Jolivet     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
33062d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) { /* for each block row */
330753ef36baSBarry Smith       iai = ai[i];
330853ef36baSBarry Smith       M   = ai[i + 1] - iai;
330953ef36baSBarry Smith       v   = aa + bs2 * iai;
33102d61bbb3SSatish Balay       for (j = 0; j < M; j++) { /* for each block */
331153ef36baSBarry Smith         ri = r + bs * aj[iai + j];
33122d61bbb3SSatish Balay         for (k = 0; k < bs; k++) {
33132d61bbb3SSatish Balay           x = ri[k];
331453ef36baSBarry Smith           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
331553ef36baSBarry Smith           v += bs;
33162d61bbb3SSatish Balay         }
33172d61bbb3SSatish Balay       }
33182d61bbb3SSatish Balay     }
33199566063dSJacob Faibussowitsch     PetscCall(VecRestoreArrayRead(rr, &r));
33209566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
33212d61bbb3SSatish Balay   }
33222d61bbb3SSatish Balay   PetscFunctionReturn(0);
33232d61bbb3SSatish Balay }
33242d61bbb3SSatish Balay 
33259371c9d4SSatish Balay PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) {
33262d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
33272d61bbb3SSatish Balay 
33282d61bbb3SSatish Balay   PetscFunctionBegin;
33292d61bbb3SSatish Balay   info->block_size   = a->bs2;
3330ceed8ce5SJed Brown   info->nz_allocated = a->bs2 * a->maxnz;
33312d61bbb3SSatish Balay   info->nz_used      = a->bs2 * a->nz;
33323966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
33332d61bbb3SSatish Balay   info->assemblies   = A->num_ass;
33348e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
33357adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
3336d5f3da31SBarry Smith   if (A->factortype) {
33372d61bbb3SSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
33382d61bbb3SSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
33392d61bbb3SSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
33402d61bbb3SSatish Balay   } else {
33412d61bbb3SSatish Balay     info->fill_ratio_given  = 0;
33422d61bbb3SSatish Balay     info->fill_ratio_needed = 0;
33432d61bbb3SSatish Balay     info->factor_mallocs    = 0;
33442d61bbb3SSatish Balay   }
33452d61bbb3SSatish Balay   PetscFunctionReturn(0);
33462d61bbb3SSatish Balay }
33472d61bbb3SSatish Balay 
33489371c9d4SSatish Balay PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) {
33492d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
33502d61bbb3SSatish Balay 
33512d61bbb3SSatish Balay   PetscFunctionBegin;
33529566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
33532d61bbb3SSatish Balay   PetscFunctionReturn(0);
33542d61bbb3SSatish Balay }
3355a001520aSPierre Jolivet 
33569371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) {
3357a001520aSPierre Jolivet   PetscFunctionBegin;
33589566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
33594222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3360a001520aSPierre Jolivet   PetscFunctionReturn(0);
3361a001520aSPierre Jolivet }
3362a001520aSPierre Jolivet 
33639371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
336474eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3365f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1;
3366bcf10a7aSPierre Jolivet   const PetscScalar *xb;
336774eeabc5SPierre Jolivet   PetscScalar        x1;
336874eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
336974eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
337074eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
337174eeabc5SPierre Jolivet 
337274eeabc5SPierre Jolivet   PetscFunctionBegin;
337374eeabc5SPierre Jolivet   idx = a->j;
337474eeabc5SPierre Jolivet   v   = a->a;
337574eeabc5SPierre Jolivet   if (usecprow) {
337674eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
337774eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
337874eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
337974eeabc5SPierre Jolivet   } else {
338074eeabc5SPierre Jolivet     mbs = a->mbs;
338174eeabc5SPierre Jolivet     ii  = a->i;
338274eeabc5SPierre Jolivet     z   = c;
338374eeabc5SPierre Jolivet   }
338474eeabc5SPierre Jolivet 
338574eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
33869371c9d4SSatish Balay     n = ii[1] - ii[0];
33879371c9d4SSatish Balay     ii++;
338874eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
338974eeabc5SPierre Jolivet     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
339074eeabc5SPierre Jolivet     if (usecprow) z = c + ridx[i];
339174eeabc5SPierre Jolivet     jj = idx;
339274eeabc5SPierre Jolivet     vv = v;
339374eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
339474eeabc5SPierre Jolivet       idx  = jj;
339574eeabc5SPierre Jolivet       v    = vv;
339674eeabc5SPierre Jolivet       sum1 = 0.0;
339774eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
33989371c9d4SSatish Balay         xb = b + (*idx++);
33999371c9d4SSatish Balay         x1 = xb[0 + k * bm];
340074eeabc5SPierre Jolivet         sum1 += v[0] * x1;
340174eeabc5SPierre Jolivet         v += 1;
340274eeabc5SPierre Jolivet       }
3403feb237baSPierre Jolivet       z[0 + k * cm] = sum1;
340474eeabc5SPierre Jolivet     }
340574eeabc5SPierre Jolivet     if (!usecprow) z += 1;
340674eeabc5SPierre Jolivet   }
340774eeabc5SPierre Jolivet   PetscFunctionReturn(0);
340874eeabc5SPierre Jolivet }
340974eeabc5SPierre Jolivet 
34109371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
34114b7054f4SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3412f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2;
3413bcf10a7aSPierre Jolivet   const PetscScalar *xb;
34144b7054f4SPierre Jolivet   PetscScalar        x1, x2;
34154b7054f4SPierre Jolivet   const MatScalar   *v, *vv;
34164b7054f4SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
34174b7054f4SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
34184b7054f4SPierre Jolivet 
34194b7054f4SPierre Jolivet   PetscFunctionBegin;
34204b7054f4SPierre Jolivet   idx = a->j;
34214b7054f4SPierre Jolivet   v   = a->a;
34224b7054f4SPierre Jolivet   if (usecprow) {
34234b7054f4SPierre Jolivet     mbs  = a->compressedrow.nrows;
34244b7054f4SPierre Jolivet     ii   = a->compressedrow.i;
34254b7054f4SPierre Jolivet     ridx = a->compressedrow.rindex;
34264b7054f4SPierre Jolivet   } else {
34274b7054f4SPierre Jolivet     mbs = a->mbs;
34284b7054f4SPierre Jolivet     ii  = a->i;
34294b7054f4SPierre Jolivet     z   = c;
34304b7054f4SPierre Jolivet   }
34314b7054f4SPierre Jolivet 
34324b7054f4SPierre Jolivet   for (i = 0; i < mbs; i++) {
34339371c9d4SSatish Balay     n = ii[1] - ii[0];
34349371c9d4SSatish Balay     ii++;
34354b7054f4SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
34364b7054f4SPierre Jolivet     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
34374b7054f4SPierre Jolivet     if (usecprow) z = c + 2 * ridx[i];
34384b7054f4SPierre Jolivet     jj = idx;
34394b7054f4SPierre Jolivet     vv = v;
34404b7054f4SPierre Jolivet     for (k = 0; k < cn; k++) {
34414b7054f4SPierre Jolivet       idx  = jj;
34424b7054f4SPierre Jolivet       v    = vv;
34439371c9d4SSatish Balay       sum1 = 0.0;
34449371c9d4SSatish Balay       sum2 = 0.0;
34454b7054f4SPierre Jolivet       for (j = 0; j < n; j++) {
34469371c9d4SSatish Balay         xb = b + 2 * (*idx++);
34479371c9d4SSatish Balay         x1 = xb[0 + k * bm];
34489371c9d4SSatish Balay         x2 = xb[1 + k * bm];
34494b7054f4SPierre Jolivet         sum1 += v[0] * x1 + v[2] * x2;
34504b7054f4SPierre Jolivet         sum2 += v[1] * x1 + v[3] * x2;
34514b7054f4SPierre Jolivet         v += 4;
34524b7054f4SPierre Jolivet       }
34539371c9d4SSatish Balay       z[0 + k * cm] = sum1;
34549371c9d4SSatish Balay       z[1 + k * cm] = sum2;
34554b7054f4SPierre Jolivet     }
34564b7054f4SPierre Jolivet     if (!usecprow) z += 2;
34574b7054f4SPierre Jolivet   }
34584b7054f4SPierre Jolivet   PetscFunctionReturn(0);
34594b7054f4SPierre Jolivet }
34604b7054f4SPierre Jolivet 
34619371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
346274eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3463f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3;
3464bcf10a7aSPierre Jolivet   const PetscScalar *xb;
346574eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3;
346674eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
346774eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
346874eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
346974eeabc5SPierre Jolivet 
347074eeabc5SPierre Jolivet   PetscFunctionBegin;
347174eeabc5SPierre Jolivet   idx = a->j;
347274eeabc5SPierre Jolivet   v   = a->a;
347374eeabc5SPierre Jolivet   if (usecprow) {
347474eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
347574eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
347674eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
347774eeabc5SPierre Jolivet   } else {
347874eeabc5SPierre Jolivet     mbs = a->mbs;
347974eeabc5SPierre Jolivet     ii  = a->i;
348074eeabc5SPierre Jolivet     z   = c;
348174eeabc5SPierre Jolivet   }
348274eeabc5SPierre Jolivet 
348374eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
34849371c9d4SSatish Balay     n = ii[1] - ii[0];
34859371c9d4SSatish Balay     ii++;
348674eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
348774eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
348874eeabc5SPierre Jolivet     if (usecprow) z = c + 3 * ridx[i];
348974eeabc5SPierre Jolivet     jj = idx;
349074eeabc5SPierre Jolivet     vv = v;
349174eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
349274eeabc5SPierre Jolivet       idx  = jj;
349374eeabc5SPierre Jolivet       v    = vv;
34949371c9d4SSatish Balay       sum1 = 0.0;
34959371c9d4SSatish Balay       sum2 = 0.0;
34969371c9d4SSatish Balay       sum3 = 0.0;
349774eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
34989371c9d4SSatish Balay         xb = b + 3 * (*idx++);
34999371c9d4SSatish Balay         x1 = xb[0 + k * bm];
35009371c9d4SSatish Balay         x2 = xb[1 + k * bm];
35019371c9d4SSatish Balay         x3 = xb[2 + k * bm];
350274eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
350374eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
350474eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
350574eeabc5SPierre Jolivet         v += 9;
350674eeabc5SPierre Jolivet       }
35079371c9d4SSatish Balay       z[0 + k * cm] = sum1;
35089371c9d4SSatish Balay       z[1 + k * cm] = sum2;
35099371c9d4SSatish Balay       z[2 + k * cm] = sum3;
351074eeabc5SPierre Jolivet     }
351174eeabc5SPierre Jolivet     if (!usecprow) z += 3;
351274eeabc5SPierre Jolivet   }
351374eeabc5SPierre Jolivet   PetscFunctionReturn(0);
351474eeabc5SPierre Jolivet }
351574eeabc5SPierre Jolivet 
35169371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
351774eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3518f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3519bcf10a7aSPierre Jolivet   const PetscScalar *xb;
352074eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3, x4;
352174eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
352274eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
352374eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
352474eeabc5SPierre Jolivet 
352574eeabc5SPierre Jolivet   PetscFunctionBegin;
352674eeabc5SPierre Jolivet   idx = a->j;
352774eeabc5SPierre Jolivet   v   = a->a;
352874eeabc5SPierre Jolivet   if (usecprow) {
352974eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
353074eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
353174eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
353274eeabc5SPierre Jolivet   } else {
353374eeabc5SPierre Jolivet     mbs = a->mbs;
353474eeabc5SPierre Jolivet     ii  = a->i;
353574eeabc5SPierre Jolivet     z   = c;
353674eeabc5SPierre Jolivet   }
353774eeabc5SPierre Jolivet 
353874eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
35399371c9d4SSatish Balay     n = ii[1] - ii[0];
35409371c9d4SSatish Balay     ii++;
354174eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
354274eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
354374eeabc5SPierre Jolivet     if (usecprow) z = c + 4 * ridx[i];
354474eeabc5SPierre Jolivet     jj = idx;
354574eeabc5SPierre Jolivet     vv = v;
354674eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
354774eeabc5SPierre Jolivet       idx  = jj;
354874eeabc5SPierre Jolivet       v    = vv;
35499371c9d4SSatish Balay       sum1 = 0.0;
35509371c9d4SSatish Balay       sum2 = 0.0;
35519371c9d4SSatish Balay       sum3 = 0.0;
35529371c9d4SSatish Balay       sum4 = 0.0;
355374eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
35549371c9d4SSatish Balay         xb = b + 4 * (*idx++);
35559371c9d4SSatish Balay         x1 = xb[0 + k * bm];
35569371c9d4SSatish Balay         x2 = xb[1 + k * bm];
35579371c9d4SSatish Balay         x3 = xb[2 + k * bm];
35589371c9d4SSatish Balay         x4 = xb[3 + k * bm];
355974eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
356074eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
356174eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
356274eeabc5SPierre Jolivet         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
356374eeabc5SPierre Jolivet         v += 16;
356474eeabc5SPierre Jolivet       }
35659371c9d4SSatish Balay       z[0 + k * cm] = sum1;
35669371c9d4SSatish Balay       z[1 + k * cm] = sum2;
35679371c9d4SSatish Balay       z[2 + k * cm] = sum3;
35689371c9d4SSatish Balay       z[3 + k * cm] = sum4;
356974eeabc5SPierre Jolivet     }
357074eeabc5SPierre Jolivet     if (!usecprow) z += 4;
357174eeabc5SPierre Jolivet   }
357274eeabc5SPierre Jolivet   PetscFunctionReturn(0);
357374eeabc5SPierre Jolivet }
357474eeabc5SPierre Jolivet 
35759371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
357674eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3577f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3578bcf10a7aSPierre Jolivet   const PetscScalar *xb;
357974eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3, x4, x5;
358074eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
358174eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
358274eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
358374eeabc5SPierre Jolivet 
358474eeabc5SPierre Jolivet   PetscFunctionBegin;
358574eeabc5SPierre Jolivet   idx = a->j;
358674eeabc5SPierre Jolivet   v   = a->a;
358774eeabc5SPierre Jolivet   if (usecprow) {
358874eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
358974eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
359074eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
359174eeabc5SPierre Jolivet   } else {
359274eeabc5SPierre Jolivet     mbs = a->mbs;
359374eeabc5SPierre Jolivet     ii  = a->i;
359474eeabc5SPierre Jolivet     z   = c;
359574eeabc5SPierre Jolivet   }
359674eeabc5SPierre Jolivet 
359774eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
35989371c9d4SSatish Balay     n = ii[1] - ii[0];
35999371c9d4SSatish Balay     ii++;
360074eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
360174eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
360274eeabc5SPierre Jolivet     if (usecprow) z = c + 5 * ridx[i];
360374eeabc5SPierre Jolivet     jj = idx;
360474eeabc5SPierre Jolivet     vv = v;
360574eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
360674eeabc5SPierre Jolivet       idx  = jj;
360774eeabc5SPierre Jolivet       v    = vv;
36089371c9d4SSatish Balay       sum1 = 0.0;
36099371c9d4SSatish Balay       sum2 = 0.0;
36109371c9d4SSatish Balay       sum3 = 0.0;
36119371c9d4SSatish Balay       sum4 = 0.0;
36129371c9d4SSatish Balay       sum5 = 0.0;
361374eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
36149371c9d4SSatish Balay         xb = b + 5 * (*idx++);
36159371c9d4SSatish Balay         x1 = xb[0 + k * bm];
36169371c9d4SSatish Balay         x2 = xb[1 + k * bm];
36179371c9d4SSatish Balay         x3 = xb[2 + k * bm];
36189371c9d4SSatish Balay         x4 = xb[3 + k * bm];
36199371c9d4SSatish Balay         x5 = xb[4 + k * bm];
362074eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
362174eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
362274eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
362374eeabc5SPierre Jolivet         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
362474eeabc5SPierre Jolivet         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
362574eeabc5SPierre Jolivet         v += 25;
362674eeabc5SPierre Jolivet       }
36279371c9d4SSatish Balay       z[0 + k * cm] = sum1;
36289371c9d4SSatish Balay       z[1 + k * cm] = sum2;
36299371c9d4SSatish Balay       z[2 + k * cm] = sum3;
36309371c9d4SSatish Balay       z[3 + k * cm] = sum4;
36319371c9d4SSatish Balay       z[4 + k * cm] = sum5;
363274eeabc5SPierre Jolivet     }
363374eeabc5SPierre Jolivet     if (!usecprow) z += 5;
363474eeabc5SPierre Jolivet   }
363574eeabc5SPierre Jolivet   PetscFunctionReturn(0);
363674eeabc5SPierre Jolivet }
363774eeabc5SPierre Jolivet 
36389371c9d4SSatish Balay PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) {
3639a001520aSPierre Jolivet   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3640a001520aSPierre Jolivet   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3641910cf402Sprj-   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3642bcf10a7aSPierre Jolivet   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3643a001520aSPierre Jolivet   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3644a001520aSPierre Jolivet   PetscBLASInt     bbs, bcn, bbm, bcm;
3645f4259b30SLisandro Dalcin   PetscScalar     *z = NULL;
3646a001520aSPierre Jolivet   PetscScalar     *c, *b;
3647a001520aSPierre Jolivet   const MatScalar *v;
3648a001520aSPierre Jolivet   const PetscInt  *idx, *ii, *ridx = NULL;
36494b7054f4SPierre Jolivet   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3650a001520aSPierre Jolivet   PetscBool        usecprow = a->compressedrow.use;
3651a001520aSPierre Jolivet 
3652a001520aSPierre Jolivet   PetscFunctionBegin;
3653a001520aSPierre Jolivet   if (!cm || !cn) PetscFunctionReturn(0);
365408401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
365508401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
365608401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3657a001520aSPierre Jolivet   b = bd->v;
365848a46eb9SPierre Jolivet   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
36599566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArray(C, &c));
366074eeabc5SPierre Jolivet   switch (bs) {
36619371c9d4SSatish Balay   case 1: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break;
36629371c9d4SSatish Balay   case 2: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break;
36639371c9d4SSatish Balay   case 3: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break;
36649371c9d4SSatish Balay   case 4: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break;
36659371c9d4SSatish Balay   case 5: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break;
366674eeabc5SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
36679566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bs, &bbs));
36689566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cn, &bcn));
36699566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bm, &bbm));
36709566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cm, &bcm));
3671a001520aSPierre Jolivet     idx = a->j;
3672a001520aSPierre Jolivet     v   = a->a;
3673a001520aSPierre Jolivet     if (usecprow) {
3674a001520aSPierre Jolivet       mbs  = a->compressedrow.nrows;
3675a001520aSPierre Jolivet       ii   = a->compressedrow.i;
3676a001520aSPierre Jolivet       ridx = a->compressedrow.rindex;
3677a001520aSPierre Jolivet     } else {
3678a001520aSPierre Jolivet       mbs = a->mbs;
3679a001520aSPierre Jolivet       ii  = a->i;
3680a001520aSPierre Jolivet       z   = c;
3681a001520aSPierre Jolivet     }
3682a001520aSPierre Jolivet     for (i = 0; i < mbs; i++) {
36839371c9d4SSatish Balay       n = ii[1] - ii[0];
36849371c9d4SSatish Balay       ii++;
3685a001520aSPierre Jolivet       if (usecprow) z = c + bs * ridx[i];
36864b7054f4SPierre Jolivet       if (n) {
3687792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
36884b7054f4SPierre Jolivet         v += bs2;
36894b7054f4SPierre Jolivet       }
36904b7054f4SPierre Jolivet       for (j = 1; j < n; j++) {
3691792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3692a001520aSPierre Jolivet         v += bs2;
3693a001520aSPierre Jolivet       }
3694a001520aSPierre Jolivet       if (!usecprow) z += bs;
3695a001520aSPierre Jolivet     }
36964b7054f4SPierre Jolivet   }
36979566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArray(C, &c));
36989566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3699a001520aSPierre Jolivet   PetscFunctionReturn(0);
3700a001520aSPierre Jolivet }
3701