xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
2a001520aSPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <petscblaslapack.h>
6cac129eeSSatish Balay 
75f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
896e086a2SDaniel Kokron   #include <immintrin.h>
996e086a2SDaniel Kokron #endif
1096e086a2SDaniel Kokron 
11*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
12*d71ae5a4SJacob Faibussowitsch {
13a3192f15SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
145d0c19d7SBarry Smith   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
155d0c19d7SBarry Smith   const PetscInt *idx;
16690b6cddSBarry Smith   PetscInt        start, end, *ai, *aj, bs, *nidx2;
17f1af5d2fSBarry Smith   PetscBT         table;
18a3192f15SSatish Balay 
193a40ed3dSBarry Smith   PetscFunctionBegin;
20a3192f15SSatish Balay   m  = a->mbs;
21a3192f15SSatish Balay   ai = a->i;
22a3192f15SSatish Balay   aj = a->j;
23d0f46423SBarry Smith   bs = A->rmap->bs;
24a3192f15SSatish Balay 
2508401ef6SPierre Jolivet   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
26a3192f15SSatish Balay 
279566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(m, &table));
289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &nidx));
299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2));
30a3192f15SSatish Balay 
31a3192f15SSatish Balay   for (i = 0; i < is_max; i++) {
32a3192f15SSatish Balay     /* Initialise the two local arrays */
33a3192f15SSatish Balay     isz = 0;
349566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(m, table));
35a3192f15SSatish Balay 
36a3192f15SSatish Balay     /* Extract the indices, assume there can be duplicate entries */
379566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i], &idx));
389566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i], &n));
39a3192f15SSatish Balay 
40a3192f15SSatish Balay     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
41a3192f15SSatish Balay     for (j = 0; j < n; ++j) {
42218c64b6SSatish Balay       ival = idx[j] / bs; /* convert the indices into block indices */
4308401ef6SPierre Jolivet       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
4426fbe8dcSKarl Rupp       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
45a3192f15SSatish Balay     }
469566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i], &idx));
479566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&is[i]));
48a3192f15SSatish Balay 
49a3192f15SSatish Balay     k = 0;
50a3192f15SSatish Balay     for (j = 0; j < ov; j++) { /* for each overlap*/
51a3192f15SSatish Balay       n = isz;
52a3192f15SSatish Balay       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
53a3192f15SSatish Balay         row   = nidx[k];
54a3192f15SSatish Balay         start = ai[row];
55a3192f15SSatish Balay         end   = ai[row + 1];
56a3192f15SSatish Balay         for (l = start; l < end; l++) {
57a3192f15SSatish Balay           val = aj[l];
5826fbe8dcSKarl Rupp           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
59a3192f15SSatish Balay         }
60a3192f15SSatish Balay       }
61a3192f15SSatish Balay     }
62218c64b6SSatish Balay     /* expand the Index Set */
63218c64b6SSatish Balay     for (j = 0; j < isz; j++) {
6426fbe8dcSKarl Rupp       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
65218c64b6SSatish Balay     }
669566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i));
67a3192f15SSatish Balay   }
689566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table));
699566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx));
709566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx2));
713a40ed3dSBarry Smith   PetscFunctionReturn(0);
72a3192f15SSatish Balay }
731c351548SSatish Balay 
74*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
75*d71ae5a4SJacob Faibussowitsch {
76736121d4SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
77690b6cddSBarry Smith   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
78690b6cddSBarry Smith   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
795d0c19d7SBarry Smith   const PetscInt *irow, *icol;
805d0c19d7SBarry Smith   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
81690b6cddSBarry Smith   PetscInt       *aj = a->j, *ai = a->i;
823f1db9ecSBarry Smith   MatScalar      *mat_a;
83736121d4SSatish Balay   Mat             C;
846041f1b1SToby Isaac   PetscBool       flag;
85736121d4SSatish Balay 
863a40ed3dSBarry Smith   PetscFunctionBegin;
879566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
889566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
899566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
909566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
91736121d4SSatish Balay 
929566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(1 + oldcols, &smap));
93736121d4SSatish Balay   ssmap = smap;
949566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1 + nrows, &lens));
95736121d4SSatish Balay   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
96736121d4SSatish Balay   /* determine lens of each row */
97736121d4SSatish Balay   for (i = 0; i < nrows; i++) {
98736121d4SSatish Balay     kstart  = ai[irow[i]];
99736121d4SSatish Balay     kend    = kstart + a->ilen[irow[i]];
100736121d4SSatish Balay     lens[i] = 0;
101736121d4SSatish Balay     for (k = kstart; k < kend; k++) {
10226fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
103736121d4SSatish Balay     }
104736121d4SSatish Balay   }
105736121d4SSatish Balay   /* Create and fill new matrix */
106736121d4SSatish Balay   if (scall == MAT_REUSE_MATRIX) {
107736121d4SSatish Balay     c = (Mat_SeqBAIJ *)((*B)->data);
108736121d4SSatish Balay 
109aed4548fSBarry Smith     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
1109566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
11128b400f6SJacob Faibussowitsch     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
1129566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(c->ilen, c->mbs));
113736121d4SSatish Balay     C = *B;
1143a40ed3dSBarry Smith   } else {
1159566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1169566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
1179566063dSJacob Faibussowitsch     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1189566063dSJacob Faibussowitsch     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
119736121d4SSatish Balay   }
120736121d4SSatish Balay   c = (Mat_SeqBAIJ *)(C->data);
121736121d4SSatish Balay   for (i = 0; i < nrows; i++) {
122736121d4SSatish Balay     row      = irow[i];
123736121d4SSatish Balay     kstart   = ai[row];
124736121d4SSatish Balay     kend     = kstart + a->ilen[row];
125736121d4SSatish Balay     mat_i    = c->i[i];
126d29f2997SMatthew Woehlke     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
127d29f2997SMatthew Woehlke     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
128736121d4SSatish Balay     mat_ilen = c->ilen + i;
129736121d4SSatish Balay     for (k = kstart; k < kend; k++) {
130736121d4SSatish Balay       if ((tcol = ssmap[a->j[k]])) {
131736121d4SSatish Balay         *mat_j++ = tcol - 1;
1329566063dSJacob Faibussowitsch         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
133549d3d68SSatish Balay         mat_a += bs2;
134736121d4SSatish Balay         (*mat_ilen)++;
135736121d4SSatish Balay       }
136736121d4SSatish Balay     }
137736121d4SSatish Balay   }
138cdc6f3adSToby Isaac   /* sort */
139d29f2997SMatthew Woehlke   if (c->j && c->a) {
140cdc6f3adSToby Isaac     MatScalar *work;
1419566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(bs2, &work));
142cdc6f3adSToby Isaac     for (i = 0; i < nrows; i++) {
143cdc6f3adSToby Isaac       PetscInt ilen;
144cdc6f3adSToby Isaac       mat_i = c->i[i];
145cdc6f3adSToby Isaac       mat_j = c->j + mat_i;
146cdc6f3adSToby Isaac       mat_a = c->a + mat_i * bs2;
147cdc6f3adSToby Isaac       ilen  = c->ilen[i];
1489566063dSJacob Faibussowitsch       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
149cdc6f3adSToby Isaac     }
1509566063dSJacob Faibussowitsch     PetscCall(PetscFree(work));
151cdc6f3adSToby Isaac   }
152218c64b6SSatish Balay 
153736121d4SSatish Balay   /* Free work space */
1549566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
1559566063dSJacob Faibussowitsch   PetscCall(PetscFree(smap));
1569566063dSJacob Faibussowitsch   PetscCall(PetscFree(lens));
1579566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1589566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
159736121d4SSatish Balay 
1609566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
161736121d4SSatish Balay   *B = C;
1623a40ed3dSBarry Smith   PetscFunctionReturn(0);
163736121d4SSatish Balay }
164736121d4SSatish Balay 
165*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
166*d71ae5a4SJacob Faibussowitsch {
167218c64b6SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
168218c64b6SSatish Balay   IS              is1, is2;
169afebec48SHong Zhang   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
1705d0c19d7SBarry Smith   const PetscInt *irow, *icol;
171218c64b6SSatish Balay 
1723a40ed3dSBarry Smith   PetscFunctionBegin;
1739566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
1749566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
1759566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
1769566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
177218c64b6SSatish Balay 
178218c64b6SSatish Balay   /* Verify if the indices corespond to each element in a block
179218c64b6SSatish Balay    and form the IS with compressed IS */
180f8ecb639SStefano Zampini   maxmnbs = PetscMax(a->mbs, a->nbs);
1819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
1829566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->mbs));
183218c64b6SSatish Balay   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
184ad540459SPierre Jolivet   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
1856041f1b1SToby Isaac   count = 0;
1866041f1b1SToby Isaac   for (i = 0; i < nrows; i++) {
187afebec48SHong Zhang     j = irow[i] / bs;
1886041f1b1SToby Isaac     if ((vary[j]--) == bs) iary[count++] = j;
189218c64b6SSatish Balay   }
1909566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
191218c64b6SSatish Balay 
1929566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->nbs));
193218c64b6SSatish Balay   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
194ad540459SPierre Jolivet   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
1956041f1b1SToby Isaac   count = 0;
1966041f1b1SToby Isaac   for (i = 0; i < ncols; i++) {
197afebec48SHong Zhang     j = icol[i] / bs;
1986041f1b1SToby Isaac     if ((vary[j]--) == bs) iary[count++] = j;
1996041f1b1SToby Isaac   }
2009566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
2019566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
2029566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
2039566063dSJacob Faibussowitsch   PetscCall(PetscFree2(vary, iary));
204218c64b6SSatish Balay 
2059566063dSJacob Faibussowitsch   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
2069566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is1));
2079566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is2));
2083a40ed3dSBarry Smith   PetscFunctionReturn(0);
209218c64b6SSatish Balay }
210218c64b6SSatish Balay 
211*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
212*d71ae5a4SJacob Faibussowitsch {
21316b64355SHong Zhang   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
2145c39f6d9SHong Zhang   Mat_SubSppt *submatj = c->submatis1;
21516b64355SHong Zhang 
21616b64355SHong Zhang   PetscFunctionBegin;
2179566063dSJacob Faibussowitsch   PetscCall((*submatj->destroy)(C));
2189566063dSJacob Faibussowitsch   PetscCall(MatDestroySubMatrix_Private(submatj));
21916b64355SHong Zhang   PetscFunctionReturn(0);
22016b64355SHong Zhang }
22116b64355SHong Zhang 
22289a1a59bSHong Zhang /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
223*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
224*d71ae5a4SJacob Faibussowitsch {
22586e85357SHong Zhang   PetscInt     i;
22686e85357SHong Zhang   Mat          C;
22786e85357SHong Zhang   Mat_SeqBAIJ *c;
22886e85357SHong Zhang   Mat_SubSppt *submatj;
22986e85357SHong Zhang 
23086e85357SHong Zhang   PetscFunctionBegin;
23186e85357SHong Zhang   for (i = 0; i < n; i++) {
23286e85357SHong Zhang     C       = (*mat)[i];
23386e85357SHong Zhang     c       = (Mat_SeqBAIJ *)C->data;
23486e85357SHong Zhang     submatj = c->submatis1;
23586e85357SHong Zhang     if (submatj) {
2367daefbafSJunchao Zhang       if (--((PetscObject)C)->refct <= 0) {
23726cc229bSBarry Smith         PetscCall(PetscFree(C->factorprefix));
2389566063dSJacob Faibussowitsch         PetscCall((*submatj->destroy)(C));
2399566063dSJacob Faibussowitsch         PetscCall(MatDestroySubMatrix_Private(submatj));
2409566063dSJacob Faibussowitsch         PetscCall(PetscFree(C->defaultvectype));
2413faff063SStefano Zampini         PetscCall(PetscFree(C->defaultrandtype));
2429566063dSJacob Faibussowitsch         PetscCall(PetscLayoutDestroy(&C->rmap));
2439566063dSJacob Faibussowitsch         PetscCall(PetscLayoutDestroy(&C->cmap));
2449566063dSJacob Faibussowitsch         PetscCall(PetscHeaderDestroy(&C));
2457daefbafSJunchao Zhang       }
24686e85357SHong Zhang     } else {
2479566063dSJacob Faibussowitsch       PetscCall(MatDestroy(&C));
24886e85357SHong Zhang     }
24986e85357SHong Zhang   }
2507daefbafSJunchao Zhang 
2517daefbafSJunchao Zhang   /* Destroy Dummy submatrices created for reuse */
2529566063dSJacob Faibussowitsch   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
2537daefbafSJunchao Zhang 
2549566063dSJacob Faibussowitsch   PetscCall(PetscFree(*mat));
25586e85357SHong Zhang   PetscFunctionReturn(0);
25686e85357SHong Zhang }
25786e85357SHong Zhang 
258*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
259*d71ae5a4SJacob Faibussowitsch {
260690b6cddSBarry Smith   PetscInt i;
261736121d4SSatish Balay 
2623a40ed3dSBarry Smith   PetscFunctionBegin;
26348a46eb9SPierre Jolivet   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
264736121d4SSatish Balay 
26548a46eb9SPierre Jolivet   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
2663a40ed3dSBarry Smith   PetscFunctionReturn(0);
267736121d4SSatish Balay }
268218c64b6SSatish Balay 
2692d61bbb3SSatish Balay /* -------------------------------------------------------*/
2702d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */
2712d61bbb3SSatish Balay /* -------------------------------------------------------*/
2722d61bbb3SSatish Balay 
273*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
274*d71ae5a4SJacob Faibussowitsch {
2752d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
276d9fead3dSBarry Smith   PetscScalar       *z, sum;
277d9fead3dSBarry Smith   const PetscScalar *x;
278d9fead3dSBarry Smith   const MatScalar   *v;
2797c565772SBarry Smith   PetscInt           mbs, i, n;
2800298fd71SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
281ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2822d61bbb3SSatish Balay 
2832d61bbb3SSatish Balay   PetscFunctionBegin;
2849566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
2859566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &z));
2862d61bbb3SSatish Balay 
28726e093fcSHong Zhang   if (usecprow) {
28826e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
28926e093fcSHong Zhang     ii   = a->compressedrow.i;
2907b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
2919566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(z, a->mbs));
29226e093fcSHong Zhang   } else {
29326e093fcSHong Zhang     mbs = a->mbs;
2942d61bbb3SSatish Balay     ii  = a->i;
29526e093fcSHong Zhang   }
2962d61bbb3SSatish Balay 
2972d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
298ee54c7eeSHong Zhang     n   = ii[1] - ii[0];
299ee54c7eeSHong Zhang     v   = a->a + ii[0];
300ee54c7eeSHong Zhang     idx = a->j + ii[0];
301ee54c7eeSHong Zhang     ii++;
302444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
303444d8c10SJed Brown     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3042d61bbb3SSatish Balay     sum = 0.0;
3052162cab8SBarry Smith     PetscSparseDensePlusDot(sum, x, v, idx, n);
30626e093fcSHong Zhang     if (usecprow) {
3077b2bb3b9SHong Zhang       z[ridx[i]] = sum;
30826e093fcSHong Zhang     } else {
3092d61bbb3SSatish Balay       z[i] = sum;
3102d61bbb3SSatish Balay     }
31126e093fcSHong Zhang   }
3129566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3139566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &z));
3149566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
3152d61bbb3SSatish Balay   PetscFunctionReturn(0);
3162d61bbb3SSatish Balay }
3172d61bbb3SSatish Balay 
318*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
319*d71ae5a4SJacob Faibussowitsch {
3202d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
321f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, *zarray;
322d9fead3dSBarry Smith   const PetscScalar *x, *xb;
32387828ca2SBarry Smith   PetscScalar        x1, x2;
324d9fead3dSBarry Smith   const MatScalar   *v;
3257c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
326ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
3272d61bbb3SSatish Balay 
3282d61bbb3SSatish Balay   PetscFunctionBegin;
3299566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3309566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
3312d61bbb3SSatish Balay 
3322d61bbb3SSatish Balay   idx = a->j;
3332d61bbb3SSatish Balay   v   = a->a;
33426e093fcSHong Zhang   if (usecprow) {
33526e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
33626e093fcSHong Zhang     ii   = a->compressedrow.i;
3377b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
3389566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
33926e093fcSHong Zhang   } else {
34026e093fcSHong Zhang     mbs = a->mbs;
3412d61bbb3SSatish Balay     ii  = a->i;
34226e093fcSHong Zhang     z   = zarray;
34326e093fcSHong Zhang   }
3442d61bbb3SSatish Balay 
3452d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
3469371c9d4SSatish Balay     n = ii[1] - ii[0];
3479371c9d4SSatish Balay     ii++;
3489371c9d4SSatish Balay     sum1 = 0.0;
3499371c9d4SSatish Balay     sum2 = 0.0;
350444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
351444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3522d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
3539371c9d4SSatish Balay       xb = x + 2 * (*idx++);
3549371c9d4SSatish Balay       x1 = xb[0];
3559371c9d4SSatish Balay       x2 = xb[1];
3562d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[2] * x2;
3572d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[3] * x2;
3582d61bbb3SSatish Balay       v += 4;
3592d61bbb3SSatish Balay     }
3607b2bb3b9SHong Zhang     if (usecprow) z = zarray + 2 * ridx[i];
3619371c9d4SSatish Balay     z[0] = sum1;
3629371c9d4SSatish Balay     z[1] = sum2;
36326e093fcSHong Zhang     if (!usecprow) z += 2;
3642d61bbb3SSatish Balay   }
3659566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3669566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
3679566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
3682d61bbb3SSatish Balay   PetscFunctionReturn(0);
3692d61bbb3SSatish Balay }
3702d61bbb3SSatish Balay 
371*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
372*d71ae5a4SJacob Faibussowitsch {
3732d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
374f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
375d9fead3dSBarry Smith   const PetscScalar *x, *xb;
376d9fead3dSBarry Smith   const MatScalar   *v;
3777c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
378ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
37926e093fcSHong Zhang 
380b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
381fee21e36SBarry Smith   #pragma disjoint(*v, *z, *xb)
382fee21e36SBarry Smith #endif
383fee21e36SBarry Smith 
3842d61bbb3SSatish Balay   PetscFunctionBegin;
3859566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3869566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
3872d61bbb3SSatish Balay 
3882d61bbb3SSatish Balay   idx = a->j;
3892d61bbb3SSatish Balay   v   = a->a;
39026e093fcSHong Zhang   if (usecprow) {
39126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
39226e093fcSHong Zhang     ii   = a->compressedrow.i;
3937b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
3949566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
39526e093fcSHong Zhang   } else {
39626e093fcSHong Zhang     mbs = a->mbs;
3972d61bbb3SSatish Balay     ii  = a->i;
39826e093fcSHong Zhang     z   = zarray;
39926e093fcSHong Zhang   }
4002d61bbb3SSatish Balay 
4012d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
4029371c9d4SSatish Balay     n = ii[1] - ii[0];
4039371c9d4SSatish Balay     ii++;
4049371c9d4SSatish Balay     sum1 = 0.0;
4059371c9d4SSatish Balay     sum2 = 0.0;
4069371c9d4SSatish Balay     sum3 = 0.0;
407444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
408444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
4092d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
41026fbe8dcSKarl Rupp       xb = x + 3 * (*idx++);
41126fbe8dcSKarl Rupp       x1 = xb[0];
41226fbe8dcSKarl Rupp       x2 = xb[1];
41326fbe8dcSKarl Rupp       x3 = xb[2];
41426fbe8dcSKarl Rupp 
4152d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
4162d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
4172d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
4182d61bbb3SSatish Balay       v += 9;
4192d61bbb3SSatish Balay     }
4207b2bb3b9SHong Zhang     if (usecprow) z = zarray + 3 * ridx[i];
4219371c9d4SSatish Balay     z[0] = sum1;
4229371c9d4SSatish Balay     z[1] = sum2;
4239371c9d4SSatish Balay     z[2] = sum3;
42426e093fcSHong Zhang     if (!usecprow) z += 3;
4252d61bbb3SSatish Balay   }
4269566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4279566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
4289566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
4292d61bbb3SSatish Balay   PetscFunctionReturn(0);
4302d61bbb3SSatish Balay }
4312d61bbb3SSatish Balay 
432*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
433*d71ae5a4SJacob Faibussowitsch {
4342d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
435f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
436d9fead3dSBarry Smith   const PetscScalar *x, *xb;
437d9fead3dSBarry Smith   const MatScalar   *v;
4387c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
439ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
4402d61bbb3SSatish Balay 
4412d61bbb3SSatish Balay   PetscFunctionBegin;
4429566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4439566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
4442d61bbb3SSatish Balay 
4452d61bbb3SSatish Balay   idx = a->j;
4462d61bbb3SSatish Balay   v   = a->a;
44726e093fcSHong Zhang   if (usecprow) {
44826e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
44926e093fcSHong Zhang     ii   = a->compressedrow.i;
4507b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
4519566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
45226e093fcSHong Zhang   } else {
45326e093fcSHong Zhang     mbs = a->mbs;
4542d61bbb3SSatish Balay     ii  = a->i;
45526e093fcSHong Zhang     z   = zarray;
45626e093fcSHong Zhang   }
4572d61bbb3SSatish Balay 
4582d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
45926fbe8dcSKarl Rupp     n = ii[1] - ii[0];
46026fbe8dcSKarl Rupp     ii++;
46126fbe8dcSKarl Rupp     sum1 = 0.0;
46226fbe8dcSKarl Rupp     sum2 = 0.0;
46326fbe8dcSKarl Rupp     sum3 = 0.0;
46426fbe8dcSKarl Rupp     sum4 = 0.0;
46526fbe8dcSKarl Rupp 
466444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
467444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
4682d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
4692d61bbb3SSatish Balay       xb = x + 4 * (*idx++);
4709371c9d4SSatish Balay       x1 = xb[0];
4719371c9d4SSatish Balay       x2 = xb[1];
4729371c9d4SSatish Balay       x3 = xb[2];
4739371c9d4SSatish Balay       x4 = xb[3];
4742d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
4752d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
4762d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
4772d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
4782d61bbb3SSatish Balay       v += 16;
4792d61bbb3SSatish Balay     }
4807b2bb3b9SHong Zhang     if (usecprow) z = zarray + 4 * ridx[i];
4819371c9d4SSatish Balay     z[0] = sum1;
4829371c9d4SSatish Balay     z[1] = sum2;
4839371c9d4SSatish Balay     z[2] = sum3;
4849371c9d4SSatish Balay     z[3] = sum4;
48526e093fcSHong Zhang     if (!usecprow) z += 4;
4862d61bbb3SSatish Balay   }
4879566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4889566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
4899566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
4902d61bbb3SSatish Balay   PetscFunctionReturn(0);
4912d61bbb3SSatish Balay }
4922d61bbb3SSatish Balay 
493*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
494*d71ae5a4SJacob Faibussowitsch {
4952d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
496f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
497d9fead3dSBarry Smith   const PetscScalar *xb, *x;
498d9fead3dSBarry Smith   const MatScalar   *v;
4990298fd71SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
5007c565772SBarry Smith   PetscInt           mbs, i, j, n;
501ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
5022d61bbb3SSatish Balay 
503433994e6SBarry Smith   PetscFunctionBegin;
5049566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5059566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
5062d61bbb3SSatish Balay 
5072d61bbb3SSatish Balay   idx = a->j;
5082d61bbb3SSatish Balay   v   = a->a;
50926e093fcSHong Zhang   if (usecprow) {
51026e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
51126e093fcSHong Zhang     ii   = a->compressedrow.i;
5127b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
5139566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
51426e093fcSHong Zhang   } else {
51526e093fcSHong Zhang     mbs = a->mbs;
5162d61bbb3SSatish Balay     ii  = a->i;
51726e093fcSHong Zhang     z   = zarray;
51826e093fcSHong Zhang   }
5192d61bbb3SSatish Balay 
5202d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
5219371c9d4SSatish Balay     n = ii[1] - ii[0];
5229371c9d4SSatish Balay     ii++;
5239371c9d4SSatish Balay     sum1 = 0.0;
5249371c9d4SSatish Balay     sum2 = 0.0;
5259371c9d4SSatish Balay     sum3 = 0.0;
5269371c9d4SSatish Balay     sum4 = 0.0;
5279371c9d4SSatish Balay     sum5 = 0.0;
528444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
529444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
5302d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
5312d61bbb3SSatish Balay       xb = x + 5 * (*idx++);
5329371c9d4SSatish Balay       x1 = xb[0];
5339371c9d4SSatish Balay       x2 = xb[1];
5349371c9d4SSatish Balay       x3 = xb[2];
5359371c9d4SSatish Balay       x4 = xb[3];
5369371c9d4SSatish Balay       x5 = xb[4];
5372d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
5382d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
5392d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
5402d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
5412d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
5422d61bbb3SSatish Balay       v += 25;
5432d61bbb3SSatish Balay     }
5447b2bb3b9SHong Zhang     if (usecprow) z = zarray + 5 * ridx[i];
5459371c9d4SSatish Balay     z[0] = sum1;
5469371c9d4SSatish Balay     z[1] = sum2;
5479371c9d4SSatish Balay     z[2] = sum3;
5489371c9d4SSatish Balay     z[3] = sum4;
5499371c9d4SSatish Balay     z[4] = sum5;
55026e093fcSHong Zhang     if (!usecprow) z += 5;
5512d61bbb3SSatish Balay   }
5529566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5539566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
5549566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
5552d61bbb3SSatish Balay   PetscFunctionReturn(0);
5562d61bbb3SSatish Balay }
5572d61bbb3SSatish Balay 
558*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
559*d71ae5a4SJacob Faibussowitsch {
56015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
561f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
562d9fead3dSBarry Smith   const PetscScalar *x, *xb;
56326e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
564d9fead3dSBarry Smith   const MatScalar   *v;
5657c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
566ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
56715091d37SBarry Smith 
568433994e6SBarry Smith   PetscFunctionBegin;
5699566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5709566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
57115091d37SBarry Smith 
57215091d37SBarry Smith   idx = a->j;
57315091d37SBarry Smith   v   = a->a;
57426e093fcSHong Zhang   if (usecprow) {
57526e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
57626e093fcSHong Zhang     ii   = a->compressedrow.i;
5777b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
5789566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
57926e093fcSHong Zhang   } else {
58026e093fcSHong Zhang     mbs = a->mbs;
58115091d37SBarry Smith     ii  = a->i;
58226e093fcSHong Zhang     z   = zarray;
58326e093fcSHong Zhang   }
58415091d37SBarry Smith 
58515091d37SBarry Smith   for (i = 0; i < mbs; i++) {
58626fbe8dcSKarl Rupp     n = ii[1] - ii[0];
58726fbe8dcSKarl Rupp     ii++;
58826fbe8dcSKarl Rupp     sum1 = 0.0;
58926fbe8dcSKarl Rupp     sum2 = 0.0;
59026fbe8dcSKarl Rupp     sum3 = 0.0;
59126fbe8dcSKarl Rupp     sum4 = 0.0;
59226fbe8dcSKarl Rupp     sum5 = 0.0;
59326fbe8dcSKarl Rupp     sum6 = 0.0;
59426fbe8dcSKarl Rupp 
595444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
596444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
59715091d37SBarry Smith     for (j = 0; j < n; j++) {
59815091d37SBarry Smith       xb = x + 6 * (*idx++);
5999371c9d4SSatish Balay       x1 = xb[0];
6009371c9d4SSatish Balay       x2 = xb[1];
6019371c9d4SSatish Balay       x3 = xb[2];
6029371c9d4SSatish Balay       x4 = xb[3];
6039371c9d4SSatish Balay       x5 = xb[4];
6049371c9d4SSatish Balay       x6 = xb[5];
60515091d37SBarry Smith       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
60615091d37SBarry Smith       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
60715091d37SBarry Smith       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
60815091d37SBarry Smith       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
60915091d37SBarry Smith       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
61015091d37SBarry Smith       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
61115091d37SBarry Smith       v += 36;
61215091d37SBarry Smith     }
6137b2bb3b9SHong Zhang     if (usecprow) z = zarray + 6 * ridx[i];
6149371c9d4SSatish Balay     z[0] = sum1;
6159371c9d4SSatish Balay     z[1] = sum2;
6169371c9d4SSatish Balay     z[2] = sum3;
6179371c9d4SSatish Balay     z[3] = sum4;
6189371c9d4SSatish Balay     z[4] = sum5;
6199371c9d4SSatish Balay     z[5] = sum6;
62026e093fcSHong Zhang     if (!usecprow) z += 6;
62115091d37SBarry Smith   }
62215091d37SBarry Smith 
6239566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6249566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
6259566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
62615091d37SBarry Smith   PetscFunctionReturn(0);
62715091d37SBarry Smith }
6288ab949d8SShri Abhyankar 
629*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
630*d71ae5a4SJacob Faibussowitsch {
6312d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
632f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
633d9fead3dSBarry Smith   const PetscScalar *x, *xb;
63426e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
635d9fead3dSBarry Smith   const MatScalar   *v;
6367c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
637ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
6382d61bbb3SSatish Balay 
639433994e6SBarry Smith   PetscFunctionBegin;
6409566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6419566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
6422d61bbb3SSatish Balay 
6432d61bbb3SSatish Balay   idx = a->j;
6442d61bbb3SSatish Balay   v   = a->a;
64526e093fcSHong Zhang   if (usecprow) {
64626e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
64726e093fcSHong Zhang     ii   = a->compressedrow.i;
6487b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
6499566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
65026e093fcSHong Zhang   } else {
65126e093fcSHong Zhang     mbs = a->mbs;
6522d61bbb3SSatish Balay     ii  = a->i;
65326e093fcSHong Zhang     z   = zarray;
65426e093fcSHong Zhang   }
6552d61bbb3SSatish Balay 
6562d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
65726fbe8dcSKarl Rupp     n = ii[1] - ii[0];
65826fbe8dcSKarl Rupp     ii++;
65926fbe8dcSKarl Rupp     sum1 = 0.0;
66026fbe8dcSKarl Rupp     sum2 = 0.0;
66126fbe8dcSKarl Rupp     sum3 = 0.0;
66226fbe8dcSKarl Rupp     sum4 = 0.0;
66326fbe8dcSKarl Rupp     sum5 = 0.0;
66426fbe8dcSKarl Rupp     sum6 = 0.0;
66526fbe8dcSKarl Rupp     sum7 = 0.0;
66626fbe8dcSKarl Rupp 
667444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
668444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
6692d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
6702d61bbb3SSatish Balay       xb = x + 7 * (*idx++);
6719371c9d4SSatish Balay       x1 = xb[0];
6729371c9d4SSatish Balay       x2 = xb[1];
6739371c9d4SSatish Balay       x3 = xb[2];
6749371c9d4SSatish Balay       x4 = xb[3];
6759371c9d4SSatish Balay       x5 = xb[4];
6769371c9d4SSatish Balay       x6 = xb[5];
6779371c9d4SSatish Balay       x7 = xb[6];
6782d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
6792d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
6802d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
6812d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
6822d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
6832d61bbb3SSatish Balay       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
6842d61bbb3SSatish Balay       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
6852d61bbb3SSatish Balay       v += 49;
6862d61bbb3SSatish Balay     }
6877b2bb3b9SHong Zhang     if (usecprow) z = zarray + 7 * ridx[i];
6889371c9d4SSatish Balay     z[0] = sum1;
6899371c9d4SSatish Balay     z[1] = sum2;
6909371c9d4SSatish Balay     z[2] = sum3;
6919371c9d4SSatish Balay     z[3] = sum4;
6929371c9d4SSatish Balay     z[4] = sum5;
6939371c9d4SSatish Balay     z[5] = sum6;
6949371c9d4SSatish Balay     z[6] = sum7;
69526e093fcSHong Zhang     if (!usecprow) z += 7;
6962d61bbb3SSatish Balay   }
6972d61bbb3SSatish Balay 
6989566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6999566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
7009566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
7012d61bbb3SSatish Balay   PetscFunctionReturn(0);
7022d61bbb3SSatish Balay }
7032d61bbb3SSatish Balay 
7045f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
705*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
706*d71ae5a4SJacob Faibussowitsch {
70796e086a2SDaniel Kokron   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
708f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
70996e086a2SDaniel Kokron   const PetscScalar *x, *xb;
71096e086a2SDaniel Kokron   const MatScalar   *v;
71196e086a2SDaniel Kokron   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
71296e086a2SDaniel Kokron   const PetscInt    *idx, *ii, *ridx = NULL;
713ce68d72fSJed Brown   PetscInt           k;
71496e086a2SDaniel Kokron   PetscBool          usecprow = a->compressedrow.use;
71596e086a2SDaniel Kokron 
71696e086a2SDaniel Kokron   __m256d a0, a1, a2, a3, a4, a5;
717ce68d72fSJed Brown   __m256d w0, w1, w2, w3;
71896e086a2SDaniel Kokron   __m256d z0, z1, z2;
71996e086a2SDaniel Kokron   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
72096e086a2SDaniel Kokron 
72196e086a2SDaniel Kokron   PetscFunctionBegin;
7229566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7239566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
72496e086a2SDaniel Kokron 
72596e086a2SDaniel Kokron   idx = a->j;
72696e086a2SDaniel Kokron   v   = a->a;
72796e086a2SDaniel Kokron   if (usecprow) {
72896e086a2SDaniel Kokron     mbs  = a->compressedrow.nrows;
72996e086a2SDaniel Kokron     ii   = a->compressedrow.i;
73096e086a2SDaniel Kokron     ridx = a->compressedrow.rindex;
7319566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
73296e086a2SDaniel Kokron   } else {
73396e086a2SDaniel Kokron     mbs = a->mbs;
73496e086a2SDaniel Kokron     ii  = a->i;
73596e086a2SDaniel Kokron     z   = zarray;
73696e086a2SDaniel Kokron   }
73796e086a2SDaniel Kokron 
73896e086a2SDaniel Kokron   if (!a->mult_work) {
73996e086a2SDaniel Kokron     k = PetscMax(A->rmap->n, A->cmap->n);
7409566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
74196e086a2SDaniel Kokron   }
74296e086a2SDaniel Kokron 
74396e086a2SDaniel Kokron   work = a->mult_work;
74496e086a2SDaniel Kokron   for (i = 0; i < mbs; i++) {
7459371c9d4SSatish Balay     n = ii[1] - ii[0];
7469371c9d4SSatish Balay     ii++;
74796e086a2SDaniel Kokron     workt = work;
74896e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
74996e086a2SDaniel Kokron       xb = x + bs * (*idx++);
75096e086a2SDaniel Kokron       for (k = 0; k < bs; k++) workt[k] = xb[k];
75196e086a2SDaniel Kokron       workt += bs;
75296e086a2SDaniel Kokron     }
75396e086a2SDaniel Kokron     if (usecprow) z = zarray + bs * ridx[i];
75496e086a2SDaniel Kokron 
7559371c9d4SSatish Balay     z0 = _mm256_setzero_pd();
7569371c9d4SSatish Balay     z1 = _mm256_setzero_pd();
7579371c9d4SSatish Balay     z2 = _mm256_setzero_pd();
75896e086a2SDaniel Kokron 
75996e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
760c05b70c4SSatish Balay       /* first column of a */
76196e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9]);
7629371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81]);
7639371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
7649371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
7659371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
7669371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
7679371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
76896e086a2SDaniel Kokron 
769c05b70c4SSatish Balay       /* second column of a */
77096e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 1]);
7719371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
7729371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
7739371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
7749371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
7759371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
7769371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
77796e086a2SDaniel Kokron 
778c05b70c4SSatish Balay       /* third column of a */
77996e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 2]);
7809371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
7819371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w2, z0);
7829371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
7839371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w2, z1);
7849371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
7859371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w2, z2);
78696e086a2SDaniel Kokron 
787c05b70c4SSatish Balay       /* fourth column of a */
78896e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 3]);
7899371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
7909371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w3, z0);
7919371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
7929371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w3, z1);
7939371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
7949371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w3, z2);
79596e086a2SDaniel Kokron 
796c05b70c4SSatish Balay       /* fifth column of a */
79796e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 4]);
7989371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
7999371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w0, z0);
8009371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
8019371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w0, z1);
8029371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
8039371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w0, z2);
80496e086a2SDaniel Kokron 
805c05b70c4SSatish Balay       /* sixth column of a */
80696e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 5]);
8079371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
8089371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
8099371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
8109371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
8119371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
8129371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
81396e086a2SDaniel Kokron 
814c05b70c4SSatish Balay       /* seventh column of a */
81596e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 6]);
8169371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
8179371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
8189371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
8199371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
8209371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
8219371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
82296e086a2SDaniel Kokron 
8236aad120cSJose E. Roman       /* eighth column of a */
82496e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 7]);
8259371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
8269371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
8279371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
8289371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
8299371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
8309371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
83196e086a2SDaniel Kokron 
832c05b70c4SSatish Balay       /* ninth column of a */
83396e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 8]);
8349371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
8359371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
8369371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
8379371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
8389371c9d4SSatish Balay       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
8399371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
84096e086a2SDaniel Kokron     }
84196e086a2SDaniel Kokron 
8429371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
8439371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
8449371c9d4SSatish Balay     _mm256_maskstore_pd(&z[8], mask1, z2);
84596e086a2SDaniel Kokron 
84696e086a2SDaniel Kokron     v += n * bs2;
84796e086a2SDaniel Kokron     if (!usecprow) z += bs;
84896e086a2SDaniel Kokron   }
8499566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8509566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
8519566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
85296e086a2SDaniel Kokron   PetscFunctionReturn(0);
85396e086a2SDaniel Kokron }
85496e086a2SDaniel Kokron #endif
85596e086a2SDaniel Kokron 
856*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
857*d71ae5a4SJacob Faibussowitsch {
858ebada01fSBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
859f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
860ebada01fSBarry Smith   const PetscScalar *x, *xb;
861ebada01fSBarry Smith   PetscScalar       *zarray, xv;
862ebada01fSBarry Smith   const MatScalar   *v;
863ebada01fSBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
864ebada01fSBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
865ebada01fSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
866ebada01fSBarry Smith 
867ebada01fSBarry Smith   PetscFunctionBegin;
8689566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8699566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
870ebada01fSBarry Smith 
871ebada01fSBarry Smith   v = a->a;
872ebada01fSBarry Smith   if (usecprow) {
873ebada01fSBarry Smith     mbs  = a->compressedrow.nrows;
874ebada01fSBarry Smith     ii   = a->compressedrow.i;
875ebada01fSBarry Smith     ridx = a->compressedrow.rindex;
8769566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
877ebada01fSBarry Smith   } else {
878ebada01fSBarry Smith     mbs = a->mbs;
879ebada01fSBarry Smith     ii  = a->i;
880ebada01fSBarry Smith     z   = zarray;
881ebada01fSBarry Smith   }
882ebada01fSBarry Smith 
883ebada01fSBarry Smith   for (i = 0; i < mbs; i++) {
884ebada01fSBarry Smith     n     = ii[i + 1] - ii[i];
885ebada01fSBarry Smith     idx   = ij + ii[i];
8869371c9d4SSatish Balay     sum1  = 0.0;
8879371c9d4SSatish Balay     sum2  = 0.0;
8889371c9d4SSatish Balay     sum3  = 0.0;
8899371c9d4SSatish Balay     sum4  = 0.0;
8909371c9d4SSatish Balay     sum5  = 0.0;
8919371c9d4SSatish Balay     sum6  = 0.0;
8929371c9d4SSatish Balay     sum7  = 0.0;
8939371c9d4SSatish Balay     sum8  = 0.0;
8949371c9d4SSatish Balay     sum9  = 0.0;
8959371c9d4SSatish Balay     sum10 = 0.0;
8969371c9d4SSatish Balay     sum11 = 0.0;
897ebada01fSBarry Smith 
898ebada01fSBarry Smith     for (j = 0; j < n; j++) {
899ebada01fSBarry Smith       xb = x + 11 * (idx[j]);
900ebada01fSBarry Smith 
901ebada01fSBarry Smith       for (k = 0; k < 11; k++) {
902ebada01fSBarry Smith         xv = xb[k];
903ebada01fSBarry Smith         sum1 += v[0] * xv;
904ebada01fSBarry Smith         sum2 += v[1] * xv;
905ebada01fSBarry Smith         sum3 += v[2] * xv;
906ebada01fSBarry Smith         sum4 += v[3] * xv;
907ebada01fSBarry Smith         sum5 += v[4] * xv;
908ebada01fSBarry Smith         sum6 += v[5] * xv;
909ebada01fSBarry Smith         sum7 += v[6] * xv;
910ebada01fSBarry Smith         sum8 += v[7] * xv;
911ebada01fSBarry Smith         sum9 += v[8] * xv;
912ebada01fSBarry Smith         sum10 += v[9] * xv;
913ebada01fSBarry Smith         sum11 += v[10] * xv;
914ebada01fSBarry Smith         v += 11;
915ebada01fSBarry Smith       }
916ebada01fSBarry Smith     }
917ebada01fSBarry Smith     if (usecprow) z = zarray + 11 * ridx[i];
9189371c9d4SSatish Balay     z[0]  = sum1;
9199371c9d4SSatish Balay     z[1]  = sum2;
9209371c9d4SSatish Balay     z[2]  = sum3;
9219371c9d4SSatish Balay     z[3]  = sum4;
9229371c9d4SSatish Balay     z[4]  = sum5;
9239371c9d4SSatish Balay     z[5]  = sum6;
9249371c9d4SSatish Balay     z[6]  = sum7;
9259371c9d4SSatish Balay     z[7]  = sum8;
9269371c9d4SSatish Balay     z[8]  = sum9;
9279371c9d4SSatish Balay     z[9]  = sum10;
9289371c9d4SSatish Balay     z[10] = sum11;
929ebada01fSBarry Smith 
930ebada01fSBarry Smith     if (!usecprow) z += 11;
931ebada01fSBarry Smith   }
932ebada01fSBarry Smith 
9339566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9349566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
9359566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
936ebada01fSBarry Smith   PetscFunctionReturn(0);
937ebada01fSBarry Smith }
938ebada01fSBarry Smith 
9396679dcc1SBarry Smith /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
940*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
941*d71ae5a4SJacob Faibussowitsch {
9426679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
9436679dcc1SBarry Smith   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
9446679dcc1SBarry Smith   const PetscScalar *x, *xb;
9456679dcc1SBarry Smith   PetscScalar       *zarray, xv;
9466679dcc1SBarry Smith   const MatScalar   *v;
9476679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
9486679dcc1SBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
9496679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
9506679dcc1SBarry Smith 
9516679dcc1SBarry Smith   PetscFunctionBegin;
9529566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9539566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
9546679dcc1SBarry Smith 
9556679dcc1SBarry Smith   v = a->a;
9566679dcc1SBarry Smith   if (usecprow) {
9576679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
9586679dcc1SBarry Smith     ii   = a->compressedrow.i;
9596679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
9609566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
9616679dcc1SBarry Smith   } else {
9626679dcc1SBarry Smith     mbs = a->mbs;
9636679dcc1SBarry Smith     ii  = a->i;
9646679dcc1SBarry Smith     z   = zarray;
9656679dcc1SBarry Smith   }
9666679dcc1SBarry Smith 
9676679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
9686679dcc1SBarry Smith     n     = ii[i + 1] - ii[i];
9696679dcc1SBarry Smith     idx   = ij + ii[i];
9709371c9d4SSatish Balay     sum1  = 0.0;
9719371c9d4SSatish Balay     sum2  = 0.0;
9729371c9d4SSatish Balay     sum3  = 0.0;
9739371c9d4SSatish Balay     sum4  = 0.0;
9749371c9d4SSatish Balay     sum5  = 0.0;
9759371c9d4SSatish Balay     sum6  = 0.0;
9769371c9d4SSatish Balay     sum7  = 0.0;
9779371c9d4SSatish Balay     sum8  = 0.0;
9789371c9d4SSatish Balay     sum9  = 0.0;
9799371c9d4SSatish Balay     sum10 = 0.0;
9809371c9d4SSatish Balay     sum11 = 0.0;
9819371c9d4SSatish Balay     sum12 = 0.0;
9826679dcc1SBarry Smith 
9836679dcc1SBarry Smith     for (j = 0; j < n; j++) {
9846679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
9856679dcc1SBarry Smith 
9866679dcc1SBarry Smith       for (k = 0; k < 12; k++) {
9876679dcc1SBarry Smith         xv = xb[k];
9886679dcc1SBarry Smith         sum1 += v[0] * xv;
9896679dcc1SBarry Smith         sum2 += v[1] * xv;
9906679dcc1SBarry Smith         sum3 += v[2] * xv;
9916679dcc1SBarry Smith         sum4 += v[3] * xv;
9926679dcc1SBarry Smith         sum5 += v[4] * xv;
9936679dcc1SBarry Smith         sum6 += v[5] * xv;
9946679dcc1SBarry Smith         sum7 += v[6] * xv;
9956679dcc1SBarry Smith         sum8 += v[7] * xv;
9966679dcc1SBarry Smith         sum9 += v[8] * xv;
9976679dcc1SBarry Smith         sum10 += v[9] * xv;
9986679dcc1SBarry Smith         sum11 += v[10] * xv;
9996679dcc1SBarry Smith         sum12 += v[11] * xv;
10006679dcc1SBarry Smith         v += 12;
10016679dcc1SBarry Smith       }
10026679dcc1SBarry Smith     }
10036679dcc1SBarry Smith     if (usecprow) z = zarray + 12 * ridx[i];
10049371c9d4SSatish Balay     z[0]  = sum1;
10059371c9d4SSatish Balay     z[1]  = sum2;
10069371c9d4SSatish Balay     z[2]  = sum3;
10079371c9d4SSatish Balay     z[3]  = sum4;
10089371c9d4SSatish Balay     z[4]  = sum5;
10099371c9d4SSatish Balay     z[5]  = sum6;
10109371c9d4SSatish Balay     z[6]  = sum7;
10119371c9d4SSatish Balay     z[7]  = sum8;
10129371c9d4SSatish Balay     z[8]  = sum9;
10139371c9d4SSatish Balay     z[9]  = sum10;
10149371c9d4SSatish Balay     z[10] = sum11;
10159371c9d4SSatish Balay     z[11] = sum12;
10166679dcc1SBarry Smith     if (!usecprow) z += 12;
10176679dcc1SBarry Smith   }
10189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10199566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
10209566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
10216679dcc1SBarry Smith   PetscFunctionReturn(0);
10226679dcc1SBarry Smith }
10236679dcc1SBarry Smith 
1024*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
1025*d71ae5a4SJacob Faibussowitsch {
10266679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
10276679dcc1SBarry Smith   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
10286679dcc1SBarry Smith   const PetscScalar *x, *xb;
10296679dcc1SBarry Smith   PetscScalar       *zarray, *yarray, xv;
10306679dcc1SBarry Smith   const MatScalar   *v;
10316679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
10326679dcc1SBarry Smith   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
10336679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
10346679dcc1SBarry Smith 
10356679dcc1SBarry Smith   PetscFunctionBegin;
10369566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
10379566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
10386679dcc1SBarry Smith 
10396679dcc1SBarry Smith   v = a->a;
10406679dcc1SBarry Smith   if (usecprow) {
104148a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
10426679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
10436679dcc1SBarry Smith     ii   = a->compressedrow.i;
10446679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
10456679dcc1SBarry Smith   } else {
10466679dcc1SBarry Smith     ii = a->i;
10476679dcc1SBarry Smith     y  = yarray;
10486679dcc1SBarry Smith     z  = zarray;
10496679dcc1SBarry Smith   }
10506679dcc1SBarry Smith 
10516679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
10526679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
10536679dcc1SBarry Smith     idx = ij + ii[i];
10546679dcc1SBarry Smith 
10556679dcc1SBarry Smith     if (usecprow) {
10566679dcc1SBarry Smith       y = yarray + 12 * ridx[i];
10576679dcc1SBarry Smith       z = zarray + 12 * ridx[i];
10586679dcc1SBarry Smith     }
10599371c9d4SSatish Balay     sum1  = y[0];
10609371c9d4SSatish Balay     sum2  = y[1];
10619371c9d4SSatish Balay     sum3  = y[2];
10629371c9d4SSatish Balay     sum4  = y[3];
10639371c9d4SSatish Balay     sum5  = y[4];
10649371c9d4SSatish Balay     sum6  = y[5];
10659371c9d4SSatish Balay     sum7  = y[6];
10669371c9d4SSatish Balay     sum8  = y[7];
10679371c9d4SSatish Balay     sum9  = y[8];
10689371c9d4SSatish Balay     sum10 = y[9];
10699371c9d4SSatish Balay     sum11 = y[10];
10709371c9d4SSatish Balay     sum12 = y[11];
10716679dcc1SBarry Smith 
10726679dcc1SBarry Smith     for (j = 0; j < n; j++) {
10736679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
10746679dcc1SBarry Smith 
10756679dcc1SBarry Smith       for (k = 0; k < 12; k++) {
10766679dcc1SBarry Smith         xv = xb[k];
10776679dcc1SBarry Smith         sum1 += v[0] * xv;
10786679dcc1SBarry Smith         sum2 += v[1] * xv;
10796679dcc1SBarry Smith         sum3 += v[2] * xv;
10806679dcc1SBarry Smith         sum4 += v[3] * xv;
10816679dcc1SBarry Smith         sum5 += v[4] * xv;
10826679dcc1SBarry Smith         sum6 += v[5] * xv;
10836679dcc1SBarry Smith         sum7 += v[6] * xv;
10846679dcc1SBarry Smith         sum8 += v[7] * xv;
10856679dcc1SBarry Smith         sum9 += v[8] * xv;
10866679dcc1SBarry Smith         sum10 += v[9] * xv;
10876679dcc1SBarry Smith         sum11 += v[10] * xv;
10886679dcc1SBarry Smith         sum12 += v[11] * xv;
10896679dcc1SBarry Smith         v += 12;
10906679dcc1SBarry Smith       }
10916679dcc1SBarry Smith     }
10926679dcc1SBarry Smith 
10939371c9d4SSatish Balay     z[0]  = sum1;
10949371c9d4SSatish Balay     z[1]  = sum2;
10959371c9d4SSatish Balay     z[2]  = sum3;
10969371c9d4SSatish Balay     z[3]  = sum4;
10979371c9d4SSatish Balay     z[4]  = sum5;
10989371c9d4SSatish Balay     z[5]  = sum6;
10999371c9d4SSatish Balay     z[6]  = sum7;
11009371c9d4SSatish Balay     z[7]  = sum8;
11019371c9d4SSatish Balay     z[8]  = sum9;
11029371c9d4SSatish Balay     z[9]  = sum10;
11039371c9d4SSatish Balay     z[10] = sum11;
11049371c9d4SSatish Balay     z[11] = sum12;
11056679dcc1SBarry Smith     if (!usecprow) {
11066679dcc1SBarry Smith       y += 12;
11076679dcc1SBarry Smith       z += 12;
11086679dcc1SBarry Smith     }
11096679dcc1SBarry Smith   }
11109566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
11119566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
11129566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
11136679dcc1SBarry Smith   PetscFunctionReturn(0);
11146679dcc1SBarry Smith }
11156679dcc1SBarry Smith 
11166679dcc1SBarry Smith /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1117*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1118*d71ae5a4SJacob Faibussowitsch {
11196679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
11206679dcc1SBarry Smith   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
11216679dcc1SBarry Smith   const PetscScalar *x, *xb;
11226679dcc1SBarry Smith   PetscScalar        x1, x2, x3, x4, *zarray;
11236679dcc1SBarry Smith   const MatScalar   *v;
11246679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
11256679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
11266679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
11276679dcc1SBarry Smith 
11286679dcc1SBarry Smith   PetscFunctionBegin;
11299566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
11309566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
11316679dcc1SBarry Smith 
11326679dcc1SBarry Smith   v = a->a;
11336679dcc1SBarry Smith   if (usecprow) {
11346679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
11356679dcc1SBarry Smith     ii   = a->compressedrow.i;
11366679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
11379566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
11386679dcc1SBarry Smith   } else {
11396679dcc1SBarry Smith     mbs = a->mbs;
11406679dcc1SBarry Smith     ii  = a->i;
11416679dcc1SBarry Smith     z   = zarray;
11426679dcc1SBarry Smith   }
11436679dcc1SBarry Smith 
11446679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
11456679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
11466679dcc1SBarry Smith     idx = ij + ii[i];
11476679dcc1SBarry Smith 
11486679dcc1SBarry Smith     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
11496679dcc1SBarry Smith     for (j = 0; j < n; j++) {
11506679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
11519371c9d4SSatish Balay       x1 = xb[0];
11529371c9d4SSatish Balay       x2 = xb[1];
11539371c9d4SSatish Balay       x3 = xb[2];
11549371c9d4SSatish Balay       x4 = xb[3];
11556679dcc1SBarry Smith 
11566679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11576679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11586679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11596679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11606679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11616679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11626679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11636679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11646679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11656679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11666679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11676679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11686679dcc1SBarry Smith       v += 48;
11696679dcc1SBarry Smith 
11709371c9d4SSatish Balay       x1 = xb[4];
11719371c9d4SSatish Balay       x2 = xb[5];
11729371c9d4SSatish Balay       x3 = xb[6];
11739371c9d4SSatish Balay       x4 = xb[7];
11746679dcc1SBarry Smith 
11756679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11766679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11776679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11786679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11796679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11806679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11816679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11826679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11836679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11846679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11856679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11866679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11876679dcc1SBarry Smith       v += 48;
11886679dcc1SBarry Smith 
11899371c9d4SSatish Balay       x1 = xb[8];
11909371c9d4SSatish Balay       x2 = xb[9];
11919371c9d4SSatish Balay       x3 = xb[10];
11929371c9d4SSatish Balay       x4 = xb[11];
11936679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11946679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11956679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11966679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11976679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11986679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11996679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
12006679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
12016679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
12026679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
12036679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
12046679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
12056679dcc1SBarry Smith       v += 48;
12066679dcc1SBarry Smith     }
12076679dcc1SBarry Smith     if (usecprow) z = zarray + 12 * ridx[i];
12089371c9d4SSatish Balay     z[0]  = sum1;
12099371c9d4SSatish Balay     z[1]  = sum2;
12109371c9d4SSatish Balay     z[2]  = sum3;
12119371c9d4SSatish Balay     z[3]  = sum4;
12129371c9d4SSatish Balay     z[4]  = sum5;
12139371c9d4SSatish Balay     z[5]  = sum6;
12149371c9d4SSatish Balay     z[6]  = sum7;
12159371c9d4SSatish Balay     z[7]  = sum8;
12169371c9d4SSatish Balay     z[8]  = sum9;
12179371c9d4SSatish Balay     z[9]  = sum10;
12189371c9d4SSatish Balay     z[10] = sum11;
12199371c9d4SSatish Balay     z[11] = sum12;
12206679dcc1SBarry Smith     if (!usecprow) z += 12;
12216679dcc1SBarry Smith   }
12229566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
12239566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
12249566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
12256679dcc1SBarry Smith   PetscFunctionReturn(0);
12266679dcc1SBarry Smith }
12276679dcc1SBarry Smith 
12286679dcc1SBarry Smith /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1229*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1230*d71ae5a4SJacob Faibussowitsch {
12316679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
12326679dcc1SBarry Smith   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
12336679dcc1SBarry Smith   const PetscScalar *x, *xb;
12346679dcc1SBarry Smith   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
12356679dcc1SBarry Smith   const MatScalar   *v;
12366679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
12376679dcc1SBarry Smith   PetscInt           mbs      = a->mbs, i, j, n;
12386679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
12396679dcc1SBarry Smith 
12406679dcc1SBarry Smith   PetscFunctionBegin;
12419566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
12429566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
12436679dcc1SBarry Smith 
12446679dcc1SBarry Smith   v = a->a;
12456679dcc1SBarry Smith   if (usecprow) {
124648a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
12476679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
12486679dcc1SBarry Smith     ii   = a->compressedrow.i;
12496679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
12506679dcc1SBarry Smith   } else {
12516679dcc1SBarry Smith     ii = a->i;
12526679dcc1SBarry Smith     y  = yarray;
12536679dcc1SBarry Smith     z  = zarray;
12546679dcc1SBarry Smith   }
12556679dcc1SBarry Smith 
12566679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
12576679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
12586679dcc1SBarry Smith     idx = ij + ii[i];
12596679dcc1SBarry Smith 
12606679dcc1SBarry Smith     if (usecprow) {
12616679dcc1SBarry Smith       y = yarray + 12 * ridx[i];
12626679dcc1SBarry Smith       z = zarray + 12 * ridx[i];
12636679dcc1SBarry Smith     }
12649371c9d4SSatish Balay     sum1  = y[0];
12659371c9d4SSatish Balay     sum2  = y[1];
12669371c9d4SSatish Balay     sum3  = y[2];
12679371c9d4SSatish Balay     sum4  = y[3];
12689371c9d4SSatish Balay     sum5  = y[4];
12699371c9d4SSatish Balay     sum6  = y[5];
12709371c9d4SSatish Balay     sum7  = y[6];
12719371c9d4SSatish Balay     sum8  = y[7];
12729371c9d4SSatish Balay     sum9  = y[8];
12739371c9d4SSatish Balay     sum10 = y[9];
12749371c9d4SSatish Balay     sum11 = y[10];
12759371c9d4SSatish Balay     sum12 = y[11];
12766679dcc1SBarry Smith 
12776679dcc1SBarry Smith     for (j = 0; j < n; j++) {
12786679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
12799371c9d4SSatish Balay       x1 = xb[0];
12809371c9d4SSatish Balay       x2 = xb[1];
12819371c9d4SSatish Balay       x3 = xb[2];
12829371c9d4SSatish Balay       x4 = xb[3];
12836679dcc1SBarry Smith 
12846679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
12856679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
12866679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
12876679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
12886679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
12896679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
12906679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
12916679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
12926679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
12936679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
12946679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
12956679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
12966679dcc1SBarry Smith       v += 48;
12976679dcc1SBarry Smith 
12989371c9d4SSatish Balay       x1 = xb[4];
12999371c9d4SSatish Balay       x2 = xb[5];
13009371c9d4SSatish Balay       x3 = xb[6];
13019371c9d4SSatish Balay       x4 = xb[7];
13026679dcc1SBarry Smith 
13036679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
13046679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
13056679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
13066679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
13076679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
13086679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
13096679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
13106679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
13116679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
13126679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
13136679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
13146679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
13156679dcc1SBarry Smith       v += 48;
13166679dcc1SBarry Smith 
13179371c9d4SSatish Balay       x1 = xb[8];
13189371c9d4SSatish Balay       x2 = xb[9];
13199371c9d4SSatish Balay       x3 = xb[10];
13209371c9d4SSatish Balay       x4 = xb[11];
13216679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
13226679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
13236679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
13246679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
13256679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
13266679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
13276679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
13286679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
13296679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
13306679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
13316679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
13326679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
13336679dcc1SBarry Smith       v += 48;
13346679dcc1SBarry Smith     }
13359371c9d4SSatish Balay     z[0]  = sum1;
13369371c9d4SSatish Balay     z[1]  = sum2;
13379371c9d4SSatish Balay     z[2]  = sum3;
13389371c9d4SSatish Balay     z[3]  = sum4;
13399371c9d4SSatish Balay     z[4]  = sum5;
13409371c9d4SSatish Balay     z[5]  = sum6;
13419371c9d4SSatish Balay     z[6]  = sum7;
13429371c9d4SSatish Balay     z[7]  = sum8;
13439371c9d4SSatish Balay     z[8]  = sum9;
13449371c9d4SSatish Balay     z[9]  = sum10;
13459371c9d4SSatish Balay     z[10] = sum11;
13469371c9d4SSatish Balay     z[11] = sum12;
13476679dcc1SBarry Smith     if (!usecprow) {
13486679dcc1SBarry Smith       y += 12;
13496679dcc1SBarry Smith       z += 12;
13506679dcc1SBarry Smith     }
13516679dcc1SBarry Smith   }
13529566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
13539566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
13549566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
13556679dcc1SBarry Smith   PetscFunctionReturn(0);
13566679dcc1SBarry Smith }
13576679dcc1SBarry Smith 
13586679dcc1SBarry Smith #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1359*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1360*d71ae5a4SJacob Faibussowitsch {
13616679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
13626679dcc1SBarry Smith   PetscScalar       *z = NULL, *zarray;
13636679dcc1SBarry Smith   const PetscScalar *x, *work;
13646679dcc1SBarry Smith   const MatScalar   *v = a->a;
13656679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
13666679dcc1SBarry Smith   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
13676679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
13686679dcc1SBarry Smith   const PetscInt     bs = 12, bs2 = 144;
13696679dcc1SBarry Smith 
13706679dcc1SBarry Smith   __m256d a0, a1, a2, a3, a4, a5;
13716679dcc1SBarry Smith   __m256d w0, w1, w2, w3;
13726679dcc1SBarry Smith   __m256d z0, z1, z2;
13736679dcc1SBarry Smith 
13746679dcc1SBarry Smith   PetscFunctionBegin;
13759566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
13769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
13776679dcc1SBarry Smith 
13786679dcc1SBarry Smith   if (usecprow) {
13796679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
13806679dcc1SBarry Smith     ii   = a->compressedrow.i;
13816679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
13829566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
13836679dcc1SBarry Smith   } else {
13846679dcc1SBarry Smith     mbs = a->mbs;
13856679dcc1SBarry Smith     ii  = a->i;
13866679dcc1SBarry Smith     z   = zarray;
13876679dcc1SBarry Smith   }
13886679dcc1SBarry Smith 
13896679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
13909371c9d4SSatish Balay     z0 = _mm256_setzero_pd();
13919371c9d4SSatish Balay     z1 = _mm256_setzero_pd();
13929371c9d4SSatish Balay     z2 = _mm256_setzero_pd();
13936679dcc1SBarry Smith 
13949371c9d4SSatish Balay     n = ii[1] - ii[0];
13959371c9d4SSatish Balay     ii++;
13966679dcc1SBarry Smith     for (j = 0; j < n; j++) {
13976679dcc1SBarry Smith       work = x + bs * (*idx++);
13986679dcc1SBarry Smith 
13996679dcc1SBarry Smith       /* first column of a */
14006679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[0]);
14019371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 0);
14029371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
14039371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 4);
14049371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
14059371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 8);
14069371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14076679dcc1SBarry Smith 
14086679dcc1SBarry Smith       /* second column of a */
14096679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[1]);
14109371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 12);
14119371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
14129371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 16);
14139371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
14149371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 20);
14159371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14166679dcc1SBarry Smith 
14176679dcc1SBarry Smith       /* third column of a */
14186679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[2]);
14199371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 24);
14209371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14219371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 28);
14229371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14239371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 32);
14249371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14256679dcc1SBarry Smith 
14266679dcc1SBarry Smith       /* fourth column of a */
14276679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[3]);
14289371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 36);
14299371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
14309371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 40);
14319371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
14329371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 44);
14339371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14346679dcc1SBarry Smith 
14356679dcc1SBarry Smith       /* fifth column of a */
14366679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[4]);
14379371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 48);
14389371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
14399371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 52);
14409371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
14419371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 56);
14429371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14436679dcc1SBarry Smith 
14446679dcc1SBarry Smith       /* sixth column of a */
14456679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[5]);
14469371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 60);
14479371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
14489371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 64);
14499371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
14509371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 68);
14519371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14526679dcc1SBarry Smith 
14536679dcc1SBarry Smith       /* seventh column of a */
14546679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[6]);
14559371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 72);
14569371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14579371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 76);
14589371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14599371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 80);
14609371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14616679dcc1SBarry Smith 
14626aad120cSJose E. Roman       /* eighth column of a */
14636679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[7]);
14649371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 84);
14659371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
14669371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 88);
14679371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
14689371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 92);
14699371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14706679dcc1SBarry Smith 
14716679dcc1SBarry Smith       /* ninth column of a */
14726679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[8]);
14739371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 96);
14749371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
14759371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 100);
14769371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
14779371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 104);
14789371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14796679dcc1SBarry Smith 
14806679dcc1SBarry Smith       /* tenth column of a */
14816679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[9]);
14829371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 108);
14839371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
14849371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 112);
14859371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
14869371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 116);
14879371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14886679dcc1SBarry Smith 
14896679dcc1SBarry Smith       /* eleventh column of a */
14906679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[10]);
14919371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 120);
14929371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14939371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 124);
14949371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14959371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 128);
14969371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14976679dcc1SBarry Smith 
14986679dcc1SBarry Smith       /* twelveth column of a */
14996679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[11]);
15009371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 132);
15019371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
15029371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 136);
15039371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
15049371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 140);
15059371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
15066679dcc1SBarry Smith 
15076679dcc1SBarry Smith       v += bs2;
15086679dcc1SBarry Smith     }
15096679dcc1SBarry Smith     if (usecprow) z = zarray + bs * ridx[i];
15109371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
15119371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
15129371c9d4SSatish Balay     _mm256_storeu_pd(&z[8], z2);
15136679dcc1SBarry Smith     if (!usecprow) z += bs;
15146679dcc1SBarry Smith   }
15159566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
15169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
15179566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
15186679dcc1SBarry Smith   PetscFunctionReturn(0);
15196679dcc1SBarry Smith }
15206679dcc1SBarry Smith #endif
15216679dcc1SBarry Smith 
15228ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1523832cc040SShri Abhyankar /* Default MatMult for block size 15 */
1524*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1525*d71ae5a4SJacob Faibussowitsch {
15268ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1527f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
15288ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
152953ef36baSBarry Smith   PetscScalar       *zarray, xv;
15308ab949d8SShri Abhyankar   const MatScalar   *v;
15318ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
15327c565772SBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1533ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
15348ab949d8SShri Abhyankar 
15358ab949d8SShri Abhyankar   PetscFunctionBegin;
15369566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
15379566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
15388ab949d8SShri Abhyankar 
15398ab949d8SShri Abhyankar   v = a->a;
15408ab949d8SShri Abhyankar   if (usecprow) {
15418ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
15428ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
15438ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
15449566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
15458ab949d8SShri Abhyankar   } else {
15468ab949d8SShri Abhyankar     mbs = a->mbs;
15478ab949d8SShri Abhyankar     ii  = a->i;
15488ab949d8SShri Abhyankar     z   = zarray;
15498ab949d8SShri Abhyankar   }
15508ab949d8SShri Abhyankar 
15518ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
15528ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
15538ab949d8SShri Abhyankar     idx   = ij + ii[i];
15549371c9d4SSatish Balay     sum1  = 0.0;
15559371c9d4SSatish Balay     sum2  = 0.0;
15569371c9d4SSatish Balay     sum3  = 0.0;
15579371c9d4SSatish Balay     sum4  = 0.0;
15589371c9d4SSatish Balay     sum5  = 0.0;
15599371c9d4SSatish Balay     sum6  = 0.0;
15609371c9d4SSatish Balay     sum7  = 0.0;
15619371c9d4SSatish Balay     sum8  = 0.0;
15629371c9d4SSatish Balay     sum9  = 0.0;
15639371c9d4SSatish Balay     sum10 = 0.0;
15649371c9d4SSatish Balay     sum11 = 0.0;
15659371c9d4SSatish Balay     sum12 = 0.0;
15669371c9d4SSatish Balay     sum13 = 0.0;
15679371c9d4SSatish Balay     sum14 = 0.0;
15689371c9d4SSatish Balay     sum15 = 0.0;
15698ab949d8SShri Abhyankar 
15708ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
15718ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
15728ab949d8SShri Abhyankar 
15738ab949d8SShri Abhyankar       for (k = 0; k < 15; k++) {
157453ef36baSBarry Smith         xv = xb[k];
157553ef36baSBarry Smith         sum1 += v[0] * xv;
157653ef36baSBarry Smith         sum2 += v[1] * xv;
157753ef36baSBarry Smith         sum3 += v[2] * xv;
157853ef36baSBarry Smith         sum4 += v[3] * xv;
157953ef36baSBarry Smith         sum5 += v[4] * xv;
158053ef36baSBarry Smith         sum6 += v[5] * xv;
158153ef36baSBarry Smith         sum7 += v[6] * xv;
158253ef36baSBarry Smith         sum8 += v[7] * xv;
158353ef36baSBarry Smith         sum9 += v[8] * xv;
158453ef36baSBarry Smith         sum10 += v[9] * xv;
158553ef36baSBarry Smith         sum11 += v[10] * xv;
158653ef36baSBarry Smith         sum12 += v[11] * xv;
158753ef36baSBarry Smith         sum13 += v[12] * xv;
158853ef36baSBarry Smith         sum14 += v[13] * xv;
158953ef36baSBarry Smith         sum15 += v[14] * xv;
15908ab949d8SShri Abhyankar         v += 15;
15918ab949d8SShri Abhyankar       }
15928ab949d8SShri Abhyankar     }
15938ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
15949371c9d4SSatish Balay     z[0]  = sum1;
15959371c9d4SSatish Balay     z[1]  = sum2;
15969371c9d4SSatish Balay     z[2]  = sum3;
15979371c9d4SSatish Balay     z[3]  = sum4;
15989371c9d4SSatish Balay     z[4]  = sum5;
15999371c9d4SSatish Balay     z[5]  = sum6;
16009371c9d4SSatish Balay     z[6]  = sum7;
16019371c9d4SSatish Balay     z[7]  = sum8;
16029371c9d4SSatish Balay     z[8]  = sum9;
16039371c9d4SSatish Balay     z[9]  = sum10;
16049371c9d4SSatish Balay     z[10] = sum11;
16059371c9d4SSatish Balay     z[11] = sum12;
16069371c9d4SSatish Balay     z[12] = sum13;
16079371c9d4SSatish Balay     z[13] = sum14;
16089371c9d4SSatish Balay     z[14] = sum15;
16098ab949d8SShri Abhyankar 
16108ab949d8SShri Abhyankar     if (!usecprow) z += 15;
16118ab949d8SShri Abhyankar   }
16128ab949d8SShri Abhyankar 
16139566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
16149566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
16159566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
16168ab949d8SShri Abhyankar   PetscFunctionReturn(0);
16178ab949d8SShri Abhyankar }
16188ab949d8SShri Abhyankar 
16198ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1620*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1621*d71ae5a4SJacob Faibussowitsch {
16228ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1623f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
16248ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
16250b8f6341SShri Abhyankar   PetscScalar        x1, x2, x3, x4, *zarray;
16268ab949d8SShri Abhyankar   const MatScalar   *v;
16278ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
16287c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1629ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
16308ab949d8SShri Abhyankar 
16318ab949d8SShri Abhyankar   PetscFunctionBegin;
16329566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
16339566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
16348ab949d8SShri Abhyankar 
16358ab949d8SShri Abhyankar   v = a->a;
16368ab949d8SShri Abhyankar   if (usecprow) {
16378ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
16388ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
16398ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
16409566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
16418ab949d8SShri Abhyankar   } else {
16428ab949d8SShri Abhyankar     mbs = a->mbs;
16438ab949d8SShri Abhyankar     ii  = a->i;
16448ab949d8SShri Abhyankar     z   = zarray;
16458ab949d8SShri Abhyankar   }
16468ab949d8SShri Abhyankar 
16478ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
16488ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
16498ab949d8SShri Abhyankar     idx   = ij + ii[i];
16509371c9d4SSatish Balay     sum1  = 0.0;
16519371c9d4SSatish Balay     sum2  = 0.0;
16529371c9d4SSatish Balay     sum3  = 0.0;
16539371c9d4SSatish Balay     sum4  = 0.0;
16549371c9d4SSatish Balay     sum5  = 0.0;
16559371c9d4SSatish Balay     sum6  = 0.0;
16569371c9d4SSatish Balay     sum7  = 0.0;
16579371c9d4SSatish Balay     sum8  = 0.0;
16589371c9d4SSatish Balay     sum9  = 0.0;
16599371c9d4SSatish Balay     sum10 = 0.0;
16609371c9d4SSatish Balay     sum11 = 0.0;
16619371c9d4SSatish Balay     sum12 = 0.0;
16629371c9d4SSatish Balay     sum13 = 0.0;
16639371c9d4SSatish Balay     sum14 = 0.0;
16649371c9d4SSatish Balay     sum15 = 0.0;
16658ab949d8SShri Abhyankar 
16668ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
16678ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
16689371c9d4SSatish Balay       x1 = xb[0];
16699371c9d4SSatish Balay       x2 = xb[1];
16709371c9d4SSatish Balay       x3 = xb[2];
16719371c9d4SSatish Balay       x4 = xb[3];
16728ab949d8SShri Abhyankar 
16738ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16748ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16758ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16768ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16778ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
16788ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
16798ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
16808ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
16818ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
16828ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
16838ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
16848ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
16858ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
16868ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
16878ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
16888ab949d8SShri Abhyankar 
16898ab949d8SShri Abhyankar       v += 60;
16908ab949d8SShri Abhyankar 
16919371c9d4SSatish Balay       x1 = xb[4];
16929371c9d4SSatish Balay       x2 = xb[5];
16939371c9d4SSatish Balay       x3 = xb[6];
16949371c9d4SSatish Balay       x4 = xb[7];
16958ab949d8SShri Abhyankar 
16968ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16978ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16988ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16998ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
17008ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
17018ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
17028ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
17038ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
17048ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
17058ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
17068ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
17078ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
17088ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
17098ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
17108ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
17118ab949d8SShri Abhyankar       v += 60;
17128ab949d8SShri Abhyankar 
17139371c9d4SSatish Balay       x1 = xb[8];
17149371c9d4SSatish Balay       x2 = xb[9];
17159371c9d4SSatish Balay       x3 = xb[10];
17169371c9d4SSatish Balay       x4 = xb[11];
17170b8f6341SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
17180b8f6341SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
17190b8f6341SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
17200b8f6341SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
17210b8f6341SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
17220b8f6341SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
17230b8f6341SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
17240b8f6341SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
17250b8f6341SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
17260b8f6341SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
17270b8f6341SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
17280b8f6341SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
17290b8f6341SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
17300b8f6341SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
17310b8f6341SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
17320b8f6341SShri Abhyankar       v += 60;
17330b8f6341SShri Abhyankar 
17349371c9d4SSatish Balay       x1 = xb[12];
17359371c9d4SSatish Balay       x2 = xb[13];
17369371c9d4SSatish Balay       x3 = xb[14];
17378ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
17388ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
17398ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
17408ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
17418ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
17428ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
17438ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
17448ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
17458ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
17468ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
17478ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
17488ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
17498ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
17508ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
17518ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
17528ab949d8SShri Abhyankar       v += 45;
17538ab949d8SShri Abhyankar     }
17548ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
17559371c9d4SSatish Balay     z[0]  = sum1;
17569371c9d4SSatish Balay     z[1]  = sum2;
17579371c9d4SSatish Balay     z[2]  = sum3;
17589371c9d4SSatish Balay     z[3]  = sum4;
17599371c9d4SSatish Balay     z[4]  = sum5;
17609371c9d4SSatish Balay     z[5]  = sum6;
17619371c9d4SSatish Balay     z[6]  = sum7;
17629371c9d4SSatish Balay     z[7]  = sum8;
17639371c9d4SSatish Balay     z[8]  = sum9;
17649371c9d4SSatish Balay     z[9]  = sum10;
17659371c9d4SSatish Balay     z[10] = sum11;
17669371c9d4SSatish Balay     z[11] = sum12;
17679371c9d4SSatish Balay     z[12] = sum13;
17689371c9d4SSatish Balay     z[13] = sum14;
17699371c9d4SSatish Balay     z[14] = sum15;
17708ab949d8SShri Abhyankar 
17718ab949d8SShri Abhyankar     if (!usecprow) z += 15;
17728ab949d8SShri Abhyankar   }
17738ab949d8SShri Abhyankar 
17749566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
17759566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
17769566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
17778ab949d8SShri Abhyankar   PetscFunctionReturn(0);
17788ab949d8SShri Abhyankar }
17798ab949d8SShri Abhyankar 
17808ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1781*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1782*d71ae5a4SJacob Faibussowitsch {
17838ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1784f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
17858ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
17860b8f6341SShri Abhyankar   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
17878ab949d8SShri Abhyankar   const MatScalar   *v;
17888ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
17897c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1790ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
17918ab949d8SShri Abhyankar 
17928ab949d8SShri Abhyankar   PetscFunctionBegin;
17939566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
17949566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
17958ab949d8SShri Abhyankar 
17968ab949d8SShri Abhyankar   v = a->a;
17978ab949d8SShri Abhyankar   if (usecprow) {
17988ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
17998ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
18008ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
18019566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
18028ab949d8SShri Abhyankar   } else {
18038ab949d8SShri Abhyankar     mbs = a->mbs;
18048ab949d8SShri Abhyankar     ii  = a->i;
18058ab949d8SShri Abhyankar     z   = zarray;
18068ab949d8SShri Abhyankar   }
18078ab949d8SShri Abhyankar 
18088ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
18098ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
18108ab949d8SShri Abhyankar     idx   = ij + ii[i];
18119371c9d4SSatish Balay     sum1  = 0.0;
18129371c9d4SSatish Balay     sum2  = 0.0;
18139371c9d4SSatish Balay     sum3  = 0.0;
18149371c9d4SSatish Balay     sum4  = 0.0;
18159371c9d4SSatish Balay     sum5  = 0.0;
18169371c9d4SSatish Balay     sum6  = 0.0;
18179371c9d4SSatish Balay     sum7  = 0.0;
18189371c9d4SSatish Balay     sum8  = 0.0;
18199371c9d4SSatish Balay     sum9  = 0.0;
18209371c9d4SSatish Balay     sum10 = 0.0;
18219371c9d4SSatish Balay     sum11 = 0.0;
18229371c9d4SSatish Balay     sum12 = 0.0;
18239371c9d4SSatish Balay     sum13 = 0.0;
18249371c9d4SSatish Balay     sum14 = 0.0;
18259371c9d4SSatish Balay     sum15 = 0.0;
18268ab949d8SShri Abhyankar 
18278ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
18288ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
18299371c9d4SSatish Balay       x1 = xb[0];
18309371c9d4SSatish Balay       x2 = xb[1];
18319371c9d4SSatish Balay       x3 = xb[2];
18329371c9d4SSatish Balay       x4 = xb[3];
18339371c9d4SSatish Balay       x5 = xb[4];
18349371c9d4SSatish Balay       x6 = xb[5];
18359371c9d4SSatish Balay       x7 = xb[6];
18360b8f6341SShri Abhyankar       x8 = xb[7];
18378ab949d8SShri Abhyankar 
18388ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
18398ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
18408ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
18418ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
18428ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
18438ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
18448ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
18458ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
18468ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
18478ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
18488ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
18498ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
18508ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
18518ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
18528ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
18538ab949d8SShri Abhyankar       v += 120;
18548ab949d8SShri Abhyankar 
18559371c9d4SSatish Balay       x1 = xb[8];
18569371c9d4SSatish Balay       x2 = xb[9];
18579371c9d4SSatish Balay       x3 = xb[10];
18589371c9d4SSatish Balay       x4 = xb[11];
18599371c9d4SSatish Balay       x5 = xb[12];
18609371c9d4SSatish Balay       x6 = xb[13];
18619371c9d4SSatish Balay       x7 = xb[14];
18620b8f6341SShri Abhyankar 
18638ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
18648ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
18658ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
18668ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
18678ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
18688ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
18698ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
18708ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
18718ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
18728ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
18738ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
18748ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
18758ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
18768ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
18778ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
18788ab949d8SShri Abhyankar       v += 105;
18798ab949d8SShri Abhyankar     }
18808ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
18819371c9d4SSatish Balay     z[0]  = sum1;
18829371c9d4SSatish Balay     z[1]  = sum2;
18839371c9d4SSatish Balay     z[2]  = sum3;
18849371c9d4SSatish Balay     z[3]  = sum4;
18859371c9d4SSatish Balay     z[4]  = sum5;
18869371c9d4SSatish Balay     z[5]  = sum6;
18879371c9d4SSatish Balay     z[6]  = sum7;
18889371c9d4SSatish Balay     z[7]  = sum8;
18899371c9d4SSatish Balay     z[8]  = sum9;
18909371c9d4SSatish Balay     z[9]  = sum10;
18919371c9d4SSatish Balay     z[10] = sum11;
18929371c9d4SSatish Balay     z[11] = sum12;
18939371c9d4SSatish Balay     z[12] = sum13;
18949371c9d4SSatish Balay     z[13] = sum14;
18959371c9d4SSatish Balay     z[14] = sum15;
18968ab949d8SShri Abhyankar 
18978ab949d8SShri Abhyankar     if (!usecprow) z += 15;
18988ab949d8SShri Abhyankar   }
18998ab949d8SShri Abhyankar 
19009566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
19019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
19029566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
19038ab949d8SShri Abhyankar   PetscFunctionReturn(0);
19048ab949d8SShri Abhyankar }
19058ab949d8SShri Abhyankar 
19068ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1907*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1908*d71ae5a4SJacob Faibussowitsch {
19098ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1910f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
19118ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
19128ab949d8SShri Abhyankar   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
19138ab949d8SShri Abhyankar   const MatScalar   *v;
19148ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
19157c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1916ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
19178ab949d8SShri Abhyankar 
19188ab949d8SShri Abhyankar   PetscFunctionBegin;
19199566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
19209566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
19218ab949d8SShri Abhyankar 
19228ab949d8SShri Abhyankar   v = a->a;
19238ab949d8SShri Abhyankar   if (usecprow) {
19248ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
19258ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
19268ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
19279566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
19288ab949d8SShri Abhyankar   } else {
19298ab949d8SShri Abhyankar     mbs = a->mbs;
19308ab949d8SShri Abhyankar     ii  = a->i;
19318ab949d8SShri Abhyankar     z   = zarray;
19328ab949d8SShri Abhyankar   }
19338ab949d8SShri Abhyankar 
19348ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
19358ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
19368ab949d8SShri Abhyankar     idx   = ij + ii[i];
19379371c9d4SSatish Balay     sum1  = 0.0;
19389371c9d4SSatish Balay     sum2  = 0.0;
19399371c9d4SSatish Balay     sum3  = 0.0;
19409371c9d4SSatish Balay     sum4  = 0.0;
19419371c9d4SSatish Balay     sum5  = 0.0;
19429371c9d4SSatish Balay     sum6  = 0.0;
19439371c9d4SSatish Balay     sum7  = 0.0;
19449371c9d4SSatish Balay     sum8  = 0.0;
19459371c9d4SSatish Balay     sum9  = 0.0;
19469371c9d4SSatish Balay     sum10 = 0.0;
19479371c9d4SSatish Balay     sum11 = 0.0;
19489371c9d4SSatish Balay     sum12 = 0.0;
19499371c9d4SSatish Balay     sum13 = 0.0;
19509371c9d4SSatish Balay     sum14 = 0.0;
19519371c9d4SSatish Balay     sum15 = 0.0;
19528ab949d8SShri Abhyankar 
19538ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
19548ab949d8SShri Abhyankar       xb  = x + 15 * (idx[j]);
19559371c9d4SSatish Balay       x1  = xb[0];
19569371c9d4SSatish Balay       x2  = xb[1];
19579371c9d4SSatish Balay       x3  = xb[2];
19589371c9d4SSatish Balay       x4  = xb[3];
19599371c9d4SSatish Balay       x5  = xb[4];
19609371c9d4SSatish Balay       x6  = xb[5];
19619371c9d4SSatish Balay       x7  = xb[6];
19629371c9d4SSatish Balay       x8  = xb[7];
19639371c9d4SSatish Balay       x9  = xb[8];
19649371c9d4SSatish Balay       x10 = xb[9];
19659371c9d4SSatish Balay       x11 = xb[10];
19669371c9d4SSatish Balay       x12 = xb[11];
19679371c9d4SSatish Balay       x13 = xb[12];
19689371c9d4SSatish Balay       x14 = xb[13];
19699371c9d4SSatish Balay       x15 = xb[14];
19708ab949d8SShri Abhyankar 
19718ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
19728ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
19738ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
19748ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
19758ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
19768ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
19778ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
19788ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
19798ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
19808ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
19818ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
19828ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
19838ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
19848ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
19858ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
19868ab949d8SShri Abhyankar       v += 225;
19878ab949d8SShri Abhyankar     }
19888ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
19899371c9d4SSatish Balay     z[0]  = sum1;
19909371c9d4SSatish Balay     z[1]  = sum2;
19919371c9d4SSatish Balay     z[2]  = sum3;
19929371c9d4SSatish Balay     z[3]  = sum4;
19939371c9d4SSatish Balay     z[4]  = sum5;
19949371c9d4SSatish Balay     z[5]  = sum6;
19959371c9d4SSatish Balay     z[6]  = sum7;
19969371c9d4SSatish Balay     z[7]  = sum8;
19979371c9d4SSatish Balay     z[8]  = sum9;
19989371c9d4SSatish Balay     z[9]  = sum10;
19999371c9d4SSatish Balay     z[10] = sum11;
20009371c9d4SSatish Balay     z[11] = sum12;
20019371c9d4SSatish Balay     z[12] = sum13;
20029371c9d4SSatish Balay     z[13] = sum14;
20039371c9d4SSatish Balay     z[14] = sum15;
20048ab949d8SShri Abhyankar 
20058ab949d8SShri Abhyankar     if (!usecprow) z += 15;
20068ab949d8SShri Abhyankar   }
20078ab949d8SShri Abhyankar 
20089566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
20099566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
20109566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
20118ab949d8SShri Abhyankar   PetscFunctionReturn(0);
20128ab949d8SShri Abhyankar }
20138ab949d8SShri Abhyankar 
20143f1db9ecSBarry Smith /*
20153f1db9ecSBarry Smith     This will not work with MatScalar == float because it calls the BLAS
20163f1db9ecSBarry Smith */
2017*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
2018*d71ae5a4SJacob Faibussowitsch {
20192d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2020f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
2021d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2022d9ca1df4SBarry Smith   const MatScalar   *v;
2023d9ca1df4SBarry Smith   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2024d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2025d9ca1df4SBarry Smith   PetscInt           ncols, k;
2026ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
20272d61bbb3SSatish Balay 
20282d61bbb3SSatish Balay   PetscFunctionBegin;
20299566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
20309566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
20312d61bbb3SSatish Balay 
20322d61bbb3SSatish Balay   idx = a->j;
20332d61bbb3SSatish Balay   v   = a->a;
203426e093fcSHong Zhang   if (usecprow) {
203526e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
203626e093fcSHong Zhang     ii   = a->compressedrow.i;
20377b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
20389566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
203926e093fcSHong Zhang   } else {
204026e093fcSHong Zhang     mbs = a->mbs;
20412d61bbb3SSatish Balay     ii  = a->i;
204226e093fcSHong Zhang     z   = zarray;
204326e093fcSHong Zhang   }
2044218c64b6SSatish Balay 
20452d61bbb3SSatish Balay   if (!a->mult_work) {
2046d0f46423SBarry Smith     k = PetscMax(A->rmap->n, A->cmap->n);
20479566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
20482d61bbb3SSatish Balay   }
20492d61bbb3SSatish Balay   work = a->mult_work;
20502d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
20519371c9d4SSatish Balay     n = ii[1] - ii[0];
20529371c9d4SSatish Balay     ii++;
20532d61bbb3SSatish Balay     ncols = n * bs;
20542d61bbb3SSatish Balay     workt = work;
20552d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
20562d61bbb3SSatish Balay       xb = x + bs * (*idx++);
20572d61bbb3SSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
20582d61bbb3SSatish Balay       workt += bs;
20592d61bbb3SSatish Balay     }
20607b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs * ridx[i];
206196b95a6bSBarry Smith     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
20622d61bbb3SSatish Balay     v += n * bs2;
206326e093fcSHong Zhang     if (!usecprow) z += bs;
20642d61bbb3SSatish Balay   }
20659566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
20669566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
20679566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
20682d61bbb3SSatish Balay   PetscFunctionReturn(0);
20692d61bbb3SSatish Balay }
20702d61bbb3SSatish Balay 
2071*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2072*d71ae5a4SJacob Faibussowitsch {
20732d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2074122f12eaSBarry Smith   const PetscScalar *x;
2075122f12eaSBarry Smith   PetscScalar       *y, *z, sum;
2076122f12eaSBarry Smith   const MatScalar   *v;
20777c565772SBarry Smith   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2078122f12eaSBarry Smith   const PetscInt    *idx, *ii;
2079ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
20802d61bbb3SSatish Balay 
20812d61bbb3SSatish Balay   PetscFunctionBegin;
20829566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
20839566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
20842d61bbb3SSatish Balay 
20852d61bbb3SSatish Balay   idx = a->j;
20862d61bbb3SSatish Balay   v   = a->a;
208726e093fcSHong Zhang   if (usecprow) {
208848a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
208926e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
209026e093fcSHong Zhang     ii   = a->compressedrow.i;
20917b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
209226e093fcSHong Zhang   } else {
20932d61bbb3SSatish Balay     ii = a->i;
209426e093fcSHong Zhang   }
20952d61bbb3SSatish Balay 
20962d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2097122f12eaSBarry Smith     n = ii[1] - ii[0];
2098122f12eaSBarry Smith     ii++;
209926e093fcSHong Zhang     if (!usecprow) {
2100122f12eaSBarry Smith       sum = y[i];
2101122f12eaSBarry Smith     } else {
2102122f12eaSBarry Smith       sum = y[ridx[i]];
2103122f12eaSBarry Smith     }
2104444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2105444d8c10SJed Brown     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2106122f12eaSBarry Smith     PetscSparseDensePlusDot(sum, x, v, idx, n);
2107122f12eaSBarry Smith     v += n;
2108122f12eaSBarry Smith     idx += n;
2109122f12eaSBarry Smith     if (usecprow) {
2110122f12eaSBarry Smith       z[ridx[i]] = sum;
2111122f12eaSBarry Smith     } else {
2112122f12eaSBarry Smith       z[i] = sum;
211326e093fcSHong Zhang     }
21142d61bbb3SSatish Balay   }
21159566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
21169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
21179566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
21182d61bbb3SSatish Balay   PetscFunctionReturn(0);
21192d61bbb3SSatish Balay }
21202d61bbb3SSatish Balay 
2121*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2122*d71ae5a4SJacob Faibussowitsch {
21232d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2124f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2125d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
212626e093fcSHong Zhang   PetscScalar        x1, x2, *yarray, *zarray;
2127d9ca1df4SBarry Smith   const MatScalar   *v;
2128d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, j;
2129d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2130ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
21312d61bbb3SSatish Balay 
21322d61bbb3SSatish Balay   PetscFunctionBegin;
21339566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
21349566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
21352d61bbb3SSatish Balay 
21362d61bbb3SSatish Balay   idx = a->j;
21372d61bbb3SSatish Balay   v   = a->a;
213826e093fcSHong Zhang   if (usecprow) {
213948a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
214026e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
214126e093fcSHong Zhang     ii   = a->compressedrow.i;
21427b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
214326e093fcSHong Zhang   } else {
21442d61bbb3SSatish Balay     ii = a->i;
214526e093fcSHong Zhang     y  = yarray;
214626e093fcSHong Zhang     z  = zarray;
214726e093fcSHong Zhang   }
21482d61bbb3SSatish Balay 
21492d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
21509371c9d4SSatish Balay     n = ii[1] - ii[0];
21519371c9d4SSatish Balay     ii++;
215226e093fcSHong Zhang     if (usecprow) {
21537b2bb3b9SHong Zhang       z = zarray + 2 * ridx[i];
21547b2bb3b9SHong Zhang       y = yarray + 2 * ridx[i];
215526e093fcSHong Zhang     }
21569371c9d4SSatish Balay     sum1 = y[0];
21579371c9d4SSatish Balay     sum2 = y[1];
2158444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2159444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
21602d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
216126fbe8dcSKarl Rupp       xb = x + 2 * (*idx++);
216226fbe8dcSKarl Rupp       x1 = xb[0];
216326fbe8dcSKarl Rupp       x2 = xb[1];
216426fbe8dcSKarl Rupp 
21652d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[2] * x2;
21662d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[3] * x2;
21672d61bbb3SSatish Balay       v += 4;
21682d61bbb3SSatish Balay     }
21699371c9d4SSatish Balay     z[0] = sum1;
21709371c9d4SSatish Balay     z[1] = sum2;
217126e093fcSHong Zhang     if (!usecprow) {
21729371c9d4SSatish Balay       z += 2;
21739371c9d4SSatish Balay       y += 2;
21742d61bbb3SSatish Balay     }
217526e093fcSHong Zhang   }
21769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
21779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
21789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(4.0 * a->nz));
21792d61bbb3SSatish Balay   PetscFunctionReturn(0);
21802d61bbb3SSatish Balay }
21812d61bbb3SSatish Balay 
2182*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2183*d71ae5a4SJacob Faibussowitsch {
21842d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2185f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2186d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2187d9ca1df4SBarry Smith   const MatScalar   *v;
2188d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2189d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2190ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
21912d61bbb3SSatish Balay 
21922d61bbb3SSatish Balay   PetscFunctionBegin;
21939566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
21949566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
21952d61bbb3SSatish Balay 
21962d61bbb3SSatish Balay   idx = a->j;
21972d61bbb3SSatish Balay   v   = a->a;
219826e093fcSHong Zhang   if (usecprow) {
219948a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
220026e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
220126e093fcSHong Zhang     ii   = a->compressedrow.i;
22027b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
220326e093fcSHong Zhang   } else {
22042d61bbb3SSatish Balay     ii = a->i;
220526e093fcSHong Zhang     y  = yarray;
220626e093fcSHong Zhang     z  = zarray;
220726e093fcSHong Zhang   }
22082d61bbb3SSatish Balay 
22092d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
22109371c9d4SSatish Balay     n = ii[1] - ii[0];
22119371c9d4SSatish Balay     ii++;
221226e093fcSHong Zhang     if (usecprow) {
22137b2bb3b9SHong Zhang       z = zarray + 3 * ridx[i];
22147b2bb3b9SHong Zhang       y = yarray + 3 * ridx[i];
221526e093fcSHong Zhang     }
22169371c9d4SSatish Balay     sum1 = y[0];
22179371c9d4SSatish Balay     sum2 = y[1];
22189371c9d4SSatish Balay     sum3 = y[2];
2219444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2220444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
22212d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
22229371c9d4SSatish Balay       xb = x + 3 * (*idx++);
22239371c9d4SSatish Balay       x1 = xb[0];
22249371c9d4SSatish Balay       x2 = xb[1];
22259371c9d4SSatish Balay       x3 = xb[2];
22262d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
22272d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
22282d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
22292d61bbb3SSatish Balay       v += 9;
22302d61bbb3SSatish Balay     }
22319371c9d4SSatish Balay     z[0] = sum1;
22329371c9d4SSatish Balay     z[1] = sum2;
22339371c9d4SSatish Balay     z[2] = sum3;
223426e093fcSHong Zhang     if (!usecprow) {
22359371c9d4SSatish Balay       z += 3;
22369371c9d4SSatish Balay       y += 3;
22372d61bbb3SSatish Balay     }
223826e093fcSHong Zhang   }
22399566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
22409566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
22419566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * a->nz));
22422d61bbb3SSatish Balay   PetscFunctionReturn(0);
22432d61bbb3SSatish Balay }
22442d61bbb3SSatish Balay 
2245*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2246*d71ae5a4SJacob Faibussowitsch {
22472d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2248f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2249d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2250d9ca1df4SBarry Smith   const MatScalar   *v;
2251d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2252d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2253ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
22542d61bbb3SSatish Balay 
22552d61bbb3SSatish Balay   PetscFunctionBegin;
22569566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
22579566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
22582d61bbb3SSatish Balay 
22592d61bbb3SSatish Balay   idx = a->j;
22602d61bbb3SSatish Balay   v   = a->a;
226126e093fcSHong Zhang   if (usecprow) {
226248a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
226326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
226426e093fcSHong Zhang     ii   = a->compressedrow.i;
22657b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
226626e093fcSHong Zhang   } else {
22672d61bbb3SSatish Balay     ii = a->i;
226826e093fcSHong Zhang     y  = yarray;
226926e093fcSHong Zhang     z  = zarray;
227026e093fcSHong Zhang   }
22712d61bbb3SSatish Balay 
22722d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
22739371c9d4SSatish Balay     n = ii[1] - ii[0];
22749371c9d4SSatish Balay     ii++;
227526e093fcSHong Zhang     if (usecprow) {
22767b2bb3b9SHong Zhang       z = zarray + 4 * ridx[i];
22777b2bb3b9SHong Zhang       y = yarray + 4 * ridx[i];
227826e093fcSHong Zhang     }
22799371c9d4SSatish Balay     sum1 = y[0];
22809371c9d4SSatish Balay     sum2 = y[1];
22819371c9d4SSatish Balay     sum3 = y[2];
22829371c9d4SSatish Balay     sum4 = y[3];
2283444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2284444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
22852d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
22862d61bbb3SSatish Balay       xb = x + 4 * (*idx++);
22879371c9d4SSatish Balay       x1 = xb[0];
22889371c9d4SSatish Balay       x2 = xb[1];
22899371c9d4SSatish Balay       x3 = xb[2];
22909371c9d4SSatish Balay       x4 = xb[3];
22912d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
22922d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
22932d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
22942d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
22952d61bbb3SSatish Balay       v += 16;
22962d61bbb3SSatish Balay     }
22979371c9d4SSatish Balay     z[0] = sum1;
22989371c9d4SSatish Balay     z[1] = sum2;
22999371c9d4SSatish Balay     z[2] = sum3;
23009371c9d4SSatish Balay     z[3] = sum4;
230126e093fcSHong Zhang     if (!usecprow) {
23029371c9d4SSatish Balay       z += 4;
23039371c9d4SSatish Balay       y += 4;
23042d61bbb3SSatish Balay     }
230526e093fcSHong Zhang   }
23069566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
23079566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
23089566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * a->nz));
23092d61bbb3SSatish Balay   PetscFunctionReturn(0);
23102d61bbb3SSatish Balay }
23112d61bbb3SSatish Balay 
2312*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2313*d71ae5a4SJacob Faibussowitsch {
23142d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2315f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2316d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
231726e093fcSHong Zhang   PetscScalar       *yarray, *zarray;
2318d9ca1df4SBarry Smith   const MatScalar   *v;
2319d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2320d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2321ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
23222d61bbb3SSatish Balay 
23232d61bbb3SSatish Balay   PetscFunctionBegin;
23249566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
23259566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
23262d61bbb3SSatish Balay 
23272d61bbb3SSatish Balay   idx = a->j;
23282d61bbb3SSatish Balay   v   = a->a;
232926e093fcSHong Zhang   if (usecprow) {
233048a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
233126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
233226e093fcSHong Zhang     ii   = a->compressedrow.i;
23337b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
233426e093fcSHong Zhang   } else {
23352d61bbb3SSatish Balay     ii = a->i;
233626e093fcSHong Zhang     y  = yarray;
233726e093fcSHong Zhang     z  = zarray;
233826e093fcSHong Zhang   }
23392d61bbb3SSatish Balay 
23402d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
23419371c9d4SSatish Balay     n = ii[1] - ii[0];
23429371c9d4SSatish Balay     ii++;
234326e093fcSHong Zhang     if (usecprow) {
23447b2bb3b9SHong Zhang       z = zarray + 5 * ridx[i];
23457b2bb3b9SHong Zhang       y = yarray + 5 * ridx[i];
234626e093fcSHong Zhang     }
23479371c9d4SSatish Balay     sum1 = y[0];
23489371c9d4SSatish Balay     sum2 = y[1];
23499371c9d4SSatish Balay     sum3 = y[2];
23509371c9d4SSatish Balay     sum4 = y[3];
23519371c9d4SSatish Balay     sum5 = y[4];
2352444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2353444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
23542d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
23552d61bbb3SSatish Balay       xb = x + 5 * (*idx++);
23569371c9d4SSatish Balay       x1 = xb[0];
23579371c9d4SSatish Balay       x2 = xb[1];
23589371c9d4SSatish Balay       x3 = xb[2];
23599371c9d4SSatish Balay       x4 = xb[3];
23609371c9d4SSatish Balay       x5 = xb[4];
23612d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
23622d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
23632d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
23642d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
23652d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
23662d61bbb3SSatish Balay       v += 25;
23672d61bbb3SSatish Balay     }
23689371c9d4SSatish Balay     z[0] = sum1;
23699371c9d4SSatish Balay     z[1] = sum2;
23709371c9d4SSatish Balay     z[2] = sum3;
23719371c9d4SSatish Balay     z[3] = sum4;
23729371c9d4SSatish Balay     z[4] = sum5;
237326e093fcSHong Zhang     if (!usecprow) {
23749371c9d4SSatish Balay       z += 5;
23759371c9d4SSatish Balay       y += 5;
23762d61bbb3SSatish Balay     }
237726e093fcSHong Zhang   }
23789566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
23799566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
23809566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * a->nz));
23812d61bbb3SSatish Balay   PetscFunctionReturn(0);
23822d61bbb3SSatish Balay }
2383c2916339SPierre Jolivet 
2384*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2385*d71ae5a4SJacob Faibussowitsch {
238615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2387f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2388d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
238926e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2390d9ca1df4SBarry Smith   const MatScalar   *v;
2391d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2392d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2393ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
239415091d37SBarry Smith 
239515091d37SBarry Smith   PetscFunctionBegin;
23969566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
23979566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
239815091d37SBarry Smith 
239915091d37SBarry Smith   idx = a->j;
240015091d37SBarry Smith   v   = a->a;
240126e093fcSHong Zhang   if (usecprow) {
240248a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
240326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
240426e093fcSHong Zhang     ii   = a->compressedrow.i;
24057b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
240626e093fcSHong Zhang   } else {
240715091d37SBarry Smith     ii = a->i;
240826e093fcSHong Zhang     y  = yarray;
240926e093fcSHong Zhang     z  = zarray;
241026e093fcSHong Zhang   }
241115091d37SBarry Smith 
241215091d37SBarry Smith   for (i = 0; i < mbs; i++) {
24139371c9d4SSatish Balay     n = ii[1] - ii[0];
24149371c9d4SSatish Balay     ii++;
241526e093fcSHong Zhang     if (usecprow) {
24167b2bb3b9SHong Zhang       z = zarray + 6 * ridx[i];
24177b2bb3b9SHong Zhang       y = yarray + 6 * ridx[i];
241826e093fcSHong Zhang     }
24199371c9d4SSatish Balay     sum1 = y[0];
24209371c9d4SSatish Balay     sum2 = y[1];
24219371c9d4SSatish Balay     sum3 = y[2];
24229371c9d4SSatish Balay     sum4 = y[3];
24239371c9d4SSatish Balay     sum5 = y[4];
24249371c9d4SSatish Balay     sum6 = y[5];
2425444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2426444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
242715091d37SBarry Smith     for (j = 0; j < n; j++) {
24283b95cb0eSSatish Balay       xb = x + 6 * (*idx++);
24299371c9d4SSatish Balay       x1 = xb[0];
24309371c9d4SSatish Balay       x2 = xb[1];
24319371c9d4SSatish Balay       x3 = xb[2];
24329371c9d4SSatish Balay       x4 = xb[3];
24339371c9d4SSatish Balay       x5 = xb[4];
24349371c9d4SSatish Balay       x6 = xb[5];
243515091d37SBarry Smith       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
243615091d37SBarry Smith       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
243715091d37SBarry Smith       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
243815091d37SBarry Smith       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
243915091d37SBarry Smith       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
244015091d37SBarry Smith       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
244115091d37SBarry Smith       v += 36;
244215091d37SBarry Smith     }
24439371c9d4SSatish Balay     z[0] = sum1;
24449371c9d4SSatish Balay     z[1] = sum2;
24459371c9d4SSatish Balay     z[2] = sum3;
24469371c9d4SSatish Balay     z[3] = sum4;
24479371c9d4SSatish Balay     z[4] = sum5;
24489371c9d4SSatish Balay     z[5] = sum6;
244926e093fcSHong Zhang     if (!usecprow) {
24509371c9d4SSatish Balay       z += 6;
24519371c9d4SSatish Balay       y += 6;
245215091d37SBarry Smith     }
245326e093fcSHong Zhang   }
24549566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
24559566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
24569566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * a->nz));
245715091d37SBarry Smith   PetscFunctionReturn(0);
245815091d37SBarry Smith }
24592d61bbb3SSatish Balay 
2460*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2461*d71ae5a4SJacob Faibussowitsch {
24622d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2463f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2464d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
246526e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2466d9ca1df4SBarry Smith   const MatScalar   *v;
2467d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2468d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2469ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
24702d61bbb3SSatish Balay 
24712d61bbb3SSatish Balay   PetscFunctionBegin;
24729566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
24739566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
24742d61bbb3SSatish Balay 
24752d61bbb3SSatish Balay   idx = a->j;
24762d61bbb3SSatish Balay   v   = a->a;
247726e093fcSHong Zhang   if (usecprow) {
247848a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
247926e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
248026e093fcSHong Zhang     ii   = a->compressedrow.i;
24817b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
248226e093fcSHong Zhang   } else {
24832d61bbb3SSatish Balay     ii = a->i;
248426e093fcSHong Zhang     y  = yarray;
248526e093fcSHong Zhang     z  = zarray;
248626e093fcSHong Zhang   }
24872d61bbb3SSatish Balay 
24882d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
24899371c9d4SSatish Balay     n = ii[1] - ii[0];
24909371c9d4SSatish Balay     ii++;
249126e093fcSHong Zhang     if (usecprow) {
24927b2bb3b9SHong Zhang       z = zarray + 7 * ridx[i];
24937b2bb3b9SHong Zhang       y = yarray + 7 * ridx[i];
249426e093fcSHong Zhang     }
24959371c9d4SSatish Balay     sum1 = y[0];
24969371c9d4SSatish Balay     sum2 = y[1];
24979371c9d4SSatish Balay     sum3 = y[2];
24989371c9d4SSatish Balay     sum4 = y[3];
24999371c9d4SSatish Balay     sum5 = y[4];
25009371c9d4SSatish Balay     sum6 = y[5];
25019371c9d4SSatish Balay     sum7 = y[6];
2502444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2503444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
25042d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
25052d61bbb3SSatish Balay       xb = x + 7 * (*idx++);
25069371c9d4SSatish Balay       x1 = xb[0];
25079371c9d4SSatish Balay       x2 = xb[1];
25089371c9d4SSatish Balay       x3 = xb[2];
25099371c9d4SSatish Balay       x4 = xb[3];
25109371c9d4SSatish Balay       x5 = xb[4];
25119371c9d4SSatish Balay       x6 = xb[5];
25129371c9d4SSatish Balay       x7 = xb[6];
25132d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
25142d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
25152d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
25162d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
25172d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
25182d61bbb3SSatish Balay       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
25192d61bbb3SSatish Balay       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
25202d61bbb3SSatish Balay       v += 49;
25212d61bbb3SSatish Balay     }
25229371c9d4SSatish Balay     z[0] = sum1;
25239371c9d4SSatish Balay     z[1] = sum2;
25249371c9d4SSatish Balay     z[2] = sum3;
25259371c9d4SSatish Balay     z[3] = sum4;
25269371c9d4SSatish Balay     z[4] = sum5;
25279371c9d4SSatish Balay     z[5] = sum6;
25289371c9d4SSatish Balay     z[6] = sum7;
252926e093fcSHong Zhang     if (!usecprow) {
25309371c9d4SSatish Balay       z += 7;
25319371c9d4SSatish Balay       y += 7;
25322d61bbb3SSatish Balay     }
253326e093fcSHong Zhang   }
25349566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
25359566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
25369566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * a->nz));
25372d61bbb3SSatish Balay   PetscFunctionReturn(0);
25382d61bbb3SSatish Balay }
2539218c64b6SSatish Balay 
25405f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2541*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2542*d71ae5a4SJacob Faibussowitsch {
254396e086a2SDaniel Kokron   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2544f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
254596e086a2SDaniel Kokron   const PetscScalar *x, *xb;
254696e086a2SDaniel Kokron   const MatScalar   *v;
25476679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
2548ce68d72fSJed Brown   PetscInt           k;
254996e086a2SDaniel Kokron   PetscBool          usecprow = a->compressedrow.use;
25506679dcc1SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
255196e086a2SDaniel Kokron 
255296e086a2SDaniel Kokron   __m256d a0, a1, a2, a3, a4, a5;
2553ce68d72fSJed Brown   __m256d w0, w1, w2, w3;
255496e086a2SDaniel Kokron   __m256d z0, z1, z2;
255596e086a2SDaniel Kokron   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
255696e086a2SDaniel Kokron 
255796e086a2SDaniel Kokron   PetscFunctionBegin;
25589566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
25599566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
25609566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &zarray));
256196e086a2SDaniel Kokron 
256296e086a2SDaniel Kokron   idx = a->j;
256396e086a2SDaniel Kokron   v   = a->a;
256496e086a2SDaniel Kokron   if (usecprow) {
256596e086a2SDaniel Kokron     mbs  = a->compressedrow.nrows;
256696e086a2SDaniel Kokron     ii   = a->compressedrow.i;
256796e086a2SDaniel Kokron     ridx = a->compressedrow.rindex;
256896e086a2SDaniel Kokron   } else {
256996e086a2SDaniel Kokron     mbs = a->mbs;
257096e086a2SDaniel Kokron     ii  = a->i;
257196e086a2SDaniel Kokron     z   = zarray;
257296e086a2SDaniel Kokron   }
257396e086a2SDaniel Kokron 
257496e086a2SDaniel Kokron   if (!a->mult_work) {
257596e086a2SDaniel Kokron     k = PetscMax(A->rmap->n, A->cmap->n);
25769566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
257796e086a2SDaniel Kokron   }
257896e086a2SDaniel Kokron 
257996e086a2SDaniel Kokron   work = a->mult_work;
258096e086a2SDaniel Kokron   for (i = 0; i < mbs; i++) {
25819371c9d4SSatish Balay     n = ii[1] - ii[0];
25829371c9d4SSatish Balay     ii++;
258396e086a2SDaniel Kokron     workt = work;
258496e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
258596e086a2SDaniel Kokron       xb = x + bs * (*idx++);
258696e086a2SDaniel Kokron       for (k = 0; k < bs; k++) workt[k] = xb[k];
258796e086a2SDaniel Kokron       workt += bs;
258896e086a2SDaniel Kokron     }
258996e086a2SDaniel Kokron     if (usecprow) z = zarray + bs * ridx[i];
259096e086a2SDaniel Kokron 
25919371c9d4SSatish Balay     z0 = _mm256_loadu_pd(&z[0]);
25929371c9d4SSatish Balay     z1 = _mm256_loadu_pd(&z[4]);
25939371c9d4SSatish Balay     z2 = _mm256_set1_pd(z[8]);
259496e086a2SDaniel Kokron 
259596e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
2596c05b70c4SSatish Balay       /* first column of a */
259796e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9]);
25989371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81]);
25999371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
26009371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
26019371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
26029371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
26039371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
260496e086a2SDaniel Kokron 
2605c05b70c4SSatish Balay       /* second column of a */
260696e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 1]);
26079371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
26089371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
26099371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
26109371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
26119371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
26129371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
261396e086a2SDaniel Kokron 
2614c05b70c4SSatish Balay       /* third column of a */
261596e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 2]);
26169371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
26179371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w2, z0);
26189371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
26199371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w2, z1);
26209371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
26219371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w2, z2);
262296e086a2SDaniel Kokron 
2623c05b70c4SSatish Balay       /* fourth column of a */
262496e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 3]);
26259371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
26269371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w3, z0);
26279371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
26289371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w3, z1);
26299371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
26309371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w3, z2);
263196e086a2SDaniel Kokron 
2632c05b70c4SSatish Balay       /* fifth column of a */
263396e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 4]);
26349371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
26359371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w0, z0);
26369371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
26379371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w0, z1);
26389371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
26399371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w0, z2);
264096e086a2SDaniel Kokron 
2641c05b70c4SSatish Balay       /* sixth column of a */
264296e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 5]);
26439371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
26449371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
26459371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
26469371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
26479371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
26489371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
264996e086a2SDaniel Kokron 
2650c05b70c4SSatish Balay       /* seventh column of a */
265196e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 6]);
26529371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
26539371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
26549371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
26559371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
26569371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
26579371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
265896e086a2SDaniel Kokron 
26596aad120cSJose E. Roman       /* eighth column of a */
266096e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 7]);
26619371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
26629371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
26639371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
26649371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
26659371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
26669371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
266796e086a2SDaniel Kokron 
2668c05b70c4SSatish Balay       /* ninth column of a */
266996e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 8]);
26709371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
26719371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
26729371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
26739371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
26749371c9d4SSatish Balay       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
26759371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
267696e086a2SDaniel Kokron     }
267796e086a2SDaniel Kokron 
26789371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
26799371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
26809371c9d4SSatish Balay     _mm256_maskstore_pd(&z[8], mask1, z2);
268196e086a2SDaniel Kokron 
268296e086a2SDaniel Kokron     v += n * bs2;
268396e086a2SDaniel Kokron     if (!usecprow) z += bs;
268496e086a2SDaniel Kokron   }
26859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
26869566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &zarray));
26879566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(162.0 * a->nz));
268896e086a2SDaniel Kokron   PetscFunctionReturn(0);
268996e086a2SDaniel Kokron }
269096e086a2SDaniel Kokron #endif
269196e086a2SDaniel Kokron 
2692*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2693*d71ae5a4SJacob Faibussowitsch {
2694ebada01fSBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2695f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2696ebada01fSBarry Smith   const PetscScalar *x, *xb;
2697ebada01fSBarry Smith   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2698ebada01fSBarry Smith   const MatScalar   *v;
2699ebada01fSBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2700ebada01fSBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2701ebada01fSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2702ebada01fSBarry Smith 
2703ebada01fSBarry Smith   PetscFunctionBegin;
27049566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
27059566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2706ebada01fSBarry Smith 
2707ebada01fSBarry Smith   idx = a->j;
2708ebada01fSBarry Smith   v   = a->a;
2709ebada01fSBarry Smith   if (usecprow) {
271048a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2711ebada01fSBarry Smith     mbs  = a->compressedrow.nrows;
2712ebada01fSBarry Smith     ii   = a->compressedrow.i;
2713ebada01fSBarry Smith     ridx = a->compressedrow.rindex;
2714ebada01fSBarry Smith   } else {
2715ebada01fSBarry Smith     ii = a->i;
2716ebada01fSBarry Smith     y  = yarray;
2717ebada01fSBarry Smith     z  = zarray;
2718ebada01fSBarry Smith   }
2719ebada01fSBarry Smith 
2720ebada01fSBarry Smith   for (i = 0; i < mbs; i++) {
27219371c9d4SSatish Balay     n = ii[1] - ii[0];
27229371c9d4SSatish Balay     ii++;
2723ebada01fSBarry Smith     if (usecprow) {
2724ebada01fSBarry Smith       z = zarray + 11 * ridx[i];
2725ebada01fSBarry Smith       y = yarray + 11 * ridx[i];
2726ebada01fSBarry Smith     }
27279371c9d4SSatish Balay     sum1  = y[0];
27289371c9d4SSatish Balay     sum2  = y[1];
27299371c9d4SSatish Balay     sum3  = y[2];
27309371c9d4SSatish Balay     sum4  = y[3];
27319371c9d4SSatish Balay     sum5  = y[4];
27329371c9d4SSatish Balay     sum6  = y[5];
27339371c9d4SSatish Balay     sum7  = y[6];
27349371c9d4SSatish Balay     sum8  = y[7];
27359371c9d4SSatish Balay     sum9  = y[8];
27369371c9d4SSatish Balay     sum10 = y[9];
27379371c9d4SSatish Balay     sum11 = y[10];
2738ebada01fSBarry Smith     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2739ebada01fSBarry Smith     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2740ebada01fSBarry Smith     for (j = 0; j < n; j++) {
2741ebada01fSBarry Smith       xb  = x + 11 * (*idx++);
27429371c9d4SSatish Balay       x1  = xb[0];
27439371c9d4SSatish Balay       x2  = xb[1];
27449371c9d4SSatish Balay       x3  = xb[2];
27459371c9d4SSatish Balay       x4  = xb[3];
27469371c9d4SSatish Balay       x5  = xb[4];
27479371c9d4SSatish Balay       x6  = xb[5];
27489371c9d4SSatish Balay       x7  = xb[6];
27499371c9d4SSatish Balay       x8  = xb[7];
27509371c9d4SSatish Balay       x9  = xb[8];
27519371c9d4SSatish Balay       x10 = xb[9];
27529371c9d4SSatish Balay       x11 = xb[10];
2753ebada01fSBarry Smith       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2754ebada01fSBarry Smith       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2755ebada01fSBarry Smith       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2756ebada01fSBarry Smith       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2757ebada01fSBarry Smith       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2758ebada01fSBarry Smith       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2759ebada01fSBarry Smith       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2760ebada01fSBarry Smith       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2761ebada01fSBarry Smith       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2762ebada01fSBarry Smith       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2763ebada01fSBarry Smith       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2764ebada01fSBarry Smith       v += 121;
2765ebada01fSBarry Smith     }
27669371c9d4SSatish Balay     z[0]  = sum1;
27679371c9d4SSatish Balay     z[1]  = sum2;
27689371c9d4SSatish Balay     z[2]  = sum3;
27699371c9d4SSatish Balay     z[3]  = sum4;
27709371c9d4SSatish Balay     z[4]  = sum5;
27719371c9d4SSatish Balay     z[5]  = sum6;
27729371c9d4SSatish Balay     z[6]  = sum7;
27739371c9d4SSatish Balay     z[7]  = sum8;
27749371c9d4SSatish Balay     z[8]  = sum9;
27759371c9d4SSatish Balay     z[9]  = sum10;
27769371c9d4SSatish Balay     z[10] = sum11;
2777ebada01fSBarry Smith     if (!usecprow) {
27789371c9d4SSatish Balay       z += 11;
27799371c9d4SSatish Balay       y += 11;
2780ebada01fSBarry Smith     }
2781ebada01fSBarry Smith   }
27829566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
27839566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
27849566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(242.0 * a->nz));
2785ebada01fSBarry Smith   PetscFunctionReturn(0);
2786ebada01fSBarry Smith }
2787ebada01fSBarry Smith 
2788*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2789*d71ae5a4SJacob Faibussowitsch {
27902d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2791f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
2792d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2793d9ca1df4SBarry Smith   const MatScalar   *v;
2794d9ca1df4SBarry Smith   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2795d9ca1df4SBarry Smith   PetscInt           ncols, k;
2796d9ca1df4SBarry Smith   const PetscInt    *ridx     = NULL, *idx, *ii;
2797ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2798218c64b6SSatish Balay 
27992d61bbb3SSatish Balay   PetscFunctionBegin;
28009566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
28019566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
28029566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &zarray));
28032d61bbb3SSatish Balay 
28042d61bbb3SSatish Balay   idx = a->j;
28052d61bbb3SSatish Balay   v   = a->a;
280626e093fcSHong Zhang   if (usecprow) {
280726e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
280826e093fcSHong Zhang     ii   = a->compressedrow.i;
28097b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
281026e093fcSHong Zhang   } else {
281126e093fcSHong Zhang     mbs = a->mbs;
28122d61bbb3SSatish Balay     ii  = a->i;
281326e093fcSHong Zhang     z   = zarray;
281426e093fcSHong Zhang   }
28152d61bbb3SSatish Balay 
28162d61bbb3SSatish Balay   if (!a->mult_work) {
2817d0f46423SBarry Smith     k = PetscMax(A->rmap->n, A->cmap->n);
28189566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
28192d61bbb3SSatish Balay   }
28202d61bbb3SSatish Balay   work = a->mult_work;
28212d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
28229371c9d4SSatish Balay     n = ii[1] - ii[0];
28239371c9d4SSatish Balay     ii++;
28242d61bbb3SSatish Balay     ncols = n * bs;
28252d61bbb3SSatish Balay     workt = work;
28262d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
28272d61bbb3SSatish Balay       xb = x + bs * (*idx++);
28282d61bbb3SSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
28292d61bbb3SSatish Balay       workt += bs;
28302d61bbb3SSatish Balay     }
28317b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs * ridx[i];
283296b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
28332d61bbb3SSatish Balay     v += n * bs2;
283426fbe8dcSKarl Rupp     if (!usecprow) z += bs;
283526e093fcSHong Zhang   }
28369566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
28379566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &zarray));
28389566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
28392d61bbb3SSatish Balay   PetscFunctionReturn(0);
28402d61bbb3SSatish Balay }
28412d61bbb3SSatish Balay 
2842*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2843*d71ae5a4SJacob Faibussowitsch {
2844547795f9SHong Zhang   PetscScalar zero = 0.0;
2845547795f9SHong Zhang 
2846547795f9SHong Zhang   PetscFunctionBegin;
28479566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
28489566063dSJacob Faibussowitsch   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2849547795f9SHong Zhang   PetscFunctionReturn(0);
2850547795f9SHong Zhang }
2851547795f9SHong Zhang 
2852*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2853*d71ae5a4SJacob Faibussowitsch {
28543447b6efSHong Zhang   PetscScalar zero = 0.0;
28552d61bbb3SSatish Balay 
28562d61bbb3SSatish Balay   PetscFunctionBegin;
28579566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
28589566063dSJacob Faibussowitsch   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
28592d61bbb3SSatish Balay   PetscFunctionReturn(0);
28602d61bbb3SSatish Balay }
28612d61bbb3SSatish Balay 
2862*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2863*d71ae5a4SJacob Faibussowitsch {
2864547795f9SHong Zhang   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2865b8c08b77SHong Zhang   PetscScalar       *z, x1, x2, x3, x4, x5;
2866d9ca1df4SBarry Smith   const PetscScalar *x, *xb = NULL;
2867d9ca1df4SBarry Smith   const MatScalar   *v;
2868b8c08b77SHong Zhang   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2869d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2870547795f9SHong Zhang   Mat_CompressedRow  cprow    = a->compressedrow;
2871ace3abfcSBarry Smith   PetscBool          usecprow = cprow.use;
2872547795f9SHong Zhang 
2873547795f9SHong Zhang   PetscFunctionBegin;
28749566063dSJacob Faibussowitsch   if (yy != zz) PetscCall(VecCopy(yy, zz));
28759566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
28769566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
2877547795f9SHong Zhang 
2878547795f9SHong Zhang   idx = a->j;
2879547795f9SHong Zhang   v   = a->a;
2880547795f9SHong Zhang   if (usecprow) {
2881547795f9SHong Zhang     mbs  = cprow.nrows;
2882547795f9SHong Zhang     ii   = cprow.i;
2883547795f9SHong Zhang     ridx = cprow.rindex;
2884547795f9SHong Zhang   } else {
2885547795f9SHong Zhang     mbs = a->mbs;
2886547795f9SHong Zhang     ii  = a->i;
2887547795f9SHong Zhang     xb  = x;
2888547795f9SHong Zhang   }
2889547795f9SHong Zhang 
2890547795f9SHong Zhang   switch (bs) {
2891547795f9SHong Zhang   case 1:
2892547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2893547795f9SHong Zhang       if (usecprow) xb = x + ridx[i];
2894547795f9SHong Zhang       x1 = xb[0];
2895547795f9SHong Zhang       ib = idx + ii[0];
28969371c9d4SSatish Balay       n  = ii[1] - ii[0];
28979371c9d4SSatish Balay       ii++;
2898547795f9SHong Zhang       for (j = 0; j < n; j++) {
2899547795f9SHong Zhang         rval = ib[j];
2900547795f9SHong Zhang         z[rval] += PetscConj(*v) * x1;
2901547795f9SHong Zhang         v++;
2902547795f9SHong Zhang       }
2903547795f9SHong Zhang       if (!usecprow) xb++;
2904547795f9SHong Zhang     }
2905547795f9SHong Zhang     break;
2906547795f9SHong Zhang   case 2:
2907547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2908547795f9SHong Zhang       if (usecprow) xb = x + 2 * ridx[i];
29099371c9d4SSatish Balay       x1 = xb[0];
29109371c9d4SSatish Balay       x2 = xb[1];
2911547795f9SHong Zhang       ib = idx + ii[0];
29129371c9d4SSatish Balay       n  = ii[1] - ii[0];
29139371c9d4SSatish Balay       ii++;
2914547795f9SHong Zhang       for (j = 0; j < n; j++) {
2915547795f9SHong Zhang         rval = ib[j] * 2;
2916547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2917547795f9SHong Zhang         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2918547795f9SHong Zhang         v += 4;
2919547795f9SHong Zhang       }
2920547795f9SHong Zhang       if (!usecprow) xb += 2;
2921547795f9SHong Zhang     }
2922547795f9SHong Zhang     break;
2923547795f9SHong Zhang   case 3:
2924547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2925547795f9SHong Zhang       if (usecprow) xb = x + 3 * ridx[i];
29269371c9d4SSatish Balay       x1 = xb[0];
29279371c9d4SSatish Balay       x2 = xb[1];
29289371c9d4SSatish Balay       x3 = xb[2];
2929547795f9SHong Zhang       ib = idx + ii[0];
29309371c9d4SSatish Balay       n  = ii[1] - ii[0];
29319371c9d4SSatish Balay       ii++;
2932547795f9SHong Zhang       for (j = 0; j < n; j++) {
2933547795f9SHong Zhang         rval = ib[j] * 3;
2934547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2935547795f9SHong Zhang         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2936547795f9SHong Zhang         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2937547795f9SHong Zhang         v += 9;
2938547795f9SHong Zhang       }
2939547795f9SHong Zhang       if (!usecprow) xb += 3;
2940547795f9SHong Zhang     }
2941547795f9SHong Zhang     break;
2942547795f9SHong Zhang   case 4:
2943547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2944547795f9SHong Zhang       if (usecprow) xb = x + 4 * ridx[i];
29459371c9d4SSatish Balay       x1 = xb[0];
29469371c9d4SSatish Balay       x2 = xb[1];
29479371c9d4SSatish Balay       x3 = xb[2];
29489371c9d4SSatish Balay       x4 = xb[3];
2949547795f9SHong Zhang       ib = idx + ii[0];
29509371c9d4SSatish Balay       n  = ii[1] - ii[0];
29519371c9d4SSatish Balay       ii++;
2952547795f9SHong Zhang       for (j = 0; j < n; j++) {
2953547795f9SHong Zhang         rval = ib[j] * 4;
2954547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2955547795f9SHong Zhang         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2956547795f9SHong Zhang         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2957547795f9SHong Zhang         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2958547795f9SHong Zhang         v += 16;
2959547795f9SHong Zhang       }
2960547795f9SHong Zhang       if (!usecprow) xb += 4;
2961547795f9SHong Zhang     }
2962547795f9SHong Zhang     break;
2963547795f9SHong Zhang   case 5:
2964547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2965547795f9SHong Zhang       if (usecprow) xb = x + 5 * ridx[i];
29669371c9d4SSatish Balay       x1 = xb[0];
29679371c9d4SSatish Balay       x2 = xb[1];
29689371c9d4SSatish Balay       x3 = xb[2];
29699371c9d4SSatish Balay       x4 = xb[3];
29709371c9d4SSatish Balay       x5 = xb[4];
2971547795f9SHong Zhang       ib = idx + ii[0];
29729371c9d4SSatish Balay       n  = ii[1] - ii[0];
29739371c9d4SSatish Balay       ii++;
2974547795f9SHong Zhang       for (j = 0; j < n; j++) {
2975547795f9SHong Zhang         rval = ib[j] * 5;
2976547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2977547795f9SHong Zhang         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2978547795f9SHong Zhang         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2979547795f9SHong Zhang         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2980547795f9SHong Zhang         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2981547795f9SHong Zhang         v += 25;
2982547795f9SHong Zhang       }
2983547795f9SHong Zhang       if (!usecprow) xb += 5;
2984547795f9SHong Zhang     }
2985547795f9SHong Zhang     break;
2986*d71ae5a4SJacob Faibussowitsch   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2987*d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2988968ae2c8SSatish Balay #if 0
2989968ae2c8SSatish Balay     {
2990b8c08b77SHong Zhang       PetscInt          ncols,k,bs2=a->bs2;
2991b8c08b77SHong Zhang       PetscScalar       *work,*workt,zb;
2992d9ca1df4SBarry Smith       const PetscScalar *xtmp;
2993547795f9SHong Zhang       if (!a->mult_work) {
2994547795f9SHong Zhang         k    = PetscMax(A->rmap->n,A->cmap->n);
29959566063dSJacob Faibussowitsch         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2996547795f9SHong Zhang       }
2997547795f9SHong Zhang       work = a->mult_work;
2998547795f9SHong Zhang       xtmp = x;
2999547795f9SHong Zhang       for (i=0; i<mbs; i++) {
3000547795f9SHong Zhang         n     = ii[1] - ii[0]; ii++;
3001547795f9SHong Zhang         ncols = n*bs;
30029566063dSJacob Faibussowitsch         PetscCall(PetscArrayzero(work,ncols));
300326fbe8dcSKarl Rupp         if (usecprow) xtmp = x + bs*ridx[i];
300496b95a6bSBarry Smith         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
3005547795f9SHong Zhang         v += n*bs2;
3006547795f9SHong Zhang         if (!usecprow) xtmp += bs;
3007547795f9SHong Zhang         workt = work;
3008547795f9SHong Zhang         for (j=0; j<n; j++) {
3009547795f9SHong Zhang           zb = z + bs*(*idx++);
3010547795f9SHong Zhang           for (k=0; k<bs; k++) zb[k] += workt[k] ;
3011547795f9SHong Zhang           workt += bs;
3012547795f9SHong Zhang         }
3013547795f9SHong Zhang       }
3014547795f9SHong Zhang     }
3015968ae2c8SSatish Balay #endif
3016547795f9SHong Zhang   }
30179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
30189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
30199566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3020547795f9SHong Zhang   PetscFunctionReturn(0);
3021547795f9SHong Zhang }
3022547795f9SHong Zhang 
3023*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
3024*d71ae5a4SJacob Faibussowitsch {
30252d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3026d9ca1df4SBarry Smith   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
3027f4259b30SLisandro Dalcin   const PetscScalar *x, *xb = NULL;
3028d9ca1df4SBarry Smith   const MatScalar   *v;
3029d9ca1df4SBarry Smith   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3030d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
30313447b6efSHong Zhang   Mat_CompressedRow  cprow    = a->compressedrow;
3032ace3abfcSBarry Smith   PetscBool          usecprow = cprow.use;
30332d61bbb3SSatish Balay 
30342d61bbb3SSatish Balay   PetscFunctionBegin;
30359566063dSJacob Faibussowitsch   if (yy != zz) PetscCall(VecCopy(yy, zz));
30369566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
30379566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
30382d61bbb3SSatish Balay 
30392d61bbb3SSatish Balay   idx = a->j;
30402d61bbb3SSatish Balay   v   = a->a;
30413447b6efSHong Zhang   if (usecprow) {
30423447b6efSHong Zhang     mbs  = cprow.nrows;
30433447b6efSHong Zhang     ii   = cprow.i;
30447b2bb3b9SHong Zhang     ridx = cprow.rindex;
30453447b6efSHong Zhang   } else {
30463447b6efSHong Zhang     mbs = a->mbs;
30472d61bbb3SSatish Balay     ii  = a->i;
3048f1af5d2fSBarry Smith     xb  = x;
30493447b6efSHong Zhang   }
30502d61bbb3SSatish Balay 
30512d61bbb3SSatish Balay   switch (bs) {
30522d61bbb3SSatish Balay   case 1:
30532d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30547b2bb3b9SHong Zhang       if (usecprow) xb = x + ridx[i];
3055f1af5d2fSBarry Smith       x1 = xb[0];
30563447b6efSHong Zhang       ib = idx + ii[0];
30579371c9d4SSatish Balay       n  = ii[1] - ii[0];
30589371c9d4SSatish Balay       ii++;
30592d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30602d61bbb3SSatish Balay         rval = ib[j];
3061f1af5d2fSBarry Smith         z[rval] += *v * x1;
3062f1af5d2fSBarry Smith         v++;
30632d61bbb3SSatish Balay       }
30643447b6efSHong Zhang       if (!usecprow) xb++;
30652d61bbb3SSatish Balay     }
30662d61bbb3SSatish Balay     break;
30672d61bbb3SSatish Balay   case 2:
30682d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30697b2bb3b9SHong Zhang       if (usecprow) xb = x + 2 * ridx[i];
30709371c9d4SSatish Balay       x1 = xb[0];
30719371c9d4SSatish Balay       x2 = xb[1];
30723447b6efSHong Zhang       ib = idx + ii[0];
30739371c9d4SSatish Balay       n  = ii[1] - ii[0];
30749371c9d4SSatish Balay       ii++;
30752d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30762d61bbb3SSatish Balay         rval = ib[j] * 2;
30772d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2;
30782d61bbb3SSatish Balay         z[rval++] += v[2] * x1 + v[3] * x2;
30792d61bbb3SSatish Balay         v += 4;
30802d61bbb3SSatish Balay       }
30813447b6efSHong Zhang       if (!usecprow) xb += 2;
30822d61bbb3SSatish Balay     }
30832d61bbb3SSatish Balay     break;
30842d61bbb3SSatish Balay   case 3:
30852d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30867b2bb3b9SHong Zhang       if (usecprow) xb = x + 3 * ridx[i];
30879371c9d4SSatish Balay       x1 = xb[0];
30889371c9d4SSatish Balay       x2 = xb[1];
30899371c9d4SSatish Balay       x3 = xb[2];
30903447b6efSHong Zhang       ib = idx + ii[0];
30919371c9d4SSatish Balay       n  = ii[1] - ii[0];
30929371c9d4SSatish Balay       ii++;
30932d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30942d61bbb3SSatish Balay         rval = ib[j] * 3;
30952d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
30962d61bbb3SSatish Balay         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
30972d61bbb3SSatish Balay         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
30982d61bbb3SSatish Balay         v += 9;
30992d61bbb3SSatish Balay       }
31003447b6efSHong Zhang       if (!usecprow) xb += 3;
31012d61bbb3SSatish Balay     }
31022d61bbb3SSatish Balay     break;
31032d61bbb3SSatish Balay   case 4:
31042d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
31057b2bb3b9SHong Zhang       if (usecprow) xb = x + 4 * ridx[i];
31069371c9d4SSatish Balay       x1 = xb[0];
31079371c9d4SSatish Balay       x2 = xb[1];
31089371c9d4SSatish Balay       x3 = xb[2];
31099371c9d4SSatish Balay       x4 = xb[3];
31103447b6efSHong Zhang       ib = idx + ii[0];
31119371c9d4SSatish Balay       n  = ii[1] - ii[0];
31129371c9d4SSatish Balay       ii++;
31132d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
31142d61bbb3SSatish Balay         rval = ib[j] * 4;
31152d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
31162d61bbb3SSatish Balay         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
31172d61bbb3SSatish Balay         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
31182d61bbb3SSatish Balay         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
31192d61bbb3SSatish Balay         v += 16;
31202d61bbb3SSatish Balay       }
31213447b6efSHong Zhang       if (!usecprow) xb += 4;
31222d61bbb3SSatish Balay     }
31232d61bbb3SSatish Balay     break;
31242d61bbb3SSatish Balay   case 5:
31252d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
31267b2bb3b9SHong Zhang       if (usecprow) xb = x + 5 * ridx[i];
31279371c9d4SSatish Balay       x1 = xb[0];
31289371c9d4SSatish Balay       x2 = xb[1];
31299371c9d4SSatish Balay       x3 = xb[2];
31309371c9d4SSatish Balay       x4 = xb[3];
31319371c9d4SSatish Balay       x5 = xb[4];
31323447b6efSHong Zhang       ib = idx + ii[0];
31339371c9d4SSatish Balay       n  = ii[1] - ii[0];
31349371c9d4SSatish Balay       ii++;
31352d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
31362d61bbb3SSatish Balay         rval = ib[j] * 5;
31372d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
31382d61bbb3SSatish Balay         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
31392d61bbb3SSatish Balay         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
31402d61bbb3SSatish Balay         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
31412d61bbb3SSatish Balay         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
31422d61bbb3SSatish Balay         v += 25;
31432d61bbb3SSatish Balay       }
31443447b6efSHong Zhang       if (!usecprow) xb += 5;
31452d61bbb3SSatish Balay     }
31462d61bbb3SSatish Balay     break;
3147f1af5d2fSBarry Smith   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3148690b6cddSBarry Smith     PetscInt           ncols, k;
3149d9ca1df4SBarry Smith     PetscScalar       *work, *workt;
3150d9ca1df4SBarry Smith     const PetscScalar *xtmp;
31512d61bbb3SSatish Balay     if (!a->mult_work) {
3152d0f46423SBarry Smith       k = PetscMax(A->rmap->n, A->cmap->n);
31539566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
31542d61bbb3SSatish Balay     }
31552d61bbb3SSatish Balay     work = a->mult_work;
31563447b6efSHong Zhang     xtmp = x;
31572d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
31589371c9d4SSatish Balay       n = ii[1] - ii[0];
31599371c9d4SSatish Balay       ii++;
31602d61bbb3SSatish Balay       ncols = n * bs;
31619566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(work, ncols));
316226fbe8dcSKarl Rupp       if (usecprow) xtmp = x + bs * ridx[i];
316396b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
31642d61bbb3SSatish Balay       v += n * bs2;
31653447b6efSHong Zhang       if (!usecprow) xtmp += bs;
31662d61bbb3SSatish Balay       workt = work;
31672d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
31682d61bbb3SSatish Balay         zb = z + bs * (*idx++);
31692d61bbb3SSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
31702d61bbb3SSatish Balay         workt += bs;
31712d61bbb3SSatish Balay       }
31722d61bbb3SSatish Balay     }
31732d61bbb3SSatish Balay   }
31742d61bbb3SSatish Balay   }
31759566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
31769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
31779566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
31782d61bbb3SSatish Balay   PetscFunctionReturn(0);
31792d61bbb3SSatish Balay }
31802d61bbb3SSatish Balay 
3181*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3182*d71ae5a4SJacob Faibussowitsch {
31832d61bbb3SSatish Balay   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3184690b6cddSBarry Smith   PetscInt     totalnz = a->bs2 * a->nz;
3185f4df32b1SMatthew Knepley   PetscScalar  oalpha  = alpha;
3186c5df96a5SBarry Smith   PetscBLASInt one     = 1, tnz;
31872d61bbb3SSatish Balay 
31882d61bbb3SSatish Balay   PetscFunctionBegin;
31899566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3190792fecdfSBarry Smith   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
31919566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(totalnz));
31922d61bbb3SSatish Balay   PetscFunctionReturn(0);
31932d61bbb3SSatish Balay }
31942d61bbb3SSatish Balay 
3195*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3196*d71ae5a4SJacob Faibussowitsch {
31972d61bbb3SSatish Balay   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
31983f1db9ecSBarry Smith   MatScalar   *v   = a->a;
3199329f5518SBarry Smith   PetscReal    sum = 0.0;
3200d0f46423SBarry Smith   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
32012d61bbb3SSatish Balay 
32022d61bbb3SSatish Balay   PetscFunctionBegin;
32032d61bbb3SSatish Balay   if (type == NORM_FROBENIUS) {
3204570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16)
3205570b7f6dSBarry Smith     PetscBLASInt one = 1, cnt = bs2 * nz;
3206792fecdfSBarry Smith     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3207570b7f6dSBarry Smith #else
32082d61bbb3SSatish Balay     for (i = 0; i < bs2 * nz; i++) {
32099371c9d4SSatish Balay       sum += PetscRealPart(PetscConj(*v) * (*v));
32109371c9d4SSatish Balay       v++;
32112d61bbb3SSatish Balay     }
3212570b7f6dSBarry Smith #endif
32138f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum);
32149566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
32158a62d963SHong Zhang   } else if (type == NORM_1) { /* maximum column sum */
32168a62d963SHong Zhang     PetscReal *tmp;
32178a62d963SHong Zhang     PetscInt  *bcol = a->j;
32189566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
32198a62d963SHong Zhang     for (i = 0; i < nz; i++) {
32208a62d963SHong Zhang       for (j = 0; j < bs; j++) {
32218a62d963SHong Zhang         k1 = bs * (*bcol) + j; /* column index */
32228a62d963SHong Zhang         for (k = 0; k < bs; k++) {
32239371c9d4SSatish Balay           tmp[k1] += PetscAbsScalar(*v);
32249371c9d4SSatish Balay           v++;
32258a62d963SHong Zhang         }
32268a62d963SHong Zhang       }
32278a62d963SHong Zhang       bcol++;
32288a62d963SHong Zhang     }
32298a62d963SHong Zhang     *norm = 0.0;
3230d0f46423SBarry Smith     for (j = 0; j < A->cmap->n; j++) {
32318a62d963SHong Zhang       if (tmp[j] > *norm) *norm = tmp[j];
32328a62d963SHong Zhang     }
32339566063dSJacob Faibussowitsch     PetscCall(PetscFree(tmp));
32349566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3235596552b5SBarry Smith   } else if (type == NORM_INFINITY) { /* maximum row sum */
3236596552b5SBarry Smith     *norm = 0.0;
3237596552b5SBarry Smith     for (k = 0; k < bs; k++) {
323874f84c7bSSatish Balay       for (j = 0; j < a->mbs; j++) {
3239596552b5SBarry Smith         v   = a->a + bs2 * a->i[j] + k;
3240596552b5SBarry Smith         sum = 0.0;
3241596552b5SBarry Smith         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
32420e90e235SBarry Smith           for (k1 = 0; k1 < bs; k1++) {
3243596552b5SBarry Smith             sum += PetscAbsScalar(*v);
3244596552b5SBarry Smith             v += bs;
32452d61bbb3SSatish Balay           }
32460e90e235SBarry Smith         }
3247596552b5SBarry Smith         if (sum > *norm) *norm = sum;
3248596552b5SBarry Smith       }
3249596552b5SBarry Smith     }
32509566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3251e7e72b3dSBarry Smith   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
32522d61bbb3SSatish Balay   PetscFunctionReturn(0);
32532d61bbb3SSatish Balay }
32542d61bbb3SSatish Balay 
3255*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3256*d71ae5a4SJacob Faibussowitsch {
32572d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
32582d61bbb3SSatish Balay 
32592d61bbb3SSatish Balay   PetscFunctionBegin;
32602d61bbb3SSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3261d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3262273d9f13SBarry Smith     *flg = PETSC_FALSE;
3263273d9f13SBarry Smith     PetscFunctionReturn(0);
32642d61bbb3SSatish Balay   }
32652d61bbb3SSatish Balay 
32662d61bbb3SSatish Balay   /* if the a->i are the same */
32679566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
326826fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
32692d61bbb3SSatish Balay 
32702d61bbb3SSatish Balay   /* if a->j are the same */
32719566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
327226fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
327326fbe8dcSKarl Rupp 
32742d61bbb3SSatish Balay   /* if a->a are the same */
32759566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
32762d61bbb3SSatish Balay   PetscFunctionReturn(0);
32772d61bbb3SSatish Balay }
32782d61bbb3SSatish Balay 
3279*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3280*d71ae5a4SJacob Faibussowitsch {
32812d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3282690b6cddSBarry Smith   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
328387828ca2SBarry Smith   PetscScalar *x, zero = 0.0;
32843f1db9ecSBarry Smith   MatScalar   *aa, *aa_j;
32852d61bbb3SSatish Balay 
32862d61bbb3SSatish Balay   PetscFunctionBegin;
328728b400f6SJacob Faibussowitsch   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3288d0f46423SBarry Smith   bs   = A->rmap->bs;
32892d61bbb3SSatish Balay   aa   = a->a;
32902d61bbb3SSatish Balay   ai   = a->i;
32912d61bbb3SSatish Balay   aj   = a->j;
32922d61bbb3SSatish Balay   ambs = a->mbs;
32932d61bbb3SSatish Balay   bs2  = a->bs2;
32942d61bbb3SSatish Balay 
32959566063dSJacob Faibussowitsch   PetscCall(VecSet(v, zero));
32969566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
32979566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(v, &n));
329808401ef6SPierre Jolivet   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
32992d61bbb3SSatish Balay   for (i = 0; i < ambs; i++) {
33002d61bbb3SSatish Balay     for (j = ai[i]; j < ai[i + 1]; j++) {
33012d61bbb3SSatish Balay       if (aj[j] == i) {
33022d61bbb3SSatish Balay         row  = i * bs;
33032d61bbb3SSatish Balay         aa_j = aa + j * bs2;
33042d61bbb3SSatish Balay         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
33052d61bbb3SSatish Balay         break;
33062d61bbb3SSatish Balay       }
33072d61bbb3SSatish Balay     }
33082d61bbb3SSatish Balay   }
33099566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
33102d61bbb3SSatish Balay   PetscFunctionReturn(0);
33112d61bbb3SSatish Balay }
33122d61bbb3SSatish Balay 
3313*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3314*d71ae5a4SJacob Faibussowitsch {
33152d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
331653ef36baSBarry Smith   const PetscScalar *l, *r, *li, *ri;
331753ef36baSBarry Smith   PetscScalar        x;
33183f1db9ecSBarry Smith   MatScalar         *aa, *v;
331953ef36baSBarry Smith   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
332053ef36baSBarry Smith   const PetscInt    *ai, *aj;
33212d61bbb3SSatish Balay 
33222d61bbb3SSatish Balay   PetscFunctionBegin;
33232d61bbb3SSatish Balay   ai  = a->i;
33242d61bbb3SSatish Balay   aj  = a->j;
33252d61bbb3SSatish Balay   aa  = a->a;
3326d0f46423SBarry Smith   m   = A->rmap->n;
3327d0f46423SBarry Smith   n   = A->cmap->n;
3328d0f46423SBarry Smith   bs  = A->rmap->bs;
33292d61bbb3SSatish Balay   mbs = a->mbs;
33302d61bbb3SSatish Balay   bs2 = a->bs2;
33312d61bbb3SSatish Balay   if (ll) {
33329566063dSJacob Faibussowitsch     PetscCall(VecGetArrayRead(ll, &l));
33339566063dSJacob Faibussowitsch     PetscCall(VecGetLocalSize(ll, &lm));
333408401ef6SPierre Jolivet     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
33352d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) { /* for each block row */
33362d61bbb3SSatish Balay       M  = ai[i + 1] - ai[i];
33372d61bbb3SSatish Balay       li = l + i * bs;
33382d61bbb3SSatish Balay       v  = aa + bs2 * ai[i];
33392d61bbb3SSatish Balay       for (j = 0; j < M; j++) { /* for each block */
3340ad540459SPierre Jolivet         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
33412d61bbb3SSatish Balay       }
33422d61bbb3SSatish Balay     }
33439566063dSJacob Faibussowitsch     PetscCall(VecRestoreArrayRead(ll, &l));
33449566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
33452d61bbb3SSatish Balay   }
33462d61bbb3SSatish Balay 
33472d61bbb3SSatish Balay   if (rr) {
33489566063dSJacob Faibussowitsch     PetscCall(VecGetArrayRead(rr, &r));
33499566063dSJacob Faibussowitsch     PetscCall(VecGetLocalSize(rr, &rn));
335008401ef6SPierre Jolivet     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
33512d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) { /* for each block row */
335253ef36baSBarry Smith       iai = ai[i];
335353ef36baSBarry Smith       M   = ai[i + 1] - iai;
335453ef36baSBarry Smith       v   = aa + bs2 * iai;
33552d61bbb3SSatish Balay       for (j = 0; j < M; j++) { /* for each block */
335653ef36baSBarry Smith         ri = r + bs * aj[iai + j];
33572d61bbb3SSatish Balay         for (k = 0; k < bs; k++) {
33582d61bbb3SSatish Balay           x = ri[k];
335953ef36baSBarry Smith           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
336053ef36baSBarry Smith           v += bs;
33612d61bbb3SSatish Balay         }
33622d61bbb3SSatish Balay       }
33632d61bbb3SSatish Balay     }
33649566063dSJacob Faibussowitsch     PetscCall(VecRestoreArrayRead(rr, &r));
33659566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
33662d61bbb3SSatish Balay   }
33672d61bbb3SSatish Balay   PetscFunctionReturn(0);
33682d61bbb3SSatish Balay }
33692d61bbb3SSatish Balay 
3370*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3371*d71ae5a4SJacob Faibussowitsch {
33722d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
33732d61bbb3SSatish Balay 
33742d61bbb3SSatish Balay   PetscFunctionBegin;
33752d61bbb3SSatish Balay   info->block_size   = a->bs2;
3376ceed8ce5SJed Brown   info->nz_allocated = a->bs2 * a->maxnz;
33772d61bbb3SSatish Balay   info->nz_used      = a->bs2 * a->nz;
33783966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
33792d61bbb3SSatish Balay   info->assemblies   = A->num_ass;
33808e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
33814dfa11a4SJacob Faibussowitsch   info->memory       = 0; /* REVIEW ME */
3382d5f3da31SBarry Smith   if (A->factortype) {
33832d61bbb3SSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
33842d61bbb3SSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
33852d61bbb3SSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
33862d61bbb3SSatish Balay   } else {
33872d61bbb3SSatish Balay     info->fill_ratio_given  = 0;
33882d61bbb3SSatish Balay     info->fill_ratio_needed = 0;
33892d61bbb3SSatish Balay     info->factor_mallocs    = 0;
33902d61bbb3SSatish Balay   }
33912d61bbb3SSatish Balay   PetscFunctionReturn(0);
33922d61bbb3SSatish Balay }
33932d61bbb3SSatish Balay 
3394*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3395*d71ae5a4SJacob Faibussowitsch {
33962d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
33972d61bbb3SSatish Balay 
33982d61bbb3SSatish Balay   PetscFunctionBegin;
33999566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
34002d61bbb3SSatish Balay   PetscFunctionReturn(0);
34012d61bbb3SSatish Balay }
3402a001520aSPierre Jolivet 
3403*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3404*d71ae5a4SJacob Faibussowitsch {
3405a001520aSPierre Jolivet   PetscFunctionBegin;
34069566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
34074222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3408a001520aSPierre Jolivet   PetscFunctionReturn(0);
3409a001520aSPierre Jolivet }
3410a001520aSPierre Jolivet 
3411*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3412*d71ae5a4SJacob Faibussowitsch {
341374eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3414f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1;
3415bcf10a7aSPierre Jolivet   const PetscScalar *xb;
341674eeabc5SPierre Jolivet   PetscScalar        x1;
341774eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
341874eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
341974eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
342074eeabc5SPierre Jolivet 
342174eeabc5SPierre Jolivet   PetscFunctionBegin;
342274eeabc5SPierre Jolivet   idx = a->j;
342374eeabc5SPierre Jolivet   v   = a->a;
342474eeabc5SPierre Jolivet   if (usecprow) {
342574eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
342674eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
342774eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
342874eeabc5SPierre Jolivet   } else {
342974eeabc5SPierre Jolivet     mbs = a->mbs;
343074eeabc5SPierre Jolivet     ii  = a->i;
343174eeabc5SPierre Jolivet     z   = c;
343274eeabc5SPierre Jolivet   }
343374eeabc5SPierre Jolivet 
343474eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
34359371c9d4SSatish Balay     n = ii[1] - ii[0];
34369371c9d4SSatish Balay     ii++;
343774eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
343874eeabc5SPierre Jolivet     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
343974eeabc5SPierre Jolivet     if (usecprow) z = c + ridx[i];
344074eeabc5SPierre Jolivet     jj = idx;
344174eeabc5SPierre Jolivet     vv = v;
344274eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
344374eeabc5SPierre Jolivet       idx  = jj;
344474eeabc5SPierre Jolivet       v    = vv;
344574eeabc5SPierre Jolivet       sum1 = 0.0;
344674eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
34479371c9d4SSatish Balay         xb = b + (*idx++);
34489371c9d4SSatish Balay         x1 = xb[0 + k * bm];
344974eeabc5SPierre Jolivet         sum1 += v[0] * x1;
345074eeabc5SPierre Jolivet         v += 1;
345174eeabc5SPierre Jolivet       }
3452feb237baSPierre Jolivet       z[0 + k * cm] = sum1;
345374eeabc5SPierre Jolivet     }
345474eeabc5SPierre Jolivet     if (!usecprow) z += 1;
345574eeabc5SPierre Jolivet   }
345674eeabc5SPierre Jolivet   PetscFunctionReturn(0);
345774eeabc5SPierre Jolivet }
345874eeabc5SPierre Jolivet 
3459*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3460*d71ae5a4SJacob Faibussowitsch {
34614b7054f4SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3462f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2;
3463bcf10a7aSPierre Jolivet   const PetscScalar *xb;
34644b7054f4SPierre Jolivet   PetscScalar        x1, x2;
34654b7054f4SPierre Jolivet   const MatScalar   *v, *vv;
34664b7054f4SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
34674b7054f4SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
34684b7054f4SPierre Jolivet 
34694b7054f4SPierre Jolivet   PetscFunctionBegin;
34704b7054f4SPierre Jolivet   idx = a->j;
34714b7054f4SPierre Jolivet   v   = a->a;
34724b7054f4SPierre Jolivet   if (usecprow) {
34734b7054f4SPierre Jolivet     mbs  = a->compressedrow.nrows;
34744b7054f4SPierre Jolivet     ii   = a->compressedrow.i;
34754b7054f4SPierre Jolivet     ridx = a->compressedrow.rindex;
34764b7054f4SPierre Jolivet   } else {
34774b7054f4SPierre Jolivet     mbs = a->mbs;
34784b7054f4SPierre Jolivet     ii  = a->i;
34794b7054f4SPierre Jolivet     z   = c;
34804b7054f4SPierre Jolivet   }
34814b7054f4SPierre Jolivet 
34824b7054f4SPierre Jolivet   for (i = 0; i < mbs; i++) {
34839371c9d4SSatish Balay     n = ii[1] - ii[0];
34849371c9d4SSatish Balay     ii++;
34854b7054f4SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
34864b7054f4SPierre Jolivet     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
34874b7054f4SPierre Jolivet     if (usecprow) z = c + 2 * ridx[i];
34884b7054f4SPierre Jolivet     jj = idx;
34894b7054f4SPierre Jolivet     vv = v;
34904b7054f4SPierre Jolivet     for (k = 0; k < cn; k++) {
34914b7054f4SPierre Jolivet       idx  = jj;
34924b7054f4SPierre Jolivet       v    = vv;
34939371c9d4SSatish Balay       sum1 = 0.0;
34949371c9d4SSatish Balay       sum2 = 0.0;
34954b7054f4SPierre Jolivet       for (j = 0; j < n; j++) {
34969371c9d4SSatish Balay         xb = b + 2 * (*idx++);
34979371c9d4SSatish Balay         x1 = xb[0 + k * bm];
34989371c9d4SSatish Balay         x2 = xb[1 + k * bm];
34994b7054f4SPierre Jolivet         sum1 += v[0] * x1 + v[2] * x2;
35004b7054f4SPierre Jolivet         sum2 += v[1] * x1 + v[3] * x2;
35014b7054f4SPierre Jolivet         v += 4;
35024b7054f4SPierre Jolivet       }
35039371c9d4SSatish Balay       z[0 + k * cm] = sum1;
35049371c9d4SSatish Balay       z[1 + k * cm] = sum2;
35054b7054f4SPierre Jolivet     }
35064b7054f4SPierre Jolivet     if (!usecprow) z += 2;
35074b7054f4SPierre Jolivet   }
35084b7054f4SPierre Jolivet   PetscFunctionReturn(0);
35094b7054f4SPierre Jolivet }
35104b7054f4SPierre Jolivet 
3511*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3512*d71ae5a4SJacob Faibussowitsch {
351374eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3514f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3;
3515bcf10a7aSPierre Jolivet   const PetscScalar *xb;
351674eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3;
351774eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
351874eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
351974eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
352074eeabc5SPierre Jolivet 
352174eeabc5SPierre Jolivet   PetscFunctionBegin;
352274eeabc5SPierre Jolivet   idx = a->j;
352374eeabc5SPierre Jolivet   v   = a->a;
352474eeabc5SPierre Jolivet   if (usecprow) {
352574eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
352674eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
352774eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
352874eeabc5SPierre Jolivet   } else {
352974eeabc5SPierre Jolivet     mbs = a->mbs;
353074eeabc5SPierre Jolivet     ii  = a->i;
353174eeabc5SPierre Jolivet     z   = c;
353274eeabc5SPierre Jolivet   }
353374eeabc5SPierre Jolivet 
353474eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
35359371c9d4SSatish Balay     n = ii[1] - ii[0];
35369371c9d4SSatish Balay     ii++;
353774eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
353874eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
353974eeabc5SPierre Jolivet     if (usecprow) z = c + 3 * ridx[i];
354074eeabc5SPierre Jolivet     jj = idx;
354174eeabc5SPierre Jolivet     vv = v;
354274eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
354374eeabc5SPierre Jolivet       idx  = jj;
354474eeabc5SPierre Jolivet       v    = vv;
35459371c9d4SSatish Balay       sum1 = 0.0;
35469371c9d4SSatish Balay       sum2 = 0.0;
35479371c9d4SSatish Balay       sum3 = 0.0;
354874eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
35499371c9d4SSatish Balay         xb = b + 3 * (*idx++);
35509371c9d4SSatish Balay         x1 = xb[0 + k * bm];
35519371c9d4SSatish Balay         x2 = xb[1 + k * bm];
35529371c9d4SSatish Balay         x3 = xb[2 + k * bm];
355374eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
355474eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
355574eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
355674eeabc5SPierre Jolivet         v += 9;
355774eeabc5SPierre Jolivet       }
35589371c9d4SSatish Balay       z[0 + k * cm] = sum1;
35599371c9d4SSatish Balay       z[1 + k * cm] = sum2;
35609371c9d4SSatish Balay       z[2 + k * cm] = sum3;
356174eeabc5SPierre Jolivet     }
356274eeabc5SPierre Jolivet     if (!usecprow) z += 3;
356374eeabc5SPierre Jolivet   }
356474eeabc5SPierre Jolivet   PetscFunctionReturn(0);
356574eeabc5SPierre Jolivet }
356674eeabc5SPierre Jolivet 
3567*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3568*d71ae5a4SJacob Faibussowitsch {
356974eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3570f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3571bcf10a7aSPierre Jolivet   const PetscScalar *xb;
357274eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3, x4;
357374eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
357474eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
357574eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
357674eeabc5SPierre Jolivet 
357774eeabc5SPierre Jolivet   PetscFunctionBegin;
357874eeabc5SPierre Jolivet   idx = a->j;
357974eeabc5SPierre Jolivet   v   = a->a;
358074eeabc5SPierre Jolivet   if (usecprow) {
358174eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
358274eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
358374eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
358474eeabc5SPierre Jolivet   } else {
358574eeabc5SPierre Jolivet     mbs = a->mbs;
358674eeabc5SPierre Jolivet     ii  = a->i;
358774eeabc5SPierre Jolivet     z   = c;
358874eeabc5SPierre Jolivet   }
358974eeabc5SPierre Jolivet 
359074eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
35919371c9d4SSatish Balay     n = ii[1] - ii[0];
35929371c9d4SSatish Balay     ii++;
359374eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
359474eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
359574eeabc5SPierre Jolivet     if (usecprow) z = c + 4 * ridx[i];
359674eeabc5SPierre Jolivet     jj = idx;
359774eeabc5SPierre Jolivet     vv = v;
359874eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
359974eeabc5SPierre Jolivet       idx  = jj;
360074eeabc5SPierre Jolivet       v    = vv;
36019371c9d4SSatish Balay       sum1 = 0.0;
36029371c9d4SSatish Balay       sum2 = 0.0;
36039371c9d4SSatish Balay       sum3 = 0.0;
36049371c9d4SSatish Balay       sum4 = 0.0;
360574eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
36069371c9d4SSatish Balay         xb = b + 4 * (*idx++);
36079371c9d4SSatish Balay         x1 = xb[0 + k * bm];
36089371c9d4SSatish Balay         x2 = xb[1 + k * bm];
36099371c9d4SSatish Balay         x3 = xb[2 + k * bm];
36109371c9d4SSatish Balay         x4 = xb[3 + k * bm];
361174eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
361274eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
361374eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
361474eeabc5SPierre Jolivet         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
361574eeabc5SPierre Jolivet         v += 16;
361674eeabc5SPierre Jolivet       }
36179371c9d4SSatish Balay       z[0 + k * cm] = sum1;
36189371c9d4SSatish Balay       z[1 + k * cm] = sum2;
36199371c9d4SSatish Balay       z[2 + k * cm] = sum3;
36209371c9d4SSatish Balay       z[3 + k * cm] = sum4;
362174eeabc5SPierre Jolivet     }
362274eeabc5SPierre Jolivet     if (!usecprow) z += 4;
362374eeabc5SPierre Jolivet   }
362474eeabc5SPierre Jolivet   PetscFunctionReturn(0);
362574eeabc5SPierre Jolivet }
362674eeabc5SPierre Jolivet 
3627*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3628*d71ae5a4SJacob Faibussowitsch {
362974eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3630f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3631bcf10a7aSPierre Jolivet   const PetscScalar *xb;
363274eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3, x4, x5;
363374eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
363474eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
363574eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
363674eeabc5SPierre Jolivet 
363774eeabc5SPierre Jolivet   PetscFunctionBegin;
363874eeabc5SPierre Jolivet   idx = a->j;
363974eeabc5SPierre Jolivet   v   = a->a;
364074eeabc5SPierre Jolivet   if (usecprow) {
364174eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
364274eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
364374eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
364474eeabc5SPierre Jolivet   } else {
364574eeabc5SPierre Jolivet     mbs = a->mbs;
364674eeabc5SPierre Jolivet     ii  = a->i;
364774eeabc5SPierre Jolivet     z   = c;
364874eeabc5SPierre Jolivet   }
364974eeabc5SPierre Jolivet 
365074eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
36519371c9d4SSatish Balay     n = ii[1] - ii[0];
36529371c9d4SSatish Balay     ii++;
365374eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
365474eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
365574eeabc5SPierre Jolivet     if (usecprow) z = c + 5 * ridx[i];
365674eeabc5SPierre Jolivet     jj = idx;
365774eeabc5SPierre Jolivet     vv = v;
365874eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
365974eeabc5SPierre Jolivet       idx  = jj;
366074eeabc5SPierre Jolivet       v    = vv;
36619371c9d4SSatish Balay       sum1 = 0.0;
36629371c9d4SSatish Balay       sum2 = 0.0;
36639371c9d4SSatish Balay       sum3 = 0.0;
36649371c9d4SSatish Balay       sum4 = 0.0;
36659371c9d4SSatish Balay       sum5 = 0.0;
366674eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
36679371c9d4SSatish Balay         xb = b + 5 * (*idx++);
36689371c9d4SSatish Balay         x1 = xb[0 + k * bm];
36699371c9d4SSatish Balay         x2 = xb[1 + k * bm];
36709371c9d4SSatish Balay         x3 = xb[2 + k * bm];
36719371c9d4SSatish Balay         x4 = xb[3 + k * bm];
36729371c9d4SSatish Balay         x5 = xb[4 + k * bm];
367374eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
367474eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
367574eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
367674eeabc5SPierre Jolivet         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
367774eeabc5SPierre Jolivet         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
367874eeabc5SPierre Jolivet         v += 25;
367974eeabc5SPierre Jolivet       }
36809371c9d4SSatish Balay       z[0 + k * cm] = sum1;
36819371c9d4SSatish Balay       z[1 + k * cm] = sum2;
36829371c9d4SSatish Balay       z[2 + k * cm] = sum3;
36839371c9d4SSatish Balay       z[3 + k * cm] = sum4;
36849371c9d4SSatish Balay       z[4 + k * cm] = sum5;
368574eeabc5SPierre Jolivet     }
368674eeabc5SPierre Jolivet     if (!usecprow) z += 5;
368774eeabc5SPierre Jolivet   }
368874eeabc5SPierre Jolivet   PetscFunctionReturn(0);
368974eeabc5SPierre Jolivet }
369074eeabc5SPierre Jolivet 
3691*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3692*d71ae5a4SJacob Faibussowitsch {
3693a001520aSPierre Jolivet   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3694a001520aSPierre Jolivet   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3695910cf402Sprj-   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3696bcf10a7aSPierre Jolivet   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3697a001520aSPierre Jolivet   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3698a001520aSPierre Jolivet   PetscBLASInt     bbs, bcn, bbm, bcm;
3699f4259b30SLisandro Dalcin   PetscScalar     *z = NULL;
3700a001520aSPierre Jolivet   PetscScalar     *c, *b;
3701a001520aSPierre Jolivet   const MatScalar *v;
3702a001520aSPierre Jolivet   const PetscInt  *idx, *ii, *ridx = NULL;
37034b7054f4SPierre Jolivet   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3704a001520aSPierre Jolivet   PetscBool        usecprow = a->compressedrow.use;
3705a001520aSPierre Jolivet 
3706a001520aSPierre Jolivet   PetscFunctionBegin;
3707a001520aSPierre Jolivet   if (!cm || !cn) PetscFunctionReturn(0);
370808401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
370908401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
371008401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3711a001520aSPierre Jolivet   b = bd->v;
371248a46eb9SPierre Jolivet   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
37139566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArray(C, &c));
371474eeabc5SPierre Jolivet   switch (bs) {
3715*d71ae5a4SJacob Faibussowitsch   case 1:
3716*d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3717*d71ae5a4SJacob Faibussowitsch     break;
3718*d71ae5a4SJacob Faibussowitsch   case 2:
3719*d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3720*d71ae5a4SJacob Faibussowitsch     break;
3721*d71ae5a4SJacob Faibussowitsch   case 3:
3722*d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3723*d71ae5a4SJacob Faibussowitsch     break;
3724*d71ae5a4SJacob Faibussowitsch   case 4:
3725*d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3726*d71ae5a4SJacob Faibussowitsch     break;
3727*d71ae5a4SJacob Faibussowitsch   case 5:
3728*d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3729*d71ae5a4SJacob Faibussowitsch     break;
373074eeabc5SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
37319566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bs, &bbs));
37329566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cn, &bcn));
37339566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bm, &bbm));
37349566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cm, &bcm));
3735a001520aSPierre Jolivet     idx = a->j;
3736a001520aSPierre Jolivet     v   = a->a;
3737a001520aSPierre Jolivet     if (usecprow) {
3738a001520aSPierre Jolivet       mbs  = a->compressedrow.nrows;
3739a001520aSPierre Jolivet       ii   = a->compressedrow.i;
3740a001520aSPierre Jolivet       ridx = a->compressedrow.rindex;
3741a001520aSPierre Jolivet     } else {
3742a001520aSPierre Jolivet       mbs = a->mbs;
3743a001520aSPierre Jolivet       ii  = a->i;
3744a001520aSPierre Jolivet       z   = c;
3745a001520aSPierre Jolivet     }
3746a001520aSPierre Jolivet     for (i = 0; i < mbs; i++) {
37479371c9d4SSatish Balay       n = ii[1] - ii[0];
37489371c9d4SSatish Balay       ii++;
3749a001520aSPierre Jolivet       if (usecprow) z = c + bs * ridx[i];
37504b7054f4SPierre Jolivet       if (n) {
3751792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
37524b7054f4SPierre Jolivet         v += bs2;
37534b7054f4SPierre Jolivet       }
37544b7054f4SPierre Jolivet       for (j = 1; j < n; j++) {
3755792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3756a001520aSPierre Jolivet         v += bs2;
3757a001520aSPierre Jolivet       }
3758a001520aSPierre Jolivet       if (!usecprow) z += bs;
3759a001520aSPierre Jolivet     }
37604b7054f4SPierre Jolivet   }
37619566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArray(C, &c));
37629566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3763a001520aSPierre Jolivet   PetscFunctionReturn(0);
3764a001520aSPierre Jolivet }
3765