xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 3ba1676111f5c958fe6c2729b46ca4d523958bb3)
1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
2a001520aSPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <petscblaslapack.h>
6cac129eeSSatish Balay 
75f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
896e086a2SDaniel Kokron   #include <immintrin.h>
996e086a2SDaniel Kokron #endif
1096e086a2SDaniel Kokron 
11d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
12d71ae5a4SJacob Faibussowitsch {
13a3192f15SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
145d0c19d7SBarry Smith   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
155d0c19d7SBarry Smith   const PetscInt *idx;
167bede89fSBarry Smith   PetscInt        start, end, *ai, *aj, bs;
17f1af5d2fSBarry Smith   PetscBT         table;
18a3192f15SSatish Balay 
193a40ed3dSBarry Smith   PetscFunctionBegin;
20a3192f15SSatish Balay   m  = a->mbs;
21a3192f15SSatish Balay   ai = a->i;
22a3192f15SSatish Balay   aj = a->j;
23d0f46423SBarry Smith   bs = A->rmap->bs;
24a3192f15SSatish Balay 
2508401ef6SPierre Jolivet   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
26a3192f15SSatish Balay 
279566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(m, &table));
289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &nidx));
29a3192f15SSatish Balay 
30a3192f15SSatish Balay   for (i = 0; i < is_max; i++) {
31a3192f15SSatish Balay     /* Initialise the two local arrays */
32a3192f15SSatish Balay     isz = 0;
339566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(m, table));
34a3192f15SSatish Balay 
35a3192f15SSatish Balay     /* Extract the indices, assume there can be duplicate entries */
369566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i], &idx));
379566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i], &n));
38a3192f15SSatish Balay 
39a3192f15SSatish Balay     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40a3192f15SSatish Balay     for (j = 0; j < n; ++j) {
41218c64b6SSatish Balay       ival = idx[j] / bs; /* convert the indices into block indices */
4208401ef6SPierre Jolivet       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
4326fbe8dcSKarl Rupp       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44a3192f15SSatish Balay     }
459566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i], &idx));
469566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&is[i]));
47a3192f15SSatish Balay 
48a3192f15SSatish Balay     k = 0;
49a3192f15SSatish Balay     for (j = 0; j < ov; j++) { /* for each overlap*/
50a3192f15SSatish Balay       n = isz;
51a3192f15SSatish Balay       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52a3192f15SSatish Balay         row   = nidx[k];
53a3192f15SSatish Balay         start = ai[row];
54a3192f15SSatish Balay         end   = ai[row + 1];
55a3192f15SSatish Balay         for (l = start; l < end; l++) {
56a3192f15SSatish Balay           val = aj[l];
5726fbe8dcSKarl Rupp           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58a3192f15SSatish Balay         }
59a3192f15SSatish Balay       }
60a3192f15SSatish Balay     }
617bede89fSBarry Smith     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
62a3192f15SSatish Balay   }
639566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table));
649566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx));
65*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
66a3192f15SSatish Balay }
671c351548SSatish Balay 
68d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
69d71ae5a4SJacob Faibussowitsch {
70736121d4SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
71690b6cddSBarry Smith   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
72690b6cddSBarry Smith   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
735d0c19d7SBarry Smith   const PetscInt *irow, *icol;
745d0c19d7SBarry Smith   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
75690b6cddSBarry Smith   PetscInt       *aj = a->j, *ai = a->i;
763f1db9ecSBarry Smith   MatScalar      *mat_a;
77736121d4SSatish Balay   Mat             C;
786041f1b1SToby Isaac   PetscBool       flag;
79736121d4SSatish Balay 
803a40ed3dSBarry Smith   PetscFunctionBegin;
819566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
829566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
839566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
849566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
85736121d4SSatish Balay 
869566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(1 + oldcols, &smap));
87736121d4SSatish Balay   ssmap = smap;
889566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1 + nrows, &lens));
89736121d4SSatish Balay   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
90736121d4SSatish Balay   /* determine lens of each row */
91736121d4SSatish Balay   for (i = 0; i < nrows; i++) {
92736121d4SSatish Balay     kstart  = ai[irow[i]];
93736121d4SSatish Balay     kend    = kstart + a->ilen[irow[i]];
94736121d4SSatish Balay     lens[i] = 0;
95736121d4SSatish Balay     for (k = kstart; k < kend; k++) {
9626fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
97736121d4SSatish Balay     }
98736121d4SSatish Balay   }
99736121d4SSatish Balay   /* Create and fill new matrix */
100736121d4SSatish Balay   if (scall == MAT_REUSE_MATRIX) {
101736121d4SSatish Balay     c = (Mat_SeqBAIJ *)((*B)->data);
102736121d4SSatish Balay 
103aed4548fSBarry Smith     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
1049566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
10528b400f6SJacob Faibussowitsch     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
1069566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(c->ilen, c->mbs));
107736121d4SSatish Balay     C = *B;
1083a40ed3dSBarry Smith   } else {
1099566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1109566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
1119566063dSJacob Faibussowitsch     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1129566063dSJacob Faibussowitsch     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
113736121d4SSatish Balay   }
114736121d4SSatish Balay   c = (Mat_SeqBAIJ *)(C->data);
115736121d4SSatish Balay   for (i = 0; i < nrows; i++) {
116736121d4SSatish Balay     row      = irow[i];
117736121d4SSatish Balay     kstart   = ai[row];
118736121d4SSatish Balay     kend     = kstart + a->ilen[row];
119736121d4SSatish Balay     mat_i    = c->i[i];
120d29f2997SMatthew Woehlke     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
121d29f2997SMatthew Woehlke     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
122736121d4SSatish Balay     mat_ilen = c->ilen + i;
123736121d4SSatish Balay     for (k = kstart; k < kend; k++) {
124736121d4SSatish Balay       if ((tcol = ssmap[a->j[k]])) {
125736121d4SSatish Balay         *mat_j++ = tcol - 1;
1269566063dSJacob Faibussowitsch         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
127549d3d68SSatish Balay         mat_a += bs2;
128736121d4SSatish Balay         (*mat_ilen)++;
129736121d4SSatish Balay       }
130736121d4SSatish Balay     }
131736121d4SSatish Balay   }
132cdc6f3adSToby Isaac   /* sort */
133d29f2997SMatthew Woehlke   if (c->j && c->a) {
134cdc6f3adSToby Isaac     MatScalar *work;
1359566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(bs2, &work));
136cdc6f3adSToby Isaac     for (i = 0; i < nrows; i++) {
137cdc6f3adSToby Isaac       PetscInt ilen;
138cdc6f3adSToby Isaac       mat_i = c->i[i];
139cdc6f3adSToby Isaac       mat_j = c->j + mat_i;
140cdc6f3adSToby Isaac       mat_a = c->a + mat_i * bs2;
141cdc6f3adSToby Isaac       ilen  = c->ilen[i];
1429566063dSJacob Faibussowitsch       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
143cdc6f3adSToby Isaac     }
1449566063dSJacob Faibussowitsch     PetscCall(PetscFree(work));
145cdc6f3adSToby Isaac   }
146218c64b6SSatish Balay 
147736121d4SSatish Balay   /* Free work space */
1489566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
1499566063dSJacob Faibussowitsch   PetscCall(PetscFree(smap));
1509566063dSJacob Faibussowitsch   PetscCall(PetscFree(lens));
1519566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1529566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
153736121d4SSatish Balay 
1549566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
155736121d4SSatish Balay   *B = C;
156*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
157736121d4SSatish Balay }
158736121d4SSatish Balay 
159d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
160d71ae5a4SJacob Faibussowitsch {
161218c64b6SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
162218c64b6SSatish Balay   IS              is1, is2;
163afebec48SHong Zhang   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
1645d0c19d7SBarry Smith   const PetscInt *irow, *icol;
165218c64b6SSatish Balay 
1663a40ed3dSBarry Smith   PetscFunctionBegin;
1679566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
1689566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
1699566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
1709566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
171218c64b6SSatish Balay 
172218c64b6SSatish Balay   /* Verify if the indices corespond to each element in a block
173218c64b6SSatish Balay    and form the IS with compressed IS */
174f8ecb639SStefano Zampini   maxmnbs = PetscMax(a->mbs, a->nbs);
1759566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
1769566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->mbs));
177218c64b6SSatish Balay   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
178ad540459SPierre Jolivet   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
1796041f1b1SToby Isaac   count = 0;
1806041f1b1SToby Isaac   for (i = 0; i < nrows; i++) {
181afebec48SHong Zhang     j = irow[i] / bs;
1826041f1b1SToby Isaac     if ((vary[j]--) == bs) iary[count++] = j;
183218c64b6SSatish Balay   }
1849566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
185218c64b6SSatish Balay 
1869566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary, a->nbs));
187218c64b6SSatish Balay   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
188ad540459SPierre Jolivet   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
1896041f1b1SToby Isaac   count = 0;
1906041f1b1SToby Isaac   for (i = 0; i < ncols; i++) {
191afebec48SHong Zhang     j = icol[i] / bs;
1926041f1b1SToby Isaac     if ((vary[j]--) == bs) iary[count++] = j;
1936041f1b1SToby Isaac   }
1949566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
1959566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
1969566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
1979566063dSJacob Faibussowitsch   PetscCall(PetscFree2(vary, iary));
198218c64b6SSatish Balay 
1999566063dSJacob Faibussowitsch   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
2009566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is1));
2019566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is2));
202*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
203218c64b6SSatish Balay }
204218c64b6SSatish Balay 
205d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
206d71ae5a4SJacob Faibussowitsch {
20716b64355SHong Zhang   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
2085c39f6d9SHong Zhang   Mat_SubSppt *submatj = c->submatis1;
20916b64355SHong Zhang 
21016b64355SHong Zhang   PetscFunctionBegin;
2119566063dSJacob Faibussowitsch   PetscCall((*submatj->destroy)(C));
2129566063dSJacob Faibussowitsch   PetscCall(MatDestroySubMatrix_Private(submatj));
213*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21416b64355SHong Zhang }
21516b64355SHong Zhang 
21689a1a59bSHong Zhang /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
217d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
218d71ae5a4SJacob Faibussowitsch {
21986e85357SHong Zhang   PetscInt     i;
22086e85357SHong Zhang   Mat          C;
22186e85357SHong Zhang   Mat_SeqBAIJ *c;
22286e85357SHong Zhang   Mat_SubSppt *submatj;
22386e85357SHong Zhang 
22486e85357SHong Zhang   PetscFunctionBegin;
22586e85357SHong Zhang   for (i = 0; i < n; i++) {
22686e85357SHong Zhang     C       = (*mat)[i];
22786e85357SHong Zhang     c       = (Mat_SeqBAIJ *)C->data;
22886e85357SHong Zhang     submatj = c->submatis1;
22986e85357SHong Zhang     if (submatj) {
2307daefbafSJunchao Zhang       if (--((PetscObject)C)->refct <= 0) {
23126cc229bSBarry Smith         PetscCall(PetscFree(C->factorprefix));
2329566063dSJacob Faibussowitsch         PetscCall((*submatj->destroy)(C));
2339566063dSJacob Faibussowitsch         PetscCall(MatDestroySubMatrix_Private(submatj));
2349566063dSJacob Faibussowitsch         PetscCall(PetscFree(C->defaultvectype));
2353faff063SStefano Zampini         PetscCall(PetscFree(C->defaultrandtype));
2369566063dSJacob Faibussowitsch         PetscCall(PetscLayoutDestroy(&C->rmap));
2379566063dSJacob Faibussowitsch         PetscCall(PetscLayoutDestroy(&C->cmap));
2389566063dSJacob Faibussowitsch         PetscCall(PetscHeaderDestroy(&C));
2397daefbafSJunchao Zhang       }
24086e85357SHong Zhang     } else {
2419566063dSJacob Faibussowitsch       PetscCall(MatDestroy(&C));
24286e85357SHong Zhang     }
24386e85357SHong Zhang   }
2447daefbafSJunchao Zhang 
2457daefbafSJunchao Zhang   /* Destroy Dummy submatrices created for reuse */
2469566063dSJacob Faibussowitsch   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
2477daefbafSJunchao Zhang 
2489566063dSJacob Faibussowitsch   PetscCall(PetscFree(*mat));
249*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25086e85357SHong Zhang }
25186e85357SHong Zhang 
252d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
253d71ae5a4SJacob Faibussowitsch {
254690b6cddSBarry Smith   PetscInt i;
255736121d4SSatish Balay 
2563a40ed3dSBarry Smith   PetscFunctionBegin;
25748a46eb9SPierre Jolivet   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
258736121d4SSatish Balay 
25948a46eb9SPierre Jolivet   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
260*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
261736121d4SSatish Balay }
262218c64b6SSatish Balay 
2632d61bbb3SSatish Balay /* -------------------------------------------------------*/
2642d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */
2652d61bbb3SSatish Balay /* -------------------------------------------------------*/
2662d61bbb3SSatish Balay 
267d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
268d71ae5a4SJacob Faibussowitsch {
2692d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
270d9fead3dSBarry Smith   PetscScalar       *z, sum;
271d9fead3dSBarry Smith   const PetscScalar *x;
272d9fead3dSBarry Smith   const MatScalar   *v;
2737c565772SBarry Smith   PetscInt           mbs, i, n;
2740298fd71SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
275ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2762d61bbb3SSatish Balay 
2772d61bbb3SSatish Balay   PetscFunctionBegin;
2789566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
2799566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &z));
2802d61bbb3SSatish Balay 
28126e093fcSHong Zhang   if (usecprow) {
28226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
28326e093fcSHong Zhang     ii   = a->compressedrow.i;
2847b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
2859566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(z, a->mbs));
28626e093fcSHong Zhang   } else {
28726e093fcSHong Zhang     mbs = a->mbs;
2882d61bbb3SSatish Balay     ii  = a->i;
28926e093fcSHong Zhang   }
2902d61bbb3SSatish Balay 
2912d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
292ee54c7eeSHong Zhang     n   = ii[1] - ii[0];
293ee54c7eeSHong Zhang     v   = a->a + ii[0];
294ee54c7eeSHong Zhang     idx = a->j + ii[0];
295ee54c7eeSHong Zhang     ii++;
296444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
297444d8c10SJed Brown     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2982d61bbb3SSatish Balay     sum = 0.0;
2992162cab8SBarry Smith     PetscSparseDensePlusDot(sum, x, v, idx, n);
30026e093fcSHong Zhang     if (usecprow) {
3017b2bb3b9SHong Zhang       z[ridx[i]] = sum;
30226e093fcSHong Zhang     } else {
3032d61bbb3SSatish Balay       z[i] = sum;
3042d61bbb3SSatish Balay     }
30526e093fcSHong Zhang   }
3069566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3079566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &z));
3089566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
309*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3102d61bbb3SSatish Balay }
3112d61bbb3SSatish Balay 
312d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
313d71ae5a4SJacob Faibussowitsch {
3142d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
315f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, *zarray;
316d9fead3dSBarry Smith   const PetscScalar *x, *xb;
31787828ca2SBarry Smith   PetscScalar        x1, x2;
318d9fead3dSBarry Smith   const MatScalar   *v;
3197c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
320ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
3212d61bbb3SSatish Balay 
3222d61bbb3SSatish Balay   PetscFunctionBegin;
3239566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3249566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
3252d61bbb3SSatish Balay 
3262d61bbb3SSatish Balay   idx = a->j;
3272d61bbb3SSatish Balay   v   = a->a;
32826e093fcSHong Zhang   if (usecprow) {
32926e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
33026e093fcSHong Zhang     ii   = a->compressedrow.i;
3317b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
3329566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
33326e093fcSHong Zhang   } else {
33426e093fcSHong Zhang     mbs = a->mbs;
3352d61bbb3SSatish Balay     ii  = a->i;
33626e093fcSHong Zhang     z   = zarray;
33726e093fcSHong Zhang   }
3382d61bbb3SSatish Balay 
3392d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
3409371c9d4SSatish Balay     n = ii[1] - ii[0];
3419371c9d4SSatish Balay     ii++;
3429371c9d4SSatish Balay     sum1 = 0.0;
3439371c9d4SSatish Balay     sum2 = 0.0;
344444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
345444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3462d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
3479371c9d4SSatish Balay       xb = x + 2 * (*idx++);
3489371c9d4SSatish Balay       x1 = xb[0];
3499371c9d4SSatish Balay       x2 = xb[1];
3502d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[2] * x2;
3512d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[3] * x2;
3522d61bbb3SSatish Balay       v += 4;
3532d61bbb3SSatish Balay     }
3547b2bb3b9SHong Zhang     if (usecprow) z = zarray + 2 * ridx[i];
3559371c9d4SSatish Balay     z[0] = sum1;
3569371c9d4SSatish Balay     z[1] = sum2;
35726e093fcSHong Zhang     if (!usecprow) z += 2;
3582d61bbb3SSatish Balay   }
3599566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3609566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
3619566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
362*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3632d61bbb3SSatish Balay }
3642d61bbb3SSatish Balay 
365d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
366d71ae5a4SJacob Faibussowitsch {
3672d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
368f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
369d9fead3dSBarry Smith   const PetscScalar *x, *xb;
370d9fead3dSBarry Smith   const MatScalar   *v;
3717c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
372ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
37326e093fcSHong Zhang 
374b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
375fee21e36SBarry Smith   #pragma disjoint(*v, *z, *xb)
376fee21e36SBarry Smith #endif
377fee21e36SBarry Smith 
3782d61bbb3SSatish Balay   PetscFunctionBegin;
3799566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3809566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
3812d61bbb3SSatish Balay 
3822d61bbb3SSatish Balay   idx = a->j;
3832d61bbb3SSatish Balay   v   = a->a;
38426e093fcSHong Zhang   if (usecprow) {
38526e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
38626e093fcSHong Zhang     ii   = a->compressedrow.i;
3877b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
3889566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
38926e093fcSHong Zhang   } else {
39026e093fcSHong Zhang     mbs = a->mbs;
3912d61bbb3SSatish Balay     ii  = a->i;
39226e093fcSHong Zhang     z   = zarray;
39326e093fcSHong Zhang   }
3942d61bbb3SSatish Balay 
3952d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
3969371c9d4SSatish Balay     n = ii[1] - ii[0];
3979371c9d4SSatish Balay     ii++;
3989371c9d4SSatish Balay     sum1 = 0.0;
3999371c9d4SSatish Balay     sum2 = 0.0;
4009371c9d4SSatish Balay     sum3 = 0.0;
401444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
402444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
4032d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
40426fbe8dcSKarl Rupp       xb = x + 3 * (*idx++);
40526fbe8dcSKarl Rupp       x1 = xb[0];
40626fbe8dcSKarl Rupp       x2 = xb[1];
40726fbe8dcSKarl Rupp       x3 = xb[2];
40826fbe8dcSKarl Rupp 
4092d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
4102d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
4112d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
4122d61bbb3SSatish Balay       v += 9;
4132d61bbb3SSatish Balay     }
4147b2bb3b9SHong Zhang     if (usecprow) z = zarray + 3 * ridx[i];
4159371c9d4SSatish Balay     z[0] = sum1;
4169371c9d4SSatish Balay     z[1] = sum2;
4179371c9d4SSatish Balay     z[2] = sum3;
41826e093fcSHong Zhang     if (!usecprow) z += 3;
4192d61bbb3SSatish Balay   }
4209566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4219566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
4229566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
423*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4242d61bbb3SSatish Balay }
4252d61bbb3SSatish Balay 
426d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
427d71ae5a4SJacob Faibussowitsch {
4282d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
429f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
430d9fead3dSBarry Smith   const PetscScalar *x, *xb;
431d9fead3dSBarry Smith   const MatScalar   *v;
4327c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
433ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
4342d61bbb3SSatish Balay 
4352d61bbb3SSatish Balay   PetscFunctionBegin;
4369566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4379566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
4382d61bbb3SSatish Balay 
4392d61bbb3SSatish Balay   idx = a->j;
4402d61bbb3SSatish Balay   v   = a->a;
44126e093fcSHong Zhang   if (usecprow) {
44226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
44326e093fcSHong Zhang     ii   = a->compressedrow.i;
4447b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
4459566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
44626e093fcSHong Zhang   } else {
44726e093fcSHong Zhang     mbs = a->mbs;
4482d61bbb3SSatish Balay     ii  = a->i;
44926e093fcSHong Zhang     z   = zarray;
45026e093fcSHong Zhang   }
4512d61bbb3SSatish Balay 
4522d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
45326fbe8dcSKarl Rupp     n = ii[1] - ii[0];
45426fbe8dcSKarl Rupp     ii++;
45526fbe8dcSKarl Rupp     sum1 = 0.0;
45626fbe8dcSKarl Rupp     sum2 = 0.0;
45726fbe8dcSKarl Rupp     sum3 = 0.0;
45826fbe8dcSKarl Rupp     sum4 = 0.0;
45926fbe8dcSKarl Rupp 
460444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
461444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
4622d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
4632d61bbb3SSatish Balay       xb = x + 4 * (*idx++);
4649371c9d4SSatish Balay       x1 = xb[0];
4659371c9d4SSatish Balay       x2 = xb[1];
4669371c9d4SSatish Balay       x3 = xb[2];
4679371c9d4SSatish Balay       x4 = xb[3];
4682d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
4692d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
4702d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
4712d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
4722d61bbb3SSatish Balay       v += 16;
4732d61bbb3SSatish Balay     }
4747b2bb3b9SHong Zhang     if (usecprow) z = zarray + 4 * ridx[i];
4759371c9d4SSatish Balay     z[0] = sum1;
4769371c9d4SSatish Balay     z[1] = sum2;
4779371c9d4SSatish Balay     z[2] = sum3;
4789371c9d4SSatish Balay     z[3] = sum4;
47926e093fcSHong Zhang     if (!usecprow) z += 4;
4802d61bbb3SSatish Balay   }
4819566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4829566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
4839566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
484*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4852d61bbb3SSatish Balay }
4862d61bbb3SSatish Balay 
487d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
488d71ae5a4SJacob Faibussowitsch {
4892d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
490f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
491d9fead3dSBarry Smith   const PetscScalar *xb, *x;
492d9fead3dSBarry Smith   const MatScalar   *v;
4930298fd71SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
4947c565772SBarry Smith   PetscInt           mbs, i, j, n;
495ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
4962d61bbb3SSatish Balay 
497433994e6SBarry Smith   PetscFunctionBegin;
4989566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4999566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
5002d61bbb3SSatish Balay 
5012d61bbb3SSatish Balay   idx = a->j;
5022d61bbb3SSatish Balay   v   = a->a;
50326e093fcSHong Zhang   if (usecprow) {
50426e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
50526e093fcSHong Zhang     ii   = a->compressedrow.i;
5067b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
5079566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
50826e093fcSHong Zhang   } else {
50926e093fcSHong Zhang     mbs = a->mbs;
5102d61bbb3SSatish Balay     ii  = a->i;
51126e093fcSHong Zhang     z   = zarray;
51226e093fcSHong Zhang   }
5132d61bbb3SSatish Balay 
5142d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
5159371c9d4SSatish Balay     n = ii[1] - ii[0];
5169371c9d4SSatish Balay     ii++;
5179371c9d4SSatish Balay     sum1 = 0.0;
5189371c9d4SSatish Balay     sum2 = 0.0;
5199371c9d4SSatish Balay     sum3 = 0.0;
5209371c9d4SSatish Balay     sum4 = 0.0;
5219371c9d4SSatish Balay     sum5 = 0.0;
522444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
523444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
5242d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
5252d61bbb3SSatish Balay       xb = x + 5 * (*idx++);
5269371c9d4SSatish Balay       x1 = xb[0];
5279371c9d4SSatish Balay       x2 = xb[1];
5289371c9d4SSatish Balay       x3 = xb[2];
5299371c9d4SSatish Balay       x4 = xb[3];
5309371c9d4SSatish Balay       x5 = xb[4];
5312d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
5322d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
5332d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
5342d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
5352d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
5362d61bbb3SSatish Balay       v += 25;
5372d61bbb3SSatish Balay     }
5387b2bb3b9SHong Zhang     if (usecprow) z = zarray + 5 * ridx[i];
5399371c9d4SSatish Balay     z[0] = sum1;
5409371c9d4SSatish Balay     z[1] = sum2;
5419371c9d4SSatish Balay     z[2] = sum3;
5429371c9d4SSatish Balay     z[3] = sum4;
5439371c9d4SSatish Balay     z[4] = sum5;
54426e093fcSHong Zhang     if (!usecprow) z += 5;
5452d61bbb3SSatish Balay   }
5469566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5479566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
5489566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
549*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5502d61bbb3SSatish Balay }
5512d61bbb3SSatish Balay 
552d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
553d71ae5a4SJacob Faibussowitsch {
55415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
555f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
556d9fead3dSBarry Smith   const PetscScalar *x, *xb;
55726e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
558d9fead3dSBarry Smith   const MatScalar   *v;
5597c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
560ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
56115091d37SBarry Smith 
562433994e6SBarry Smith   PetscFunctionBegin;
5639566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5649566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
56515091d37SBarry Smith 
56615091d37SBarry Smith   idx = a->j;
56715091d37SBarry Smith   v   = a->a;
56826e093fcSHong Zhang   if (usecprow) {
56926e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
57026e093fcSHong Zhang     ii   = a->compressedrow.i;
5717b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
5729566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
57326e093fcSHong Zhang   } else {
57426e093fcSHong Zhang     mbs = a->mbs;
57515091d37SBarry Smith     ii  = a->i;
57626e093fcSHong Zhang     z   = zarray;
57726e093fcSHong Zhang   }
57815091d37SBarry Smith 
57915091d37SBarry Smith   for (i = 0; i < mbs; i++) {
58026fbe8dcSKarl Rupp     n = ii[1] - ii[0];
58126fbe8dcSKarl Rupp     ii++;
58226fbe8dcSKarl Rupp     sum1 = 0.0;
58326fbe8dcSKarl Rupp     sum2 = 0.0;
58426fbe8dcSKarl Rupp     sum3 = 0.0;
58526fbe8dcSKarl Rupp     sum4 = 0.0;
58626fbe8dcSKarl Rupp     sum5 = 0.0;
58726fbe8dcSKarl Rupp     sum6 = 0.0;
58826fbe8dcSKarl Rupp 
589444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
590444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
59115091d37SBarry Smith     for (j = 0; j < n; j++) {
59215091d37SBarry Smith       xb = x + 6 * (*idx++);
5939371c9d4SSatish Balay       x1 = xb[0];
5949371c9d4SSatish Balay       x2 = xb[1];
5959371c9d4SSatish Balay       x3 = xb[2];
5969371c9d4SSatish Balay       x4 = xb[3];
5979371c9d4SSatish Balay       x5 = xb[4];
5989371c9d4SSatish Balay       x6 = xb[5];
59915091d37SBarry Smith       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
60015091d37SBarry Smith       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
60115091d37SBarry Smith       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
60215091d37SBarry Smith       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
60315091d37SBarry Smith       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
60415091d37SBarry Smith       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
60515091d37SBarry Smith       v += 36;
60615091d37SBarry Smith     }
6077b2bb3b9SHong Zhang     if (usecprow) z = zarray + 6 * ridx[i];
6089371c9d4SSatish Balay     z[0] = sum1;
6099371c9d4SSatish Balay     z[1] = sum2;
6109371c9d4SSatish Balay     z[2] = sum3;
6119371c9d4SSatish Balay     z[3] = sum4;
6129371c9d4SSatish Balay     z[4] = sum5;
6139371c9d4SSatish Balay     z[5] = sum6;
61426e093fcSHong Zhang     if (!usecprow) z += 6;
61515091d37SBarry Smith   }
61615091d37SBarry Smith 
6179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
6199566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
620*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
62115091d37SBarry Smith }
6228ab949d8SShri Abhyankar 
623d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
624d71ae5a4SJacob Faibussowitsch {
6252d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
626f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
627d9fead3dSBarry Smith   const PetscScalar *x, *xb;
62826e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
629d9fead3dSBarry Smith   const MatScalar   *v;
6307c565772SBarry Smith   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
631ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
6322d61bbb3SSatish Balay 
633433994e6SBarry Smith   PetscFunctionBegin;
6349566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6359566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
6362d61bbb3SSatish Balay 
6372d61bbb3SSatish Balay   idx = a->j;
6382d61bbb3SSatish Balay   v   = a->a;
63926e093fcSHong Zhang   if (usecprow) {
64026e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
64126e093fcSHong Zhang     ii   = a->compressedrow.i;
6427b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
6439566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
64426e093fcSHong Zhang   } else {
64526e093fcSHong Zhang     mbs = a->mbs;
6462d61bbb3SSatish Balay     ii  = a->i;
64726e093fcSHong Zhang     z   = zarray;
64826e093fcSHong Zhang   }
6492d61bbb3SSatish Balay 
6502d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
65126fbe8dcSKarl Rupp     n = ii[1] - ii[0];
65226fbe8dcSKarl Rupp     ii++;
65326fbe8dcSKarl Rupp     sum1 = 0.0;
65426fbe8dcSKarl Rupp     sum2 = 0.0;
65526fbe8dcSKarl Rupp     sum3 = 0.0;
65626fbe8dcSKarl Rupp     sum4 = 0.0;
65726fbe8dcSKarl Rupp     sum5 = 0.0;
65826fbe8dcSKarl Rupp     sum6 = 0.0;
65926fbe8dcSKarl Rupp     sum7 = 0.0;
66026fbe8dcSKarl Rupp 
661444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
662444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
6632d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
6642d61bbb3SSatish Balay       xb = x + 7 * (*idx++);
6659371c9d4SSatish Balay       x1 = xb[0];
6669371c9d4SSatish Balay       x2 = xb[1];
6679371c9d4SSatish Balay       x3 = xb[2];
6689371c9d4SSatish Balay       x4 = xb[3];
6699371c9d4SSatish Balay       x5 = xb[4];
6709371c9d4SSatish Balay       x6 = xb[5];
6719371c9d4SSatish Balay       x7 = xb[6];
6722d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
6732d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
6742d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
6752d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
6762d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
6772d61bbb3SSatish Balay       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
6782d61bbb3SSatish Balay       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
6792d61bbb3SSatish Balay       v += 49;
6802d61bbb3SSatish Balay     }
6817b2bb3b9SHong Zhang     if (usecprow) z = zarray + 7 * ridx[i];
6829371c9d4SSatish Balay     z[0] = sum1;
6839371c9d4SSatish Balay     z[1] = sum2;
6849371c9d4SSatish Balay     z[2] = sum3;
6859371c9d4SSatish Balay     z[3] = sum4;
6869371c9d4SSatish Balay     z[4] = sum5;
6879371c9d4SSatish Balay     z[5] = sum6;
6889371c9d4SSatish Balay     z[6] = sum7;
68926e093fcSHong Zhang     if (!usecprow) z += 7;
6902d61bbb3SSatish Balay   }
6912d61bbb3SSatish Balay 
6929566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6939566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
6949566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
695*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6962d61bbb3SSatish Balay }
6972d61bbb3SSatish Balay 
6985f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
699d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
700d71ae5a4SJacob Faibussowitsch {
70196e086a2SDaniel Kokron   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
702f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
70396e086a2SDaniel Kokron   const PetscScalar *x, *xb;
70496e086a2SDaniel Kokron   const MatScalar   *v;
70596e086a2SDaniel Kokron   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
70696e086a2SDaniel Kokron   const PetscInt    *idx, *ii, *ridx = NULL;
707ce68d72fSJed Brown   PetscInt           k;
70896e086a2SDaniel Kokron   PetscBool          usecprow = a->compressedrow.use;
70996e086a2SDaniel Kokron 
71096e086a2SDaniel Kokron   __m256d a0, a1, a2, a3, a4, a5;
711ce68d72fSJed Brown   __m256d w0, w1, w2, w3;
71296e086a2SDaniel Kokron   __m256d z0, z1, z2;
71396e086a2SDaniel Kokron   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
71496e086a2SDaniel Kokron 
71596e086a2SDaniel Kokron   PetscFunctionBegin;
7169566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7179566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
71896e086a2SDaniel Kokron 
71996e086a2SDaniel Kokron   idx = a->j;
72096e086a2SDaniel Kokron   v   = a->a;
72196e086a2SDaniel Kokron   if (usecprow) {
72296e086a2SDaniel Kokron     mbs  = a->compressedrow.nrows;
72396e086a2SDaniel Kokron     ii   = a->compressedrow.i;
72496e086a2SDaniel Kokron     ridx = a->compressedrow.rindex;
7259566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
72696e086a2SDaniel Kokron   } else {
72796e086a2SDaniel Kokron     mbs = a->mbs;
72896e086a2SDaniel Kokron     ii  = a->i;
72996e086a2SDaniel Kokron     z   = zarray;
73096e086a2SDaniel Kokron   }
73196e086a2SDaniel Kokron 
73296e086a2SDaniel Kokron   if (!a->mult_work) {
73396e086a2SDaniel Kokron     k = PetscMax(A->rmap->n, A->cmap->n);
7349566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
73596e086a2SDaniel Kokron   }
73696e086a2SDaniel Kokron 
73796e086a2SDaniel Kokron   work = a->mult_work;
73896e086a2SDaniel Kokron   for (i = 0; i < mbs; i++) {
7399371c9d4SSatish Balay     n = ii[1] - ii[0];
7409371c9d4SSatish Balay     ii++;
74196e086a2SDaniel Kokron     workt = work;
74296e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
74396e086a2SDaniel Kokron       xb = x + bs * (*idx++);
74496e086a2SDaniel Kokron       for (k = 0; k < bs; k++) workt[k] = xb[k];
74596e086a2SDaniel Kokron       workt += bs;
74696e086a2SDaniel Kokron     }
74796e086a2SDaniel Kokron     if (usecprow) z = zarray + bs * ridx[i];
74896e086a2SDaniel Kokron 
7499371c9d4SSatish Balay     z0 = _mm256_setzero_pd();
7509371c9d4SSatish Balay     z1 = _mm256_setzero_pd();
7519371c9d4SSatish Balay     z2 = _mm256_setzero_pd();
75296e086a2SDaniel Kokron 
75396e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
754c05b70c4SSatish Balay       /* first column of a */
75596e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9]);
7569371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81]);
7579371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
7589371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
7599371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
7609371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
7619371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
76296e086a2SDaniel Kokron 
763c05b70c4SSatish Balay       /* second column of a */
76496e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 1]);
7659371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
7669371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
7679371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
7689371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
7699371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
7709371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
77196e086a2SDaniel Kokron 
772c05b70c4SSatish Balay       /* third column of a */
77396e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 2]);
7749371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
7759371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w2, z0);
7769371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
7779371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w2, z1);
7789371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
7799371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w2, z2);
78096e086a2SDaniel Kokron 
781c05b70c4SSatish Balay       /* fourth column of a */
78296e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 3]);
7839371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
7849371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w3, z0);
7859371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
7869371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w3, z1);
7879371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
7889371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w3, z2);
78996e086a2SDaniel Kokron 
790c05b70c4SSatish Balay       /* fifth column of a */
79196e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 4]);
7929371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
7939371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w0, z0);
7949371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
7959371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w0, z1);
7969371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
7979371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w0, z2);
79896e086a2SDaniel Kokron 
799c05b70c4SSatish Balay       /* sixth column of a */
80096e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 5]);
8019371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
8029371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
8039371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
8049371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
8059371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
8069371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
80796e086a2SDaniel Kokron 
808c05b70c4SSatish Balay       /* seventh column of a */
80996e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 6]);
8109371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
8119371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
8129371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
8139371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
8149371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
8159371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
81696e086a2SDaniel Kokron 
8176aad120cSJose E. Roman       /* eighth column of a */
81896e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 7]);
8199371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
8209371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
8219371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
8229371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
8239371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
8249371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
82596e086a2SDaniel Kokron 
826c05b70c4SSatish Balay       /* ninth column of a */
82796e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 8]);
8289371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
8299371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
8309371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
8319371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
8329371c9d4SSatish Balay       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
8339371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
83496e086a2SDaniel Kokron     }
83596e086a2SDaniel Kokron 
8369371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
8379371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
8389371c9d4SSatish Balay     _mm256_maskstore_pd(&z[8], mask1, z2);
83996e086a2SDaniel Kokron 
84096e086a2SDaniel Kokron     v += n * bs2;
84196e086a2SDaniel Kokron     if (!usecprow) z += bs;
84296e086a2SDaniel Kokron   }
8439566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8449566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
8459566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
846*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
84796e086a2SDaniel Kokron }
84896e086a2SDaniel Kokron #endif
84996e086a2SDaniel Kokron 
850d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
851d71ae5a4SJacob Faibussowitsch {
852ebada01fSBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
853f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
854ebada01fSBarry Smith   const PetscScalar *x, *xb;
855ebada01fSBarry Smith   PetscScalar       *zarray, xv;
856ebada01fSBarry Smith   const MatScalar   *v;
857ebada01fSBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
858ebada01fSBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
859ebada01fSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
860ebada01fSBarry Smith 
861ebada01fSBarry Smith   PetscFunctionBegin;
8629566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8639566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
864ebada01fSBarry Smith 
865ebada01fSBarry Smith   v = a->a;
866ebada01fSBarry Smith   if (usecprow) {
867ebada01fSBarry Smith     mbs  = a->compressedrow.nrows;
868ebada01fSBarry Smith     ii   = a->compressedrow.i;
869ebada01fSBarry Smith     ridx = a->compressedrow.rindex;
8709566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
871ebada01fSBarry Smith   } else {
872ebada01fSBarry Smith     mbs = a->mbs;
873ebada01fSBarry Smith     ii  = a->i;
874ebada01fSBarry Smith     z   = zarray;
875ebada01fSBarry Smith   }
876ebada01fSBarry Smith 
877ebada01fSBarry Smith   for (i = 0; i < mbs; i++) {
878ebada01fSBarry Smith     n     = ii[i + 1] - ii[i];
879ebada01fSBarry Smith     idx   = ij + ii[i];
8809371c9d4SSatish Balay     sum1  = 0.0;
8819371c9d4SSatish Balay     sum2  = 0.0;
8829371c9d4SSatish Balay     sum3  = 0.0;
8839371c9d4SSatish Balay     sum4  = 0.0;
8849371c9d4SSatish Balay     sum5  = 0.0;
8859371c9d4SSatish Balay     sum6  = 0.0;
8869371c9d4SSatish Balay     sum7  = 0.0;
8879371c9d4SSatish Balay     sum8  = 0.0;
8889371c9d4SSatish Balay     sum9  = 0.0;
8899371c9d4SSatish Balay     sum10 = 0.0;
8909371c9d4SSatish Balay     sum11 = 0.0;
891ebada01fSBarry Smith 
892ebada01fSBarry Smith     for (j = 0; j < n; j++) {
893ebada01fSBarry Smith       xb = x + 11 * (idx[j]);
894ebada01fSBarry Smith 
895ebada01fSBarry Smith       for (k = 0; k < 11; k++) {
896ebada01fSBarry Smith         xv = xb[k];
897ebada01fSBarry Smith         sum1 += v[0] * xv;
898ebada01fSBarry Smith         sum2 += v[1] * xv;
899ebada01fSBarry Smith         sum3 += v[2] * xv;
900ebada01fSBarry Smith         sum4 += v[3] * xv;
901ebada01fSBarry Smith         sum5 += v[4] * xv;
902ebada01fSBarry Smith         sum6 += v[5] * xv;
903ebada01fSBarry Smith         sum7 += v[6] * xv;
904ebada01fSBarry Smith         sum8 += v[7] * xv;
905ebada01fSBarry Smith         sum9 += v[8] * xv;
906ebada01fSBarry Smith         sum10 += v[9] * xv;
907ebada01fSBarry Smith         sum11 += v[10] * xv;
908ebada01fSBarry Smith         v += 11;
909ebada01fSBarry Smith       }
910ebada01fSBarry Smith     }
911ebada01fSBarry Smith     if (usecprow) z = zarray + 11 * ridx[i];
9129371c9d4SSatish Balay     z[0]  = sum1;
9139371c9d4SSatish Balay     z[1]  = sum2;
9149371c9d4SSatish Balay     z[2]  = sum3;
9159371c9d4SSatish Balay     z[3]  = sum4;
9169371c9d4SSatish Balay     z[4]  = sum5;
9179371c9d4SSatish Balay     z[5]  = sum6;
9189371c9d4SSatish Balay     z[6]  = sum7;
9199371c9d4SSatish Balay     z[7]  = sum8;
9209371c9d4SSatish Balay     z[8]  = sum9;
9219371c9d4SSatish Balay     z[9]  = sum10;
9229371c9d4SSatish Balay     z[10] = sum11;
923ebada01fSBarry Smith 
924ebada01fSBarry Smith     if (!usecprow) z += 11;
925ebada01fSBarry Smith   }
926ebada01fSBarry Smith 
9279566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9289566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
9299566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
930*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
931ebada01fSBarry Smith }
932ebada01fSBarry Smith 
9336679dcc1SBarry Smith /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
934d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
935d71ae5a4SJacob Faibussowitsch {
9366679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
9376679dcc1SBarry Smith   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
9386679dcc1SBarry Smith   const PetscScalar *x, *xb;
9396679dcc1SBarry Smith   PetscScalar       *zarray, xv;
9406679dcc1SBarry Smith   const MatScalar   *v;
9416679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
9426679dcc1SBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
9436679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
9446679dcc1SBarry Smith 
9456679dcc1SBarry Smith   PetscFunctionBegin;
9469566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9479566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
9486679dcc1SBarry Smith 
9496679dcc1SBarry Smith   v = a->a;
9506679dcc1SBarry Smith   if (usecprow) {
9516679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
9526679dcc1SBarry Smith     ii   = a->compressedrow.i;
9536679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
9549566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
9556679dcc1SBarry Smith   } else {
9566679dcc1SBarry Smith     mbs = a->mbs;
9576679dcc1SBarry Smith     ii  = a->i;
9586679dcc1SBarry Smith     z   = zarray;
9596679dcc1SBarry Smith   }
9606679dcc1SBarry Smith 
9616679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
9626679dcc1SBarry Smith     n     = ii[i + 1] - ii[i];
9636679dcc1SBarry Smith     idx   = ij + ii[i];
9649371c9d4SSatish Balay     sum1  = 0.0;
9659371c9d4SSatish Balay     sum2  = 0.0;
9669371c9d4SSatish Balay     sum3  = 0.0;
9679371c9d4SSatish Balay     sum4  = 0.0;
9689371c9d4SSatish Balay     sum5  = 0.0;
9699371c9d4SSatish Balay     sum6  = 0.0;
9709371c9d4SSatish Balay     sum7  = 0.0;
9719371c9d4SSatish Balay     sum8  = 0.0;
9729371c9d4SSatish Balay     sum9  = 0.0;
9739371c9d4SSatish Balay     sum10 = 0.0;
9749371c9d4SSatish Balay     sum11 = 0.0;
9759371c9d4SSatish Balay     sum12 = 0.0;
9766679dcc1SBarry Smith 
9776679dcc1SBarry Smith     for (j = 0; j < n; j++) {
9786679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
9796679dcc1SBarry Smith 
9806679dcc1SBarry Smith       for (k = 0; k < 12; k++) {
9816679dcc1SBarry Smith         xv = xb[k];
9826679dcc1SBarry Smith         sum1 += v[0] * xv;
9836679dcc1SBarry Smith         sum2 += v[1] * xv;
9846679dcc1SBarry Smith         sum3 += v[2] * xv;
9856679dcc1SBarry Smith         sum4 += v[3] * xv;
9866679dcc1SBarry Smith         sum5 += v[4] * xv;
9876679dcc1SBarry Smith         sum6 += v[5] * xv;
9886679dcc1SBarry Smith         sum7 += v[6] * xv;
9896679dcc1SBarry Smith         sum8 += v[7] * xv;
9906679dcc1SBarry Smith         sum9 += v[8] * xv;
9916679dcc1SBarry Smith         sum10 += v[9] * xv;
9926679dcc1SBarry Smith         sum11 += v[10] * xv;
9936679dcc1SBarry Smith         sum12 += v[11] * xv;
9946679dcc1SBarry Smith         v += 12;
9956679dcc1SBarry Smith       }
9966679dcc1SBarry Smith     }
9976679dcc1SBarry Smith     if (usecprow) z = zarray + 12 * ridx[i];
9989371c9d4SSatish Balay     z[0]  = sum1;
9999371c9d4SSatish Balay     z[1]  = sum2;
10009371c9d4SSatish Balay     z[2]  = sum3;
10019371c9d4SSatish Balay     z[3]  = sum4;
10029371c9d4SSatish Balay     z[4]  = sum5;
10039371c9d4SSatish Balay     z[5]  = sum6;
10049371c9d4SSatish Balay     z[6]  = sum7;
10059371c9d4SSatish Balay     z[7]  = sum8;
10069371c9d4SSatish Balay     z[8]  = sum9;
10079371c9d4SSatish Balay     z[9]  = sum10;
10089371c9d4SSatish Balay     z[10] = sum11;
10099371c9d4SSatish Balay     z[11] = sum12;
10106679dcc1SBarry Smith     if (!usecprow) z += 12;
10116679dcc1SBarry Smith   }
10129566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10139566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
10149566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1015*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
10166679dcc1SBarry Smith }
10176679dcc1SBarry Smith 
1018d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
1019d71ae5a4SJacob Faibussowitsch {
10206679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
10216679dcc1SBarry Smith   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
10226679dcc1SBarry Smith   const PetscScalar *x, *xb;
10236679dcc1SBarry Smith   PetscScalar       *zarray, *yarray, xv;
10246679dcc1SBarry Smith   const MatScalar   *v;
10256679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx;
10266679dcc1SBarry Smith   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
10276679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
10286679dcc1SBarry Smith 
10296679dcc1SBarry Smith   PetscFunctionBegin;
10309566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
10319566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
10326679dcc1SBarry Smith 
10336679dcc1SBarry Smith   v = a->a;
10346679dcc1SBarry Smith   if (usecprow) {
103548a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
10366679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
10376679dcc1SBarry Smith     ii   = a->compressedrow.i;
10386679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
10396679dcc1SBarry Smith   } else {
10406679dcc1SBarry Smith     ii = a->i;
10416679dcc1SBarry Smith     y  = yarray;
10426679dcc1SBarry Smith     z  = zarray;
10436679dcc1SBarry Smith   }
10446679dcc1SBarry Smith 
10456679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
10466679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
10476679dcc1SBarry Smith     idx = ij + ii[i];
10486679dcc1SBarry Smith 
10496679dcc1SBarry Smith     if (usecprow) {
10506679dcc1SBarry Smith       y = yarray + 12 * ridx[i];
10516679dcc1SBarry Smith       z = zarray + 12 * ridx[i];
10526679dcc1SBarry Smith     }
10539371c9d4SSatish Balay     sum1  = y[0];
10549371c9d4SSatish Balay     sum2  = y[1];
10559371c9d4SSatish Balay     sum3  = y[2];
10569371c9d4SSatish Balay     sum4  = y[3];
10579371c9d4SSatish Balay     sum5  = y[4];
10589371c9d4SSatish Balay     sum6  = y[5];
10599371c9d4SSatish Balay     sum7  = y[6];
10609371c9d4SSatish Balay     sum8  = y[7];
10619371c9d4SSatish Balay     sum9  = y[8];
10629371c9d4SSatish Balay     sum10 = y[9];
10639371c9d4SSatish Balay     sum11 = y[10];
10649371c9d4SSatish Balay     sum12 = y[11];
10656679dcc1SBarry Smith 
10666679dcc1SBarry Smith     for (j = 0; j < n; j++) {
10676679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
10686679dcc1SBarry Smith 
10696679dcc1SBarry Smith       for (k = 0; k < 12; k++) {
10706679dcc1SBarry Smith         xv = xb[k];
10716679dcc1SBarry Smith         sum1 += v[0] * xv;
10726679dcc1SBarry Smith         sum2 += v[1] * xv;
10736679dcc1SBarry Smith         sum3 += v[2] * xv;
10746679dcc1SBarry Smith         sum4 += v[3] * xv;
10756679dcc1SBarry Smith         sum5 += v[4] * xv;
10766679dcc1SBarry Smith         sum6 += v[5] * xv;
10776679dcc1SBarry Smith         sum7 += v[6] * xv;
10786679dcc1SBarry Smith         sum8 += v[7] * xv;
10796679dcc1SBarry Smith         sum9 += v[8] * xv;
10806679dcc1SBarry Smith         sum10 += v[9] * xv;
10816679dcc1SBarry Smith         sum11 += v[10] * xv;
10826679dcc1SBarry Smith         sum12 += v[11] * xv;
10836679dcc1SBarry Smith         v += 12;
10846679dcc1SBarry Smith       }
10856679dcc1SBarry Smith     }
10866679dcc1SBarry Smith 
10879371c9d4SSatish Balay     z[0]  = sum1;
10889371c9d4SSatish Balay     z[1]  = sum2;
10899371c9d4SSatish Balay     z[2]  = sum3;
10909371c9d4SSatish Balay     z[3]  = sum4;
10919371c9d4SSatish Balay     z[4]  = sum5;
10929371c9d4SSatish Balay     z[5]  = sum6;
10939371c9d4SSatish Balay     z[6]  = sum7;
10949371c9d4SSatish Balay     z[7]  = sum8;
10959371c9d4SSatish Balay     z[8]  = sum9;
10969371c9d4SSatish Balay     z[9]  = sum10;
10979371c9d4SSatish Balay     z[10] = sum11;
10989371c9d4SSatish Balay     z[11] = sum12;
10996679dcc1SBarry Smith     if (!usecprow) {
11006679dcc1SBarry Smith       y += 12;
11016679dcc1SBarry Smith       z += 12;
11026679dcc1SBarry Smith     }
11036679dcc1SBarry Smith   }
11049566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
11059566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
11069566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1107*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
11086679dcc1SBarry Smith }
11096679dcc1SBarry Smith 
11106679dcc1SBarry Smith /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1111d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1112d71ae5a4SJacob Faibussowitsch {
11136679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
11146679dcc1SBarry Smith   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
11156679dcc1SBarry Smith   const PetscScalar *x, *xb;
11166679dcc1SBarry Smith   PetscScalar        x1, x2, x3, x4, *zarray;
11176679dcc1SBarry Smith   const MatScalar   *v;
11186679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
11196679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
11206679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
11216679dcc1SBarry Smith 
11226679dcc1SBarry Smith   PetscFunctionBegin;
11239566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
11249566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
11256679dcc1SBarry Smith 
11266679dcc1SBarry Smith   v = a->a;
11276679dcc1SBarry Smith   if (usecprow) {
11286679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
11296679dcc1SBarry Smith     ii   = a->compressedrow.i;
11306679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
11319566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
11326679dcc1SBarry Smith   } else {
11336679dcc1SBarry Smith     mbs = a->mbs;
11346679dcc1SBarry Smith     ii  = a->i;
11356679dcc1SBarry Smith     z   = zarray;
11366679dcc1SBarry Smith   }
11376679dcc1SBarry Smith 
11386679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
11396679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
11406679dcc1SBarry Smith     idx = ij + ii[i];
11416679dcc1SBarry Smith 
11426679dcc1SBarry Smith     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
11436679dcc1SBarry Smith     for (j = 0; j < n; j++) {
11446679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
11459371c9d4SSatish Balay       x1 = xb[0];
11469371c9d4SSatish Balay       x2 = xb[1];
11479371c9d4SSatish Balay       x3 = xb[2];
11489371c9d4SSatish Balay       x4 = xb[3];
11496679dcc1SBarry Smith 
11506679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11516679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11526679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11536679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11546679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11556679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11566679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11576679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11586679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11596679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11606679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11616679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11626679dcc1SBarry Smith       v += 48;
11636679dcc1SBarry Smith 
11649371c9d4SSatish Balay       x1 = xb[4];
11659371c9d4SSatish Balay       x2 = xb[5];
11669371c9d4SSatish Balay       x3 = xb[6];
11679371c9d4SSatish Balay       x4 = xb[7];
11686679dcc1SBarry Smith 
11696679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11706679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11716679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11726679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11736679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11746679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11756679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11766679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11776679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11786679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11796679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11806679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11816679dcc1SBarry Smith       v += 48;
11826679dcc1SBarry Smith 
11839371c9d4SSatish Balay       x1 = xb[8];
11849371c9d4SSatish Balay       x2 = xb[9];
11859371c9d4SSatish Balay       x3 = xb[10];
11869371c9d4SSatish Balay       x4 = xb[11];
11876679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
11886679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
11896679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
11906679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
11916679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
11926679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
11936679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
11946679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
11956679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
11966679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
11976679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
11986679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
11996679dcc1SBarry Smith       v += 48;
12006679dcc1SBarry Smith     }
12016679dcc1SBarry Smith     if (usecprow) z = zarray + 12 * ridx[i];
12029371c9d4SSatish Balay     z[0]  = sum1;
12039371c9d4SSatish Balay     z[1]  = sum2;
12049371c9d4SSatish Balay     z[2]  = sum3;
12059371c9d4SSatish Balay     z[3]  = sum4;
12069371c9d4SSatish Balay     z[4]  = sum5;
12079371c9d4SSatish Balay     z[5]  = sum6;
12089371c9d4SSatish Balay     z[6]  = sum7;
12099371c9d4SSatish Balay     z[7]  = sum8;
12109371c9d4SSatish Balay     z[8]  = sum9;
12119371c9d4SSatish Balay     z[9]  = sum10;
12129371c9d4SSatish Balay     z[10] = sum11;
12139371c9d4SSatish Balay     z[11] = sum12;
12146679dcc1SBarry Smith     if (!usecprow) z += 12;
12156679dcc1SBarry Smith   }
12169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
12179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
12189566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1219*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
12206679dcc1SBarry Smith }
12216679dcc1SBarry Smith 
12226679dcc1SBarry Smith /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1223d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1224d71ae5a4SJacob Faibussowitsch {
12256679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
12266679dcc1SBarry Smith   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
12276679dcc1SBarry Smith   const PetscScalar *x, *xb;
12286679dcc1SBarry Smith   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
12296679dcc1SBarry Smith   const MatScalar   *v;
12306679dcc1SBarry Smith   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
12316679dcc1SBarry Smith   PetscInt           mbs      = a->mbs, i, j, n;
12326679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
12336679dcc1SBarry Smith 
12346679dcc1SBarry Smith   PetscFunctionBegin;
12359566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
12369566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
12376679dcc1SBarry Smith 
12386679dcc1SBarry Smith   v = a->a;
12396679dcc1SBarry Smith   if (usecprow) {
124048a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
12416679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
12426679dcc1SBarry Smith     ii   = a->compressedrow.i;
12436679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
12446679dcc1SBarry Smith   } else {
12456679dcc1SBarry Smith     ii = a->i;
12466679dcc1SBarry Smith     y  = yarray;
12476679dcc1SBarry Smith     z  = zarray;
12486679dcc1SBarry Smith   }
12496679dcc1SBarry Smith 
12506679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
12516679dcc1SBarry Smith     n   = ii[i + 1] - ii[i];
12526679dcc1SBarry Smith     idx = ij + ii[i];
12536679dcc1SBarry Smith 
12546679dcc1SBarry Smith     if (usecprow) {
12556679dcc1SBarry Smith       y = yarray + 12 * ridx[i];
12566679dcc1SBarry Smith       z = zarray + 12 * ridx[i];
12576679dcc1SBarry Smith     }
12589371c9d4SSatish Balay     sum1  = y[0];
12599371c9d4SSatish Balay     sum2  = y[1];
12609371c9d4SSatish Balay     sum3  = y[2];
12619371c9d4SSatish Balay     sum4  = y[3];
12629371c9d4SSatish Balay     sum5  = y[4];
12639371c9d4SSatish Balay     sum6  = y[5];
12649371c9d4SSatish Balay     sum7  = y[6];
12659371c9d4SSatish Balay     sum8  = y[7];
12669371c9d4SSatish Balay     sum9  = y[8];
12679371c9d4SSatish Balay     sum10 = y[9];
12689371c9d4SSatish Balay     sum11 = y[10];
12699371c9d4SSatish Balay     sum12 = y[11];
12706679dcc1SBarry Smith 
12716679dcc1SBarry Smith     for (j = 0; j < n; j++) {
12726679dcc1SBarry Smith       xb = x + 12 * (idx[j]);
12739371c9d4SSatish Balay       x1 = xb[0];
12749371c9d4SSatish Balay       x2 = xb[1];
12759371c9d4SSatish Balay       x3 = xb[2];
12769371c9d4SSatish Balay       x4 = xb[3];
12776679dcc1SBarry Smith 
12786679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
12796679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
12806679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
12816679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
12826679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
12836679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
12846679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
12856679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
12866679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
12876679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
12886679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
12896679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
12906679dcc1SBarry Smith       v += 48;
12916679dcc1SBarry Smith 
12929371c9d4SSatish Balay       x1 = xb[4];
12939371c9d4SSatish Balay       x2 = xb[5];
12949371c9d4SSatish Balay       x3 = xb[6];
12959371c9d4SSatish Balay       x4 = xb[7];
12966679dcc1SBarry Smith 
12976679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
12986679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
12996679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
13006679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
13016679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
13026679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
13036679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
13046679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
13056679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
13066679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
13076679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
13086679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
13096679dcc1SBarry Smith       v += 48;
13106679dcc1SBarry Smith 
13119371c9d4SSatish Balay       x1 = xb[8];
13129371c9d4SSatish Balay       x2 = xb[9];
13139371c9d4SSatish Balay       x3 = xb[10];
13149371c9d4SSatish Balay       x4 = xb[11];
13156679dcc1SBarry Smith       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
13166679dcc1SBarry Smith       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
13176679dcc1SBarry Smith       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
13186679dcc1SBarry Smith       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
13196679dcc1SBarry Smith       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
13206679dcc1SBarry Smith       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
13216679dcc1SBarry Smith       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
13226679dcc1SBarry Smith       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
13236679dcc1SBarry Smith       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
13246679dcc1SBarry Smith       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
13256679dcc1SBarry Smith       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
13266679dcc1SBarry Smith       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
13276679dcc1SBarry Smith       v += 48;
13286679dcc1SBarry Smith     }
13299371c9d4SSatish Balay     z[0]  = sum1;
13309371c9d4SSatish Balay     z[1]  = sum2;
13319371c9d4SSatish Balay     z[2]  = sum3;
13329371c9d4SSatish Balay     z[3]  = sum4;
13339371c9d4SSatish Balay     z[4]  = sum5;
13349371c9d4SSatish Balay     z[5]  = sum6;
13359371c9d4SSatish Balay     z[6]  = sum7;
13369371c9d4SSatish Balay     z[7]  = sum8;
13379371c9d4SSatish Balay     z[8]  = sum9;
13389371c9d4SSatish Balay     z[9]  = sum10;
13399371c9d4SSatish Balay     z[10] = sum11;
13409371c9d4SSatish Balay     z[11] = sum12;
13416679dcc1SBarry Smith     if (!usecprow) {
13426679dcc1SBarry Smith       y += 12;
13436679dcc1SBarry Smith       z += 12;
13446679dcc1SBarry Smith     }
13456679dcc1SBarry Smith   }
13469566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
13479566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
13489566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1349*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13506679dcc1SBarry Smith }
13516679dcc1SBarry Smith 
13526679dcc1SBarry Smith #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1353d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1354d71ae5a4SJacob Faibussowitsch {
13556679dcc1SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
13566679dcc1SBarry Smith   PetscScalar       *z = NULL, *zarray;
13576679dcc1SBarry Smith   const PetscScalar *x, *work;
13586679dcc1SBarry Smith   const MatScalar   *v = a->a;
13596679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
13606679dcc1SBarry Smith   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
13616679dcc1SBarry Smith   PetscBool          usecprow = a->compressedrow.use;
13626679dcc1SBarry Smith   const PetscInt     bs = 12, bs2 = 144;
13636679dcc1SBarry Smith 
13646679dcc1SBarry Smith   __m256d a0, a1, a2, a3, a4, a5;
13656679dcc1SBarry Smith   __m256d w0, w1, w2, w3;
13666679dcc1SBarry Smith   __m256d z0, z1, z2;
13676679dcc1SBarry Smith 
13686679dcc1SBarry Smith   PetscFunctionBegin;
13699566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
13709566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
13716679dcc1SBarry Smith 
13726679dcc1SBarry Smith   if (usecprow) {
13736679dcc1SBarry Smith     mbs  = a->compressedrow.nrows;
13746679dcc1SBarry Smith     ii   = a->compressedrow.i;
13756679dcc1SBarry Smith     ridx = a->compressedrow.rindex;
13769566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
13776679dcc1SBarry Smith   } else {
13786679dcc1SBarry Smith     mbs = a->mbs;
13796679dcc1SBarry Smith     ii  = a->i;
13806679dcc1SBarry Smith     z   = zarray;
13816679dcc1SBarry Smith   }
13826679dcc1SBarry Smith 
13836679dcc1SBarry Smith   for (i = 0; i < mbs; i++) {
13849371c9d4SSatish Balay     z0 = _mm256_setzero_pd();
13859371c9d4SSatish Balay     z1 = _mm256_setzero_pd();
13869371c9d4SSatish Balay     z2 = _mm256_setzero_pd();
13876679dcc1SBarry Smith 
13889371c9d4SSatish Balay     n = ii[1] - ii[0];
13899371c9d4SSatish Balay     ii++;
13906679dcc1SBarry Smith     for (j = 0; j < n; j++) {
13916679dcc1SBarry Smith       work = x + bs * (*idx++);
13926679dcc1SBarry Smith 
13936679dcc1SBarry Smith       /* first column of a */
13946679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[0]);
13959371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 0);
13969371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
13979371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 4);
13989371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
13999371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 8);
14009371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14016679dcc1SBarry Smith 
14026679dcc1SBarry Smith       /* second column of a */
14036679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[1]);
14049371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 12);
14059371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
14069371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 16);
14079371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
14089371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 20);
14099371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14106679dcc1SBarry Smith 
14116679dcc1SBarry Smith       /* third column of a */
14126679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[2]);
14139371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 24);
14149371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14159371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 28);
14169371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14179371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 32);
14189371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14196679dcc1SBarry Smith 
14206679dcc1SBarry Smith       /* fourth column of a */
14216679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[3]);
14229371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 36);
14239371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
14249371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 40);
14259371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
14269371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 44);
14279371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14286679dcc1SBarry Smith 
14296679dcc1SBarry Smith       /* fifth column of a */
14306679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[4]);
14319371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 48);
14329371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
14339371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 52);
14349371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
14359371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 56);
14369371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14376679dcc1SBarry Smith 
14386679dcc1SBarry Smith       /* sixth column of a */
14396679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[5]);
14409371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 60);
14419371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
14429371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 64);
14439371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
14449371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 68);
14459371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14466679dcc1SBarry Smith 
14476679dcc1SBarry Smith       /* seventh column of a */
14486679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[6]);
14499371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 72);
14509371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14519371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 76);
14529371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14539371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 80);
14549371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14556679dcc1SBarry Smith 
14566aad120cSJose E. Roman       /* eighth column of a */
14576679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[7]);
14589371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 84);
14599371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
14609371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 88);
14619371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
14629371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 92);
14639371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
14646679dcc1SBarry Smith 
14656679dcc1SBarry Smith       /* ninth column of a */
14666679dcc1SBarry Smith       w0 = _mm256_set1_pd(work[8]);
14679371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 96);
14689371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
14699371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 100);
14709371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
14719371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 104);
14729371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
14736679dcc1SBarry Smith 
14746679dcc1SBarry Smith       /* tenth column of a */
14756679dcc1SBarry Smith       w1 = _mm256_set1_pd(work[9]);
14769371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 108);
14779371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w1, z0);
14789371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 112);
14799371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w1, z1);
14809371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 116);
14819371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w1, z2);
14826679dcc1SBarry Smith 
14836679dcc1SBarry Smith       /* eleventh column of a */
14846679dcc1SBarry Smith       w2 = _mm256_set1_pd(work[10]);
14859371c9d4SSatish Balay       a0 = _mm256_loadu_pd(v + 120);
14869371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
14879371c9d4SSatish Balay       a1 = _mm256_loadu_pd(v + 124);
14889371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
14899371c9d4SSatish Balay       a2 = _mm256_loadu_pd(v + 128);
14909371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
14916679dcc1SBarry Smith 
14926679dcc1SBarry Smith       /* twelveth column of a */
14936679dcc1SBarry Smith       w3 = _mm256_set1_pd(work[11]);
14949371c9d4SSatish Balay       a3 = _mm256_loadu_pd(v + 132);
14959371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
14969371c9d4SSatish Balay       a4 = _mm256_loadu_pd(v + 136);
14979371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
14989371c9d4SSatish Balay       a5 = _mm256_loadu_pd(v + 140);
14999371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
15006679dcc1SBarry Smith 
15016679dcc1SBarry Smith       v += bs2;
15026679dcc1SBarry Smith     }
15036679dcc1SBarry Smith     if (usecprow) z = zarray + bs * ridx[i];
15049371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
15059371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
15069371c9d4SSatish Balay     _mm256_storeu_pd(&z[8], z2);
15076679dcc1SBarry Smith     if (!usecprow) z += bs;
15086679dcc1SBarry Smith   }
15099566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
15109566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
15119566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1512*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
15136679dcc1SBarry Smith }
15146679dcc1SBarry Smith #endif
15156679dcc1SBarry Smith 
15168ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1517832cc040SShri Abhyankar /* Default MatMult for block size 15 */
1518d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1519d71ae5a4SJacob Faibussowitsch {
15208ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1521f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
15228ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
152353ef36baSBarry Smith   PetscScalar       *zarray, xv;
15248ab949d8SShri Abhyankar   const MatScalar   *v;
15258ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
15267c565772SBarry Smith   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1527ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
15288ab949d8SShri Abhyankar 
15298ab949d8SShri Abhyankar   PetscFunctionBegin;
15309566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
15319566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
15328ab949d8SShri Abhyankar 
15338ab949d8SShri Abhyankar   v = a->a;
15348ab949d8SShri Abhyankar   if (usecprow) {
15358ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
15368ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
15378ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
15389566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
15398ab949d8SShri Abhyankar   } else {
15408ab949d8SShri Abhyankar     mbs = a->mbs;
15418ab949d8SShri Abhyankar     ii  = a->i;
15428ab949d8SShri Abhyankar     z   = zarray;
15438ab949d8SShri Abhyankar   }
15448ab949d8SShri Abhyankar 
15458ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
15468ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
15478ab949d8SShri Abhyankar     idx   = ij + ii[i];
15489371c9d4SSatish Balay     sum1  = 0.0;
15499371c9d4SSatish Balay     sum2  = 0.0;
15509371c9d4SSatish Balay     sum3  = 0.0;
15519371c9d4SSatish Balay     sum4  = 0.0;
15529371c9d4SSatish Balay     sum5  = 0.0;
15539371c9d4SSatish Balay     sum6  = 0.0;
15549371c9d4SSatish Balay     sum7  = 0.0;
15559371c9d4SSatish Balay     sum8  = 0.0;
15569371c9d4SSatish Balay     sum9  = 0.0;
15579371c9d4SSatish Balay     sum10 = 0.0;
15589371c9d4SSatish Balay     sum11 = 0.0;
15599371c9d4SSatish Balay     sum12 = 0.0;
15609371c9d4SSatish Balay     sum13 = 0.0;
15619371c9d4SSatish Balay     sum14 = 0.0;
15629371c9d4SSatish Balay     sum15 = 0.0;
15638ab949d8SShri Abhyankar 
15648ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
15658ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
15668ab949d8SShri Abhyankar 
15678ab949d8SShri Abhyankar       for (k = 0; k < 15; k++) {
156853ef36baSBarry Smith         xv = xb[k];
156953ef36baSBarry Smith         sum1 += v[0] * xv;
157053ef36baSBarry Smith         sum2 += v[1] * xv;
157153ef36baSBarry Smith         sum3 += v[2] * xv;
157253ef36baSBarry Smith         sum4 += v[3] * xv;
157353ef36baSBarry Smith         sum5 += v[4] * xv;
157453ef36baSBarry Smith         sum6 += v[5] * xv;
157553ef36baSBarry Smith         sum7 += v[6] * xv;
157653ef36baSBarry Smith         sum8 += v[7] * xv;
157753ef36baSBarry Smith         sum9 += v[8] * xv;
157853ef36baSBarry Smith         sum10 += v[9] * xv;
157953ef36baSBarry Smith         sum11 += v[10] * xv;
158053ef36baSBarry Smith         sum12 += v[11] * xv;
158153ef36baSBarry Smith         sum13 += v[12] * xv;
158253ef36baSBarry Smith         sum14 += v[13] * xv;
158353ef36baSBarry Smith         sum15 += v[14] * xv;
15848ab949d8SShri Abhyankar         v += 15;
15858ab949d8SShri Abhyankar       }
15868ab949d8SShri Abhyankar     }
15878ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
15889371c9d4SSatish Balay     z[0]  = sum1;
15899371c9d4SSatish Balay     z[1]  = sum2;
15909371c9d4SSatish Balay     z[2]  = sum3;
15919371c9d4SSatish Balay     z[3]  = sum4;
15929371c9d4SSatish Balay     z[4]  = sum5;
15939371c9d4SSatish Balay     z[5]  = sum6;
15949371c9d4SSatish Balay     z[6]  = sum7;
15959371c9d4SSatish Balay     z[7]  = sum8;
15969371c9d4SSatish Balay     z[8]  = sum9;
15979371c9d4SSatish Balay     z[9]  = sum10;
15989371c9d4SSatish Balay     z[10] = sum11;
15999371c9d4SSatish Balay     z[11] = sum12;
16009371c9d4SSatish Balay     z[12] = sum13;
16019371c9d4SSatish Balay     z[13] = sum14;
16029371c9d4SSatish Balay     z[14] = sum15;
16038ab949d8SShri Abhyankar 
16048ab949d8SShri Abhyankar     if (!usecprow) z += 15;
16058ab949d8SShri Abhyankar   }
16068ab949d8SShri Abhyankar 
16079566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
16089566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
16099566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1610*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16118ab949d8SShri Abhyankar }
16128ab949d8SShri Abhyankar 
16138ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1614d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1615d71ae5a4SJacob Faibussowitsch {
16168ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1617f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
16188ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
16190b8f6341SShri Abhyankar   PetscScalar        x1, x2, x3, x4, *zarray;
16208ab949d8SShri Abhyankar   const MatScalar   *v;
16218ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
16227c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1623ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
16248ab949d8SShri Abhyankar 
16258ab949d8SShri Abhyankar   PetscFunctionBegin;
16269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
16279566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
16288ab949d8SShri Abhyankar 
16298ab949d8SShri Abhyankar   v = a->a;
16308ab949d8SShri Abhyankar   if (usecprow) {
16318ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
16328ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
16338ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
16349566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
16358ab949d8SShri Abhyankar   } else {
16368ab949d8SShri Abhyankar     mbs = a->mbs;
16378ab949d8SShri Abhyankar     ii  = a->i;
16388ab949d8SShri Abhyankar     z   = zarray;
16398ab949d8SShri Abhyankar   }
16408ab949d8SShri Abhyankar 
16418ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
16428ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
16438ab949d8SShri Abhyankar     idx   = ij + ii[i];
16449371c9d4SSatish Balay     sum1  = 0.0;
16459371c9d4SSatish Balay     sum2  = 0.0;
16469371c9d4SSatish Balay     sum3  = 0.0;
16479371c9d4SSatish Balay     sum4  = 0.0;
16489371c9d4SSatish Balay     sum5  = 0.0;
16499371c9d4SSatish Balay     sum6  = 0.0;
16509371c9d4SSatish Balay     sum7  = 0.0;
16519371c9d4SSatish Balay     sum8  = 0.0;
16529371c9d4SSatish Balay     sum9  = 0.0;
16539371c9d4SSatish Balay     sum10 = 0.0;
16549371c9d4SSatish Balay     sum11 = 0.0;
16559371c9d4SSatish Balay     sum12 = 0.0;
16569371c9d4SSatish Balay     sum13 = 0.0;
16579371c9d4SSatish Balay     sum14 = 0.0;
16589371c9d4SSatish Balay     sum15 = 0.0;
16598ab949d8SShri Abhyankar 
16608ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
16618ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
16629371c9d4SSatish Balay       x1 = xb[0];
16639371c9d4SSatish Balay       x2 = xb[1];
16649371c9d4SSatish Balay       x3 = xb[2];
16659371c9d4SSatish Balay       x4 = xb[3];
16668ab949d8SShri Abhyankar 
16678ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16688ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16698ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16708ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16718ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
16728ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
16738ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
16748ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
16758ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
16768ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
16778ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
16788ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
16798ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
16808ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
16818ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
16828ab949d8SShri Abhyankar 
16838ab949d8SShri Abhyankar       v += 60;
16848ab949d8SShri Abhyankar 
16859371c9d4SSatish Balay       x1 = xb[4];
16869371c9d4SSatish Balay       x2 = xb[5];
16879371c9d4SSatish Balay       x3 = xb[6];
16889371c9d4SSatish Balay       x4 = xb[7];
16898ab949d8SShri Abhyankar 
16908ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
16918ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
16928ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
16938ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
16948ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
16958ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
16968ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
16978ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
16988ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
16998ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
17008ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
17018ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
17028ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
17038ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
17048ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
17058ab949d8SShri Abhyankar       v += 60;
17068ab949d8SShri Abhyankar 
17079371c9d4SSatish Balay       x1 = xb[8];
17089371c9d4SSatish Balay       x2 = xb[9];
17099371c9d4SSatish Balay       x3 = xb[10];
17109371c9d4SSatish Balay       x4 = xb[11];
17110b8f6341SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
17120b8f6341SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
17130b8f6341SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
17140b8f6341SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
17150b8f6341SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
17160b8f6341SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
17170b8f6341SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
17180b8f6341SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
17190b8f6341SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
17200b8f6341SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
17210b8f6341SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
17220b8f6341SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
17230b8f6341SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
17240b8f6341SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
17250b8f6341SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
17260b8f6341SShri Abhyankar       v += 60;
17270b8f6341SShri Abhyankar 
17289371c9d4SSatish Balay       x1 = xb[12];
17299371c9d4SSatish Balay       x2 = xb[13];
17309371c9d4SSatish Balay       x3 = xb[14];
17318ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
17328ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
17338ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
17348ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
17358ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
17368ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
17378ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
17388ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
17398ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
17408ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
17418ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
17428ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
17438ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
17448ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
17458ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
17468ab949d8SShri Abhyankar       v += 45;
17478ab949d8SShri Abhyankar     }
17488ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
17499371c9d4SSatish Balay     z[0]  = sum1;
17509371c9d4SSatish Balay     z[1]  = sum2;
17519371c9d4SSatish Balay     z[2]  = sum3;
17529371c9d4SSatish Balay     z[3]  = sum4;
17539371c9d4SSatish Balay     z[4]  = sum5;
17549371c9d4SSatish Balay     z[5]  = sum6;
17559371c9d4SSatish Balay     z[6]  = sum7;
17569371c9d4SSatish Balay     z[7]  = sum8;
17579371c9d4SSatish Balay     z[8]  = sum9;
17589371c9d4SSatish Balay     z[9]  = sum10;
17599371c9d4SSatish Balay     z[10] = sum11;
17609371c9d4SSatish Balay     z[11] = sum12;
17619371c9d4SSatish Balay     z[12] = sum13;
17629371c9d4SSatish Balay     z[13] = sum14;
17639371c9d4SSatish Balay     z[14] = sum15;
17648ab949d8SShri Abhyankar 
17658ab949d8SShri Abhyankar     if (!usecprow) z += 15;
17668ab949d8SShri Abhyankar   }
17678ab949d8SShri Abhyankar 
17689566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
17699566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
17709566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1771*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17728ab949d8SShri Abhyankar }
17738ab949d8SShri Abhyankar 
17748ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1775d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1776d71ae5a4SJacob Faibussowitsch {
17778ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1778f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
17798ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
17800b8f6341SShri Abhyankar   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
17818ab949d8SShri Abhyankar   const MatScalar   *v;
17828ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
17837c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1784ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
17858ab949d8SShri Abhyankar 
17868ab949d8SShri Abhyankar   PetscFunctionBegin;
17879566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
17889566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
17898ab949d8SShri Abhyankar 
17908ab949d8SShri Abhyankar   v = a->a;
17918ab949d8SShri Abhyankar   if (usecprow) {
17928ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
17938ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
17948ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
17959566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
17968ab949d8SShri Abhyankar   } else {
17978ab949d8SShri Abhyankar     mbs = a->mbs;
17988ab949d8SShri Abhyankar     ii  = a->i;
17998ab949d8SShri Abhyankar     z   = zarray;
18008ab949d8SShri Abhyankar   }
18018ab949d8SShri Abhyankar 
18028ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
18038ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
18048ab949d8SShri Abhyankar     idx   = ij + ii[i];
18059371c9d4SSatish Balay     sum1  = 0.0;
18069371c9d4SSatish Balay     sum2  = 0.0;
18079371c9d4SSatish Balay     sum3  = 0.0;
18089371c9d4SSatish Balay     sum4  = 0.0;
18099371c9d4SSatish Balay     sum5  = 0.0;
18109371c9d4SSatish Balay     sum6  = 0.0;
18119371c9d4SSatish Balay     sum7  = 0.0;
18129371c9d4SSatish Balay     sum8  = 0.0;
18139371c9d4SSatish Balay     sum9  = 0.0;
18149371c9d4SSatish Balay     sum10 = 0.0;
18159371c9d4SSatish Balay     sum11 = 0.0;
18169371c9d4SSatish Balay     sum12 = 0.0;
18179371c9d4SSatish Balay     sum13 = 0.0;
18189371c9d4SSatish Balay     sum14 = 0.0;
18199371c9d4SSatish Balay     sum15 = 0.0;
18208ab949d8SShri Abhyankar 
18218ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
18228ab949d8SShri Abhyankar       xb = x + 15 * (idx[j]);
18239371c9d4SSatish Balay       x1 = xb[0];
18249371c9d4SSatish Balay       x2 = xb[1];
18259371c9d4SSatish Balay       x3 = xb[2];
18269371c9d4SSatish Balay       x4 = xb[3];
18279371c9d4SSatish Balay       x5 = xb[4];
18289371c9d4SSatish Balay       x6 = xb[5];
18299371c9d4SSatish Balay       x7 = xb[6];
18300b8f6341SShri Abhyankar       x8 = xb[7];
18318ab949d8SShri Abhyankar 
18328ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
18338ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
18348ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
18358ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
18368ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
18378ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
18388ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
18398ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
18408ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
18418ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
18428ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
18438ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
18448ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
18458ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
18468ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
18478ab949d8SShri Abhyankar       v += 120;
18488ab949d8SShri Abhyankar 
18499371c9d4SSatish Balay       x1 = xb[8];
18509371c9d4SSatish Balay       x2 = xb[9];
18519371c9d4SSatish Balay       x3 = xb[10];
18529371c9d4SSatish Balay       x4 = xb[11];
18539371c9d4SSatish Balay       x5 = xb[12];
18549371c9d4SSatish Balay       x6 = xb[13];
18559371c9d4SSatish Balay       x7 = xb[14];
18560b8f6341SShri Abhyankar 
18578ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
18588ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
18598ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
18608ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
18618ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
18628ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
18638ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
18648ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
18658ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
18668ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
18678ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
18688ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
18698ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
18708ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
18718ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
18728ab949d8SShri Abhyankar       v += 105;
18738ab949d8SShri Abhyankar     }
18748ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
18759371c9d4SSatish Balay     z[0]  = sum1;
18769371c9d4SSatish Balay     z[1]  = sum2;
18779371c9d4SSatish Balay     z[2]  = sum3;
18789371c9d4SSatish Balay     z[3]  = sum4;
18799371c9d4SSatish Balay     z[4]  = sum5;
18809371c9d4SSatish Balay     z[5]  = sum6;
18819371c9d4SSatish Balay     z[6]  = sum7;
18829371c9d4SSatish Balay     z[7]  = sum8;
18839371c9d4SSatish Balay     z[8]  = sum9;
18849371c9d4SSatish Balay     z[9]  = sum10;
18859371c9d4SSatish Balay     z[10] = sum11;
18869371c9d4SSatish Balay     z[11] = sum12;
18879371c9d4SSatish Balay     z[12] = sum13;
18889371c9d4SSatish Balay     z[13] = sum14;
18899371c9d4SSatish Balay     z[14] = sum15;
18908ab949d8SShri Abhyankar 
18918ab949d8SShri Abhyankar     if (!usecprow) z += 15;
18928ab949d8SShri Abhyankar   }
18938ab949d8SShri Abhyankar 
18949566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
18959566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
18969566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1897*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18988ab949d8SShri Abhyankar }
18998ab949d8SShri Abhyankar 
19008ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1901d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1902d71ae5a4SJacob Faibussowitsch {
19038ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1904f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
19058ab949d8SShri Abhyankar   const PetscScalar *x, *xb;
19068ab949d8SShri Abhyankar   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
19078ab949d8SShri Abhyankar   const MatScalar   *v;
19088ab949d8SShri Abhyankar   const PetscInt    *ii, *ij = a->j, *idx;
19097c565772SBarry Smith   PetscInt           mbs, i, j, n, *ridx = NULL;
1910ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
19118ab949d8SShri Abhyankar 
19128ab949d8SShri Abhyankar   PetscFunctionBegin;
19139566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
19149566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
19158ab949d8SShri Abhyankar 
19168ab949d8SShri Abhyankar   v = a->a;
19178ab949d8SShri Abhyankar   if (usecprow) {
19188ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
19198ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
19208ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
19219566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
19228ab949d8SShri Abhyankar   } else {
19238ab949d8SShri Abhyankar     mbs = a->mbs;
19248ab949d8SShri Abhyankar     ii  = a->i;
19258ab949d8SShri Abhyankar     z   = zarray;
19268ab949d8SShri Abhyankar   }
19278ab949d8SShri Abhyankar 
19288ab949d8SShri Abhyankar   for (i = 0; i < mbs; i++) {
19298ab949d8SShri Abhyankar     n     = ii[i + 1] - ii[i];
19308ab949d8SShri Abhyankar     idx   = ij + ii[i];
19319371c9d4SSatish Balay     sum1  = 0.0;
19329371c9d4SSatish Balay     sum2  = 0.0;
19339371c9d4SSatish Balay     sum3  = 0.0;
19349371c9d4SSatish Balay     sum4  = 0.0;
19359371c9d4SSatish Balay     sum5  = 0.0;
19369371c9d4SSatish Balay     sum6  = 0.0;
19379371c9d4SSatish Balay     sum7  = 0.0;
19389371c9d4SSatish Balay     sum8  = 0.0;
19399371c9d4SSatish Balay     sum9  = 0.0;
19409371c9d4SSatish Balay     sum10 = 0.0;
19419371c9d4SSatish Balay     sum11 = 0.0;
19429371c9d4SSatish Balay     sum12 = 0.0;
19439371c9d4SSatish Balay     sum13 = 0.0;
19449371c9d4SSatish Balay     sum14 = 0.0;
19459371c9d4SSatish Balay     sum15 = 0.0;
19468ab949d8SShri Abhyankar 
19478ab949d8SShri Abhyankar     for (j = 0; j < n; j++) {
19488ab949d8SShri Abhyankar       xb  = x + 15 * (idx[j]);
19499371c9d4SSatish Balay       x1  = xb[0];
19509371c9d4SSatish Balay       x2  = xb[1];
19519371c9d4SSatish Balay       x3  = xb[2];
19529371c9d4SSatish Balay       x4  = xb[3];
19539371c9d4SSatish Balay       x5  = xb[4];
19549371c9d4SSatish Balay       x6  = xb[5];
19559371c9d4SSatish Balay       x7  = xb[6];
19569371c9d4SSatish Balay       x8  = xb[7];
19579371c9d4SSatish Balay       x9  = xb[8];
19589371c9d4SSatish Balay       x10 = xb[9];
19599371c9d4SSatish Balay       x11 = xb[10];
19609371c9d4SSatish Balay       x12 = xb[11];
19619371c9d4SSatish Balay       x13 = xb[12];
19629371c9d4SSatish Balay       x14 = xb[13];
19639371c9d4SSatish Balay       x15 = xb[14];
19648ab949d8SShri Abhyankar 
19658ab949d8SShri Abhyankar       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
19668ab949d8SShri Abhyankar       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
19678ab949d8SShri Abhyankar       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
19688ab949d8SShri Abhyankar       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
19698ab949d8SShri Abhyankar       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
19708ab949d8SShri Abhyankar       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
19718ab949d8SShri Abhyankar       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
19728ab949d8SShri Abhyankar       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
19738ab949d8SShri Abhyankar       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
19748ab949d8SShri Abhyankar       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
19758ab949d8SShri Abhyankar       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
19768ab949d8SShri Abhyankar       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
19778ab949d8SShri Abhyankar       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
19788ab949d8SShri Abhyankar       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
19798ab949d8SShri Abhyankar       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
19808ab949d8SShri Abhyankar       v += 225;
19818ab949d8SShri Abhyankar     }
19828ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15 * ridx[i];
19839371c9d4SSatish Balay     z[0]  = sum1;
19849371c9d4SSatish Balay     z[1]  = sum2;
19859371c9d4SSatish Balay     z[2]  = sum3;
19869371c9d4SSatish Balay     z[3]  = sum4;
19879371c9d4SSatish Balay     z[4]  = sum5;
19889371c9d4SSatish Balay     z[5]  = sum6;
19899371c9d4SSatish Balay     z[6]  = sum7;
19909371c9d4SSatish Balay     z[7]  = sum8;
19919371c9d4SSatish Balay     z[8]  = sum9;
19929371c9d4SSatish Balay     z[9]  = sum10;
19939371c9d4SSatish Balay     z[10] = sum11;
19949371c9d4SSatish Balay     z[11] = sum12;
19959371c9d4SSatish Balay     z[12] = sum13;
19969371c9d4SSatish Balay     z[13] = sum14;
19979371c9d4SSatish Balay     z[14] = sum15;
19988ab949d8SShri Abhyankar 
19998ab949d8SShri Abhyankar     if (!usecprow) z += 15;
20008ab949d8SShri Abhyankar   }
20018ab949d8SShri Abhyankar 
20029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
20039566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
20049566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
2005*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20068ab949d8SShri Abhyankar }
20078ab949d8SShri Abhyankar 
20083f1db9ecSBarry Smith /*
20093f1db9ecSBarry Smith     This will not work with MatScalar == float because it calls the BLAS
20103f1db9ecSBarry Smith */
2011d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
2012d71ae5a4SJacob Faibussowitsch {
20132d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2014f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
2015d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2016d9ca1df4SBarry Smith   const MatScalar   *v;
2017d9ca1df4SBarry Smith   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2018d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2019d9ca1df4SBarry Smith   PetscInt           ncols, k;
2020ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
20212d61bbb3SSatish Balay 
20222d61bbb3SSatish Balay   PetscFunctionBegin;
20239566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
20249566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(zz, &zarray));
20252d61bbb3SSatish Balay 
20262d61bbb3SSatish Balay   idx = a->j;
20272d61bbb3SSatish Balay   v   = a->a;
202826e093fcSHong Zhang   if (usecprow) {
202926e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
203026e093fcSHong Zhang     ii   = a->compressedrow.i;
20317b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
20329566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
203326e093fcSHong Zhang   } else {
203426e093fcSHong Zhang     mbs = a->mbs;
20352d61bbb3SSatish Balay     ii  = a->i;
203626e093fcSHong Zhang     z   = zarray;
203726e093fcSHong Zhang   }
2038218c64b6SSatish Balay 
20392d61bbb3SSatish Balay   if (!a->mult_work) {
2040d0f46423SBarry Smith     k = PetscMax(A->rmap->n, A->cmap->n);
20419566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
20422d61bbb3SSatish Balay   }
20432d61bbb3SSatish Balay   work = a->mult_work;
20442d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
20459371c9d4SSatish Balay     n = ii[1] - ii[0];
20469371c9d4SSatish Balay     ii++;
20472d61bbb3SSatish Balay     ncols = n * bs;
20482d61bbb3SSatish Balay     workt = work;
20492d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
20502d61bbb3SSatish Balay       xb = x + bs * (*idx++);
20512d61bbb3SSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
20522d61bbb3SSatish Balay       workt += bs;
20532d61bbb3SSatish Balay     }
20547b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs * ridx[i];
205596b95a6bSBarry Smith     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
20562d61bbb3SSatish Balay     v += n * bs2;
205726e093fcSHong Zhang     if (!usecprow) z += bs;
20582d61bbb3SSatish Balay   }
20599566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
20609566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(zz, &zarray));
20619566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2062*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20632d61bbb3SSatish Balay }
20642d61bbb3SSatish Balay 
2065d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2066d71ae5a4SJacob Faibussowitsch {
20672d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2068122f12eaSBarry Smith   const PetscScalar *x;
2069122f12eaSBarry Smith   PetscScalar       *y, *z, sum;
2070122f12eaSBarry Smith   const MatScalar   *v;
20717c565772SBarry Smith   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2072122f12eaSBarry Smith   const PetscInt    *idx, *ii;
2073ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
20742d61bbb3SSatish Balay 
20752d61bbb3SSatish Balay   PetscFunctionBegin;
20769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
20779566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
20782d61bbb3SSatish Balay 
20792d61bbb3SSatish Balay   idx = a->j;
20802d61bbb3SSatish Balay   v   = a->a;
208126e093fcSHong Zhang   if (usecprow) {
208248a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
208326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
208426e093fcSHong Zhang     ii   = a->compressedrow.i;
20857b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
208626e093fcSHong Zhang   } else {
20872d61bbb3SSatish Balay     ii = a->i;
208826e093fcSHong Zhang   }
20892d61bbb3SSatish Balay 
20902d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
2091122f12eaSBarry Smith     n = ii[1] - ii[0];
2092122f12eaSBarry Smith     ii++;
209326e093fcSHong Zhang     if (!usecprow) {
2094122f12eaSBarry Smith       sum = y[i];
2095122f12eaSBarry Smith     } else {
2096122f12eaSBarry Smith       sum = y[ridx[i]];
2097122f12eaSBarry Smith     }
2098444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2099444d8c10SJed Brown     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2100122f12eaSBarry Smith     PetscSparseDensePlusDot(sum, x, v, idx, n);
2101122f12eaSBarry Smith     v += n;
2102122f12eaSBarry Smith     idx += n;
2103122f12eaSBarry Smith     if (usecprow) {
2104122f12eaSBarry Smith       z[ridx[i]] = sum;
2105122f12eaSBarry Smith     } else {
2106122f12eaSBarry Smith       z[i] = sum;
210726e093fcSHong Zhang     }
21082d61bbb3SSatish Balay   }
21099566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
21109566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
21119566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
2112*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21132d61bbb3SSatish Balay }
21142d61bbb3SSatish Balay 
2115d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2116d71ae5a4SJacob Faibussowitsch {
21172d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2118f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2119d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
212026e093fcSHong Zhang   PetscScalar        x1, x2, *yarray, *zarray;
2121d9ca1df4SBarry Smith   const MatScalar   *v;
2122d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, j;
2123d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2124ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
21252d61bbb3SSatish Balay 
21262d61bbb3SSatish Balay   PetscFunctionBegin;
21279566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
21289566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
21292d61bbb3SSatish Balay 
21302d61bbb3SSatish Balay   idx = a->j;
21312d61bbb3SSatish Balay   v   = a->a;
213226e093fcSHong Zhang   if (usecprow) {
213348a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
213426e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
213526e093fcSHong Zhang     ii   = a->compressedrow.i;
21367b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
213726e093fcSHong Zhang   } else {
21382d61bbb3SSatish Balay     ii = a->i;
213926e093fcSHong Zhang     y  = yarray;
214026e093fcSHong Zhang     z  = zarray;
214126e093fcSHong Zhang   }
21422d61bbb3SSatish Balay 
21432d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
21449371c9d4SSatish Balay     n = ii[1] - ii[0];
21459371c9d4SSatish Balay     ii++;
214626e093fcSHong Zhang     if (usecprow) {
21477b2bb3b9SHong Zhang       z = zarray + 2 * ridx[i];
21487b2bb3b9SHong Zhang       y = yarray + 2 * ridx[i];
214926e093fcSHong Zhang     }
21509371c9d4SSatish Balay     sum1 = y[0];
21519371c9d4SSatish Balay     sum2 = y[1];
2152444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2153444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
21542d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
215526fbe8dcSKarl Rupp       xb = x + 2 * (*idx++);
215626fbe8dcSKarl Rupp       x1 = xb[0];
215726fbe8dcSKarl Rupp       x2 = xb[1];
215826fbe8dcSKarl Rupp 
21592d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[2] * x2;
21602d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[3] * x2;
21612d61bbb3SSatish Balay       v += 4;
21622d61bbb3SSatish Balay     }
21639371c9d4SSatish Balay     z[0] = sum1;
21649371c9d4SSatish Balay     z[1] = sum2;
216526e093fcSHong Zhang     if (!usecprow) {
21669371c9d4SSatish Balay       z += 2;
21679371c9d4SSatish Balay       y += 2;
21682d61bbb3SSatish Balay     }
216926e093fcSHong Zhang   }
21709566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
21719566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
21729566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(4.0 * a->nz));
2173*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21742d61bbb3SSatish Balay }
21752d61bbb3SSatish Balay 
2176d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2177d71ae5a4SJacob Faibussowitsch {
21782d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2179f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2180d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2181d9ca1df4SBarry Smith   const MatScalar   *v;
2182d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2183d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2184ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
21852d61bbb3SSatish Balay 
21862d61bbb3SSatish Balay   PetscFunctionBegin;
21879566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
21889566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
21892d61bbb3SSatish Balay 
21902d61bbb3SSatish Balay   idx = a->j;
21912d61bbb3SSatish Balay   v   = a->a;
219226e093fcSHong Zhang   if (usecprow) {
219348a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
219426e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
219526e093fcSHong Zhang     ii   = a->compressedrow.i;
21967b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
219726e093fcSHong Zhang   } else {
21982d61bbb3SSatish Balay     ii = a->i;
219926e093fcSHong Zhang     y  = yarray;
220026e093fcSHong Zhang     z  = zarray;
220126e093fcSHong Zhang   }
22022d61bbb3SSatish Balay 
22032d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
22049371c9d4SSatish Balay     n = ii[1] - ii[0];
22059371c9d4SSatish Balay     ii++;
220626e093fcSHong Zhang     if (usecprow) {
22077b2bb3b9SHong Zhang       z = zarray + 3 * ridx[i];
22087b2bb3b9SHong Zhang       y = yarray + 3 * ridx[i];
220926e093fcSHong Zhang     }
22109371c9d4SSatish Balay     sum1 = y[0];
22119371c9d4SSatish Balay     sum2 = y[1];
22129371c9d4SSatish Balay     sum3 = y[2];
2213444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2214444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
22152d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
22169371c9d4SSatish Balay       xb = x + 3 * (*idx++);
22179371c9d4SSatish Balay       x1 = xb[0];
22189371c9d4SSatish Balay       x2 = xb[1];
22199371c9d4SSatish Balay       x3 = xb[2];
22202d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
22212d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
22222d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
22232d61bbb3SSatish Balay       v += 9;
22242d61bbb3SSatish Balay     }
22259371c9d4SSatish Balay     z[0] = sum1;
22269371c9d4SSatish Balay     z[1] = sum2;
22279371c9d4SSatish Balay     z[2] = sum3;
222826e093fcSHong Zhang     if (!usecprow) {
22299371c9d4SSatish Balay       z += 3;
22309371c9d4SSatish Balay       y += 3;
22312d61bbb3SSatish Balay     }
223226e093fcSHong Zhang   }
22339566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
22349566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
22359566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * a->nz));
2236*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
22372d61bbb3SSatish Balay }
22382d61bbb3SSatish Balay 
2239d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2240d71ae5a4SJacob Faibussowitsch {
22412d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2242f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2243d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2244d9ca1df4SBarry Smith   const MatScalar   *v;
2245d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2246d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2247ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
22482d61bbb3SSatish Balay 
22492d61bbb3SSatish Balay   PetscFunctionBegin;
22509566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
22519566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
22522d61bbb3SSatish Balay 
22532d61bbb3SSatish Balay   idx = a->j;
22542d61bbb3SSatish Balay   v   = a->a;
225526e093fcSHong Zhang   if (usecprow) {
225648a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
225726e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
225826e093fcSHong Zhang     ii   = a->compressedrow.i;
22597b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
226026e093fcSHong Zhang   } else {
22612d61bbb3SSatish Balay     ii = a->i;
226226e093fcSHong Zhang     y  = yarray;
226326e093fcSHong Zhang     z  = zarray;
226426e093fcSHong Zhang   }
22652d61bbb3SSatish Balay 
22662d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
22679371c9d4SSatish Balay     n = ii[1] - ii[0];
22689371c9d4SSatish Balay     ii++;
226926e093fcSHong Zhang     if (usecprow) {
22707b2bb3b9SHong Zhang       z = zarray + 4 * ridx[i];
22717b2bb3b9SHong Zhang       y = yarray + 4 * ridx[i];
227226e093fcSHong Zhang     }
22739371c9d4SSatish Balay     sum1 = y[0];
22749371c9d4SSatish Balay     sum2 = y[1];
22759371c9d4SSatish Balay     sum3 = y[2];
22769371c9d4SSatish Balay     sum4 = y[3];
2277444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2278444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
22792d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
22802d61bbb3SSatish Balay       xb = x + 4 * (*idx++);
22819371c9d4SSatish Balay       x1 = xb[0];
22829371c9d4SSatish Balay       x2 = xb[1];
22839371c9d4SSatish Balay       x3 = xb[2];
22849371c9d4SSatish Balay       x4 = xb[3];
22852d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
22862d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
22872d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
22882d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
22892d61bbb3SSatish Balay       v += 16;
22902d61bbb3SSatish Balay     }
22919371c9d4SSatish Balay     z[0] = sum1;
22929371c9d4SSatish Balay     z[1] = sum2;
22939371c9d4SSatish Balay     z[2] = sum3;
22949371c9d4SSatish Balay     z[3] = sum4;
229526e093fcSHong Zhang     if (!usecprow) {
22969371c9d4SSatish Balay       z += 4;
22979371c9d4SSatish Balay       y += 4;
22982d61bbb3SSatish Balay     }
229926e093fcSHong Zhang   }
23009566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
23019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
23029566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * a->nz));
2303*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23042d61bbb3SSatish Balay }
23052d61bbb3SSatish Balay 
2306d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2307d71ae5a4SJacob Faibussowitsch {
23082d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2309f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2310d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
231126e093fcSHong Zhang   PetscScalar       *yarray, *zarray;
2312d9ca1df4SBarry Smith   const MatScalar   *v;
2313d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2314d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2315ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
23162d61bbb3SSatish Balay 
23172d61bbb3SSatish Balay   PetscFunctionBegin;
23189566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
23199566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
23202d61bbb3SSatish Balay 
23212d61bbb3SSatish Balay   idx = a->j;
23222d61bbb3SSatish Balay   v   = a->a;
232326e093fcSHong Zhang   if (usecprow) {
232448a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
232526e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
232626e093fcSHong Zhang     ii   = a->compressedrow.i;
23277b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
232826e093fcSHong Zhang   } else {
23292d61bbb3SSatish Balay     ii = a->i;
233026e093fcSHong Zhang     y  = yarray;
233126e093fcSHong Zhang     z  = zarray;
233226e093fcSHong Zhang   }
23332d61bbb3SSatish Balay 
23342d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
23359371c9d4SSatish Balay     n = ii[1] - ii[0];
23369371c9d4SSatish Balay     ii++;
233726e093fcSHong Zhang     if (usecprow) {
23387b2bb3b9SHong Zhang       z = zarray + 5 * ridx[i];
23397b2bb3b9SHong Zhang       y = yarray + 5 * ridx[i];
234026e093fcSHong Zhang     }
23419371c9d4SSatish Balay     sum1 = y[0];
23429371c9d4SSatish Balay     sum2 = y[1];
23439371c9d4SSatish Balay     sum3 = y[2];
23449371c9d4SSatish Balay     sum4 = y[3];
23459371c9d4SSatish Balay     sum5 = y[4];
2346444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2347444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
23482d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
23492d61bbb3SSatish Balay       xb = x + 5 * (*idx++);
23509371c9d4SSatish Balay       x1 = xb[0];
23519371c9d4SSatish Balay       x2 = xb[1];
23529371c9d4SSatish Balay       x3 = xb[2];
23539371c9d4SSatish Balay       x4 = xb[3];
23549371c9d4SSatish Balay       x5 = xb[4];
23552d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
23562d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
23572d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
23582d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
23592d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
23602d61bbb3SSatish Balay       v += 25;
23612d61bbb3SSatish Balay     }
23629371c9d4SSatish Balay     z[0] = sum1;
23639371c9d4SSatish Balay     z[1] = sum2;
23649371c9d4SSatish Balay     z[2] = sum3;
23659371c9d4SSatish Balay     z[3] = sum4;
23669371c9d4SSatish Balay     z[4] = sum5;
236726e093fcSHong Zhang     if (!usecprow) {
23689371c9d4SSatish Balay       z += 5;
23699371c9d4SSatish Balay       y += 5;
23702d61bbb3SSatish Balay     }
237126e093fcSHong Zhang   }
23729566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
23739566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
23749566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * a->nz));
2375*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23762d61bbb3SSatish Balay }
2377c2916339SPierre Jolivet 
2378d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2379d71ae5a4SJacob Faibussowitsch {
238015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2381f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2382d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
238326e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2384d9ca1df4SBarry Smith   const MatScalar   *v;
2385d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2386d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2387ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
238815091d37SBarry Smith 
238915091d37SBarry Smith   PetscFunctionBegin;
23909566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
23919566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
239215091d37SBarry Smith 
239315091d37SBarry Smith   idx = a->j;
239415091d37SBarry Smith   v   = a->a;
239526e093fcSHong Zhang   if (usecprow) {
239648a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
239726e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
239826e093fcSHong Zhang     ii   = a->compressedrow.i;
23997b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
240026e093fcSHong Zhang   } else {
240115091d37SBarry Smith     ii = a->i;
240226e093fcSHong Zhang     y  = yarray;
240326e093fcSHong Zhang     z  = zarray;
240426e093fcSHong Zhang   }
240515091d37SBarry Smith 
240615091d37SBarry Smith   for (i = 0; i < mbs; i++) {
24079371c9d4SSatish Balay     n = ii[1] - ii[0];
24089371c9d4SSatish Balay     ii++;
240926e093fcSHong Zhang     if (usecprow) {
24107b2bb3b9SHong Zhang       z = zarray + 6 * ridx[i];
24117b2bb3b9SHong Zhang       y = yarray + 6 * ridx[i];
241226e093fcSHong Zhang     }
24139371c9d4SSatish Balay     sum1 = y[0];
24149371c9d4SSatish Balay     sum2 = y[1];
24159371c9d4SSatish Balay     sum3 = y[2];
24169371c9d4SSatish Balay     sum4 = y[3];
24179371c9d4SSatish Balay     sum5 = y[4];
24189371c9d4SSatish Balay     sum6 = y[5];
2419444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2420444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
242115091d37SBarry Smith     for (j = 0; j < n; j++) {
24223b95cb0eSSatish Balay       xb = x + 6 * (*idx++);
24239371c9d4SSatish Balay       x1 = xb[0];
24249371c9d4SSatish Balay       x2 = xb[1];
24259371c9d4SSatish Balay       x3 = xb[2];
24269371c9d4SSatish Balay       x4 = xb[3];
24279371c9d4SSatish Balay       x5 = xb[4];
24289371c9d4SSatish Balay       x6 = xb[5];
242915091d37SBarry Smith       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
243015091d37SBarry Smith       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
243115091d37SBarry Smith       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
243215091d37SBarry Smith       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
243315091d37SBarry Smith       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
243415091d37SBarry Smith       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
243515091d37SBarry Smith       v += 36;
243615091d37SBarry Smith     }
24379371c9d4SSatish Balay     z[0] = sum1;
24389371c9d4SSatish Balay     z[1] = sum2;
24399371c9d4SSatish Balay     z[2] = sum3;
24409371c9d4SSatish Balay     z[3] = sum4;
24419371c9d4SSatish Balay     z[4] = sum5;
24429371c9d4SSatish Balay     z[5] = sum6;
244326e093fcSHong Zhang     if (!usecprow) {
24449371c9d4SSatish Balay       z += 6;
24459371c9d4SSatish Balay       y += 6;
244615091d37SBarry Smith     }
244726e093fcSHong Zhang   }
24489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
24499566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
24509566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * a->nz));
2451*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
245215091d37SBarry Smith }
24532d61bbb3SSatish Balay 
2454d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2455d71ae5a4SJacob Faibussowitsch {
24562d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2457f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2458d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
245926e093fcSHong Zhang   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2460d9ca1df4SBarry Smith   const MatScalar   *v;
2461d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2462d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2463ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
24642d61bbb3SSatish Balay 
24652d61bbb3SSatish Balay   PetscFunctionBegin;
24669566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
24679566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
24682d61bbb3SSatish Balay 
24692d61bbb3SSatish Balay   idx = a->j;
24702d61bbb3SSatish Balay   v   = a->a;
247126e093fcSHong Zhang   if (usecprow) {
247248a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
247326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
247426e093fcSHong Zhang     ii   = a->compressedrow.i;
24757b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
247626e093fcSHong Zhang   } else {
24772d61bbb3SSatish Balay     ii = a->i;
247826e093fcSHong Zhang     y  = yarray;
247926e093fcSHong Zhang     z  = zarray;
248026e093fcSHong Zhang   }
24812d61bbb3SSatish Balay 
24822d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
24839371c9d4SSatish Balay     n = ii[1] - ii[0];
24849371c9d4SSatish Balay     ii++;
248526e093fcSHong Zhang     if (usecprow) {
24867b2bb3b9SHong Zhang       z = zarray + 7 * ridx[i];
24877b2bb3b9SHong Zhang       y = yarray + 7 * ridx[i];
248826e093fcSHong Zhang     }
24899371c9d4SSatish Balay     sum1 = y[0];
24909371c9d4SSatish Balay     sum2 = y[1];
24919371c9d4SSatish Balay     sum3 = y[2];
24929371c9d4SSatish Balay     sum4 = y[3];
24939371c9d4SSatish Balay     sum5 = y[4];
24949371c9d4SSatish Balay     sum6 = y[5];
24959371c9d4SSatish Balay     sum7 = y[6];
2496444d8c10SJed Brown     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2497444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
24982d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
24992d61bbb3SSatish Balay       xb = x + 7 * (*idx++);
25009371c9d4SSatish Balay       x1 = xb[0];
25019371c9d4SSatish Balay       x2 = xb[1];
25029371c9d4SSatish Balay       x3 = xb[2];
25039371c9d4SSatish Balay       x4 = xb[3];
25049371c9d4SSatish Balay       x5 = xb[4];
25059371c9d4SSatish Balay       x6 = xb[5];
25069371c9d4SSatish Balay       x7 = xb[6];
25072d61bbb3SSatish Balay       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
25082d61bbb3SSatish Balay       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
25092d61bbb3SSatish Balay       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
25102d61bbb3SSatish Balay       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
25112d61bbb3SSatish Balay       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
25122d61bbb3SSatish Balay       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
25132d61bbb3SSatish Balay       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
25142d61bbb3SSatish Balay       v += 49;
25152d61bbb3SSatish Balay     }
25169371c9d4SSatish Balay     z[0] = sum1;
25179371c9d4SSatish Balay     z[1] = sum2;
25189371c9d4SSatish Balay     z[2] = sum3;
25199371c9d4SSatish Balay     z[3] = sum4;
25209371c9d4SSatish Balay     z[4] = sum5;
25219371c9d4SSatish Balay     z[5] = sum6;
25229371c9d4SSatish Balay     z[6] = sum7;
252326e093fcSHong Zhang     if (!usecprow) {
25249371c9d4SSatish Balay       z += 7;
25259371c9d4SSatish Balay       y += 7;
25262d61bbb3SSatish Balay     }
252726e093fcSHong Zhang   }
25289566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
25299566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
25309566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * a->nz));
2531*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25322d61bbb3SSatish Balay }
2533218c64b6SSatish Balay 
25345f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2535d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2536d71ae5a4SJacob Faibussowitsch {
253796e086a2SDaniel Kokron   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2538f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
253996e086a2SDaniel Kokron   const PetscScalar *x, *xb;
254096e086a2SDaniel Kokron   const MatScalar   *v;
25416679dcc1SBarry Smith   PetscInt           mbs, i, j, n;
2542ce68d72fSJed Brown   PetscInt           k;
254396e086a2SDaniel Kokron   PetscBool          usecprow = a->compressedrow.use;
25446679dcc1SBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
254596e086a2SDaniel Kokron 
254696e086a2SDaniel Kokron   __m256d a0, a1, a2, a3, a4, a5;
2547ce68d72fSJed Brown   __m256d w0, w1, w2, w3;
254896e086a2SDaniel Kokron   __m256d z0, z1, z2;
254996e086a2SDaniel Kokron   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
255096e086a2SDaniel Kokron 
255196e086a2SDaniel Kokron   PetscFunctionBegin;
25529566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
25539566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
25549566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &zarray));
255596e086a2SDaniel Kokron 
255696e086a2SDaniel Kokron   idx = a->j;
255796e086a2SDaniel Kokron   v   = a->a;
255896e086a2SDaniel Kokron   if (usecprow) {
255996e086a2SDaniel Kokron     mbs  = a->compressedrow.nrows;
256096e086a2SDaniel Kokron     ii   = a->compressedrow.i;
256196e086a2SDaniel Kokron     ridx = a->compressedrow.rindex;
256296e086a2SDaniel Kokron   } else {
256396e086a2SDaniel Kokron     mbs = a->mbs;
256496e086a2SDaniel Kokron     ii  = a->i;
256596e086a2SDaniel Kokron     z   = zarray;
256696e086a2SDaniel Kokron   }
256796e086a2SDaniel Kokron 
256896e086a2SDaniel Kokron   if (!a->mult_work) {
256996e086a2SDaniel Kokron     k = PetscMax(A->rmap->n, A->cmap->n);
25709566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
257196e086a2SDaniel Kokron   }
257296e086a2SDaniel Kokron 
257396e086a2SDaniel Kokron   work = a->mult_work;
257496e086a2SDaniel Kokron   for (i = 0; i < mbs; i++) {
25759371c9d4SSatish Balay     n = ii[1] - ii[0];
25769371c9d4SSatish Balay     ii++;
257796e086a2SDaniel Kokron     workt = work;
257896e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
257996e086a2SDaniel Kokron       xb = x + bs * (*idx++);
258096e086a2SDaniel Kokron       for (k = 0; k < bs; k++) workt[k] = xb[k];
258196e086a2SDaniel Kokron       workt += bs;
258296e086a2SDaniel Kokron     }
258396e086a2SDaniel Kokron     if (usecprow) z = zarray + bs * ridx[i];
258496e086a2SDaniel Kokron 
25859371c9d4SSatish Balay     z0 = _mm256_loadu_pd(&z[0]);
25869371c9d4SSatish Balay     z1 = _mm256_loadu_pd(&z[4]);
25879371c9d4SSatish Balay     z2 = _mm256_set1_pd(z[8]);
258896e086a2SDaniel Kokron 
258996e086a2SDaniel Kokron     for (j = 0; j < n; j++) {
2590c05b70c4SSatish Balay       /* first column of a */
259196e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9]);
25929371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81]);
25939371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
25949371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
25959371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
25969371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
25979371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
259896e086a2SDaniel Kokron 
2599c05b70c4SSatish Balay       /* second column of a */
260096e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 1]);
26019371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
26029371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
26039371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
26049371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
26059371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
26069371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
260796e086a2SDaniel Kokron 
2608c05b70c4SSatish Balay       /* third column of a */
260996e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 2]);
26109371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
26119371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w2, z0);
26129371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
26139371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w2, z1);
26149371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
26159371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w2, z2);
261696e086a2SDaniel Kokron 
2617c05b70c4SSatish Balay       /* fourth column of a */
261896e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 3]);
26199371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
26209371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w3, z0);
26219371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
26229371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w3, z1);
26239371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
26249371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w3, z2);
262596e086a2SDaniel Kokron 
2626c05b70c4SSatish Balay       /* fifth column of a */
262796e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 4]);
26289371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
26299371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w0, z0);
26309371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
26319371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w0, z1);
26329371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
26339371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w0, z2);
263496e086a2SDaniel Kokron 
2635c05b70c4SSatish Balay       /* sixth column of a */
263696e086a2SDaniel Kokron       w1 = _mm256_set1_pd(work[j * 9 + 5]);
26379371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
26389371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w1, z0);
26399371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
26409371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w1, z1);
26419371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
26429371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w1, z2);
264396e086a2SDaniel Kokron 
2644c05b70c4SSatish Balay       /* seventh column of a */
264596e086a2SDaniel Kokron       w2 = _mm256_set1_pd(work[j * 9 + 6]);
26469371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
26479371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w2, z0);
26489371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
26499371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w2, z1);
26509371c9d4SSatish Balay       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
26519371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w2, z2);
265296e086a2SDaniel Kokron 
26536aad120cSJose E. Roman       /* eighth column of a */
265496e086a2SDaniel Kokron       w3 = _mm256_set1_pd(work[j * 9 + 7]);
26559371c9d4SSatish Balay       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
26569371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a3, w3, z0);
26579371c9d4SSatish Balay       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
26589371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a4, w3, z1);
26599371c9d4SSatish Balay       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
26609371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a5, w3, z2);
266196e086a2SDaniel Kokron 
2662c05b70c4SSatish Balay       /* ninth column of a */
266396e086a2SDaniel Kokron       w0 = _mm256_set1_pd(work[j * 9 + 8]);
26649371c9d4SSatish Balay       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
26659371c9d4SSatish Balay       z0 = _mm256_fmadd_pd(a0, w0, z0);
26669371c9d4SSatish Balay       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
26679371c9d4SSatish Balay       z1 = _mm256_fmadd_pd(a1, w0, z1);
26689371c9d4SSatish Balay       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
26699371c9d4SSatish Balay       z2 = _mm256_fmadd_pd(a2, w0, z2);
267096e086a2SDaniel Kokron     }
267196e086a2SDaniel Kokron 
26729371c9d4SSatish Balay     _mm256_storeu_pd(&z[0], z0);
26739371c9d4SSatish Balay     _mm256_storeu_pd(&z[4], z1);
26749371c9d4SSatish Balay     _mm256_maskstore_pd(&z[8], mask1, z2);
267596e086a2SDaniel Kokron 
267696e086a2SDaniel Kokron     v += n * bs2;
267796e086a2SDaniel Kokron     if (!usecprow) z += bs;
267896e086a2SDaniel Kokron   }
26799566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
26809566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &zarray));
26819566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(162.0 * a->nz));
2682*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
268396e086a2SDaniel Kokron }
268496e086a2SDaniel Kokron #endif
268596e086a2SDaniel Kokron 
2686d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2687d71ae5a4SJacob Faibussowitsch {
2688ebada01fSBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2689f4259b30SLisandro Dalcin   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2690ebada01fSBarry Smith   const PetscScalar *x, *xb;
2691ebada01fSBarry Smith   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2692ebada01fSBarry Smith   const MatScalar   *v;
2693ebada01fSBarry Smith   PetscInt           mbs = a->mbs, i, j, n;
2694ebada01fSBarry Smith   const PetscInt    *idx, *ii, *ridx = NULL;
2695ebada01fSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2696ebada01fSBarry Smith 
2697ebada01fSBarry Smith   PetscFunctionBegin;
26989566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
26999566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2700ebada01fSBarry Smith 
2701ebada01fSBarry Smith   idx = a->j;
2702ebada01fSBarry Smith   v   = a->a;
2703ebada01fSBarry Smith   if (usecprow) {
270448a46eb9SPierre Jolivet     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2705ebada01fSBarry Smith     mbs  = a->compressedrow.nrows;
2706ebada01fSBarry Smith     ii   = a->compressedrow.i;
2707ebada01fSBarry Smith     ridx = a->compressedrow.rindex;
2708ebada01fSBarry Smith   } else {
2709ebada01fSBarry Smith     ii = a->i;
2710ebada01fSBarry Smith     y  = yarray;
2711ebada01fSBarry Smith     z  = zarray;
2712ebada01fSBarry Smith   }
2713ebada01fSBarry Smith 
2714ebada01fSBarry Smith   for (i = 0; i < mbs; i++) {
27159371c9d4SSatish Balay     n = ii[1] - ii[0];
27169371c9d4SSatish Balay     ii++;
2717ebada01fSBarry Smith     if (usecprow) {
2718ebada01fSBarry Smith       z = zarray + 11 * ridx[i];
2719ebada01fSBarry Smith       y = yarray + 11 * ridx[i];
2720ebada01fSBarry Smith     }
27219371c9d4SSatish Balay     sum1  = y[0];
27229371c9d4SSatish Balay     sum2  = y[1];
27239371c9d4SSatish Balay     sum3  = y[2];
27249371c9d4SSatish Balay     sum4  = y[3];
27259371c9d4SSatish Balay     sum5  = y[4];
27269371c9d4SSatish Balay     sum6  = y[5];
27279371c9d4SSatish Balay     sum7  = y[6];
27289371c9d4SSatish Balay     sum8  = y[7];
27299371c9d4SSatish Balay     sum9  = y[8];
27309371c9d4SSatish Balay     sum10 = y[9];
27319371c9d4SSatish Balay     sum11 = y[10];
2732ebada01fSBarry Smith     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2733ebada01fSBarry Smith     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2734ebada01fSBarry Smith     for (j = 0; j < n; j++) {
2735ebada01fSBarry Smith       xb  = x + 11 * (*idx++);
27369371c9d4SSatish Balay       x1  = xb[0];
27379371c9d4SSatish Balay       x2  = xb[1];
27389371c9d4SSatish Balay       x3  = xb[2];
27399371c9d4SSatish Balay       x4  = xb[3];
27409371c9d4SSatish Balay       x5  = xb[4];
27419371c9d4SSatish Balay       x6  = xb[5];
27429371c9d4SSatish Balay       x7  = xb[6];
27439371c9d4SSatish Balay       x8  = xb[7];
27449371c9d4SSatish Balay       x9  = xb[8];
27459371c9d4SSatish Balay       x10 = xb[9];
27469371c9d4SSatish Balay       x11 = xb[10];
2747ebada01fSBarry Smith       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2748ebada01fSBarry Smith       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2749ebada01fSBarry Smith       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2750ebada01fSBarry Smith       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2751ebada01fSBarry Smith       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2752ebada01fSBarry Smith       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2753ebada01fSBarry Smith       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2754ebada01fSBarry Smith       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2755ebada01fSBarry Smith       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2756ebada01fSBarry Smith       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2757ebada01fSBarry Smith       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2758ebada01fSBarry Smith       v += 121;
2759ebada01fSBarry Smith     }
27609371c9d4SSatish Balay     z[0]  = sum1;
27619371c9d4SSatish Balay     z[1]  = sum2;
27629371c9d4SSatish Balay     z[2]  = sum3;
27639371c9d4SSatish Balay     z[3]  = sum4;
27649371c9d4SSatish Balay     z[4]  = sum5;
27659371c9d4SSatish Balay     z[5]  = sum6;
27669371c9d4SSatish Balay     z[6]  = sum7;
27679371c9d4SSatish Balay     z[7]  = sum8;
27689371c9d4SSatish Balay     z[8]  = sum9;
27699371c9d4SSatish Balay     z[9]  = sum10;
27709371c9d4SSatish Balay     z[10] = sum11;
2771ebada01fSBarry Smith     if (!usecprow) {
27729371c9d4SSatish Balay       z += 11;
27739371c9d4SSatish Balay       y += 11;
2774ebada01fSBarry Smith     }
2775ebada01fSBarry Smith   }
27769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
27779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
27789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(242.0 * a->nz));
2779*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2780ebada01fSBarry Smith }
2781ebada01fSBarry Smith 
2782d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2783d71ae5a4SJacob Faibussowitsch {
27842d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2785f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, *work, *workt, *zarray;
2786d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
2787d9ca1df4SBarry Smith   const MatScalar   *v;
2788d9ca1df4SBarry Smith   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2789d9ca1df4SBarry Smith   PetscInt           ncols, k;
2790d9ca1df4SBarry Smith   const PetscInt    *ridx     = NULL, *idx, *ii;
2791ace3abfcSBarry Smith   PetscBool          usecprow = a->compressedrow.use;
2792218c64b6SSatish Balay 
27932d61bbb3SSatish Balay   PetscFunctionBegin;
27949566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
27959566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
27969566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &zarray));
27972d61bbb3SSatish Balay 
27982d61bbb3SSatish Balay   idx = a->j;
27992d61bbb3SSatish Balay   v   = a->a;
280026e093fcSHong Zhang   if (usecprow) {
280126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
280226e093fcSHong Zhang     ii   = a->compressedrow.i;
28037b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
280426e093fcSHong Zhang   } else {
280526e093fcSHong Zhang     mbs = a->mbs;
28062d61bbb3SSatish Balay     ii  = a->i;
280726e093fcSHong Zhang     z   = zarray;
280826e093fcSHong Zhang   }
28092d61bbb3SSatish Balay 
28102d61bbb3SSatish Balay   if (!a->mult_work) {
2811d0f46423SBarry Smith     k = PetscMax(A->rmap->n, A->cmap->n);
28129566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
28132d61bbb3SSatish Balay   }
28142d61bbb3SSatish Balay   work = a->mult_work;
28152d61bbb3SSatish Balay   for (i = 0; i < mbs; i++) {
28169371c9d4SSatish Balay     n = ii[1] - ii[0];
28179371c9d4SSatish Balay     ii++;
28182d61bbb3SSatish Balay     ncols = n * bs;
28192d61bbb3SSatish Balay     workt = work;
28202d61bbb3SSatish Balay     for (j = 0; j < n; j++) {
28212d61bbb3SSatish Balay       xb = x + bs * (*idx++);
28222d61bbb3SSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
28232d61bbb3SSatish Balay       workt += bs;
28242d61bbb3SSatish Balay     }
28257b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs * ridx[i];
282696b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
28272d61bbb3SSatish Balay     v += n * bs2;
282826fbe8dcSKarl Rupp     if (!usecprow) z += bs;
282926e093fcSHong Zhang   }
28309566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
28319566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &zarray));
28329566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2833*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
28342d61bbb3SSatish Balay }
28352d61bbb3SSatish Balay 
2836d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2837d71ae5a4SJacob Faibussowitsch {
2838547795f9SHong Zhang   PetscScalar zero = 0.0;
2839547795f9SHong Zhang 
2840547795f9SHong Zhang   PetscFunctionBegin;
28419566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
28429566063dSJacob Faibussowitsch   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2843*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2844547795f9SHong Zhang }
2845547795f9SHong Zhang 
2846d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2847d71ae5a4SJacob Faibussowitsch {
28483447b6efSHong Zhang   PetscScalar zero = 0.0;
28492d61bbb3SSatish Balay 
28502d61bbb3SSatish Balay   PetscFunctionBegin;
28519566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
28529566063dSJacob Faibussowitsch   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2853*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
28542d61bbb3SSatish Balay }
28552d61bbb3SSatish Balay 
2856d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2857d71ae5a4SJacob Faibussowitsch {
2858547795f9SHong Zhang   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2859b8c08b77SHong Zhang   PetscScalar       *z, x1, x2, x3, x4, x5;
2860d9ca1df4SBarry Smith   const PetscScalar *x, *xb = NULL;
2861d9ca1df4SBarry Smith   const MatScalar   *v;
2862b8c08b77SHong Zhang   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2863d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2864547795f9SHong Zhang   Mat_CompressedRow  cprow    = a->compressedrow;
2865ace3abfcSBarry Smith   PetscBool          usecprow = cprow.use;
2866547795f9SHong Zhang 
2867547795f9SHong Zhang   PetscFunctionBegin;
28689566063dSJacob Faibussowitsch   if (yy != zz) PetscCall(VecCopy(yy, zz));
28699566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
28709566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
2871547795f9SHong Zhang 
2872547795f9SHong Zhang   idx = a->j;
2873547795f9SHong Zhang   v   = a->a;
2874547795f9SHong Zhang   if (usecprow) {
2875547795f9SHong Zhang     mbs  = cprow.nrows;
2876547795f9SHong Zhang     ii   = cprow.i;
2877547795f9SHong Zhang     ridx = cprow.rindex;
2878547795f9SHong Zhang   } else {
2879547795f9SHong Zhang     mbs = a->mbs;
2880547795f9SHong Zhang     ii  = a->i;
2881547795f9SHong Zhang     xb  = x;
2882547795f9SHong Zhang   }
2883547795f9SHong Zhang 
2884547795f9SHong Zhang   switch (bs) {
2885547795f9SHong Zhang   case 1:
2886547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2887547795f9SHong Zhang       if (usecprow) xb = x + ridx[i];
2888547795f9SHong Zhang       x1 = xb[0];
2889547795f9SHong Zhang       ib = idx + ii[0];
28909371c9d4SSatish Balay       n  = ii[1] - ii[0];
28919371c9d4SSatish Balay       ii++;
2892547795f9SHong Zhang       for (j = 0; j < n; j++) {
2893547795f9SHong Zhang         rval = ib[j];
2894547795f9SHong Zhang         z[rval] += PetscConj(*v) * x1;
2895547795f9SHong Zhang         v++;
2896547795f9SHong Zhang       }
2897547795f9SHong Zhang       if (!usecprow) xb++;
2898547795f9SHong Zhang     }
2899547795f9SHong Zhang     break;
2900547795f9SHong Zhang   case 2:
2901547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2902547795f9SHong Zhang       if (usecprow) xb = x + 2 * ridx[i];
29039371c9d4SSatish Balay       x1 = xb[0];
29049371c9d4SSatish Balay       x2 = xb[1];
2905547795f9SHong Zhang       ib = idx + ii[0];
29069371c9d4SSatish Balay       n  = ii[1] - ii[0];
29079371c9d4SSatish Balay       ii++;
2908547795f9SHong Zhang       for (j = 0; j < n; j++) {
2909547795f9SHong Zhang         rval = ib[j] * 2;
2910547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2911547795f9SHong Zhang         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2912547795f9SHong Zhang         v += 4;
2913547795f9SHong Zhang       }
2914547795f9SHong Zhang       if (!usecprow) xb += 2;
2915547795f9SHong Zhang     }
2916547795f9SHong Zhang     break;
2917547795f9SHong Zhang   case 3:
2918547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2919547795f9SHong Zhang       if (usecprow) xb = x + 3 * ridx[i];
29209371c9d4SSatish Balay       x1 = xb[0];
29219371c9d4SSatish Balay       x2 = xb[1];
29229371c9d4SSatish Balay       x3 = xb[2];
2923547795f9SHong Zhang       ib = idx + ii[0];
29249371c9d4SSatish Balay       n  = ii[1] - ii[0];
29259371c9d4SSatish Balay       ii++;
2926547795f9SHong Zhang       for (j = 0; j < n; j++) {
2927547795f9SHong Zhang         rval = ib[j] * 3;
2928547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2929547795f9SHong Zhang         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2930547795f9SHong Zhang         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2931547795f9SHong Zhang         v += 9;
2932547795f9SHong Zhang       }
2933547795f9SHong Zhang       if (!usecprow) xb += 3;
2934547795f9SHong Zhang     }
2935547795f9SHong Zhang     break;
2936547795f9SHong Zhang   case 4:
2937547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2938547795f9SHong Zhang       if (usecprow) xb = x + 4 * ridx[i];
29399371c9d4SSatish Balay       x1 = xb[0];
29409371c9d4SSatish Balay       x2 = xb[1];
29419371c9d4SSatish Balay       x3 = xb[2];
29429371c9d4SSatish Balay       x4 = xb[3];
2943547795f9SHong Zhang       ib = idx + ii[0];
29449371c9d4SSatish Balay       n  = ii[1] - ii[0];
29459371c9d4SSatish Balay       ii++;
2946547795f9SHong Zhang       for (j = 0; j < n; j++) {
2947547795f9SHong Zhang         rval = ib[j] * 4;
2948547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2949547795f9SHong Zhang         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2950547795f9SHong Zhang         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2951547795f9SHong Zhang         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2952547795f9SHong Zhang         v += 16;
2953547795f9SHong Zhang       }
2954547795f9SHong Zhang       if (!usecprow) xb += 4;
2955547795f9SHong Zhang     }
2956547795f9SHong Zhang     break;
2957547795f9SHong Zhang   case 5:
2958547795f9SHong Zhang     for (i = 0; i < mbs; i++) {
2959547795f9SHong Zhang       if (usecprow) xb = x + 5 * ridx[i];
29609371c9d4SSatish Balay       x1 = xb[0];
29619371c9d4SSatish Balay       x2 = xb[1];
29629371c9d4SSatish Balay       x3 = xb[2];
29639371c9d4SSatish Balay       x4 = xb[3];
29649371c9d4SSatish Balay       x5 = xb[4];
2965547795f9SHong Zhang       ib = idx + ii[0];
29669371c9d4SSatish Balay       n  = ii[1] - ii[0];
29679371c9d4SSatish Balay       ii++;
2968547795f9SHong Zhang       for (j = 0; j < n; j++) {
2969547795f9SHong Zhang         rval = ib[j] * 5;
2970547795f9SHong Zhang         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2971547795f9SHong Zhang         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2972547795f9SHong Zhang         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2973547795f9SHong Zhang         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2974547795f9SHong Zhang         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2975547795f9SHong Zhang         v += 25;
2976547795f9SHong Zhang       }
2977547795f9SHong Zhang       if (!usecprow) xb += 5;
2978547795f9SHong Zhang     }
2979547795f9SHong Zhang     break;
2980d71ae5a4SJacob Faibussowitsch   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2981d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2982968ae2c8SSatish Balay #if 0
2983968ae2c8SSatish Balay     {
2984b8c08b77SHong Zhang       PetscInt          ncols,k,bs2=a->bs2;
2985b8c08b77SHong Zhang       PetscScalar       *work,*workt,zb;
2986d9ca1df4SBarry Smith       const PetscScalar *xtmp;
2987547795f9SHong Zhang       if (!a->mult_work) {
2988547795f9SHong Zhang         k    = PetscMax(A->rmap->n,A->cmap->n);
29899566063dSJacob Faibussowitsch         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2990547795f9SHong Zhang       }
2991547795f9SHong Zhang       work = a->mult_work;
2992547795f9SHong Zhang       xtmp = x;
2993547795f9SHong Zhang       for (i=0; i<mbs; i++) {
2994547795f9SHong Zhang         n     = ii[1] - ii[0]; ii++;
2995547795f9SHong Zhang         ncols = n*bs;
29969566063dSJacob Faibussowitsch         PetscCall(PetscArrayzero(work,ncols));
299726fbe8dcSKarl Rupp         if (usecprow) xtmp = x + bs*ridx[i];
299896b95a6bSBarry Smith         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2999547795f9SHong Zhang         v += n*bs2;
3000547795f9SHong Zhang         if (!usecprow) xtmp += bs;
3001547795f9SHong Zhang         workt = work;
3002547795f9SHong Zhang         for (j=0; j<n; j++) {
3003547795f9SHong Zhang           zb = z + bs*(*idx++);
3004547795f9SHong Zhang           for (k=0; k<bs; k++) zb[k] += workt[k] ;
3005547795f9SHong Zhang           workt += bs;
3006547795f9SHong Zhang         }
3007547795f9SHong Zhang       }
3008547795f9SHong Zhang     }
3009968ae2c8SSatish Balay #endif
3010547795f9SHong Zhang   }
30119566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
30129566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
30139566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3014*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3015547795f9SHong Zhang }
3016547795f9SHong Zhang 
3017d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
3018d71ae5a4SJacob Faibussowitsch {
30192d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3020d9ca1df4SBarry Smith   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
3021f4259b30SLisandro Dalcin   const PetscScalar *x, *xb = NULL;
3022d9ca1df4SBarry Smith   const MatScalar   *v;
3023d9ca1df4SBarry Smith   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3024d9ca1df4SBarry Smith   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
30253447b6efSHong Zhang   Mat_CompressedRow  cprow    = a->compressedrow;
3026ace3abfcSBarry Smith   PetscBool          usecprow = cprow.use;
30272d61bbb3SSatish Balay 
30282d61bbb3SSatish Balay   PetscFunctionBegin;
30299566063dSJacob Faibussowitsch   if (yy != zz) PetscCall(VecCopy(yy, zz));
30309566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
30319566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
30322d61bbb3SSatish Balay 
30332d61bbb3SSatish Balay   idx = a->j;
30342d61bbb3SSatish Balay   v   = a->a;
30353447b6efSHong Zhang   if (usecprow) {
30363447b6efSHong Zhang     mbs  = cprow.nrows;
30373447b6efSHong Zhang     ii   = cprow.i;
30387b2bb3b9SHong Zhang     ridx = cprow.rindex;
30393447b6efSHong Zhang   } else {
30403447b6efSHong Zhang     mbs = a->mbs;
30412d61bbb3SSatish Balay     ii  = a->i;
3042f1af5d2fSBarry Smith     xb  = x;
30433447b6efSHong Zhang   }
30442d61bbb3SSatish Balay 
30452d61bbb3SSatish Balay   switch (bs) {
30462d61bbb3SSatish Balay   case 1:
30472d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30487b2bb3b9SHong Zhang       if (usecprow) xb = x + ridx[i];
3049f1af5d2fSBarry Smith       x1 = xb[0];
30503447b6efSHong Zhang       ib = idx + ii[0];
30519371c9d4SSatish Balay       n  = ii[1] - ii[0];
30529371c9d4SSatish Balay       ii++;
30532d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30542d61bbb3SSatish Balay         rval = ib[j];
3055f1af5d2fSBarry Smith         z[rval] += *v * x1;
3056f1af5d2fSBarry Smith         v++;
30572d61bbb3SSatish Balay       }
30583447b6efSHong Zhang       if (!usecprow) xb++;
30592d61bbb3SSatish Balay     }
30602d61bbb3SSatish Balay     break;
30612d61bbb3SSatish Balay   case 2:
30622d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30637b2bb3b9SHong Zhang       if (usecprow) xb = x + 2 * ridx[i];
30649371c9d4SSatish Balay       x1 = xb[0];
30659371c9d4SSatish Balay       x2 = xb[1];
30663447b6efSHong Zhang       ib = idx + ii[0];
30679371c9d4SSatish Balay       n  = ii[1] - ii[0];
30689371c9d4SSatish Balay       ii++;
30692d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30702d61bbb3SSatish Balay         rval = ib[j] * 2;
30712d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2;
30722d61bbb3SSatish Balay         z[rval++] += v[2] * x1 + v[3] * x2;
30732d61bbb3SSatish Balay         v += 4;
30742d61bbb3SSatish Balay       }
30753447b6efSHong Zhang       if (!usecprow) xb += 2;
30762d61bbb3SSatish Balay     }
30772d61bbb3SSatish Balay     break;
30782d61bbb3SSatish Balay   case 3:
30792d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30807b2bb3b9SHong Zhang       if (usecprow) xb = x + 3 * ridx[i];
30819371c9d4SSatish Balay       x1 = xb[0];
30829371c9d4SSatish Balay       x2 = xb[1];
30839371c9d4SSatish Balay       x3 = xb[2];
30843447b6efSHong Zhang       ib = idx + ii[0];
30859371c9d4SSatish Balay       n  = ii[1] - ii[0];
30869371c9d4SSatish Balay       ii++;
30872d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
30882d61bbb3SSatish Balay         rval = ib[j] * 3;
30892d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
30902d61bbb3SSatish Balay         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
30912d61bbb3SSatish Balay         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
30922d61bbb3SSatish Balay         v += 9;
30932d61bbb3SSatish Balay       }
30943447b6efSHong Zhang       if (!usecprow) xb += 3;
30952d61bbb3SSatish Balay     }
30962d61bbb3SSatish Balay     break;
30972d61bbb3SSatish Balay   case 4:
30982d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
30997b2bb3b9SHong Zhang       if (usecprow) xb = x + 4 * ridx[i];
31009371c9d4SSatish Balay       x1 = xb[0];
31019371c9d4SSatish Balay       x2 = xb[1];
31029371c9d4SSatish Balay       x3 = xb[2];
31039371c9d4SSatish Balay       x4 = xb[3];
31043447b6efSHong Zhang       ib = idx + ii[0];
31059371c9d4SSatish Balay       n  = ii[1] - ii[0];
31069371c9d4SSatish Balay       ii++;
31072d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
31082d61bbb3SSatish Balay         rval = ib[j] * 4;
31092d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
31102d61bbb3SSatish Balay         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
31112d61bbb3SSatish Balay         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
31122d61bbb3SSatish Balay         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
31132d61bbb3SSatish Balay         v += 16;
31142d61bbb3SSatish Balay       }
31153447b6efSHong Zhang       if (!usecprow) xb += 4;
31162d61bbb3SSatish Balay     }
31172d61bbb3SSatish Balay     break;
31182d61bbb3SSatish Balay   case 5:
31192d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
31207b2bb3b9SHong Zhang       if (usecprow) xb = x + 5 * ridx[i];
31219371c9d4SSatish Balay       x1 = xb[0];
31229371c9d4SSatish Balay       x2 = xb[1];
31239371c9d4SSatish Balay       x3 = xb[2];
31249371c9d4SSatish Balay       x4 = xb[3];
31259371c9d4SSatish Balay       x5 = xb[4];
31263447b6efSHong Zhang       ib = idx + ii[0];
31279371c9d4SSatish Balay       n  = ii[1] - ii[0];
31289371c9d4SSatish Balay       ii++;
31292d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
31302d61bbb3SSatish Balay         rval = ib[j] * 5;
31312d61bbb3SSatish Balay         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
31322d61bbb3SSatish Balay         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
31332d61bbb3SSatish Balay         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
31342d61bbb3SSatish Balay         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
31352d61bbb3SSatish Balay         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
31362d61bbb3SSatish Balay         v += 25;
31372d61bbb3SSatish Balay       }
31383447b6efSHong Zhang       if (!usecprow) xb += 5;
31392d61bbb3SSatish Balay     }
31402d61bbb3SSatish Balay     break;
3141f1af5d2fSBarry Smith   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3142690b6cddSBarry Smith     PetscInt           ncols, k;
3143d9ca1df4SBarry Smith     PetscScalar       *work, *workt;
3144d9ca1df4SBarry Smith     const PetscScalar *xtmp;
31452d61bbb3SSatish Balay     if (!a->mult_work) {
3146d0f46423SBarry Smith       k = PetscMax(A->rmap->n, A->cmap->n);
31479566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
31482d61bbb3SSatish Balay     }
31492d61bbb3SSatish Balay     work = a->mult_work;
31503447b6efSHong Zhang     xtmp = x;
31512d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) {
31529371c9d4SSatish Balay       n = ii[1] - ii[0];
31539371c9d4SSatish Balay       ii++;
31542d61bbb3SSatish Balay       ncols = n * bs;
31559566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(work, ncols));
315626fbe8dcSKarl Rupp       if (usecprow) xtmp = x + bs * ridx[i];
315796b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
31582d61bbb3SSatish Balay       v += n * bs2;
31593447b6efSHong Zhang       if (!usecprow) xtmp += bs;
31602d61bbb3SSatish Balay       workt = work;
31612d61bbb3SSatish Balay       for (j = 0; j < n; j++) {
31622d61bbb3SSatish Balay         zb = z + bs * (*idx++);
31632d61bbb3SSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
31642d61bbb3SSatish Balay         workt += bs;
31652d61bbb3SSatish Balay       }
31662d61bbb3SSatish Balay     }
31672d61bbb3SSatish Balay   }
31682d61bbb3SSatish Balay   }
31699566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
31709566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
31719566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3172*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
31732d61bbb3SSatish Balay }
31742d61bbb3SSatish Balay 
3175d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3176d71ae5a4SJacob Faibussowitsch {
31772d61bbb3SSatish Balay   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3178690b6cddSBarry Smith   PetscInt     totalnz = a->bs2 * a->nz;
3179f4df32b1SMatthew Knepley   PetscScalar  oalpha  = alpha;
3180c5df96a5SBarry Smith   PetscBLASInt one     = 1, tnz;
31812d61bbb3SSatish Balay 
31822d61bbb3SSatish Balay   PetscFunctionBegin;
31839566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3184792fecdfSBarry Smith   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
31859566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(totalnz));
3186*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
31872d61bbb3SSatish Balay }
31882d61bbb3SSatish Balay 
3189d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3190d71ae5a4SJacob Faibussowitsch {
31912d61bbb3SSatish Balay   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
31923f1db9ecSBarry Smith   MatScalar   *v   = a->a;
3193329f5518SBarry Smith   PetscReal    sum = 0.0;
3194d0f46423SBarry Smith   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
31952d61bbb3SSatish Balay 
31962d61bbb3SSatish Balay   PetscFunctionBegin;
31972d61bbb3SSatish Balay   if (type == NORM_FROBENIUS) {
3198570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16)
3199570b7f6dSBarry Smith     PetscBLASInt one = 1, cnt = bs2 * nz;
3200792fecdfSBarry Smith     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3201570b7f6dSBarry Smith #else
32022d61bbb3SSatish Balay     for (i = 0; i < bs2 * nz; i++) {
32039371c9d4SSatish Balay       sum += PetscRealPart(PetscConj(*v) * (*v));
32049371c9d4SSatish Balay       v++;
32052d61bbb3SSatish Balay     }
3206570b7f6dSBarry Smith #endif
32078f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum);
32089566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
32098a62d963SHong Zhang   } else if (type == NORM_1) { /* maximum column sum */
32108a62d963SHong Zhang     PetscReal *tmp;
32118a62d963SHong Zhang     PetscInt  *bcol = a->j;
32129566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
32138a62d963SHong Zhang     for (i = 0; i < nz; i++) {
32148a62d963SHong Zhang       for (j = 0; j < bs; j++) {
32158a62d963SHong Zhang         k1 = bs * (*bcol) + j; /* column index */
32168a62d963SHong Zhang         for (k = 0; k < bs; k++) {
32179371c9d4SSatish Balay           tmp[k1] += PetscAbsScalar(*v);
32189371c9d4SSatish Balay           v++;
32198a62d963SHong Zhang         }
32208a62d963SHong Zhang       }
32218a62d963SHong Zhang       bcol++;
32228a62d963SHong Zhang     }
32238a62d963SHong Zhang     *norm = 0.0;
3224d0f46423SBarry Smith     for (j = 0; j < A->cmap->n; j++) {
32258a62d963SHong Zhang       if (tmp[j] > *norm) *norm = tmp[j];
32268a62d963SHong Zhang     }
32279566063dSJacob Faibussowitsch     PetscCall(PetscFree(tmp));
32289566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3229596552b5SBarry Smith   } else if (type == NORM_INFINITY) { /* maximum row sum */
3230596552b5SBarry Smith     *norm = 0.0;
3231596552b5SBarry Smith     for (k = 0; k < bs; k++) {
323274f84c7bSSatish Balay       for (j = 0; j < a->mbs; j++) {
3233596552b5SBarry Smith         v   = a->a + bs2 * a->i[j] + k;
3234596552b5SBarry Smith         sum = 0.0;
3235596552b5SBarry Smith         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
32360e90e235SBarry Smith           for (k1 = 0; k1 < bs; k1++) {
3237596552b5SBarry Smith             sum += PetscAbsScalar(*v);
3238596552b5SBarry Smith             v += bs;
32392d61bbb3SSatish Balay           }
32400e90e235SBarry Smith         }
3241596552b5SBarry Smith         if (sum > *norm) *norm = sum;
3242596552b5SBarry Smith       }
3243596552b5SBarry Smith     }
32449566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3245e7e72b3dSBarry Smith   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3246*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
32472d61bbb3SSatish Balay }
32482d61bbb3SSatish Balay 
3249d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3250d71ae5a4SJacob Faibussowitsch {
32512d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
32522d61bbb3SSatish Balay 
32532d61bbb3SSatish Balay   PetscFunctionBegin;
32542d61bbb3SSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3255d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3256273d9f13SBarry Smith     *flg = PETSC_FALSE;
3257*3ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
32582d61bbb3SSatish Balay   }
32592d61bbb3SSatish Balay 
32602d61bbb3SSatish Balay   /* if the a->i are the same */
32619566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3262*3ba16761SJacob Faibussowitsch   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
32632d61bbb3SSatish Balay 
32642d61bbb3SSatish Balay   /* if a->j are the same */
32659566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3266*3ba16761SJacob Faibussowitsch   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
326726fbe8dcSKarl Rupp 
32682d61bbb3SSatish Balay   /* if a->a are the same */
32699566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3270*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
32712d61bbb3SSatish Balay }
32722d61bbb3SSatish Balay 
3273d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3274d71ae5a4SJacob Faibussowitsch {
32752d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3276690b6cddSBarry Smith   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
327787828ca2SBarry Smith   PetscScalar *x, zero = 0.0;
32783f1db9ecSBarry Smith   MatScalar   *aa, *aa_j;
32792d61bbb3SSatish Balay 
32802d61bbb3SSatish Balay   PetscFunctionBegin;
328128b400f6SJacob Faibussowitsch   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3282d0f46423SBarry Smith   bs   = A->rmap->bs;
32832d61bbb3SSatish Balay   aa   = a->a;
32842d61bbb3SSatish Balay   ai   = a->i;
32852d61bbb3SSatish Balay   aj   = a->j;
32862d61bbb3SSatish Balay   ambs = a->mbs;
32872d61bbb3SSatish Balay   bs2  = a->bs2;
32882d61bbb3SSatish Balay 
32899566063dSJacob Faibussowitsch   PetscCall(VecSet(v, zero));
32909566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
32919566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(v, &n));
329208401ef6SPierre Jolivet   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
32932d61bbb3SSatish Balay   for (i = 0; i < ambs; i++) {
32942d61bbb3SSatish Balay     for (j = ai[i]; j < ai[i + 1]; j++) {
32952d61bbb3SSatish Balay       if (aj[j] == i) {
32962d61bbb3SSatish Balay         row  = i * bs;
32972d61bbb3SSatish Balay         aa_j = aa + j * bs2;
32982d61bbb3SSatish Balay         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
32992d61bbb3SSatish Balay         break;
33002d61bbb3SSatish Balay       }
33012d61bbb3SSatish Balay     }
33022d61bbb3SSatish Balay   }
33039566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
3304*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
33052d61bbb3SSatish Balay }
33062d61bbb3SSatish Balay 
3307d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3308d71ae5a4SJacob Faibussowitsch {
33092d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
331053ef36baSBarry Smith   const PetscScalar *l, *r, *li, *ri;
331153ef36baSBarry Smith   PetscScalar        x;
33123f1db9ecSBarry Smith   MatScalar         *aa, *v;
331353ef36baSBarry Smith   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
331453ef36baSBarry Smith   const PetscInt    *ai, *aj;
33152d61bbb3SSatish Balay 
33162d61bbb3SSatish Balay   PetscFunctionBegin;
33172d61bbb3SSatish Balay   ai  = a->i;
33182d61bbb3SSatish Balay   aj  = a->j;
33192d61bbb3SSatish Balay   aa  = a->a;
3320d0f46423SBarry Smith   m   = A->rmap->n;
3321d0f46423SBarry Smith   n   = A->cmap->n;
3322d0f46423SBarry Smith   bs  = A->rmap->bs;
33232d61bbb3SSatish Balay   mbs = a->mbs;
33242d61bbb3SSatish Balay   bs2 = a->bs2;
33252d61bbb3SSatish Balay   if (ll) {
33269566063dSJacob Faibussowitsch     PetscCall(VecGetArrayRead(ll, &l));
33279566063dSJacob Faibussowitsch     PetscCall(VecGetLocalSize(ll, &lm));
332808401ef6SPierre Jolivet     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
33292d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) { /* for each block row */
33302d61bbb3SSatish Balay       M  = ai[i + 1] - ai[i];
33312d61bbb3SSatish Balay       li = l + i * bs;
33322d61bbb3SSatish Balay       v  = aa + bs2 * ai[i];
33332d61bbb3SSatish Balay       for (j = 0; j < M; j++) { /* for each block */
3334ad540459SPierre Jolivet         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
33352d61bbb3SSatish Balay       }
33362d61bbb3SSatish Balay     }
33379566063dSJacob Faibussowitsch     PetscCall(VecRestoreArrayRead(ll, &l));
33389566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
33392d61bbb3SSatish Balay   }
33402d61bbb3SSatish Balay 
33412d61bbb3SSatish Balay   if (rr) {
33429566063dSJacob Faibussowitsch     PetscCall(VecGetArrayRead(rr, &r));
33439566063dSJacob Faibussowitsch     PetscCall(VecGetLocalSize(rr, &rn));
334408401ef6SPierre Jolivet     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
33452d61bbb3SSatish Balay     for (i = 0; i < mbs; i++) { /* for each block row */
334653ef36baSBarry Smith       iai = ai[i];
334753ef36baSBarry Smith       M   = ai[i + 1] - iai;
334853ef36baSBarry Smith       v   = aa + bs2 * iai;
33492d61bbb3SSatish Balay       for (j = 0; j < M; j++) { /* for each block */
335053ef36baSBarry Smith         ri = r + bs * aj[iai + j];
33512d61bbb3SSatish Balay         for (k = 0; k < bs; k++) {
33522d61bbb3SSatish Balay           x = ri[k];
335353ef36baSBarry Smith           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
335453ef36baSBarry Smith           v += bs;
33552d61bbb3SSatish Balay         }
33562d61bbb3SSatish Balay       }
33572d61bbb3SSatish Balay     }
33589566063dSJacob Faibussowitsch     PetscCall(VecRestoreArrayRead(rr, &r));
33599566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
33602d61bbb3SSatish Balay   }
3361*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
33622d61bbb3SSatish Balay }
33632d61bbb3SSatish Balay 
3364d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3365d71ae5a4SJacob Faibussowitsch {
33662d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
33672d61bbb3SSatish Balay 
33682d61bbb3SSatish Balay   PetscFunctionBegin;
33692d61bbb3SSatish Balay   info->block_size   = a->bs2;
3370ceed8ce5SJed Brown   info->nz_allocated = a->bs2 * a->maxnz;
33712d61bbb3SSatish Balay   info->nz_used      = a->bs2 * a->nz;
33723966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
33732d61bbb3SSatish Balay   info->assemblies   = A->num_ass;
33748e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
33754dfa11a4SJacob Faibussowitsch   info->memory       = 0; /* REVIEW ME */
3376d5f3da31SBarry Smith   if (A->factortype) {
33772d61bbb3SSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
33782d61bbb3SSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
33792d61bbb3SSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
33802d61bbb3SSatish Balay   } else {
33812d61bbb3SSatish Balay     info->fill_ratio_given  = 0;
33822d61bbb3SSatish Balay     info->fill_ratio_needed = 0;
33832d61bbb3SSatish Balay     info->factor_mallocs    = 0;
33842d61bbb3SSatish Balay   }
3385*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
33862d61bbb3SSatish Balay }
33872d61bbb3SSatish Balay 
3388d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3389d71ae5a4SJacob Faibussowitsch {
33902d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
33912d61bbb3SSatish Balay 
33922d61bbb3SSatish Balay   PetscFunctionBegin;
33939566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3394*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
33952d61bbb3SSatish Balay }
3396a001520aSPierre Jolivet 
3397d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3398d71ae5a4SJacob Faibussowitsch {
3399a001520aSPierre Jolivet   PetscFunctionBegin;
34009566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
34014222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3402*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3403a001520aSPierre Jolivet }
3404a001520aSPierre Jolivet 
3405d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3406d71ae5a4SJacob Faibussowitsch {
340774eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3408f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1;
3409bcf10a7aSPierre Jolivet   const PetscScalar *xb;
341074eeabc5SPierre Jolivet   PetscScalar        x1;
341174eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
341274eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
341374eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
341474eeabc5SPierre Jolivet 
341574eeabc5SPierre Jolivet   PetscFunctionBegin;
341674eeabc5SPierre Jolivet   idx = a->j;
341774eeabc5SPierre Jolivet   v   = a->a;
341874eeabc5SPierre Jolivet   if (usecprow) {
341974eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
342074eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
342174eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
342274eeabc5SPierre Jolivet   } else {
342374eeabc5SPierre Jolivet     mbs = a->mbs;
342474eeabc5SPierre Jolivet     ii  = a->i;
342574eeabc5SPierre Jolivet     z   = c;
342674eeabc5SPierre Jolivet   }
342774eeabc5SPierre Jolivet 
342874eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
34299371c9d4SSatish Balay     n = ii[1] - ii[0];
34309371c9d4SSatish Balay     ii++;
343174eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
343274eeabc5SPierre Jolivet     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
343374eeabc5SPierre Jolivet     if (usecprow) z = c + ridx[i];
343474eeabc5SPierre Jolivet     jj = idx;
343574eeabc5SPierre Jolivet     vv = v;
343674eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
343774eeabc5SPierre Jolivet       idx  = jj;
343874eeabc5SPierre Jolivet       v    = vv;
343974eeabc5SPierre Jolivet       sum1 = 0.0;
344074eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
34419371c9d4SSatish Balay         xb = b + (*idx++);
34429371c9d4SSatish Balay         x1 = xb[0 + k * bm];
344374eeabc5SPierre Jolivet         sum1 += v[0] * x1;
344474eeabc5SPierre Jolivet         v += 1;
344574eeabc5SPierre Jolivet       }
3446feb237baSPierre Jolivet       z[0 + k * cm] = sum1;
344774eeabc5SPierre Jolivet     }
344874eeabc5SPierre Jolivet     if (!usecprow) z += 1;
344974eeabc5SPierre Jolivet   }
3450*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
345174eeabc5SPierre Jolivet }
345274eeabc5SPierre Jolivet 
3453d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3454d71ae5a4SJacob Faibussowitsch {
34554b7054f4SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3456f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2;
3457bcf10a7aSPierre Jolivet   const PetscScalar *xb;
34584b7054f4SPierre Jolivet   PetscScalar        x1, x2;
34594b7054f4SPierre Jolivet   const MatScalar   *v, *vv;
34604b7054f4SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
34614b7054f4SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
34624b7054f4SPierre Jolivet 
34634b7054f4SPierre Jolivet   PetscFunctionBegin;
34644b7054f4SPierre Jolivet   idx = a->j;
34654b7054f4SPierre Jolivet   v   = a->a;
34664b7054f4SPierre Jolivet   if (usecprow) {
34674b7054f4SPierre Jolivet     mbs  = a->compressedrow.nrows;
34684b7054f4SPierre Jolivet     ii   = a->compressedrow.i;
34694b7054f4SPierre Jolivet     ridx = a->compressedrow.rindex;
34704b7054f4SPierre Jolivet   } else {
34714b7054f4SPierre Jolivet     mbs = a->mbs;
34724b7054f4SPierre Jolivet     ii  = a->i;
34734b7054f4SPierre Jolivet     z   = c;
34744b7054f4SPierre Jolivet   }
34754b7054f4SPierre Jolivet 
34764b7054f4SPierre Jolivet   for (i = 0; i < mbs; i++) {
34779371c9d4SSatish Balay     n = ii[1] - ii[0];
34789371c9d4SSatish Balay     ii++;
34794b7054f4SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
34804b7054f4SPierre Jolivet     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
34814b7054f4SPierre Jolivet     if (usecprow) z = c + 2 * ridx[i];
34824b7054f4SPierre Jolivet     jj = idx;
34834b7054f4SPierre Jolivet     vv = v;
34844b7054f4SPierre Jolivet     for (k = 0; k < cn; k++) {
34854b7054f4SPierre Jolivet       idx  = jj;
34864b7054f4SPierre Jolivet       v    = vv;
34879371c9d4SSatish Balay       sum1 = 0.0;
34889371c9d4SSatish Balay       sum2 = 0.0;
34894b7054f4SPierre Jolivet       for (j = 0; j < n; j++) {
34909371c9d4SSatish Balay         xb = b + 2 * (*idx++);
34919371c9d4SSatish Balay         x1 = xb[0 + k * bm];
34929371c9d4SSatish Balay         x2 = xb[1 + k * bm];
34934b7054f4SPierre Jolivet         sum1 += v[0] * x1 + v[2] * x2;
34944b7054f4SPierre Jolivet         sum2 += v[1] * x1 + v[3] * x2;
34954b7054f4SPierre Jolivet         v += 4;
34964b7054f4SPierre Jolivet       }
34979371c9d4SSatish Balay       z[0 + k * cm] = sum1;
34989371c9d4SSatish Balay       z[1 + k * cm] = sum2;
34994b7054f4SPierre Jolivet     }
35004b7054f4SPierre Jolivet     if (!usecprow) z += 2;
35014b7054f4SPierre Jolivet   }
3502*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
35034b7054f4SPierre Jolivet }
35044b7054f4SPierre Jolivet 
3505d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3506d71ae5a4SJacob Faibussowitsch {
350774eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3508f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3;
3509bcf10a7aSPierre Jolivet   const PetscScalar *xb;
351074eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3;
351174eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
351274eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
351374eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
351474eeabc5SPierre Jolivet 
351574eeabc5SPierre Jolivet   PetscFunctionBegin;
351674eeabc5SPierre Jolivet   idx = a->j;
351774eeabc5SPierre Jolivet   v   = a->a;
351874eeabc5SPierre Jolivet   if (usecprow) {
351974eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
352074eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
352174eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
352274eeabc5SPierre Jolivet   } else {
352374eeabc5SPierre Jolivet     mbs = a->mbs;
352474eeabc5SPierre Jolivet     ii  = a->i;
352574eeabc5SPierre Jolivet     z   = c;
352674eeabc5SPierre Jolivet   }
352774eeabc5SPierre Jolivet 
352874eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
35299371c9d4SSatish Balay     n = ii[1] - ii[0];
35309371c9d4SSatish Balay     ii++;
353174eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
353274eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
353374eeabc5SPierre Jolivet     if (usecprow) z = c + 3 * ridx[i];
353474eeabc5SPierre Jolivet     jj = idx;
353574eeabc5SPierre Jolivet     vv = v;
353674eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
353774eeabc5SPierre Jolivet       idx  = jj;
353874eeabc5SPierre Jolivet       v    = vv;
35399371c9d4SSatish Balay       sum1 = 0.0;
35409371c9d4SSatish Balay       sum2 = 0.0;
35419371c9d4SSatish Balay       sum3 = 0.0;
354274eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
35439371c9d4SSatish Balay         xb = b + 3 * (*idx++);
35449371c9d4SSatish Balay         x1 = xb[0 + k * bm];
35459371c9d4SSatish Balay         x2 = xb[1 + k * bm];
35469371c9d4SSatish Balay         x3 = xb[2 + k * bm];
354774eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
354874eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
354974eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
355074eeabc5SPierre Jolivet         v += 9;
355174eeabc5SPierre Jolivet       }
35529371c9d4SSatish Balay       z[0 + k * cm] = sum1;
35539371c9d4SSatish Balay       z[1 + k * cm] = sum2;
35549371c9d4SSatish Balay       z[2 + k * cm] = sum3;
355574eeabc5SPierre Jolivet     }
355674eeabc5SPierre Jolivet     if (!usecprow) z += 3;
355774eeabc5SPierre Jolivet   }
3558*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
355974eeabc5SPierre Jolivet }
356074eeabc5SPierre Jolivet 
3561d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3562d71ae5a4SJacob Faibussowitsch {
356374eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3564f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3565bcf10a7aSPierre Jolivet   const PetscScalar *xb;
356674eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3, x4;
356774eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
356874eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
356974eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
357074eeabc5SPierre Jolivet 
357174eeabc5SPierre Jolivet   PetscFunctionBegin;
357274eeabc5SPierre Jolivet   idx = a->j;
357374eeabc5SPierre Jolivet   v   = a->a;
357474eeabc5SPierre Jolivet   if (usecprow) {
357574eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
357674eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
357774eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
357874eeabc5SPierre Jolivet   } else {
357974eeabc5SPierre Jolivet     mbs = a->mbs;
358074eeabc5SPierre Jolivet     ii  = a->i;
358174eeabc5SPierre Jolivet     z   = c;
358274eeabc5SPierre Jolivet   }
358374eeabc5SPierre Jolivet 
358474eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
35859371c9d4SSatish Balay     n = ii[1] - ii[0];
35869371c9d4SSatish Balay     ii++;
358774eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
358874eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358974eeabc5SPierre Jolivet     if (usecprow) z = c + 4 * ridx[i];
359074eeabc5SPierre Jolivet     jj = idx;
359174eeabc5SPierre Jolivet     vv = v;
359274eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
359374eeabc5SPierre Jolivet       idx  = jj;
359474eeabc5SPierre Jolivet       v    = vv;
35959371c9d4SSatish Balay       sum1 = 0.0;
35969371c9d4SSatish Balay       sum2 = 0.0;
35979371c9d4SSatish Balay       sum3 = 0.0;
35989371c9d4SSatish Balay       sum4 = 0.0;
359974eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
36009371c9d4SSatish Balay         xb = b + 4 * (*idx++);
36019371c9d4SSatish Balay         x1 = xb[0 + k * bm];
36029371c9d4SSatish Balay         x2 = xb[1 + k * bm];
36039371c9d4SSatish Balay         x3 = xb[2 + k * bm];
36049371c9d4SSatish Balay         x4 = xb[3 + k * bm];
360574eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
360674eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
360774eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
360874eeabc5SPierre Jolivet         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
360974eeabc5SPierre Jolivet         v += 16;
361074eeabc5SPierre Jolivet       }
36119371c9d4SSatish Balay       z[0 + k * cm] = sum1;
36129371c9d4SSatish Balay       z[1 + k * cm] = sum2;
36139371c9d4SSatish Balay       z[2 + k * cm] = sum3;
36149371c9d4SSatish Balay       z[3 + k * cm] = sum4;
361574eeabc5SPierre Jolivet     }
361674eeabc5SPierre Jolivet     if (!usecprow) z += 4;
361774eeabc5SPierre Jolivet   }
3618*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
361974eeabc5SPierre Jolivet }
362074eeabc5SPierre Jolivet 
3621d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3622d71ae5a4SJacob Faibussowitsch {
362374eeabc5SPierre Jolivet   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3624f4259b30SLisandro Dalcin   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3625bcf10a7aSPierre Jolivet   const PetscScalar *xb;
362674eeabc5SPierre Jolivet   PetscScalar        x1, x2, x3, x4, x5;
362774eeabc5SPierre Jolivet   const MatScalar   *v, *vv;
362874eeabc5SPierre Jolivet   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
362974eeabc5SPierre Jolivet   PetscBool          usecprow = a->compressedrow.use;
363074eeabc5SPierre Jolivet 
363174eeabc5SPierre Jolivet   PetscFunctionBegin;
363274eeabc5SPierre Jolivet   idx = a->j;
363374eeabc5SPierre Jolivet   v   = a->a;
363474eeabc5SPierre Jolivet   if (usecprow) {
363574eeabc5SPierre Jolivet     mbs  = a->compressedrow.nrows;
363674eeabc5SPierre Jolivet     ii   = a->compressedrow.i;
363774eeabc5SPierre Jolivet     ridx = a->compressedrow.rindex;
363874eeabc5SPierre Jolivet   } else {
363974eeabc5SPierre Jolivet     mbs = a->mbs;
364074eeabc5SPierre Jolivet     ii  = a->i;
364174eeabc5SPierre Jolivet     z   = c;
364274eeabc5SPierre Jolivet   }
364374eeabc5SPierre Jolivet 
364474eeabc5SPierre Jolivet   for (i = 0; i < mbs; i++) {
36459371c9d4SSatish Balay     n = ii[1] - ii[0];
36469371c9d4SSatish Balay     ii++;
364774eeabc5SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
364874eeabc5SPierre Jolivet     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
364974eeabc5SPierre Jolivet     if (usecprow) z = c + 5 * ridx[i];
365074eeabc5SPierre Jolivet     jj = idx;
365174eeabc5SPierre Jolivet     vv = v;
365274eeabc5SPierre Jolivet     for (k = 0; k < cn; k++) {
365374eeabc5SPierre Jolivet       idx  = jj;
365474eeabc5SPierre Jolivet       v    = vv;
36559371c9d4SSatish Balay       sum1 = 0.0;
36569371c9d4SSatish Balay       sum2 = 0.0;
36579371c9d4SSatish Balay       sum3 = 0.0;
36589371c9d4SSatish Balay       sum4 = 0.0;
36599371c9d4SSatish Balay       sum5 = 0.0;
366074eeabc5SPierre Jolivet       for (j = 0; j < n; j++) {
36619371c9d4SSatish Balay         xb = b + 5 * (*idx++);
36629371c9d4SSatish Balay         x1 = xb[0 + k * bm];
36639371c9d4SSatish Balay         x2 = xb[1 + k * bm];
36649371c9d4SSatish Balay         x3 = xb[2 + k * bm];
36659371c9d4SSatish Balay         x4 = xb[3 + k * bm];
36669371c9d4SSatish Balay         x5 = xb[4 + k * bm];
366774eeabc5SPierre Jolivet         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
366874eeabc5SPierre Jolivet         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
366974eeabc5SPierre Jolivet         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
367074eeabc5SPierre Jolivet         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
367174eeabc5SPierre Jolivet         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
367274eeabc5SPierre Jolivet         v += 25;
367374eeabc5SPierre Jolivet       }
36749371c9d4SSatish Balay       z[0 + k * cm] = sum1;
36759371c9d4SSatish Balay       z[1 + k * cm] = sum2;
36769371c9d4SSatish Balay       z[2 + k * cm] = sum3;
36779371c9d4SSatish Balay       z[3 + k * cm] = sum4;
36789371c9d4SSatish Balay       z[4 + k * cm] = sum5;
367974eeabc5SPierre Jolivet     }
368074eeabc5SPierre Jolivet     if (!usecprow) z += 5;
368174eeabc5SPierre Jolivet   }
3682*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
368374eeabc5SPierre Jolivet }
368474eeabc5SPierre Jolivet 
3685d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3686d71ae5a4SJacob Faibussowitsch {
3687a001520aSPierre Jolivet   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3688a001520aSPierre Jolivet   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3689910cf402Sprj-   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3690bcf10a7aSPierre Jolivet   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3691a001520aSPierre Jolivet   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3692a001520aSPierre Jolivet   PetscBLASInt     bbs, bcn, bbm, bcm;
3693f4259b30SLisandro Dalcin   PetscScalar     *z = NULL;
3694a001520aSPierre Jolivet   PetscScalar     *c, *b;
3695a001520aSPierre Jolivet   const MatScalar *v;
3696a001520aSPierre Jolivet   const PetscInt  *idx, *ii, *ridx = NULL;
36974b7054f4SPierre Jolivet   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3698a001520aSPierre Jolivet   PetscBool        usecprow = a->compressedrow.use;
3699a001520aSPierre Jolivet 
3700a001520aSPierre Jolivet   PetscFunctionBegin;
3701*3ba16761SJacob Faibussowitsch   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
370208401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
370308401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
370408401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3705a001520aSPierre Jolivet   b = bd->v;
370648a46eb9SPierre Jolivet   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
37079566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArray(C, &c));
370874eeabc5SPierre Jolivet   switch (bs) {
3709d71ae5a4SJacob Faibussowitsch   case 1:
3710d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3711d71ae5a4SJacob Faibussowitsch     break;
3712d71ae5a4SJacob Faibussowitsch   case 2:
3713d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3714d71ae5a4SJacob Faibussowitsch     break;
3715d71ae5a4SJacob Faibussowitsch   case 3:
3716d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3717d71ae5a4SJacob Faibussowitsch     break;
3718d71ae5a4SJacob Faibussowitsch   case 4:
3719d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3720d71ae5a4SJacob Faibussowitsch     break;
3721d71ae5a4SJacob Faibussowitsch   case 5:
3722d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3723d71ae5a4SJacob Faibussowitsch     break;
372474eeabc5SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
37259566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bs, &bbs));
37269566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cn, &bcn));
37279566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bm, &bbm));
37289566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cm, &bcm));
3729a001520aSPierre Jolivet     idx = a->j;
3730a001520aSPierre Jolivet     v   = a->a;
3731a001520aSPierre Jolivet     if (usecprow) {
3732a001520aSPierre Jolivet       mbs  = a->compressedrow.nrows;
3733a001520aSPierre Jolivet       ii   = a->compressedrow.i;
3734a001520aSPierre Jolivet       ridx = a->compressedrow.rindex;
3735a001520aSPierre Jolivet     } else {
3736a001520aSPierre Jolivet       mbs = a->mbs;
3737a001520aSPierre Jolivet       ii  = a->i;
3738a001520aSPierre Jolivet       z   = c;
3739a001520aSPierre Jolivet     }
3740a001520aSPierre Jolivet     for (i = 0; i < mbs; i++) {
37419371c9d4SSatish Balay       n = ii[1] - ii[0];
37429371c9d4SSatish Balay       ii++;
3743a001520aSPierre Jolivet       if (usecprow) z = c + bs * ridx[i];
37444b7054f4SPierre Jolivet       if (n) {
3745792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
37464b7054f4SPierre Jolivet         v += bs2;
37474b7054f4SPierre Jolivet       }
37484b7054f4SPierre Jolivet       for (j = 1; j < n; j++) {
3749792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3750a001520aSPierre Jolivet         v += bs2;
3751a001520aSPierre Jolivet       }
3752a001520aSPierre Jolivet       if (!usecprow) z += bs;
3753a001520aSPierre Jolivet     }
37544b7054f4SPierre Jolivet   }
37559566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArray(C, &c));
37569566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3757*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3758a001520aSPierre Jolivet }
3759