1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 2a001520aSPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <petscblaslapack.h> 6cac129eeSSatish Balay 75f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 896e086a2SDaniel Kokron #include <immintrin.h> 996e086a2SDaniel Kokron #endif 1096e086a2SDaniel Kokron 11*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 12*d71ae5a4SJacob Faibussowitsch { 13a3192f15SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 145d0c19d7SBarry Smith PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 155d0c19d7SBarry Smith const PetscInt *idx; 16690b6cddSBarry Smith PetscInt start, end, *ai, *aj, bs, *nidx2; 17f1af5d2fSBarry Smith PetscBT table; 18a3192f15SSatish Balay 193a40ed3dSBarry Smith PetscFunctionBegin; 20a3192f15SSatish Balay m = a->mbs; 21a3192f15SSatish Balay ai = a->i; 22a3192f15SSatish Balay aj = a->j; 23d0f46423SBarry Smith bs = A->rmap->bs; 24a3192f15SSatish Balay 2508401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 26a3192f15SSatish Balay 279566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(m, &table)); 289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &nidx)); 299566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2)); 30a3192f15SSatish Balay 31a3192f15SSatish Balay for (i = 0; i < is_max; i++) { 32a3192f15SSatish Balay /* Initialise the two local arrays */ 33a3192f15SSatish Balay isz = 0; 349566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(m, table)); 35a3192f15SSatish Balay 36a3192f15SSatish Balay /* Extract the indices, assume there can be duplicate entries */ 379566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 389566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 39a3192f15SSatish Balay 40a3192f15SSatish Balay /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 41a3192f15SSatish Balay for (j = 0; j < n; ++j) { 42218c64b6SSatish Balay ival = idx[j] / bs; /* convert the indices into block indices */ 4308401ef6SPierre Jolivet PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 4426fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 45a3192f15SSatish Balay } 469566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 479566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 48a3192f15SSatish Balay 49a3192f15SSatish Balay k = 0; 50a3192f15SSatish Balay for (j = 0; j < ov; j++) { /* for each overlap*/ 51a3192f15SSatish Balay n = isz; 52a3192f15SSatish Balay for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 53a3192f15SSatish Balay row = nidx[k]; 54a3192f15SSatish Balay start = ai[row]; 55a3192f15SSatish Balay end = ai[row + 1]; 56a3192f15SSatish Balay for (l = start; l < end; l++) { 57a3192f15SSatish Balay val = aj[l]; 5826fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 59a3192f15SSatish Balay } 60a3192f15SSatish Balay } 61a3192f15SSatish Balay } 62218c64b6SSatish Balay /* expand the Index Set */ 63218c64b6SSatish Balay for (j = 0; j < isz; j++) { 6426fbe8dcSKarl Rupp for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k; 65218c64b6SSatish Balay } 669566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i)); 67a3192f15SSatish Balay } 689566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table)); 699566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 709566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx2)); 713a40ed3dSBarry Smith PetscFunctionReturn(0); 72a3192f15SSatish Balay } 731c351548SSatish Balay 74*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 75*d71ae5a4SJacob Faibussowitsch { 76736121d4SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 77690b6cddSBarry Smith PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 78690b6cddSBarry Smith PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 795d0c19d7SBarry Smith const PetscInt *irow, *icol; 805d0c19d7SBarry Smith PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 81690b6cddSBarry Smith PetscInt *aj = a->j, *ai = a->i; 823f1db9ecSBarry Smith MatScalar *mat_a; 83736121d4SSatish Balay Mat C; 846041f1b1SToby Isaac PetscBool flag; 85736121d4SSatish Balay 863a40ed3dSBarry Smith PetscFunctionBegin; 879566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 889566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 899566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 909566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 91736121d4SSatish Balay 929566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 93736121d4SSatish Balay ssmap = smap; 949566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 95736121d4SSatish Balay for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 96736121d4SSatish Balay /* determine lens of each row */ 97736121d4SSatish Balay for (i = 0; i < nrows; i++) { 98736121d4SSatish Balay kstart = ai[irow[i]]; 99736121d4SSatish Balay kend = kstart + a->ilen[irow[i]]; 100736121d4SSatish Balay lens[i] = 0; 101736121d4SSatish Balay for (k = kstart; k < kend; k++) { 10226fbe8dcSKarl Rupp if (ssmap[aj[k]]) lens[i]++; 103736121d4SSatish Balay } 104736121d4SSatish Balay } 105736121d4SSatish Balay /* Create and fill new matrix */ 106736121d4SSatish Balay if (scall == MAT_REUSE_MATRIX) { 107736121d4SSatish Balay c = (Mat_SeqBAIJ *)((*B)->data); 108736121d4SSatish Balay 109aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1109566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 11128b400f6SJacob Faibussowitsch PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 1129566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 113736121d4SSatish Balay C = *B; 1143a40ed3dSBarry Smith } else { 1159566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1169566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 1179566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1189566063dSJacob Faibussowitsch PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 119736121d4SSatish Balay } 120736121d4SSatish Balay c = (Mat_SeqBAIJ *)(C->data); 121736121d4SSatish Balay for (i = 0; i < nrows; i++) { 122736121d4SSatish Balay row = irow[i]; 123736121d4SSatish Balay kstart = ai[row]; 124736121d4SSatish Balay kend = kstart + a->ilen[row]; 125736121d4SSatish Balay mat_i = c->i[i]; 126d29f2997SMatthew Woehlke mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 127d29f2997SMatthew Woehlke mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 128736121d4SSatish Balay mat_ilen = c->ilen + i; 129736121d4SSatish Balay for (k = kstart; k < kend; k++) { 130736121d4SSatish Balay if ((tcol = ssmap[a->j[k]])) { 131736121d4SSatish Balay *mat_j++ = tcol - 1; 1329566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 133549d3d68SSatish Balay mat_a += bs2; 134736121d4SSatish Balay (*mat_ilen)++; 135736121d4SSatish Balay } 136736121d4SSatish Balay } 137736121d4SSatish Balay } 138cdc6f3adSToby Isaac /* sort */ 139d29f2997SMatthew Woehlke if (c->j && c->a) { 140cdc6f3adSToby Isaac MatScalar *work; 1419566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 142cdc6f3adSToby Isaac for (i = 0; i < nrows; i++) { 143cdc6f3adSToby Isaac PetscInt ilen; 144cdc6f3adSToby Isaac mat_i = c->i[i]; 145cdc6f3adSToby Isaac mat_j = c->j + mat_i; 146cdc6f3adSToby Isaac mat_a = c->a + mat_i * bs2; 147cdc6f3adSToby Isaac ilen = c->ilen[i]; 1489566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 149cdc6f3adSToby Isaac } 1509566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 151cdc6f3adSToby Isaac } 152218c64b6SSatish Balay 153736121d4SSatish Balay /* Free work space */ 1549566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 1559566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 1569566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 1579566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1589566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 159736121d4SSatish Balay 1609566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 161736121d4SSatish Balay *B = C; 1623a40ed3dSBarry Smith PetscFunctionReturn(0); 163736121d4SSatish Balay } 164736121d4SSatish Balay 165*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 166*d71ae5a4SJacob Faibussowitsch { 167218c64b6SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 168218c64b6SSatish Balay IS is1, is2; 169afebec48SHong Zhang PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j; 1705d0c19d7SBarry Smith const PetscInt *irow, *icol; 171218c64b6SSatish Balay 1723a40ed3dSBarry Smith PetscFunctionBegin; 1739566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 1749566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 1759566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 1769566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 177218c64b6SSatish Balay 178218c64b6SSatish Balay /* Verify if the indices corespond to each element in a block 179218c64b6SSatish Balay and form the IS with compressed IS */ 180f8ecb639SStefano Zampini maxmnbs = PetscMax(a->mbs, a->nbs); 1819566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 1829566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->mbs)); 183218c64b6SSatish Balay for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 184ad540459SPierre Jolivet for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 1856041f1b1SToby Isaac count = 0; 1866041f1b1SToby Isaac for (i = 0; i < nrows; i++) { 187afebec48SHong Zhang j = irow[i] / bs; 1886041f1b1SToby Isaac if ((vary[j]--) == bs) iary[count++] = j; 189218c64b6SSatish Balay } 1909566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 191218c64b6SSatish Balay 1929566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->nbs)); 193218c64b6SSatish Balay for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 194ad540459SPierre Jolivet for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 1956041f1b1SToby Isaac count = 0; 1966041f1b1SToby Isaac for (i = 0; i < ncols; i++) { 197afebec48SHong Zhang j = icol[i] / bs; 1986041f1b1SToby Isaac if ((vary[j]--) == bs) iary[count++] = j; 1996041f1b1SToby Isaac } 2009566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 2019566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 2029566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 2039566063dSJacob Faibussowitsch PetscCall(PetscFree2(vary, iary)); 204218c64b6SSatish Balay 2059566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 2069566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2079566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 2083a40ed3dSBarry Smith PetscFunctionReturn(0); 209218c64b6SSatish Balay } 210218c64b6SSatish Balay 211*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 212*d71ae5a4SJacob Faibussowitsch { 21316b64355SHong Zhang Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 2145c39f6d9SHong Zhang Mat_SubSppt *submatj = c->submatis1; 21516b64355SHong Zhang 21616b64355SHong Zhang PetscFunctionBegin; 2179566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 2189566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 21916b64355SHong Zhang PetscFunctionReturn(0); 22016b64355SHong Zhang } 22116b64355SHong Zhang 22289a1a59bSHong Zhang /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 223*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) 224*d71ae5a4SJacob Faibussowitsch { 22586e85357SHong Zhang PetscInt i; 22686e85357SHong Zhang Mat C; 22786e85357SHong Zhang Mat_SeqBAIJ *c; 22886e85357SHong Zhang Mat_SubSppt *submatj; 22986e85357SHong Zhang 23086e85357SHong Zhang PetscFunctionBegin; 23186e85357SHong Zhang for (i = 0; i < n; i++) { 23286e85357SHong Zhang C = (*mat)[i]; 23386e85357SHong Zhang c = (Mat_SeqBAIJ *)C->data; 23486e85357SHong Zhang submatj = c->submatis1; 23586e85357SHong Zhang if (submatj) { 2367daefbafSJunchao Zhang if (--((PetscObject)C)->refct <= 0) { 23726cc229bSBarry Smith PetscCall(PetscFree(C->factorprefix)); 2389566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 2399566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 2409566063dSJacob Faibussowitsch PetscCall(PetscFree(C->defaultvectype)); 2413faff063SStefano Zampini PetscCall(PetscFree(C->defaultrandtype)); 2429566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->rmap)); 2439566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->cmap)); 2449566063dSJacob Faibussowitsch PetscCall(PetscHeaderDestroy(&C)); 2457daefbafSJunchao Zhang } 24686e85357SHong Zhang } else { 2479566063dSJacob Faibussowitsch PetscCall(MatDestroy(&C)); 24886e85357SHong Zhang } 24986e85357SHong Zhang } 2507daefbafSJunchao Zhang 2517daefbafSJunchao Zhang /* Destroy Dummy submatrices created for reuse */ 2529566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 2537daefbafSJunchao Zhang 2549566063dSJacob Faibussowitsch PetscCall(PetscFree(*mat)); 25586e85357SHong Zhang PetscFunctionReturn(0); 25686e85357SHong Zhang } 25786e85357SHong Zhang 258*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 259*d71ae5a4SJacob Faibussowitsch { 260690b6cddSBarry Smith PetscInt i; 261736121d4SSatish Balay 2623a40ed3dSBarry Smith PetscFunctionBegin; 26348a46eb9SPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 264736121d4SSatish Balay 26548a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 2663a40ed3dSBarry Smith PetscFunctionReturn(0); 267736121d4SSatish Balay } 268218c64b6SSatish Balay 2692d61bbb3SSatish Balay /* -------------------------------------------------------*/ 2702d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */ 2712d61bbb3SSatish Balay /* -------------------------------------------------------*/ 2722d61bbb3SSatish Balay 273*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) 274*d71ae5a4SJacob Faibussowitsch { 2752d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 276d9fead3dSBarry Smith PetscScalar *z, sum; 277d9fead3dSBarry Smith const PetscScalar *x; 278d9fead3dSBarry Smith const MatScalar *v; 2797c565772SBarry Smith PetscInt mbs, i, n; 2800298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 281ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2822d61bbb3SSatish Balay 2832d61bbb3SSatish Balay PetscFunctionBegin; 2849566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2859566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &z)); 2862d61bbb3SSatish Balay 28726e093fcSHong Zhang if (usecprow) { 28826e093fcSHong Zhang mbs = a->compressedrow.nrows; 28926e093fcSHong Zhang ii = a->compressedrow.i; 2907b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 2919566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(z, a->mbs)); 29226e093fcSHong Zhang } else { 29326e093fcSHong Zhang mbs = a->mbs; 2942d61bbb3SSatish Balay ii = a->i; 29526e093fcSHong Zhang } 2962d61bbb3SSatish Balay 2972d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 298ee54c7eeSHong Zhang n = ii[1] - ii[0]; 299ee54c7eeSHong Zhang v = a->a + ii[0]; 300ee54c7eeSHong Zhang idx = a->j + ii[0]; 301ee54c7eeSHong Zhang ii++; 302444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 303444d8c10SJed Brown PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3042d61bbb3SSatish Balay sum = 0.0; 3052162cab8SBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 30626e093fcSHong Zhang if (usecprow) { 3077b2bb3b9SHong Zhang z[ridx[i]] = sum; 30826e093fcSHong Zhang } else { 3092d61bbb3SSatish Balay z[i] = sum; 3102d61bbb3SSatish Balay } 31126e093fcSHong Zhang } 3129566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3139566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &z)); 3149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 3152d61bbb3SSatish Balay PetscFunctionReturn(0); 3162d61bbb3SSatish Balay } 3172d61bbb3SSatish Balay 318*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) 319*d71ae5a4SJacob Faibussowitsch { 3202d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 321f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, *zarray; 322d9fead3dSBarry Smith const PetscScalar *x, *xb; 32387828ca2SBarry Smith PetscScalar x1, x2; 324d9fead3dSBarry Smith const MatScalar *v; 3257c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 326ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 3272d61bbb3SSatish Balay 3282d61bbb3SSatish Balay PetscFunctionBegin; 3299566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3309566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 3312d61bbb3SSatish Balay 3322d61bbb3SSatish Balay idx = a->j; 3332d61bbb3SSatish Balay v = a->a; 33426e093fcSHong Zhang if (usecprow) { 33526e093fcSHong Zhang mbs = a->compressedrow.nrows; 33626e093fcSHong Zhang ii = a->compressedrow.i; 3377b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 3389566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 33926e093fcSHong Zhang } else { 34026e093fcSHong Zhang mbs = a->mbs; 3412d61bbb3SSatish Balay ii = a->i; 34226e093fcSHong Zhang z = zarray; 34326e093fcSHong Zhang } 3442d61bbb3SSatish Balay 3452d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 3469371c9d4SSatish Balay n = ii[1] - ii[0]; 3479371c9d4SSatish Balay ii++; 3489371c9d4SSatish Balay sum1 = 0.0; 3499371c9d4SSatish Balay sum2 = 0.0; 350444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 351444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3522d61bbb3SSatish Balay for (j = 0; j < n; j++) { 3539371c9d4SSatish Balay xb = x + 2 * (*idx++); 3549371c9d4SSatish Balay x1 = xb[0]; 3559371c9d4SSatish Balay x2 = xb[1]; 3562d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 3572d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 3582d61bbb3SSatish Balay v += 4; 3592d61bbb3SSatish Balay } 3607b2bb3b9SHong Zhang if (usecprow) z = zarray + 2 * ridx[i]; 3619371c9d4SSatish Balay z[0] = sum1; 3629371c9d4SSatish Balay z[1] = sum2; 36326e093fcSHong Zhang if (!usecprow) z += 2; 3642d61bbb3SSatish Balay } 3659566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3669566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 3679566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 3682d61bbb3SSatish Balay PetscFunctionReturn(0); 3692d61bbb3SSatish Balay } 3702d61bbb3SSatish Balay 371*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) 372*d71ae5a4SJacob Faibussowitsch { 3732d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 374f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 375d9fead3dSBarry Smith const PetscScalar *x, *xb; 376d9fead3dSBarry Smith const MatScalar *v; 3777c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 378ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 37926e093fcSHong Zhang 380b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 381fee21e36SBarry Smith #pragma disjoint(*v, *z, *xb) 382fee21e36SBarry Smith #endif 383fee21e36SBarry Smith 3842d61bbb3SSatish Balay PetscFunctionBegin; 3859566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3869566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 3872d61bbb3SSatish Balay 3882d61bbb3SSatish Balay idx = a->j; 3892d61bbb3SSatish Balay v = a->a; 39026e093fcSHong Zhang if (usecprow) { 39126e093fcSHong Zhang mbs = a->compressedrow.nrows; 39226e093fcSHong Zhang ii = a->compressedrow.i; 3937b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 3949566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 39526e093fcSHong Zhang } else { 39626e093fcSHong Zhang mbs = a->mbs; 3972d61bbb3SSatish Balay ii = a->i; 39826e093fcSHong Zhang z = zarray; 39926e093fcSHong Zhang } 4002d61bbb3SSatish Balay 4012d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 4029371c9d4SSatish Balay n = ii[1] - ii[0]; 4039371c9d4SSatish Balay ii++; 4049371c9d4SSatish Balay sum1 = 0.0; 4059371c9d4SSatish Balay sum2 = 0.0; 4069371c9d4SSatish Balay sum3 = 0.0; 407444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 408444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4092d61bbb3SSatish Balay for (j = 0; j < n; j++) { 41026fbe8dcSKarl Rupp xb = x + 3 * (*idx++); 41126fbe8dcSKarl Rupp x1 = xb[0]; 41226fbe8dcSKarl Rupp x2 = xb[1]; 41326fbe8dcSKarl Rupp x3 = xb[2]; 41426fbe8dcSKarl Rupp 4152d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 4162d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 4172d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 4182d61bbb3SSatish Balay v += 9; 4192d61bbb3SSatish Balay } 4207b2bb3b9SHong Zhang if (usecprow) z = zarray + 3 * ridx[i]; 4219371c9d4SSatish Balay z[0] = sum1; 4229371c9d4SSatish Balay z[1] = sum2; 4239371c9d4SSatish Balay z[2] = sum3; 42426e093fcSHong Zhang if (!usecprow) z += 3; 4252d61bbb3SSatish Balay } 4269566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4279566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 4289566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 4292d61bbb3SSatish Balay PetscFunctionReturn(0); 4302d61bbb3SSatish Balay } 4312d61bbb3SSatish Balay 432*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) 433*d71ae5a4SJacob Faibussowitsch { 4342d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 435f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 436d9fead3dSBarry Smith const PetscScalar *x, *xb; 437d9fead3dSBarry Smith const MatScalar *v; 4387c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 439ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 4402d61bbb3SSatish Balay 4412d61bbb3SSatish Balay PetscFunctionBegin; 4429566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4439566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 4442d61bbb3SSatish Balay 4452d61bbb3SSatish Balay idx = a->j; 4462d61bbb3SSatish Balay v = a->a; 44726e093fcSHong Zhang if (usecprow) { 44826e093fcSHong Zhang mbs = a->compressedrow.nrows; 44926e093fcSHong Zhang ii = a->compressedrow.i; 4507b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 4519566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 45226e093fcSHong Zhang } else { 45326e093fcSHong Zhang mbs = a->mbs; 4542d61bbb3SSatish Balay ii = a->i; 45526e093fcSHong Zhang z = zarray; 45626e093fcSHong Zhang } 4572d61bbb3SSatish Balay 4582d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 45926fbe8dcSKarl Rupp n = ii[1] - ii[0]; 46026fbe8dcSKarl Rupp ii++; 46126fbe8dcSKarl Rupp sum1 = 0.0; 46226fbe8dcSKarl Rupp sum2 = 0.0; 46326fbe8dcSKarl Rupp sum3 = 0.0; 46426fbe8dcSKarl Rupp sum4 = 0.0; 46526fbe8dcSKarl Rupp 466444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 467444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4682d61bbb3SSatish Balay for (j = 0; j < n; j++) { 4692d61bbb3SSatish Balay xb = x + 4 * (*idx++); 4709371c9d4SSatish Balay x1 = xb[0]; 4719371c9d4SSatish Balay x2 = xb[1]; 4729371c9d4SSatish Balay x3 = xb[2]; 4739371c9d4SSatish Balay x4 = xb[3]; 4742d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 4752d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 4762d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 4772d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 4782d61bbb3SSatish Balay v += 16; 4792d61bbb3SSatish Balay } 4807b2bb3b9SHong Zhang if (usecprow) z = zarray + 4 * ridx[i]; 4819371c9d4SSatish Balay z[0] = sum1; 4829371c9d4SSatish Balay z[1] = sum2; 4839371c9d4SSatish Balay z[2] = sum3; 4849371c9d4SSatish Balay z[3] = sum4; 48526e093fcSHong Zhang if (!usecprow) z += 4; 4862d61bbb3SSatish Balay } 4879566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4889566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 4899566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 4902d61bbb3SSatish Balay PetscFunctionReturn(0); 4912d61bbb3SSatish Balay } 4922d61bbb3SSatish Balay 493*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) 494*d71ae5a4SJacob Faibussowitsch { 4952d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 496f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 497d9fead3dSBarry Smith const PetscScalar *xb, *x; 498d9fead3dSBarry Smith const MatScalar *v; 4990298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 5007c565772SBarry Smith PetscInt mbs, i, j, n; 501ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 5022d61bbb3SSatish Balay 503433994e6SBarry Smith PetscFunctionBegin; 5049566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5059566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 5062d61bbb3SSatish Balay 5072d61bbb3SSatish Balay idx = a->j; 5082d61bbb3SSatish Balay v = a->a; 50926e093fcSHong Zhang if (usecprow) { 51026e093fcSHong Zhang mbs = a->compressedrow.nrows; 51126e093fcSHong Zhang ii = a->compressedrow.i; 5127b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 5139566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 51426e093fcSHong Zhang } else { 51526e093fcSHong Zhang mbs = a->mbs; 5162d61bbb3SSatish Balay ii = a->i; 51726e093fcSHong Zhang z = zarray; 51826e093fcSHong Zhang } 5192d61bbb3SSatish Balay 5202d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 5219371c9d4SSatish Balay n = ii[1] - ii[0]; 5229371c9d4SSatish Balay ii++; 5239371c9d4SSatish Balay sum1 = 0.0; 5249371c9d4SSatish Balay sum2 = 0.0; 5259371c9d4SSatish Balay sum3 = 0.0; 5269371c9d4SSatish Balay sum4 = 0.0; 5279371c9d4SSatish Balay sum5 = 0.0; 528444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 529444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 5302d61bbb3SSatish Balay for (j = 0; j < n; j++) { 5312d61bbb3SSatish Balay xb = x + 5 * (*idx++); 5329371c9d4SSatish Balay x1 = xb[0]; 5339371c9d4SSatish Balay x2 = xb[1]; 5349371c9d4SSatish Balay x3 = xb[2]; 5359371c9d4SSatish Balay x4 = xb[3]; 5369371c9d4SSatish Balay x5 = xb[4]; 5372d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 5382d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 5392d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 5402d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 5412d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 5422d61bbb3SSatish Balay v += 25; 5432d61bbb3SSatish Balay } 5447b2bb3b9SHong Zhang if (usecprow) z = zarray + 5 * ridx[i]; 5459371c9d4SSatish Balay z[0] = sum1; 5469371c9d4SSatish Balay z[1] = sum2; 5479371c9d4SSatish Balay z[2] = sum3; 5489371c9d4SSatish Balay z[3] = sum4; 5499371c9d4SSatish Balay z[4] = sum5; 55026e093fcSHong Zhang if (!usecprow) z += 5; 5512d61bbb3SSatish Balay } 5529566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5539566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 5549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 5552d61bbb3SSatish Balay PetscFunctionReturn(0); 5562d61bbb3SSatish Balay } 5572d61bbb3SSatish Balay 558*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) 559*d71ae5a4SJacob Faibussowitsch { 56015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 561f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 562d9fead3dSBarry Smith const PetscScalar *x, *xb; 56326e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 564d9fead3dSBarry Smith const MatScalar *v; 5657c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 566ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 56715091d37SBarry Smith 568433994e6SBarry Smith PetscFunctionBegin; 5699566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5709566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 57115091d37SBarry Smith 57215091d37SBarry Smith idx = a->j; 57315091d37SBarry Smith v = a->a; 57426e093fcSHong Zhang if (usecprow) { 57526e093fcSHong Zhang mbs = a->compressedrow.nrows; 57626e093fcSHong Zhang ii = a->compressedrow.i; 5777b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 5789566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 57926e093fcSHong Zhang } else { 58026e093fcSHong Zhang mbs = a->mbs; 58115091d37SBarry Smith ii = a->i; 58226e093fcSHong Zhang z = zarray; 58326e093fcSHong Zhang } 58415091d37SBarry Smith 58515091d37SBarry Smith for (i = 0; i < mbs; i++) { 58626fbe8dcSKarl Rupp n = ii[1] - ii[0]; 58726fbe8dcSKarl Rupp ii++; 58826fbe8dcSKarl Rupp sum1 = 0.0; 58926fbe8dcSKarl Rupp sum2 = 0.0; 59026fbe8dcSKarl Rupp sum3 = 0.0; 59126fbe8dcSKarl Rupp sum4 = 0.0; 59226fbe8dcSKarl Rupp sum5 = 0.0; 59326fbe8dcSKarl Rupp sum6 = 0.0; 59426fbe8dcSKarl Rupp 595444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 596444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 59715091d37SBarry Smith for (j = 0; j < n; j++) { 59815091d37SBarry Smith xb = x + 6 * (*idx++); 5999371c9d4SSatish Balay x1 = xb[0]; 6009371c9d4SSatish Balay x2 = xb[1]; 6019371c9d4SSatish Balay x3 = xb[2]; 6029371c9d4SSatish Balay x4 = xb[3]; 6039371c9d4SSatish Balay x5 = xb[4]; 6049371c9d4SSatish Balay x6 = xb[5]; 60515091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 60615091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 60715091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 60815091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 60915091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 61015091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 61115091d37SBarry Smith v += 36; 61215091d37SBarry Smith } 6137b2bb3b9SHong Zhang if (usecprow) z = zarray + 6 * ridx[i]; 6149371c9d4SSatish Balay z[0] = sum1; 6159371c9d4SSatish Balay z[1] = sum2; 6169371c9d4SSatish Balay z[2] = sum3; 6179371c9d4SSatish Balay z[3] = sum4; 6189371c9d4SSatish Balay z[4] = sum5; 6199371c9d4SSatish Balay z[5] = sum6; 62026e093fcSHong Zhang if (!usecprow) z += 6; 62115091d37SBarry Smith } 62215091d37SBarry Smith 6239566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6249566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 6259566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 62615091d37SBarry Smith PetscFunctionReturn(0); 62715091d37SBarry Smith } 6288ab949d8SShri Abhyankar 629*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) 630*d71ae5a4SJacob Faibussowitsch { 6312d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 632f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 633d9fead3dSBarry Smith const PetscScalar *x, *xb; 63426e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 635d9fead3dSBarry Smith const MatScalar *v; 6367c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 637ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 6382d61bbb3SSatish Balay 639433994e6SBarry Smith PetscFunctionBegin; 6409566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6419566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 6422d61bbb3SSatish Balay 6432d61bbb3SSatish Balay idx = a->j; 6442d61bbb3SSatish Balay v = a->a; 64526e093fcSHong Zhang if (usecprow) { 64626e093fcSHong Zhang mbs = a->compressedrow.nrows; 64726e093fcSHong Zhang ii = a->compressedrow.i; 6487b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 6499566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 65026e093fcSHong Zhang } else { 65126e093fcSHong Zhang mbs = a->mbs; 6522d61bbb3SSatish Balay ii = a->i; 65326e093fcSHong Zhang z = zarray; 65426e093fcSHong Zhang } 6552d61bbb3SSatish Balay 6562d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 65726fbe8dcSKarl Rupp n = ii[1] - ii[0]; 65826fbe8dcSKarl Rupp ii++; 65926fbe8dcSKarl Rupp sum1 = 0.0; 66026fbe8dcSKarl Rupp sum2 = 0.0; 66126fbe8dcSKarl Rupp sum3 = 0.0; 66226fbe8dcSKarl Rupp sum4 = 0.0; 66326fbe8dcSKarl Rupp sum5 = 0.0; 66426fbe8dcSKarl Rupp sum6 = 0.0; 66526fbe8dcSKarl Rupp sum7 = 0.0; 66626fbe8dcSKarl Rupp 667444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 668444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 6692d61bbb3SSatish Balay for (j = 0; j < n; j++) { 6702d61bbb3SSatish Balay xb = x + 7 * (*idx++); 6719371c9d4SSatish Balay x1 = xb[0]; 6729371c9d4SSatish Balay x2 = xb[1]; 6739371c9d4SSatish Balay x3 = xb[2]; 6749371c9d4SSatish Balay x4 = xb[3]; 6759371c9d4SSatish Balay x5 = xb[4]; 6769371c9d4SSatish Balay x6 = xb[5]; 6779371c9d4SSatish Balay x7 = xb[6]; 6782d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 6792d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 6802d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 6812d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 6822d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 6832d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 6842d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 6852d61bbb3SSatish Balay v += 49; 6862d61bbb3SSatish Balay } 6877b2bb3b9SHong Zhang if (usecprow) z = zarray + 7 * ridx[i]; 6889371c9d4SSatish Balay z[0] = sum1; 6899371c9d4SSatish Balay z[1] = sum2; 6909371c9d4SSatish Balay z[2] = sum3; 6919371c9d4SSatish Balay z[3] = sum4; 6929371c9d4SSatish Balay z[4] = sum5; 6939371c9d4SSatish Balay z[5] = sum6; 6949371c9d4SSatish Balay z[6] = sum7; 69526e093fcSHong Zhang if (!usecprow) z += 7; 6962d61bbb3SSatish Balay } 6972d61bbb3SSatish Balay 6989566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6999566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 7009566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 7012d61bbb3SSatish Balay PetscFunctionReturn(0); 7022d61bbb3SSatish Balay } 7032d61bbb3SSatish Balay 7045f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 705*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) 706*d71ae5a4SJacob Faibussowitsch { 70796e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 708f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 70996e086a2SDaniel Kokron const PetscScalar *x, *xb; 71096e086a2SDaniel Kokron const MatScalar *v; 71196e086a2SDaniel Kokron PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 71296e086a2SDaniel Kokron const PetscInt *idx, *ii, *ridx = NULL; 713ce68d72fSJed Brown PetscInt k; 71496e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 71596e086a2SDaniel Kokron 71696e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 717ce68d72fSJed Brown __m256d w0, w1, w2, w3; 71896e086a2SDaniel Kokron __m256d z0, z1, z2; 71996e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 72096e086a2SDaniel Kokron 72196e086a2SDaniel Kokron PetscFunctionBegin; 7229566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7239566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 72496e086a2SDaniel Kokron 72596e086a2SDaniel Kokron idx = a->j; 72696e086a2SDaniel Kokron v = a->a; 72796e086a2SDaniel Kokron if (usecprow) { 72896e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 72996e086a2SDaniel Kokron ii = a->compressedrow.i; 73096e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 7319566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 73296e086a2SDaniel Kokron } else { 73396e086a2SDaniel Kokron mbs = a->mbs; 73496e086a2SDaniel Kokron ii = a->i; 73596e086a2SDaniel Kokron z = zarray; 73696e086a2SDaniel Kokron } 73796e086a2SDaniel Kokron 73896e086a2SDaniel Kokron if (!a->mult_work) { 73996e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 7409566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 74196e086a2SDaniel Kokron } 74296e086a2SDaniel Kokron 74396e086a2SDaniel Kokron work = a->mult_work; 74496e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 7459371c9d4SSatish Balay n = ii[1] - ii[0]; 7469371c9d4SSatish Balay ii++; 74796e086a2SDaniel Kokron workt = work; 74896e086a2SDaniel Kokron for (j = 0; j < n; j++) { 74996e086a2SDaniel Kokron xb = x + bs * (*idx++); 75096e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 75196e086a2SDaniel Kokron workt += bs; 75296e086a2SDaniel Kokron } 75396e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 75496e086a2SDaniel Kokron 7559371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 7569371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 7579371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 75896e086a2SDaniel Kokron 75996e086a2SDaniel Kokron for (j = 0; j < n; j++) { 760c05b70c4SSatish Balay /* first column of a */ 76196e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 7629371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 7639371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 7649371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 7659371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 7669371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 7679371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 76896e086a2SDaniel Kokron 769c05b70c4SSatish Balay /* second column of a */ 77096e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 7719371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 7729371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 7739371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 7749371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 7759371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 7769371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 77796e086a2SDaniel Kokron 778c05b70c4SSatish Balay /* third column of a */ 77996e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 7809371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 7819371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 7829371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 7839371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 7849371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 7859371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 78696e086a2SDaniel Kokron 787c05b70c4SSatish Balay /* fourth column of a */ 78896e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 7899371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 7909371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 7919371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 7929371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 7939371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 7949371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 79596e086a2SDaniel Kokron 796c05b70c4SSatish Balay /* fifth column of a */ 79796e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 7989371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 7999371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 8009371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 8019371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 8029371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 8039371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 80496e086a2SDaniel Kokron 805c05b70c4SSatish Balay /* sixth column of a */ 80696e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 8079371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 8089371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 8099371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 8109371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 8119371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 8129371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 81396e086a2SDaniel Kokron 814c05b70c4SSatish Balay /* seventh column of a */ 81596e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 8169371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 8179371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 8189371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 8199371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 8209371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 8219371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 82296e086a2SDaniel Kokron 8236aad120cSJose E. Roman /* eighth column of a */ 82496e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 8259371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 8269371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 8279371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 8289371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 8299371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 8309371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 83196e086a2SDaniel Kokron 832c05b70c4SSatish Balay /* ninth column of a */ 83396e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 8349371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 8359371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 8369371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 8379371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 8389371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 8399371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 84096e086a2SDaniel Kokron } 84196e086a2SDaniel Kokron 8429371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 8439371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 8449371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 84596e086a2SDaniel Kokron 84696e086a2SDaniel Kokron v += n * bs2; 84796e086a2SDaniel Kokron if (!usecprow) z += bs; 84896e086a2SDaniel Kokron } 8499566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8509566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 8519566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 85296e086a2SDaniel Kokron PetscFunctionReturn(0); 85396e086a2SDaniel Kokron } 85496e086a2SDaniel Kokron #endif 85596e086a2SDaniel Kokron 856*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) 857*d71ae5a4SJacob Faibussowitsch { 858ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 859f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 860ebada01fSBarry Smith const PetscScalar *x, *xb; 861ebada01fSBarry Smith PetscScalar *zarray, xv; 862ebada01fSBarry Smith const MatScalar *v; 863ebada01fSBarry Smith const PetscInt *ii, *ij = a->j, *idx; 864ebada01fSBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 865ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 866ebada01fSBarry Smith 867ebada01fSBarry Smith PetscFunctionBegin; 8689566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8699566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 870ebada01fSBarry Smith 871ebada01fSBarry Smith v = a->a; 872ebada01fSBarry Smith if (usecprow) { 873ebada01fSBarry Smith mbs = a->compressedrow.nrows; 874ebada01fSBarry Smith ii = a->compressedrow.i; 875ebada01fSBarry Smith ridx = a->compressedrow.rindex; 8769566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 877ebada01fSBarry Smith } else { 878ebada01fSBarry Smith mbs = a->mbs; 879ebada01fSBarry Smith ii = a->i; 880ebada01fSBarry Smith z = zarray; 881ebada01fSBarry Smith } 882ebada01fSBarry Smith 883ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 884ebada01fSBarry Smith n = ii[i + 1] - ii[i]; 885ebada01fSBarry Smith idx = ij + ii[i]; 8869371c9d4SSatish Balay sum1 = 0.0; 8879371c9d4SSatish Balay sum2 = 0.0; 8889371c9d4SSatish Balay sum3 = 0.0; 8899371c9d4SSatish Balay sum4 = 0.0; 8909371c9d4SSatish Balay sum5 = 0.0; 8919371c9d4SSatish Balay sum6 = 0.0; 8929371c9d4SSatish Balay sum7 = 0.0; 8939371c9d4SSatish Balay sum8 = 0.0; 8949371c9d4SSatish Balay sum9 = 0.0; 8959371c9d4SSatish Balay sum10 = 0.0; 8969371c9d4SSatish Balay sum11 = 0.0; 897ebada01fSBarry Smith 898ebada01fSBarry Smith for (j = 0; j < n; j++) { 899ebada01fSBarry Smith xb = x + 11 * (idx[j]); 900ebada01fSBarry Smith 901ebada01fSBarry Smith for (k = 0; k < 11; k++) { 902ebada01fSBarry Smith xv = xb[k]; 903ebada01fSBarry Smith sum1 += v[0] * xv; 904ebada01fSBarry Smith sum2 += v[1] * xv; 905ebada01fSBarry Smith sum3 += v[2] * xv; 906ebada01fSBarry Smith sum4 += v[3] * xv; 907ebada01fSBarry Smith sum5 += v[4] * xv; 908ebada01fSBarry Smith sum6 += v[5] * xv; 909ebada01fSBarry Smith sum7 += v[6] * xv; 910ebada01fSBarry Smith sum8 += v[7] * xv; 911ebada01fSBarry Smith sum9 += v[8] * xv; 912ebada01fSBarry Smith sum10 += v[9] * xv; 913ebada01fSBarry Smith sum11 += v[10] * xv; 914ebada01fSBarry Smith v += 11; 915ebada01fSBarry Smith } 916ebada01fSBarry Smith } 917ebada01fSBarry Smith if (usecprow) z = zarray + 11 * ridx[i]; 9189371c9d4SSatish Balay z[0] = sum1; 9199371c9d4SSatish Balay z[1] = sum2; 9209371c9d4SSatish Balay z[2] = sum3; 9219371c9d4SSatish Balay z[3] = sum4; 9229371c9d4SSatish Balay z[4] = sum5; 9239371c9d4SSatish Balay z[5] = sum6; 9249371c9d4SSatish Balay z[6] = sum7; 9259371c9d4SSatish Balay z[7] = sum8; 9269371c9d4SSatish Balay z[8] = sum9; 9279371c9d4SSatish Balay z[9] = sum10; 9289371c9d4SSatish Balay z[10] = sum11; 929ebada01fSBarry Smith 930ebada01fSBarry Smith if (!usecprow) z += 11; 931ebada01fSBarry Smith } 932ebada01fSBarry Smith 9339566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9349566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 9359566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 936ebada01fSBarry Smith PetscFunctionReturn(0); 937ebada01fSBarry Smith } 938ebada01fSBarry Smith 9396679dcc1SBarry Smith /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 940*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) 941*d71ae5a4SJacob Faibussowitsch { 9426679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 9436679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 9446679dcc1SBarry Smith const PetscScalar *x, *xb; 9456679dcc1SBarry Smith PetscScalar *zarray, xv; 9466679dcc1SBarry Smith const MatScalar *v; 9476679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 9486679dcc1SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 9496679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 9506679dcc1SBarry Smith 9516679dcc1SBarry Smith PetscFunctionBegin; 9529566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9539566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 9546679dcc1SBarry Smith 9556679dcc1SBarry Smith v = a->a; 9566679dcc1SBarry Smith if (usecprow) { 9576679dcc1SBarry Smith mbs = a->compressedrow.nrows; 9586679dcc1SBarry Smith ii = a->compressedrow.i; 9596679dcc1SBarry Smith ridx = a->compressedrow.rindex; 9609566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 9616679dcc1SBarry Smith } else { 9626679dcc1SBarry Smith mbs = a->mbs; 9636679dcc1SBarry Smith ii = a->i; 9646679dcc1SBarry Smith z = zarray; 9656679dcc1SBarry Smith } 9666679dcc1SBarry Smith 9676679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 9686679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 9696679dcc1SBarry Smith idx = ij + ii[i]; 9709371c9d4SSatish Balay sum1 = 0.0; 9719371c9d4SSatish Balay sum2 = 0.0; 9729371c9d4SSatish Balay sum3 = 0.0; 9739371c9d4SSatish Balay sum4 = 0.0; 9749371c9d4SSatish Balay sum5 = 0.0; 9759371c9d4SSatish Balay sum6 = 0.0; 9769371c9d4SSatish Balay sum7 = 0.0; 9779371c9d4SSatish Balay sum8 = 0.0; 9789371c9d4SSatish Balay sum9 = 0.0; 9799371c9d4SSatish Balay sum10 = 0.0; 9809371c9d4SSatish Balay sum11 = 0.0; 9819371c9d4SSatish Balay sum12 = 0.0; 9826679dcc1SBarry Smith 9836679dcc1SBarry Smith for (j = 0; j < n; j++) { 9846679dcc1SBarry Smith xb = x + 12 * (idx[j]); 9856679dcc1SBarry Smith 9866679dcc1SBarry Smith for (k = 0; k < 12; k++) { 9876679dcc1SBarry Smith xv = xb[k]; 9886679dcc1SBarry Smith sum1 += v[0] * xv; 9896679dcc1SBarry Smith sum2 += v[1] * xv; 9906679dcc1SBarry Smith sum3 += v[2] * xv; 9916679dcc1SBarry Smith sum4 += v[3] * xv; 9926679dcc1SBarry Smith sum5 += v[4] * xv; 9936679dcc1SBarry Smith sum6 += v[5] * xv; 9946679dcc1SBarry Smith sum7 += v[6] * xv; 9956679dcc1SBarry Smith sum8 += v[7] * xv; 9966679dcc1SBarry Smith sum9 += v[8] * xv; 9976679dcc1SBarry Smith sum10 += v[9] * xv; 9986679dcc1SBarry Smith sum11 += v[10] * xv; 9996679dcc1SBarry Smith sum12 += v[11] * xv; 10006679dcc1SBarry Smith v += 12; 10016679dcc1SBarry Smith } 10026679dcc1SBarry Smith } 10036679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 10049371c9d4SSatish Balay z[0] = sum1; 10059371c9d4SSatish Balay z[1] = sum2; 10069371c9d4SSatish Balay z[2] = sum3; 10079371c9d4SSatish Balay z[3] = sum4; 10089371c9d4SSatish Balay z[4] = sum5; 10099371c9d4SSatish Balay z[5] = sum6; 10109371c9d4SSatish Balay z[6] = sum7; 10119371c9d4SSatish Balay z[7] = sum8; 10129371c9d4SSatish Balay z[8] = sum9; 10139371c9d4SSatish Balay z[9] = sum10; 10149371c9d4SSatish Balay z[10] = sum11; 10159371c9d4SSatish Balay z[11] = sum12; 10166679dcc1SBarry Smith if (!usecprow) z += 12; 10176679dcc1SBarry Smith } 10189566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10199566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 10209566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 10216679dcc1SBarry Smith PetscFunctionReturn(0); 10226679dcc1SBarry Smith } 10236679dcc1SBarry Smith 1024*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) 1025*d71ae5a4SJacob Faibussowitsch { 10266679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 10276679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 10286679dcc1SBarry Smith const PetscScalar *x, *xb; 10296679dcc1SBarry Smith PetscScalar *zarray, *yarray, xv; 10306679dcc1SBarry Smith const MatScalar *v; 10316679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 10326679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 10336679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 10346679dcc1SBarry Smith 10356679dcc1SBarry Smith PetscFunctionBegin; 10369566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10379566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 10386679dcc1SBarry Smith 10396679dcc1SBarry Smith v = a->a; 10406679dcc1SBarry Smith if (usecprow) { 104148a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 10426679dcc1SBarry Smith mbs = a->compressedrow.nrows; 10436679dcc1SBarry Smith ii = a->compressedrow.i; 10446679dcc1SBarry Smith ridx = a->compressedrow.rindex; 10456679dcc1SBarry Smith } else { 10466679dcc1SBarry Smith ii = a->i; 10476679dcc1SBarry Smith y = yarray; 10486679dcc1SBarry Smith z = zarray; 10496679dcc1SBarry Smith } 10506679dcc1SBarry Smith 10516679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 10526679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 10536679dcc1SBarry Smith idx = ij + ii[i]; 10546679dcc1SBarry Smith 10556679dcc1SBarry Smith if (usecprow) { 10566679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 10576679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 10586679dcc1SBarry Smith } 10599371c9d4SSatish Balay sum1 = y[0]; 10609371c9d4SSatish Balay sum2 = y[1]; 10619371c9d4SSatish Balay sum3 = y[2]; 10629371c9d4SSatish Balay sum4 = y[3]; 10639371c9d4SSatish Balay sum5 = y[4]; 10649371c9d4SSatish Balay sum6 = y[5]; 10659371c9d4SSatish Balay sum7 = y[6]; 10669371c9d4SSatish Balay sum8 = y[7]; 10679371c9d4SSatish Balay sum9 = y[8]; 10689371c9d4SSatish Balay sum10 = y[9]; 10699371c9d4SSatish Balay sum11 = y[10]; 10709371c9d4SSatish Balay sum12 = y[11]; 10716679dcc1SBarry Smith 10726679dcc1SBarry Smith for (j = 0; j < n; j++) { 10736679dcc1SBarry Smith xb = x + 12 * (idx[j]); 10746679dcc1SBarry Smith 10756679dcc1SBarry Smith for (k = 0; k < 12; k++) { 10766679dcc1SBarry Smith xv = xb[k]; 10776679dcc1SBarry Smith sum1 += v[0] * xv; 10786679dcc1SBarry Smith sum2 += v[1] * xv; 10796679dcc1SBarry Smith sum3 += v[2] * xv; 10806679dcc1SBarry Smith sum4 += v[3] * xv; 10816679dcc1SBarry Smith sum5 += v[4] * xv; 10826679dcc1SBarry Smith sum6 += v[5] * xv; 10836679dcc1SBarry Smith sum7 += v[6] * xv; 10846679dcc1SBarry Smith sum8 += v[7] * xv; 10856679dcc1SBarry Smith sum9 += v[8] * xv; 10866679dcc1SBarry Smith sum10 += v[9] * xv; 10876679dcc1SBarry Smith sum11 += v[10] * xv; 10886679dcc1SBarry Smith sum12 += v[11] * xv; 10896679dcc1SBarry Smith v += 12; 10906679dcc1SBarry Smith } 10916679dcc1SBarry Smith } 10926679dcc1SBarry Smith 10939371c9d4SSatish Balay z[0] = sum1; 10949371c9d4SSatish Balay z[1] = sum2; 10959371c9d4SSatish Balay z[2] = sum3; 10969371c9d4SSatish Balay z[3] = sum4; 10979371c9d4SSatish Balay z[4] = sum5; 10989371c9d4SSatish Balay z[5] = sum6; 10999371c9d4SSatish Balay z[6] = sum7; 11009371c9d4SSatish Balay z[7] = sum8; 11019371c9d4SSatish Balay z[8] = sum9; 11029371c9d4SSatish Balay z[9] = sum10; 11039371c9d4SSatish Balay z[10] = sum11; 11049371c9d4SSatish Balay z[11] = sum12; 11056679dcc1SBarry Smith if (!usecprow) { 11066679dcc1SBarry Smith y += 12; 11076679dcc1SBarry Smith z += 12; 11086679dcc1SBarry Smith } 11096679dcc1SBarry Smith } 11109566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11119566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 11129566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 11136679dcc1SBarry Smith PetscFunctionReturn(0); 11146679dcc1SBarry Smith } 11156679dcc1SBarry Smith 11166679dcc1SBarry Smith /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1117*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) 1118*d71ae5a4SJacob Faibussowitsch { 11196679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 11206679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 11216679dcc1SBarry Smith const PetscScalar *x, *xb; 11226679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray; 11236679dcc1SBarry Smith const MatScalar *v; 11246679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 11256679dcc1SBarry Smith PetscInt mbs, i, j, n; 11266679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 11276679dcc1SBarry Smith 11286679dcc1SBarry Smith PetscFunctionBegin; 11299566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 11309566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 11316679dcc1SBarry Smith 11326679dcc1SBarry Smith v = a->a; 11336679dcc1SBarry Smith if (usecprow) { 11346679dcc1SBarry Smith mbs = a->compressedrow.nrows; 11356679dcc1SBarry Smith ii = a->compressedrow.i; 11366679dcc1SBarry Smith ridx = a->compressedrow.rindex; 11379566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 11386679dcc1SBarry Smith } else { 11396679dcc1SBarry Smith mbs = a->mbs; 11406679dcc1SBarry Smith ii = a->i; 11416679dcc1SBarry Smith z = zarray; 11426679dcc1SBarry Smith } 11436679dcc1SBarry Smith 11446679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 11456679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 11466679dcc1SBarry Smith idx = ij + ii[i]; 11476679dcc1SBarry Smith 11486679dcc1SBarry Smith sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 11496679dcc1SBarry Smith for (j = 0; j < n; j++) { 11506679dcc1SBarry Smith xb = x + 12 * (idx[j]); 11519371c9d4SSatish Balay x1 = xb[0]; 11529371c9d4SSatish Balay x2 = xb[1]; 11539371c9d4SSatish Balay x3 = xb[2]; 11549371c9d4SSatish Balay x4 = xb[3]; 11556679dcc1SBarry Smith 11566679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11576679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11586679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11596679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11606679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11616679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11626679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11636679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11646679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11656679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11666679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11676679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11686679dcc1SBarry Smith v += 48; 11696679dcc1SBarry Smith 11709371c9d4SSatish Balay x1 = xb[4]; 11719371c9d4SSatish Balay x2 = xb[5]; 11729371c9d4SSatish Balay x3 = xb[6]; 11739371c9d4SSatish Balay x4 = xb[7]; 11746679dcc1SBarry Smith 11756679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11766679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11776679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11786679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11796679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11806679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11816679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11826679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11836679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11846679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11856679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11866679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11876679dcc1SBarry Smith v += 48; 11886679dcc1SBarry Smith 11899371c9d4SSatish Balay x1 = xb[8]; 11909371c9d4SSatish Balay x2 = xb[9]; 11919371c9d4SSatish Balay x3 = xb[10]; 11929371c9d4SSatish Balay x4 = xb[11]; 11936679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11946679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11956679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11966679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11976679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11986679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11996679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12006679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12016679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12026679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12036679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12046679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12056679dcc1SBarry Smith v += 48; 12066679dcc1SBarry Smith } 12076679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 12089371c9d4SSatish Balay z[0] = sum1; 12099371c9d4SSatish Balay z[1] = sum2; 12109371c9d4SSatish Balay z[2] = sum3; 12119371c9d4SSatish Balay z[3] = sum4; 12129371c9d4SSatish Balay z[4] = sum5; 12139371c9d4SSatish Balay z[5] = sum6; 12149371c9d4SSatish Balay z[6] = sum7; 12159371c9d4SSatish Balay z[7] = sum8; 12169371c9d4SSatish Balay z[8] = sum9; 12179371c9d4SSatish Balay z[9] = sum10; 12189371c9d4SSatish Balay z[10] = sum11; 12199371c9d4SSatish Balay z[11] = sum12; 12206679dcc1SBarry Smith if (!usecprow) z += 12; 12216679dcc1SBarry Smith } 12229566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 12239566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 12249566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 12256679dcc1SBarry Smith PetscFunctionReturn(0); 12266679dcc1SBarry Smith } 12276679dcc1SBarry Smith 12286679dcc1SBarry Smith /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1229*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) 1230*d71ae5a4SJacob Faibussowitsch { 12316679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 12326679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 12336679dcc1SBarry Smith const PetscScalar *x, *xb; 12346679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray, *yarray; 12356679dcc1SBarry Smith const MatScalar *v; 12366679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 12376679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, n; 12386679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 12396679dcc1SBarry Smith 12406679dcc1SBarry Smith PetscFunctionBegin; 12419566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 12429566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 12436679dcc1SBarry Smith 12446679dcc1SBarry Smith v = a->a; 12456679dcc1SBarry Smith if (usecprow) { 124648a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 12476679dcc1SBarry Smith mbs = a->compressedrow.nrows; 12486679dcc1SBarry Smith ii = a->compressedrow.i; 12496679dcc1SBarry Smith ridx = a->compressedrow.rindex; 12506679dcc1SBarry Smith } else { 12516679dcc1SBarry Smith ii = a->i; 12526679dcc1SBarry Smith y = yarray; 12536679dcc1SBarry Smith z = zarray; 12546679dcc1SBarry Smith } 12556679dcc1SBarry Smith 12566679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 12576679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 12586679dcc1SBarry Smith idx = ij + ii[i]; 12596679dcc1SBarry Smith 12606679dcc1SBarry Smith if (usecprow) { 12616679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 12626679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 12636679dcc1SBarry Smith } 12649371c9d4SSatish Balay sum1 = y[0]; 12659371c9d4SSatish Balay sum2 = y[1]; 12669371c9d4SSatish Balay sum3 = y[2]; 12679371c9d4SSatish Balay sum4 = y[3]; 12689371c9d4SSatish Balay sum5 = y[4]; 12699371c9d4SSatish Balay sum6 = y[5]; 12709371c9d4SSatish Balay sum7 = y[6]; 12719371c9d4SSatish Balay sum8 = y[7]; 12729371c9d4SSatish Balay sum9 = y[8]; 12739371c9d4SSatish Balay sum10 = y[9]; 12749371c9d4SSatish Balay sum11 = y[10]; 12759371c9d4SSatish Balay sum12 = y[11]; 12766679dcc1SBarry Smith 12776679dcc1SBarry Smith for (j = 0; j < n; j++) { 12786679dcc1SBarry Smith xb = x + 12 * (idx[j]); 12799371c9d4SSatish Balay x1 = xb[0]; 12809371c9d4SSatish Balay x2 = xb[1]; 12819371c9d4SSatish Balay x3 = xb[2]; 12829371c9d4SSatish Balay x4 = xb[3]; 12836679dcc1SBarry Smith 12846679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12856679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12866679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12876679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12886679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12896679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12906679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12916679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12926679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12936679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12946679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12956679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12966679dcc1SBarry Smith v += 48; 12976679dcc1SBarry Smith 12989371c9d4SSatish Balay x1 = xb[4]; 12999371c9d4SSatish Balay x2 = xb[5]; 13009371c9d4SSatish Balay x3 = xb[6]; 13019371c9d4SSatish Balay x4 = xb[7]; 13026679dcc1SBarry Smith 13036679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 13046679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 13056679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 13066679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 13076679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 13086679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 13096679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 13106679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 13116679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 13126679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 13136679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 13146679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 13156679dcc1SBarry Smith v += 48; 13166679dcc1SBarry Smith 13179371c9d4SSatish Balay x1 = xb[8]; 13189371c9d4SSatish Balay x2 = xb[9]; 13199371c9d4SSatish Balay x3 = xb[10]; 13209371c9d4SSatish Balay x4 = xb[11]; 13216679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 13226679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 13236679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 13246679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 13256679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 13266679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 13276679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 13286679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 13296679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 13306679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 13316679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 13326679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 13336679dcc1SBarry Smith v += 48; 13346679dcc1SBarry Smith } 13359371c9d4SSatish Balay z[0] = sum1; 13369371c9d4SSatish Balay z[1] = sum2; 13379371c9d4SSatish Balay z[2] = sum3; 13389371c9d4SSatish Balay z[3] = sum4; 13399371c9d4SSatish Balay z[4] = sum5; 13409371c9d4SSatish Balay z[5] = sum6; 13419371c9d4SSatish Balay z[6] = sum7; 13429371c9d4SSatish Balay z[7] = sum8; 13439371c9d4SSatish Balay z[8] = sum9; 13449371c9d4SSatish Balay z[9] = sum10; 13459371c9d4SSatish Balay z[10] = sum11; 13469371c9d4SSatish Balay z[11] = sum12; 13476679dcc1SBarry Smith if (!usecprow) { 13486679dcc1SBarry Smith y += 12; 13496679dcc1SBarry Smith z += 12; 13506679dcc1SBarry Smith } 13516679dcc1SBarry Smith } 13529566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 13539566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 13549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 13556679dcc1SBarry Smith PetscFunctionReturn(0); 13566679dcc1SBarry Smith } 13576679dcc1SBarry Smith 13586679dcc1SBarry Smith #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1359*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) 1360*d71ae5a4SJacob Faibussowitsch { 13616679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 13626679dcc1SBarry Smith PetscScalar *z = NULL, *zarray; 13636679dcc1SBarry Smith const PetscScalar *x, *work; 13646679dcc1SBarry Smith const MatScalar *v = a->a; 13656679dcc1SBarry Smith PetscInt mbs, i, j, n; 13666679dcc1SBarry Smith const PetscInt *idx = a->j, *ii, *ridx = NULL; 13676679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 13686679dcc1SBarry Smith const PetscInt bs = 12, bs2 = 144; 13696679dcc1SBarry Smith 13706679dcc1SBarry Smith __m256d a0, a1, a2, a3, a4, a5; 13716679dcc1SBarry Smith __m256d w0, w1, w2, w3; 13726679dcc1SBarry Smith __m256d z0, z1, z2; 13736679dcc1SBarry Smith 13746679dcc1SBarry Smith PetscFunctionBegin; 13759566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 13769566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 13776679dcc1SBarry Smith 13786679dcc1SBarry Smith if (usecprow) { 13796679dcc1SBarry Smith mbs = a->compressedrow.nrows; 13806679dcc1SBarry Smith ii = a->compressedrow.i; 13816679dcc1SBarry Smith ridx = a->compressedrow.rindex; 13829566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 13836679dcc1SBarry Smith } else { 13846679dcc1SBarry Smith mbs = a->mbs; 13856679dcc1SBarry Smith ii = a->i; 13866679dcc1SBarry Smith z = zarray; 13876679dcc1SBarry Smith } 13886679dcc1SBarry Smith 13896679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 13909371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 13919371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 13929371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 13936679dcc1SBarry Smith 13949371c9d4SSatish Balay n = ii[1] - ii[0]; 13959371c9d4SSatish Balay ii++; 13966679dcc1SBarry Smith for (j = 0; j < n; j++) { 13976679dcc1SBarry Smith work = x + bs * (*idx++); 13986679dcc1SBarry Smith 13996679dcc1SBarry Smith /* first column of a */ 14006679dcc1SBarry Smith w0 = _mm256_set1_pd(work[0]); 14019371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 0); 14029371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14039371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 4); 14049371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14059371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 8); 14069371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14076679dcc1SBarry Smith 14086679dcc1SBarry Smith /* second column of a */ 14096679dcc1SBarry Smith w1 = _mm256_set1_pd(work[1]); 14109371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 12); 14119371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14129371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 16); 14139371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14149371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 20); 14159371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14166679dcc1SBarry Smith 14176679dcc1SBarry Smith /* third column of a */ 14186679dcc1SBarry Smith w2 = _mm256_set1_pd(work[2]); 14199371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 24); 14209371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14219371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 28); 14229371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14239371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 32); 14249371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14256679dcc1SBarry Smith 14266679dcc1SBarry Smith /* fourth column of a */ 14276679dcc1SBarry Smith w3 = _mm256_set1_pd(work[3]); 14289371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 36); 14299371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14309371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 40); 14319371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14329371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 44); 14339371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14346679dcc1SBarry Smith 14356679dcc1SBarry Smith /* fifth column of a */ 14366679dcc1SBarry Smith w0 = _mm256_set1_pd(work[4]); 14379371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 48); 14389371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14399371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 52); 14409371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14419371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 56); 14429371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14436679dcc1SBarry Smith 14446679dcc1SBarry Smith /* sixth column of a */ 14456679dcc1SBarry Smith w1 = _mm256_set1_pd(work[5]); 14469371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 60); 14479371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14489371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 64); 14499371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14509371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 68); 14519371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14526679dcc1SBarry Smith 14536679dcc1SBarry Smith /* seventh column of a */ 14546679dcc1SBarry Smith w2 = _mm256_set1_pd(work[6]); 14559371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 72); 14569371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14579371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 76); 14589371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14599371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 80); 14609371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14616679dcc1SBarry Smith 14626aad120cSJose E. Roman /* eighth column of a */ 14636679dcc1SBarry Smith w3 = _mm256_set1_pd(work[7]); 14649371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 84); 14659371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14669371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 88); 14679371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14689371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 92); 14699371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14706679dcc1SBarry Smith 14716679dcc1SBarry Smith /* ninth column of a */ 14726679dcc1SBarry Smith w0 = _mm256_set1_pd(work[8]); 14739371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 96); 14749371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14759371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 100); 14769371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14779371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 104); 14789371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14796679dcc1SBarry Smith 14806679dcc1SBarry Smith /* tenth column of a */ 14816679dcc1SBarry Smith w1 = _mm256_set1_pd(work[9]); 14829371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 108); 14839371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14849371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 112); 14859371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14869371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 116); 14879371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14886679dcc1SBarry Smith 14896679dcc1SBarry Smith /* eleventh column of a */ 14906679dcc1SBarry Smith w2 = _mm256_set1_pd(work[10]); 14919371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 120); 14929371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14939371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 124); 14949371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14959371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 128); 14969371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14976679dcc1SBarry Smith 14986679dcc1SBarry Smith /* twelveth column of a */ 14996679dcc1SBarry Smith w3 = _mm256_set1_pd(work[11]); 15009371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 132); 15019371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 15029371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 136); 15039371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 15049371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 140); 15059371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 15066679dcc1SBarry Smith 15076679dcc1SBarry Smith v += bs2; 15086679dcc1SBarry Smith } 15096679dcc1SBarry Smith if (usecprow) z = zarray + bs * ridx[i]; 15109371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 15119371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 15129371c9d4SSatish Balay _mm256_storeu_pd(&z[8], z2); 15136679dcc1SBarry Smith if (!usecprow) z += bs; 15146679dcc1SBarry Smith } 15159566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 15169566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 15179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 15186679dcc1SBarry Smith PetscFunctionReturn(0); 15196679dcc1SBarry Smith } 15206679dcc1SBarry Smith #endif 15216679dcc1SBarry Smith 15228ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1523832cc040SShri Abhyankar /* Default MatMult for block size 15 */ 1524*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) 1525*d71ae5a4SJacob Faibussowitsch { 15268ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1527f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 15288ab949d8SShri Abhyankar const PetscScalar *x, *xb; 152953ef36baSBarry Smith PetscScalar *zarray, xv; 15308ab949d8SShri Abhyankar const MatScalar *v; 15318ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 15327c565772SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 1533ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 15348ab949d8SShri Abhyankar 15358ab949d8SShri Abhyankar PetscFunctionBegin; 15369566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 15379566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 15388ab949d8SShri Abhyankar 15398ab949d8SShri Abhyankar v = a->a; 15408ab949d8SShri Abhyankar if (usecprow) { 15418ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 15428ab949d8SShri Abhyankar ii = a->compressedrow.i; 15438ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 15449566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 15458ab949d8SShri Abhyankar } else { 15468ab949d8SShri Abhyankar mbs = a->mbs; 15478ab949d8SShri Abhyankar ii = a->i; 15488ab949d8SShri Abhyankar z = zarray; 15498ab949d8SShri Abhyankar } 15508ab949d8SShri Abhyankar 15518ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 15528ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 15538ab949d8SShri Abhyankar idx = ij + ii[i]; 15549371c9d4SSatish Balay sum1 = 0.0; 15559371c9d4SSatish Balay sum2 = 0.0; 15569371c9d4SSatish Balay sum3 = 0.0; 15579371c9d4SSatish Balay sum4 = 0.0; 15589371c9d4SSatish Balay sum5 = 0.0; 15599371c9d4SSatish Balay sum6 = 0.0; 15609371c9d4SSatish Balay sum7 = 0.0; 15619371c9d4SSatish Balay sum8 = 0.0; 15629371c9d4SSatish Balay sum9 = 0.0; 15639371c9d4SSatish Balay sum10 = 0.0; 15649371c9d4SSatish Balay sum11 = 0.0; 15659371c9d4SSatish Balay sum12 = 0.0; 15669371c9d4SSatish Balay sum13 = 0.0; 15679371c9d4SSatish Balay sum14 = 0.0; 15689371c9d4SSatish Balay sum15 = 0.0; 15698ab949d8SShri Abhyankar 15708ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 15718ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 15728ab949d8SShri Abhyankar 15738ab949d8SShri Abhyankar for (k = 0; k < 15; k++) { 157453ef36baSBarry Smith xv = xb[k]; 157553ef36baSBarry Smith sum1 += v[0] * xv; 157653ef36baSBarry Smith sum2 += v[1] * xv; 157753ef36baSBarry Smith sum3 += v[2] * xv; 157853ef36baSBarry Smith sum4 += v[3] * xv; 157953ef36baSBarry Smith sum5 += v[4] * xv; 158053ef36baSBarry Smith sum6 += v[5] * xv; 158153ef36baSBarry Smith sum7 += v[6] * xv; 158253ef36baSBarry Smith sum8 += v[7] * xv; 158353ef36baSBarry Smith sum9 += v[8] * xv; 158453ef36baSBarry Smith sum10 += v[9] * xv; 158553ef36baSBarry Smith sum11 += v[10] * xv; 158653ef36baSBarry Smith sum12 += v[11] * xv; 158753ef36baSBarry Smith sum13 += v[12] * xv; 158853ef36baSBarry Smith sum14 += v[13] * xv; 158953ef36baSBarry Smith sum15 += v[14] * xv; 15908ab949d8SShri Abhyankar v += 15; 15918ab949d8SShri Abhyankar } 15928ab949d8SShri Abhyankar } 15938ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 15949371c9d4SSatish Balay z[0] = sum1; 15959371c9d4SSatish Balay z[1] = sum2; 15969371c9d4SSatish Balay z[2] = sum3; 15979371c9d4SSatish Balay z[3] = sum4; 15989371c9d4SSatish Balay z[4] = sum5; 15999371c9d4SSatish Balay z[5] = sum6; 16009371c9d4SSatish Balay z[6] = sum7; 16019371c9d4SSatish Balay z[7] = sum8; 16029371c9d4SSatish Balay z[8] = sum9; 16039371c9d4SSatish Balay z[9] = sum10; 16049371c9d4SSatish Balay z[10] = sum11; 16059371c9d4SSatish Balay z[11] = sum12; 16069371c9d4SSatish Balay z[12] = sum13; 16079371c9d4SSatish Balay z[13] = sum14; 16089371c9d4SSatish Balay z[14] = sum15; 16098ab949d8SShri Abhyankar 16108ab949d8SShri Abhyankar if (!usecprow) z += 15; 16118ab949d8SShri Abhyankar } 16128ab949d8SShri Abhyankar 16139566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 16149566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 16159566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 16168ab949d8SShri Abhyankar PetscFunctionReturn(0); 16178ab949d8SShri Abhyankar } 16188ab949d8SShri Abhyankar 16198ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1620*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) 1621*d71ae5a4SJacob Faibussowitsch { 16228ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1623f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 16248ab949d8SShri Abhyankar const PetscScalar *x, *xb; 16250b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, *zarray; 16268ab949d8SShri Abhyankar const MatScalar *v; 16278ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 16287c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1629ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 16308ab949d8SShri Abhyankar 16318ab949d8SShri Abhyankar PetscFunctionBegin; 16329566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 16339566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 16348ab949d8SShri Abhyankar 16358ab949d8SShri Abhyankar v = a->a; 16368ab949d8SShri Abhyankar if (usecprow) { 16378ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 16388ab949d8SShri Abhyankar ii = a->compressedrow.i; 16398ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 16409566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 16418ab949d8SShri Abhyankar } else { 16428ab949d8SShri Abhyankar mbs = a->mbs; 16438ab949d8SShri Abhyankar ii = a->i; 16448ab949d8SShri Abhyankar z = zarray; 16458ab949d8SShri Abhyankar } 16468ab949d8SShri Abhyankar 16478ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 16488ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 16498ab949d8SShri Abhyankar idx = ij + ii[i]; 16509371c9d4SSatish Balay sum1 = 0.0; 16519371c9d4SSatish Balay sum2 = 0.0; 16529371c9d4SSatish Balay sum3 = 0.0; 16539371c9d4SSatish Balay sum4 = 0.0; 16549371c9d4SSatish Balay sum5 = 0.0; 16559371c9d4SSatish Balay sum6 = 0.0; 16569371c9d4SSatish Balay sum7 = 0.0; 16579371c9d4SSatish Balay sum8 = 0.0; 16589371c9d4SSatish Balay sum9 = 0.0; 16599371c9d4SSatish Balay sum10 = 0.0; 16609371c9d4SSatish Balay sum11 = 0.0; 16619371c9d4SSatish Balay sum12 = 0.0; 16629371c9d4SSatish Balay sum13 = 0.0; 16639371c9d4SSatish Balay sum14 = 0.0; 16649371c9d4SSatish Balay sum15 = 0.0; 16658ab949d8SShri Abhyankar 16668ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 16678ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 16689371c9d4SSatish Balay x1 = xb[0]; 16699371c9d4SSatish Balay x2 = xb[1]; 16709371c9d4SSatish Balay x3 = xb[2]; 16719371c9d4SSatish Balay x4 = xb[3]; 16728ab949d8SShri Abhyankar 16738ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16748ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16758ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16768ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16778ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16788ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16798ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16808ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16818ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16828ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16838ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16848ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16858ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16868ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16878ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16888ab949d8SShri Abhyankar 16898ab949d8SShri Abhyankar v += 60; 16908ab949d8SShri Abhyankar 16919371c9d4SSatish Balay x1 = xb[4]; 16929371c9d4SSatish Balay x2 = xb[5]; 16939371c9d4SSatish Balay x3 = xb[6]; 16949371c9d4SSatish Balay x4 = xb[7]; 16958ab949d8SShri Abhyankar 16968ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16978ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16988ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16998ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 17008ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 17018ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 17028ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 17038ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 17048ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 17058ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 17068ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 17078ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 17088ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 17098ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 17108ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 17118ab949d8SShri Abhyankar v += 60; 17128ab949d8SShri Abhyankar 17139371c9d4SSatish Balay x1 = xb[8]; 17149371c9d4SSatish Balay x2 = xb[9]; 17159371c9d4SSatish Balay x3 = xb[10]; 17169371c9d4SSatish Balay x4 = xb[11]; 17170b8f6341SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 17180b8f6341SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 17190b8f6341SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 17200b8f6341SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 17210b8f6341SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 17220b8f6341SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 17230b8f6341SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 17240b8f6341SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 17250b8f6341SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 17260b8f6341SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 17270b8f6341SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 17280b8f6341SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 17290b8f6341SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 17300b8f6341SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 17310b8f6341SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 17320b8f6341SShri Abhyankar v += 60; 17330b8f6341SShri Abhyankar 17349371c9d4SSatish Balay x1 = xb[12]; 17359371c9d4SSatish Balay x2 = xb[13]; 17369371c9d4SSatish Balay x3 = xb[14]; 17378ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 17388ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 17398ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 17408ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 17418ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 17428ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 17438ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 17448ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 17458ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 17468ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 17478ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 17488ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 17498ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 17508ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 17518ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 17528ab949d8SShri Abhyankar v += 45; 17538ab949d8SShri Abhyankar } 17548ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 17559371c9d4SSatish Balay z[0] = sum1; 17569371c9d4SSatish Balay z[1] = sum2; 17579371c9d4SSatish Balay z[2] = sum3; 17589371c9d4SSatish Balay z[3] = sum4; 17599371c9d4SSatish Balay z[4] = sum5; 17609371c9d4SSatish Balay z[5] = sum6; 17619371c9d4SSatish Balay z[6] = sum7; 17629371c9d4SSatish Balay z[7] = sum8; 17639371c9d4SSatish Balay z[8] = sum9; 17649371c9d4SSatish Balay z[9] = sum10; 17659371c9d4SSatish Balay z[10] = sum11; 17669371c9d4SSatish Balay z[11] = sum12; 17679371c9d4SSatish Balay z[12] = sum13; 17689371c9d4SSatish Balay z[13] = sum14; 17699371c9d4SSatish Balay z[14] = sum15; 17708ab949d8SShri Abhyankar 17718ab949d8SShri Abhyankar if (!usecprow) z += 15; 17728ab949d8SShri Abhyankar } 17738ab949d8SShri Abhyankar 17749566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 17759566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 17769566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 17778ab949d8SShri Abhyankar PetscFunctionReturn(0); 17788ab949d8SShri Abhyankar } 17798ab949d8SShri Abhyankar 17808ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1781*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) 1782*d71ae5a4SJacob Faibussowitsch { 17838ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1784f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 17858ab949d8SShri Abhyankar const PetscScalar *x, *xb; 17860b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 17878ab949d8SShri Abhyankar const MatScalar *v; 17888ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 17897c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1790ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 17918ab949d8SShri Abhyankar 17928ab949d8SShri Abhyankar PetscFunctionBegin; 17939566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 17949566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 17958ab949d8SShri Abhyankar 17968ab949d8SShri Abhyankar v = a->a; 17978ab949d8SShri Abhyankar if (usecprow) { 17988ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 17998ab949d8SShri Abhyankar ii = a->compressedrow.i; 18008ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 18019566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 18028ab949d8SShri Abhyankar } else { 18038ab949d8SShri Abhyankar mbs = a->mbs; 18048ab949d8SShri Abhyankar ii = a->i; 18058ab949d8SShri Abhyankar z = zarray; 18068ab949d8SShri Abhyankar } 18078ab949d8SShri Abhyankar 18088ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 18098ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 18108ab949d8SShri Abhyankar idx = ij + ii[i]; 18119371c9d4SSatish Balay sum1 = 0.0; 18129371c9d4SSatish Balay sum2 = 0.0; 18139371c9d4SSatish Balay sum3 = 0.0; 18149371c9d4SSatish Balay sum4 = 0.0; 18159371c9d4SSatish Balay sum5 = 0.0; 18169371c9d4SSatish Balay sum6 = 0.0; 18179371c9d4SSatish Balay sum7 = 0.0; 18189371c9d4SSatish Balay sum8 = 0.0; 18199371c9d4SSatish Balay sum9 = 0.0; 18209371c9d4SSatish Balay sum10 = 0.0; 18219371c9d4SSatish Balay sum11 = 0.0; 18229371c9d4SSatish Balay sum12 = 0.0; 18239371c9d4SSatish Balay sum13 = 0.0; 18249371c9d4SSatish Balay sum14 = 0.0; 18259371c9d4SSatish Balay sum15 = 0.0; 18268ab949d8SShri Abhyankar 18278ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 18288ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 18299371c9d4SSatish Balay x1 = xb[0]; 18309371c9d4SSatish Balay x2 = xb[1]; 18319371c9d4SSatish Balay x3 = xb[2]; 18329371c9d4SSatish Balay x4 = xb[3]; 18339371c9d4SSatish Balay x5 = xb[4]; 18349371c9d4SSatish Balay x6 = xb[5]; 18359371c9d4SSatish Balay x7 = xb[6]; 18360b8f6341SShri Abhyankar x8 = xb[7]; 18378ab949d8SShri Abhyankar 18388ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 18398ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 18408ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 18418ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 18428ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 18438ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 18448ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 18458ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 18468ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 18478ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 18488ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 18498ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 18508ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 18518ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 18528ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 18538ab949d8SShri Abhyankar v += 120; 18548ab949d8SShri Abhyankar 18559371c9d4SSatish Balay x1 = xb[8]; 18569371c9d4SSatish Balay x2 = xb[9]; 18579371c9d4SSatish Balay x3 = xb[10]; 18589371c9d4SSatish Balay x4 = xb[11]; 18599371c9d4SSatish Balay x5 = xb[12]; 18609371c9d4SSatish Balay x6 = xb[13]; 18619371c9d4SSatish Balay x7 = xb[14]; 18620b8f6341SShri Abhyankar 18638ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 18648ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 18658ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 18668ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 18678ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 18688ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 18698ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 18708ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 18718ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 18728ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 18738ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 18748ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 18758ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 18768ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 18778ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 18788ab949d8SShri Abhyankar v += 105; 18798ab949d8SShri Abhyankar } 18808ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 18819371c9d4SSatish Balay z[0] = sum1; 18829371c9d4SSatish Balay z[1] = sum2; 18839371c9d4SSatish Balay z[2] = sum3; 18849371c9d4SSatish Balay z[3] = sum4; 18859371c9d4SSatish Balay z[4] = sum5; 18869371c9d4SSatish Balay z[5] = sum6; 18879371c9d4SSatish Balay z[6] = sum7; 18889371c9d4SSatish Balay z[7] = sum8; 18899371c9d4SSatish Balay z[8] = sum9; 18909371c9d4SSatish Balay z[9] = sum10; 18919371c9d4SSatish Balay z[10] = sum11; 18929371c9d4SSatish Balay z[11] = sum12; 18939371c9d4SSatish Balay z[12] = sum13; 18949371c9d4SSatish Balay z[13] = sum14; 18959371c9d4SSatish Balay z[14] = sum15; 18968ab949d8SShri Abhyankar 18978ab949d8SShri Abhyankar if (!usecprow) z += 15; 18988ab949d8SShri Abhyankar } 18998ab949d8SShri Abhyankar 19009566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 19019566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 19029566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 19038ab949d8SShri Abhyankar PetscFunctionReturn(0); 19048ab949d8SShri Abhyankar } 19058ab949d8SShri Abhyankar 19068ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1907*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) 1908*d71ae5a4SJacob Faibussowitsch { 19098ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1910f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 19118ab949d8SShri Abhyankar const PetscScalar *x, *xb; 19128ab949d8SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 19138ab949d8SShri Abhyankar const MatScalar *v; 19148ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 19157c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1916ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 19178ab949d8SShri Abhyankar 19188ab949d8SShri Abhyankar PetscFunctionBegin; 19199566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 19209566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 19218ab949d8SShri Abhyankar 19228ab949d8SShri Abhyankar v = a->a; 19238ab949d8SShri Abhyankar if (usecprow) { 19248ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 19258ab949d8SShri Abhyankar ii = a->compressedrow.i; 19268ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 19279566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 19288ab949d8SShri Abhyankar } else { 19298ab949d8SShri Abhyankar mbs = a->mbs; 19308ab949d8SShri Abhyankar ii = a->i; 19318ab949d8SShri Abhyankar z = zarray; 19328ab949d8SShri Abhyankar } 19338ab949d8SShri Abhyankar 19348ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 19358ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 19368ab949d8SShri Abhyankar idx = ij + ii[i]; 19379371c9d4SSatish Balay sum1 = 0.0; 19389371c9d4SSatish Balay sum2 = 0.0; 19399371c9d4SSatish Balay sum3 = 0.0; 19409371c9d4SSatish Balay sum4 = 0.0; 19419371c9d4SSatish Balay sum5 = 0.0; 19429371c9d4SSatish Balay sum6 = 0.0; 19439371c9d4SSatish Balay sum7 = 0.0; 19449371c9d4SSatish Balay sum8 = 0.0; 19459371c9d4SSatish Balay sum9 = 0.0; 19469371c9d4SSatish Balay sum10 = 0.0; 19479371c9d4SSatish Balay sum11 = 0.0; 19489371c9d4SSatish Balay sum12 = 0.0; 19499371c9d4SSatish Balay sum13 = 0.0; 19509371c9d4SSatish Balay sum14 = 0.0; 19519371c9d4SSatish Balay sum15 = 0.0; 19528ab949d8SShri Abhyankar 19538ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 19548ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 19559371c9d4SSatish Balay x1 = xb[0]; 19569371c9d4SSatish Balay x2 = xb[1]; 19579371c9d4SSatish Balay x3 = xb[2]; 19589371c9d4SSatish Balay x4 = xb[3]; 19599371c9d4SSatish Balay x5 = xb[4]; 19609371c9d4SSatish Balay x6 = xb[5]; 19619371c9d4SSatish Balay x7 = xb[6]; 19629371c9d4SSatish Balay x8 = xb[7]; 19639371c9d4SSatish Balay x9 = xb[8]; 19649371c9d4SSatish Balay x10 = xb[9]; 19659371c9d4SSatish Balay x11 = xb[10]; 19669371c9d4SSatish Balay x12 = xb[11]; 19679371c9d4SSatish Balay x13 = xb[12]; 19689371c9d4SSatish Balay x14 = xb[13]; 19699371c9d4SSatish Balay x15 = xb[14]; 19708ab949d8SShri Abhyankar 19718ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 19728ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 19738ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 19748ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 19758ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 19768ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 19778ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 19788ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 19798ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 19808ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 19818ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 19828ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 19838ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 19848ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 19858ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 19868ab949d8SShri Abhyankar v += 225; 19878ab949d8SShri Abhyankar } 19888ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 19899371c9d4SSatish Balay z[0] = sum1; 19909371c9d4SSatish Balay z[1] = sum2; 19919371c9d4SSatish Balay z[2] = sum3; 19929371c9d4SSatish Balay z[3] = sum4; 19939371c9d4SSatish Balay z[4] = sum5; 19949371c9d4SSatish Balay z[5] = sum6; 19959371c9d4SSatish Balay z[6] = sum7; 19969371c9d4SSatish Balay z[7] = sum8; 19979371c9d4SSatish Balay z[8] = sum9; 19989371c9d4SSatish Balay z[9] = sum10; 19999371c9d4SSatish Balay z[10] = sum11; 20009371c9d4SSatish Balay z[11] = sum12; 20019371c9d4SSatish Balay z[12] = sum13; 20029371c9d4SSatish Balay z[13] = sum14; 20039371c9d4SSatish Balay z[14] = sum15; 20048ab949d8SShri Abhyankar 20058ab949d8SShri Abhyankar if (!usecprow) z += 15; 20068ab949d8SShri Abhyankar } 20078ab949d8SShri Abhyankar 20089566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20099566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 20109566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 20118ab949d8SShri Abhyankar PetscFunctionReturn(0); 20128ab949d8SShri Abhyankar } 20138ab949d8SShri Abhyankar 20143f1db9ecSBarry Smith /* 20153f1db9ecSBarry Smith This will not work with MatScalar == float because it calls the BLAS 20163f1db9ecSBarry Smith */ 2017*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) 2018*d71ae5a4SJacob Faibussowitsch { 20192d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2020f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 2021d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2022d9ca1df4SBarry Smith const MatScalar *v; 2023d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2024d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2025d9ca1df4SBarry Smith PetscInt ncols, k; 2026ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20272d61bbb3SSatish Balay 20282d61bbb3SSatish Balay PetscFunctionBegin; 20299566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20309566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 20312d61bbb3SSatish Balay 20322d61bbb3SSatish Balay idx = a->j; 20332d61bbb3SSatish Balay v = a->a; 203426e093fcSHong Zhang if (usecprow) { 203526e093fcSHong Zhang mbs = a->compressedrow.nrows; 203626e093fcSHong Zhang ii = a->compressedrow.i; 20377b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 20389566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 203926e093fcSHong Zhang } else { 204026e093fcSHong Zhang mbs = a->mbs; 20412d61bbb3SSatish Balay ii = a->i; 204226e093fcSHong Zhang z = zarray; 204326e093fcSHong Zhang } 2044218c64b6SSatish Balay 20452d61bbb3SSatish Balay if (!a->mult_work) { 2046d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 20479566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 20482d61bbb3SSatish Balay } 20492d61bbb3SSatish Balay work = a->mult_work; 20502d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 20519371c9d4SSatish Balay n = ii[1] - ii[0]; 20529371c9d4SSatish Balay ii++; 20532d61bbb3SSatish Balay ncols = n * bs; 20542d61bbb3SSatish Balay workt = work; 20552d61bbb3SSatish Balay for (j = 0; j < n; j++) { 20562d61bbb3SSatish Balay xb = x + bs * (*idx++); 20572d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 20582d61bbb3SSatish Balay workt += bs; 20592d61bbb3SSatish Balay } 20607b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 206196b95a6bSBarry Smith PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 20622d61bbb3SSatish Balay v += n * bs2; 206326e093fcSHong Zhang if (!usecprow) z += bs; 20642d61bbb3SSatish Balay } 20659566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20669566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 20679566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 20682d61bbb3SSatish Balay PetscFunctionReturn(0); 20692d61bbb3SSatish Balay } 20702d61bbb3SSatish Balay 2071*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 2072*d71ae5a4SJacob Faibussowitsch { 20732d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2074122f12eaSBarry Smith const PetscScalar *x; 2075122f12eaSBarry Smith PetscScalar *y, *z, sum; 2076122f12eaSBarry Smith const MatScalar *v; 20777c565772SBarry Smith PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2078122f12eaSBarry Smith const PetscInt *idx, *ii; 2079ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20802d61bbb3SSatish Balay 20812d61bbb3SSatish Balay PetscFunctionBegin; 20829566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20839566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 20842d61bbb3SSatish Balay 20852d61bbb3SSatish Balay idx = a->j; 20862d61bbb3SSatish Balay v = a->a; 208726e093fcSHong Zhang if (usecprow) { 208848a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 208926e093fcSHong Zhang mbs = a->compressedrow.nrows; 209026e093fcSHong Zhang ii = a->compressedrow.i; 20917b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 209226e093fcSHong Zhang } else { 20932d61bbb3SSatish Balay ii = a->i; 209426e093fcSHong Zhang } 20952d61bbb3SSatish Balay 20962d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 2097122f12eaSBarry Smith n = ii[1] - ii[0]; 2098122f12eaSBarry Smith ii++; 209926e093fcSHong Zhang if (!usecprow) { 2100122f12eaSBarry Smith sum = y[i]; 2101122f12eaSBarry Smith } else { 2102122f12eaSBarry Smith sum = y[ridx[i]]; 2103122f12eaSBarry Smith } 2104444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2105444d8c10SJed Brown PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2106122f12eaSBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 2107122f12eaSBarry Smith v += n; 2108122f12eaSBarry Smith idx += n; 2109122f12eaSBarry Smith if (usecprow) { 2110122f12eaSBarry Smith z[ridx[i]] = sum; 2111122f12eaSBarry Smith } else { 2112122f12eaSBarry Smith z[i] = sum; 211326e093fcSHong Zhang } 21142d61bbb3SSatish Balay } 21159566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 21169566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 21179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 21182d61bbb3SSatish Balay PetscFunctionReturn(0); 21192d61bbb3SSatish Balay } 21202d61bbb3SSatish Balay 2121*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 2122*d71ae5a4SJacob Faibussowitsch { 21232d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2124f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2125d9ca1df4SBarry Smith const PetscScalar *x, *xb; 212626e093fcSHong Zhang PetscScalar x1, x2, *yarray, *zarray; 2127d9ca1df4SBarry Smith const MatScalar *v; 2128d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, j; 2129d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2130ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 21312d61bbb3SSatish Balay 21322d61bbb3SSatish Balay PetscFunctionBegin; 21339566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 21349566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 21352d61bbb3SSatish Balay 21362d61bbb3SSatish Balay idx = a->j; 21372d61bbb3SSatish Balay v = a->a; 213826e093fcSHong Zhang if (usecprow) { 213948a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 214026e093fcSHong Zhang mbs = a->compressedrow.nrows; 214126e093fcSHong Zhang ii = a->compressedrow.i; 21427b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 214326e093fcSHong Zhang } else { 21442d61bbb3SSatish Balay ii = a->i; 214526e093fcSHong Zhang y = yarray; 214626e093fcSHong Zhang z = zarray; 214726e093fcSHong Zhang } 21482d61bbb3SSatish Balay 21492d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 21509371c9d4SSatish Balay n = ii[1] - ii[0]; 21519371c9d4SSatish Balay ii++; 215226e093fcSHong Zhang if (usecprow) { 21537b2bb3b9SHong Zhang z = zarray + 2 * ridx[i]; 21547b2bb3b9SHong Zhang y = yarray + 2 * ridx[i]; 215526e093fcSHong Zhang } 21569371c9d4SSatish Balay sum1 = y[0]; 21579371c9d4SSatish Balay sum2 = y[1]; 2158444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2159444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 21602d61bbb3SSatish Balay for (j = 0; j < n; j++) { 216126fbe8dcSKarl Rupp xb = x + 2 * (*idx++); 216226fbe8dcSKarl Rupp x1 = xb[0]; 216326fbe8dcSKarl Rupp x2 = xb[1]; 216426fbe8dcSKarl Rupp 21652d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 21662d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 21672d61bbb3SSatish Balay v += 4; 21682d61bbb3SSatish Balay } 21699371c9d4SSatish Balay z[0] = sum1; 21709371c9d4SSatish Balay z[1] = sum2; 217126e093fcSHong Zhang if (!usecprow) { 21729371c9d4SSatish Balay z += 2; 21739371c9d4SSatish Balay y += 2; 21742d61bbb3SSatish Balay } 217526e093fcSHong Zhang } 21769566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 21779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 21789566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * a->nz)); 21792d61bbb3SSatish Balay PetscFunctionReturn(0); 21802d61bbb3SSatish Balay } 21812d61bbb3SSatish Balay 2182*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 2183*d71ae5a4SJacob Faibussowitsch { 21842d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2185f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2186d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2187d9ca1df4SBarry Smith const MatScalar *v; 2188d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2189d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2190ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 21912d61bbb3SSatish Balay 21922d61bbb3SSatish Balay PetscFunctionBegin; 21939566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 21949566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 21952d61bbb3SSatish Balay 21962d61bbb3SSatish Balay idx = a->j; 21972d61bbb3SSatish Balay v = a->a; 219826e093fcSHong Zhang if (usecprow) { 219948a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 220026e093fcSHong Zhang mbs = a->compressedrow.nrows; 220126e093fcSHong Zhang ii = a->compressedrow.i; 22027b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 220326e093fcSHong Zhang } else { 22042d61bbb3SSatish Balay ii = a->i; 220526e093fcSHong Zhang y = yarray; 220626e093fcSHong Zhang z = zarray; 220726e093fcSHong Zhang } 22082d61bbb3SSatish Balay 22092d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 22109371c9d4SSatish Balay n = ii[1] - ii[0]; 22119371c9d4SSatish Balay ii++; 221226e093fcSHong Zhang if (usecprow) { 22137b2bb3b9SHong Zhang z = zarray + 3 * ridx[i]; 22147b2bb3b9SHong Zhang y = yarray + 3 * ridx[i]; 221526e093fcSHong Zhang } 22169371c9d4SSatish Balay sum1 = y[0]; 22179371c9d4SSatish Balay sum2 = y[1]; 22189371c9d4SSatish Balay sum3 = y[2]; 2219444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2220444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 22212d61bbb3SSatish Balay for (j = 0; j < n; j++) { 22229371c9d4SSatish Balay xb = x + 3 * (*idx++); 22239371c9d4SSatish Balay x1 = xb[0]; 22249371c9d4SSatish Balay x2 = xb[1]; 22259371c9d4SSatish Balay x3 = xb[2]; 22262d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 22272d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 22282d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 22292d61bbb3SSatish Balay v += 9; 22302d61bbb3SSatish Balay } 22319371c9d4SSatish Balay z[0] = sum1; 22329371c9d4SSatish Balay z[1] = sum2; 22339371c9d4SSatish Balay z[2] = sum3; 223426e093fcSHong Zhang if (!usecprow) { 22359371c9d4SSatish Balay z += 3; 22369371c9d4SSatish Balay y += 3; 22372d61bbb3SSatish Balay } 223826e093fcSHong Zhang } 22399566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 22409566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 22419566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz)); 22422d61bbb3SSatish Balay PetscFunctionReturn(0); 22432d61bbb3SSatish Balay } 22442d61bbb3SSatish Balay 2245*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 2246*d71ae5a4SJacob Faibussowitsch { 22472d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2248f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2249d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2250d9ca1df4SBarry Smith const MatScalar *v; 2251d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2252d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2253ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 22542d61bbb3SSatish Balay 22552d61bbb3SSatish Balay PetscFunctionBegin; 22569566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 22579566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 22582d61bbb3SSatish Balay 22592d61bbb3SSatish Balay idx = a->j; 22602d61bbb3SSatish Balay v = a->a; 226126e093fcSHong Zhang if (usecprow) { 226248a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 226326e093fcSHong Zhang mbs = a->compressedrow.nrows; 226426e093fcSHong Zhang ii = a->compressedrow.i; 22657b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 226626e093fcSHong Zhang } else { 22672d61bbb3SSatish Balay ii = a->i; 226826e093fcSHong Zhang y = yarray; 226926e093fcSHong Zhang z = zarray; 227026e093fcSHong Zhang } 22712d61bbb3SSatish Balay 22722d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 22739371c9d4SSatish Balay n = ii[1] - ii[0]; 22749371c9d4SSatish Balay ii++; 227526e093fcSHong Zhang if (usecprow) { 22767b2bb3b9SHong Zhang z = zarray + 4 * ridx[i]; 22777b2bb3b9SHong Zhang y = yarray + 4 * ridx[i]; 227826e093fcSHong Zhang } 22799371c9d4SSatish Balay sum1 = y[0]; 22809371c9d4SSatish Balay sum2 = y[1]; 22819371c9d4SSatish Balay sum3 = y[2]; 22829371c9d4SSatish Balay sum4 = y[3]; 2283444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2284444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 22852d61bbb3SSatish Balay for (j = 0; j < n; j++) { 22862d61bbb3SSatish Balay xb = x + 4 * (*idx++); 22879371c9d4SSatish Balay x1 = xb[0]; 22889371c9d4SSatish Balay x2 = xb[1]; 22899371c9d4SSatish Balay x3 = xb[2]; 22909371c9d4SSatish Balay x4 = xb[3]; 22912d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 22922d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 22932d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 22942d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 22952d61bbb3SSatish Balay v += 16; 22962d61bbb3SSatish Balay } 22979371c9d4SSatish Balay z[0] = sum1; 22989371c9d4SSatish Balay z[1] = sum2; 22999371c9d4SSatish Balay z[2] = sum3; 23009371c9d4SSatish Balay z[3] = sum4; 230126e093fcSHong Zhang if (!usecprow) { 23029371c9d4SSatish Balay z += 4; 23039371c9d4SSatish Balay y += 4; 23042d61bbb3SSatish Balay } 230526e093fcSHong Zhang } 23069566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 23079566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 23089566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz)); 23092d61bbb3SSatish Balay PetscFunctionReturn(0); 23102d61bbb3SSatish Balay } 23112d61bbb3SSatish Balay 2312*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 2313*d71ae5a4SJacob Faibussowitsch { 23142d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2315f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2316d9ca1df4SBarry Smith const PetscScalar *x, *xb; 231726e093fcSHong Zhang PetscScalar *yarray, *zarray; 2318d9ca1df4SBarry Smith const MatScalar *v; 2319d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2320d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2321ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 23222d61bbb3SSatish Balay 23232d61bbb3SSatish Balay PetscFunctionBegin; 23249566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 23259566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 23262d61bbb3SSatish Balay 23272d61bbb3SSatish Balay idx = a->j; 23282d61bbb3SSatish Balay v = a->a; 232926e093fcSHong Zhang if (usecprow) { 233048a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 233126e093fcSHong Zhang mbs = a->compressedrow.nrows; 233226e093fcSHong Zhang ii = a->compressedrow.i; 23337b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 233426e093fcSHong Zhang } else { 23352d61bbb3SSatish Balay ii = a->i; 233626e093fcSHong Zhang y = yarray; 233726e093fcSHong Zhang z = zarray; 233826e093fcSHong Zhang } 23392d61bbb3SSatish Balay 23402d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 23419371c9d4SSatish Balay n = ii[1] - ii[0]; 23429371c9d4SSatish Balay ii++; 234326e093fcSHong Zhang if (usecprow) { 23447b2bb3b9SHong Zhang z = zarray + 5 * ridx[i]; 23457b2bb3b9SHong Zhang y = yarray + 5 * ridx[i]; 234626e093fcSHong Zhang } 23479371c9d4SSatish Balay sum1 = y[0]; 23489371c9d4SSatish Balay sum2 = y[1]; 23499371c9d4SSatish Balay sum3 = y[2]; 23509371c9d4SSatish Balay sum4 = y[3]; 23519371c9d4SSatish Balay sum5 = y[4]; 2352444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2353444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 23542d61bbb3SSatish Balay for (j = 0; j < n; j++) { 23552d61bbb3SSatish Balay xb = x + 5 * (*idx++); 23569371c9d4SSatish Balay x1 = xb[0]; 23579371c9d4SSatish Balay x2 = xb[1]; 23589371c9d4SSatish Balay x3 = xb[2]; 23599371c9d4SSatish Balay x4 = xb[3]; 23609371c9d4SSatish Balay x5 = xb[4]; 23612d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 23622d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 23632d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 23642d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 23652d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 23662d61bbb3SSatish Balay v += 25; 23672d61bbb3SSatish Balay } 23689371c9d4SSatish Balay z[0] = sum1; 23699371c9d4SSatish Balay z[1] = sum2; 23709371c9d4SSatish Balay z[2] = sum3; 23719371c9d4SSatish Balay z[3] = sum4; 23729371c9d4SSatish Balay z[4] = sum5; 237326e093fcSHong Zhang if (!usecprow) { 23749371c9d4SSatish Balay z += 5; 23759371c9d4SSatish Balay y += 5; 23762d61bbb3SSatish Balay } 237726e093fcSHong Zhang } 23789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 23799566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 23809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz)); 23812d61bbb3SSatish Balay PetscFunctionReturn(0); 23822d61bbb3SSatish Balay } 2383c2916339SPierre Jolivet 2384*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 2385*d71ae5a4SJacob Faibussowitsch { 238615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2387f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2388d9ca1df4SBarry Smith const PetscScalar *x, *xb; 238926e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2390d9ca1df4SBarry Smith const MatScalar *v; 2391d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2392d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2393ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 239415091d37SBarry Smith 239515091d37SBarry Smith PetscFunctionBegin; 23969566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 23979566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 239815091d37SBarry Smith 239915091d37SBarry Smith idx = a->j; 240015091d37SBarry Smith v = a->a; 240126e093fcSHong Zhang if (usecprow) { 240248a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 240326e093fcSHong Zhang mbs = a->compressedrow.nrows; 240426e093fcSHong Zhang ii = a->compressedrow.i; 24057b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 240626e093fcSHong Zhang } else { 240715091d37SBarry Smith ii = a->i; 240826e093fcSHong Zhang y = yarray; 240926e093fcSHong Zhang z = zarray; 241026e093fcSHong Zhang } 241115091d37SBarry Smith 241215091d37SBarry Smith for (i = 0; i < mbs; i++) { 24139371c9d4SSatish Balay n = ii[1] - ii[0]; 24149371c9d4SSatish Balay ii++; 241526e093fcSHong Zhang if (usecprow) { 24167b2bb3b9SHong Zhang z = zarray + 6 * ridx[i]; 24177b2bb3b9SHong Zhang y = yarray + 6 * ridx[i]; 241826e093fcSHong Zhang } 24199371c9d4SSatish Balay sum1 = y[0]; 24209371c9d4SSatish Balay sum2 = y[1]; 24219371c9d4SSatish Balay sum3 = y[2]; 24229371c9d4SSatish Balay sum4 = y[3]; 24239371c9d4SSatish Balay sum5 = y[4]; 24249371c9d4SSatish Balay sum6 = y[5]; 2425444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2426444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 242715091d37SBarry Smith for (j = 0; j < n; j++) { 24283b95cb0eSSatish Balay xb = x + 6 * (*idx++); 24299371c9d4SSatish Balay x1 = xb[0]; 24309371c9d4SSatish Balay x2 = xb[1]; 24319371c9d4SSatish Balay x3 = xb[2]; 24329371c9d4SSatish Balay x4 = xb[3]; 24339371c9d4SSatish Balay x5 = xb[4]; 24349371c9d4SSatish Balay x6 = xb[5]; 243515091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 243615091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 243715091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 243815091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 243915091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 244015091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 244115091d37SBarry Smith v += 36; 244215091d37SBarry Smith } 24439371c9d4SSatish Balay z[0] = sum1; 24449371c9d4SSatish Balay z[1] = sum2; 24459371c9d4SSatish Balay z[2] = sum3; 24469371c9d4SSatish Balay z[3] = sum4; 24479371c9d4SSatish Balay z[4] = sum5; 24489371c9d4SSatish Balay z[5] = sum6; 244926e093fcSHong Zhang if (!usecprow) { 24509371c9d4SSatish Balay z += 6; 24519371c9d4SSatish Balay y += 6; 245215091d37SBarry Smith } 245326e093fcSHong Zhang } 24549566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 24559566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 24569566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz)); 245715091d37SBarry Smith PetscFunctionReturn(0); 245815091d37SBarry Smith } 24592d61bbb3SSatish Balay 2460*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 2461*d71ae5a4SJacob Faibussowitsch { 24622d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2463f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2464d9ca1df4SBarry Smith const PetscScalar *x, *xb; 246526e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2466d9ca1df4SBarry Smith const MatScalar *v; 2467d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2468d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2469ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 24702d61bbb3SSatish Balay 24712d61bbb3SSatish Balay PetscFunctionBegin; 24729566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 24739566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 24742d61bbb3SSatish Balay 24752d61bbb3SSatish Balay idx = a->j; 24762d61bbb3SSatish Balay v = a->a; 247726e093fcSHong Zhang if (usecprow) { 247848a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 247926e093fcSHong Zhang mbs = a->compressedrow.nrows; 248026e093fcSHong Zhang ii = a->compressedrow.i; 24817b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 248226e093fcSHong Zhang } else { 24832d61bbb3SSatish Balay ii = a->i; 248426e093fcSHong Zhang y = yarray; 248526e093fcSHong Zhang z = zarray; 248626e093fcSHong Zhang } 24872d61bbb3SSatish Balay 24882d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 24899371c9d4SSatish Balay n = ii[1] - ii[0]; 24909371c9d4SSatish Balay ii++; 249126e093fcSHong Zhang if (usecprow) { 24927b2bb3b9SHong Zhang z = zarray + 7 * ridx[i]; 24937b2bb3b9SHong Zhang y = yarray + 7 * ridx[i]; 249426e093fcSHong Zhang } 24959371c9d4SSatish Balay sum1 = y[0]; 24969371c9d4SSatish Balay sum2 = y[1]; 24979371c9d4SSatish Balay sum3 = y[2]; 24989371c9d4SSatish Balay sum4 = y[3]; 24999371c9d4SSatish Balay sum5 = y[4]; 25009371c9d4SSatish Balay sum6 = y[5]; 25019371c9d4SSatish Balay sum7 = y[6]; 2502444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2503444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 25042d61bbb3SSatish Balay for (j = 0; j < n; j++) { 25052d61bbb3SSatish Balay xb = x + 7 * (*idx++); 25069371c9d4SSatish Balay x1 = xb[0]; 25079371c9d4SSatish Balay x2 = xb[1]; 25089371c9d4SSatish Balay x3 = xb[2]; 25099371c9d4SSatish Balay x4 = xb[3]; 25109371c9d4SSatish Balay x5 = xb[4]; 25119371c9d4SSatish Balay x6 = xb[5]; 25129371c9d4SSatish Balay x7 = xb[6]; 25132d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 25142d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 25152d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 25162d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 25172d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 25182d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 25192d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 25202d61bbb3SSatish Balay v += 49; 25212d61bbb3SSatish Balay } 25229371c9d4SSatish Balay z[0] = sum1; 25239371c9d4SSatish Balay z[1] = sum2; 25249371c9d4SSatish Balay z[2] = sum3; 25259371c9d4SSatish Balay z[3] = sum4; 25269371c9d4SSatish Balay z[4] = sum5; 25279371c9d4SSatish Balay z[5] = sum6; 25289371c9d4SSatish Balay z[6] = sum7; 252926e093fcSHong Zhang if (!usecprow) { 25309371c9d4SSatish Balay z += 7; 25319371c9d4SSatish Balay y += 7; 25322d61bbb3SSatish Balay } 253326e093fcSHong Zhang } 25349566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 25359566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 25369566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz)); 25372d61bbb3SSatish Balay PetscFunctionReturn(0); 25382d61bbb3SSatish Balay } 2539218c64b6SSatish Balay 25405f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2541*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) 2542*d71ae5a4SJacob Faibussowitsch { 254396e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2544f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 254596e086a2SDaniel Kokron const PetscScalar *x, *xb; 254696e086a2SDaniel Kokron const MatScalar *v; 25476679dcc1SBarry Smith PetscInt mbs, i, j, n; 2548ce68d72fSJed Brown PetscInt k; 254996e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 25506679dcc1SBarry Smith const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 255196e086a2SDaniel Kokron 255296e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 2553ce68d72fSJed Brown __m256d w0, w1, w2, w3; 255496e086a2SDaniel Kokron __m256d z0, z1, z2; 255596e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 255696e086a2SDaniel Kokron 255796e086a2SDaniel Kokron PetscFunctionBegin; 25589566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 25599566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 25609566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 256196e086a2SDaniel Kokron 256296e086a2SDaniel Kokron idx = a->j; 256396e086a2SDaniel Kokron v = a->a; 256496e086a2SDaniel Kokron if (usecprow) { 256596e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 256696e086a2SDaniel Kokron ii = a->compressedrow.i; 256796e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 256896e086a2SDaniel Kokron } else { 256996e086a2SDaniel Kokron mbs = a->mbs; 257096e086a2SDaniel Kokron ii = a->i; 257196e086a2SDaniel Kokron z = zarray; 257296e086a2SDaniel Kokron } 257396e086a2SDaniel Kokron 257496e086a2SDaniel Kokron if (!a->mult_work) { 257596e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 25769566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 257796e086a2SDaniel Kokron } 257896e086a2SDaniel Kokron 257996e086a2SDaniel Kokron work = a->mult_work; 258096e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 25819371c9d4SSatish Balay n = ii[1] - ii[0]; 25829371c9d4SSatish Balay ii++; 258396e086a2SDaniel Kokron workt = work; 258496e086a2SDaniel Kokron for (j = 0; j < n; j++) { 258596e086a2SDaniel Kokron xb = x + bs * (*idx++); 258696e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 258796e086a2SDaniel Kokron workt += bs; 258896e086a2SDaniel Kokron } 258996e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 259096e086a2SDaniel Kokron 25919371c9d4SSatish Balay z0 = _mm256_loadu_pd(&z[0]); 25929371c9d4SSatish Balay z1 = _mm256_loadu_pd(&z[4]); 25939371c9d4SSatish Balay z2 = _mm256_set1_pd(z[8]); 259496e086a2SDaniel Kokron 259596e086a2SDaniel Kokron for (j = 0; j < n; j++) { 2596c05b70c4SSatish Balay /* first column of a */ 259796e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 25989371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 25999371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 26009371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 26019371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 26029371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 26039371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 260496e086a2SDaniel Kokron 2605c05b70c4SSatish Balay /* second column of a */ 260696e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 26079371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 26089371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 26099371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 26109371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 26119371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 26129371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 261396e086a2SDaniel Kokron 2614c05b70c4SSatish Balay /* third column of a */ 261596e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 26169371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 26179371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 26189371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 26199371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 26209371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 26219371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 262296e086a2SDaniel Kokron 2623c05b70c4SSatish Balay /* fourth column of a */ 262496e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 26259371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 26269371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 26279371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 26289371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 26299371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 26309371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 263196e086a2SDaniel Kokron 2632c05b70c4SSatish Balay /* fifth column of a */ 263396e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 26349371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 26359371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 26369371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 26379371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 26389371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 26399371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 264096e086a2SDaniel Kokron 2641c05b70c4SSatish Balay /* sixth column of a */ 264296e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 26439371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 26449371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 26459371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 26469371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 26479371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 26489371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 264996e086a2SDaniel Kokron 2650c05b70c4SSatish Balay /* seventh column of a */ 265196e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 26529371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 26539371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 26549371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 26559371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 26569371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 26579371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 265896e086a2SDaniel Kokron 26596aad120cSJose E. Roman /* eighth column of a */ 266096e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 26619371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 26629371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 26639371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 26649371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 26659371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 26669371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 266796e086a2SDaniel Kokron 2668c05b70c4SSatish Balay /* ninth column of a */ 266996e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 26709371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 26719371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 26729371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 26739371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 26749371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 26759371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 267696e086a2SDaniel Kokron } 267796e086a2SDaniel Kokron 26789371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 26799371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 26809371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 268196e086a2SDaniel Kokron 268296e086a2SDaniel Kokron v += n * bs2; 268396e086a2SDaniel Kokron if (!usecprow) z += bs; 268496e086a2SDaniel Kokron } 26859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 26869566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 26879566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(162.0 * a->nz)); 268896e086a2SDaniel Kokron PetscFunctionReturn(0); 268996e086a2SDaniel Kokron } 269096e086a2SDaniel Kokron #endif 269196e086a2SDaniel Kokron 2692*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) 2693*d71ae5a4SJacob Faibussowitsch { 2694ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2695f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2696ebada01fSBarry Smith const PetscScalar *x, *xb; 2697ebada01fSBarry Smith PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2698ebada01fSBarry Smith const MatScalar *v; 2699ebada01fSBarry Smith PetscInt mbs = a->mbs, i, j, n; 2700ebada01fSBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2701ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 2702ebada01fSBarry Smith 2703ebada01fSBarry Smith PetscFunctionBegin; 27049566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 27059566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2706ebada01fSBarry Smith 2707ebada01fSBarry Smith idx = a->j; 2708ebada01fSBarry Smith v = a->a; 2709ebada01fSBarry Smith if (usecprow) { 271048a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2711ebada01fSBarry Smith mbs = a->compressedrow.nrows; 2712ebada01fSBarry Smith ii = a->compressedrow.i; 2713ebada01fSBarry Smith ridx = a->compressedrow.rindex; 2714ebada01fSBarry Smith } else { 2715ebada01fSBarry Smith ii = a->i; 2716ebada01fSBarry Smith y = yarray; 2717ebada01fSBarry Smith z = zarray; 2718ebada01fSBarry Smith } 2719ebada01fSBarry Smith 2720ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 27219371c9d4SSatish Balay n = ii[1] - ii[0]; 27229371c9d4SSatish Balay ii++; 2723ebada01fSBarry Smith if (usecprow) { 2724ebada01fSBarry Smith z = zarray + 11 * ridx[i]; 2725ebada01fSBarry Smith y = yarray + 11 * ridx[i]; 2726ebada01fSBarry Smith } 27279371c9d4SSatish Balay sum1 = y[0]; 27289371c9d4SSatish Balay sum2 = y[1]; 27299371c9d4SSatish Balay sum3 = y[2]; 27309371c9d4SSatish Balay sum4 = y[3]; 27319371c9d4SSatish Balay sum5 = y[4]; 27329371c9d4SSatish Balay sum6 = y[5]; 27339371c9d4SSatish Balay sum7 = y[6]; 27349371c9d4SSatish Balay sum8 = y[7]; 27359371c9d4SSatish Balay sum9 = y[8]; 27369371c9d4SSatish Balay sum10 = y[9]; 27379371c9d4SSatish Balay sum11 = y[10]; 2738ebada01fSBarry Smith PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2739ebada01fSBarry Smith PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2740ebada01fSBarry Smith for (j = 0; j < n; j++) { 2741ebada01fSBarry Smith xb = x + 11 * (*idx++); 27429371c9d4SSatish Balay x1 = xb[0]; 27439371c9d4SSatish Balay x2 = xb[1]; 27449371c9d4SSatish Balay x3 = xb[2]; 27459371c9d4SSatish Balay x4 = xb[3]; 27469371c9d4SSatish Balay x5 = xb[4]; 27479371c9d4SSatish Balay x6 = xb[5]; 27489371c9d4SSatish Balay x7 = xb[6]; 27499371c9d4SSatish Balay x8 = xb[7]; 27509371c9d4SSatish Balay x9 = xb[8]; 27519371c9d4SSatish Balay x10 = xb[9]; 27529371c9d4SSatish Balay x11 = xb[10]; 2753ebada01fSBarry Smith sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2754ebada01fSBarry Smith sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2755ebada01fSBarry Smith sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2756ebada01fSBarry Smith sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2757ebada01fSBarry Smith sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2758ebada01fSBarry Smith sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2759ebada01fSBarry Smith sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2760ebada01fSBarry Smith sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2761ebada01fSBarry Smith sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2762ebada01fSBarry Smith sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2763ebada01fSBarry Smith sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2764ebada01fSBarry Smith v += 121; 2765ebada01fSBarry Smith } 27669371c9d4SSatish Balay z[0] = sum1; 27679371c9d4SSatish Balay z[1] = sum2; 27689371c9d4SSatish Balay z[2] = sum3; 27699371c9d4SSatish Balay z[3] = sum4; 27709371c9d4SSatish Balay z[4] = sum5; 27719371c9d4SSatish Balay z[5] = sum6; 27729371c9d4SSatish Balay z[6] = sum7; 27739371c9d4SSatish Balay z[7] = sum8; 27749371c9d4SSatish Balay z[8] = sum9; 27759371c9d4SSatish Balay z[9] = sum10; 27769371c9d4SSatish Balay z[10] = sum11; 2777ebada01fSBarry Smith if (!usecprow) { 27789371c9d4SSatish Balay z += 11; 27799371c9d4SSatish Balay y += 11; 2780ebada01fSBarry Smith } 2781ebada01fSBarry Smith } 27829566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 27839566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 27849566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz)); 2785ebada01fSBarry Smith PetscFunctionReturn(0); 2786ebada01fSBarry Smith } 2787ebada01fSBarry Smith 2788*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 2789*d71ae5a4SJacob Faibussowitsch { 27902d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2791f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 2792d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2793d9ca1df4SBarry Smith const MatScalar *v; 2794d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2795d9ca1df4SBarry Smith PetscInt ncols, k; 2796d9ca1df4SBarry Smith const PetscInt *ridx = NULL, *idx, *ii; 2797ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2798218c64b6SSatish Balay 27992d61bbb3SSatish Balay PetscFunctionBegin; 28009566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 28019566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 28029566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 28032d61bbb3SSatish Balay 28042d61bbb3SSatish Balay idx = a->j; 28052d61bbb3SSatish Balay v = a->a; 280626e093fcSHong Zhang if (usecprow) { 280726e093fcSHong Zhang mbs = a->compressedrow.nrows; 280826e093fcSHong Zhang ii = a->compressedrow.i; 28097b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 281026e093fcSHong Zhang } else { 281126e093fcSHong Zhang mbs = a->mbs; 28122d61bbb3SSatish Balay ii = a->i; 281326e093fcSHong Zhang z = zarray; 281426e093fcSHong Zhang } 28152d61bbb3SSatish Balay 28162d61bbb3SSatish Balay if (!a->mult_work) { 2817d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 28189566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 28192d61bbb3SSatish Balay } 28202d61bbb3SSatish Balay work = a->mult_work; 28212d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 28229371c9d4SSatish Balay n = ii[1] - ii[0]; 28239371c9d4SSatish Balay ii++; 28242d61bbb3SSatish Balay ncols = n * bs; 28252d61bbb3SSatish Balay workt = work; 28262d61bbb3SSatish Balay for (j = 0; j < n; j++) { 28272d61bbb3SSatish Balay xb = x + bs * (*idx++); 28282d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 28292d61bbb3SSatish Balay workt += bs; 28302d61bbb3SSatish Balay } 28317b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 283296b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 28332d61bbb3SSatish Balay v += n * bs2; 283426fbe8dcSKarl Rupp if (!usecprow) z += bs; 283526e093fcSHong Zhang } 28369566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 28379566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 28389566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 28392d61bbb3SSatish Balay PetscFunctionReturn(0); 28402d61bbb3SSatish Balay } 28412d61bbb3SSatish Balay 2842*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2843*d71ae5a4SJacob Faibussowitsch { 2844547795f9SHong Zhang PetscScalar zero = 0.0; 2845547795f9SHong Zhang 2846547795f9SHong Zhang PetscFunctionBegin; 28479566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28489566063dSJacob Faibussowitsch PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2849547795f9SHong Zhang PetscFunctionReturn(0); 2850547795f9SHong Zhang } 2851547795f9SHong Zhang 2852*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2853*d71ae5a4SJacob Faibussowitsch { 28543447b6efSHong Zhang PetscScalar zero = 0.0; 28552d61bbb3SSatish Balay 28562d61bbb3SSatish Balay PetscFunctionBegin; 28579566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28589566063dSJacob Faibussowitsch PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 28592d61bbb3SSatish Balay PetscFunctionReturn(0); 28602d61bbb3SSatish Balay } 28612d61bbb3SSatish Balay 2862*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2863*d71ae5a4SJacob Faibussowitsch { 2864547795f9SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2865b8c08b77SHong Zhang PetscScalar *z, x1, x2, x3, x4, x5; 2866d9ca1df4SBarry Smith const PetscScalar *x, *xb = NULL; 2867d9ca1df4SBarry Smith const MatScalar *v; 2868b8c08b77SHong Zhang PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2869d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 2870547795f9SHong Zhang Mat_CompressedRow cprow = a->compressedrow; 2871ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 2872547795f9SHong Zhang 2873547795f9SHong Zhang PetscFunctionBegin; 28749566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 28759566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 28769566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 2877547795f9SHong Zhang 2878547795f9SHong Zhang idx = a->j; 2879547795f9SHong Zhang v = a->a; 2880547795f9SHong Zhang if (usecprow) { 2881547795f9SHong Zhang mbs = cprow.nrows; 2882547795f9SHong Zhang ii = cprow.i; 2883547795f9SHong Zhang ridx = cprow.rindex; 2884547795f9SHong Zhang } else { 2885547795f9SHong Zhang mbs = a->mbs; 2886547795f9SHong Zhang ii = a->i; 2887547795f9SHong Zhang xb = x; 2888547795f9SHong Zhang } 2889547795f9SHong Zhang 2890547795f9SHong Zhang switch (bs) { 2891547795f9SHong Zhang case 1: 2892547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2893547795f9SHong Zhang if (usecprow) xb = x + ridx[i]; 2894547795f9SHong Zhang x1 = xb[0]; 2895547795f9SHong Zhang ib = idx + ii[0]; 28969371c9d4SSatish Balay n = ii[1] - ii[0]; 28979371c9d4SSatish Balay ii++; 2898547795f9SHong Zhang for (j = 0; j < n; j++) { 2899547795f9SHong Zhang rval = ib[j]; 2900547795f9SHong Zhang z[rval] += PetscConj(*v) * x1; 2901547795f9SHong Zhang v++; 2902547795f9SHong Zhang } 2903547795f9SHong Zhang if (!usecprow) xb++; 2904547795f9SHong Zhang } 2905547795f9SHong Zhang break; 2906547795f9SHong Zhang case 2: 2907547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2908547795f9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 29099371c9d4SSatish Balay x1 = xb[0]; 29109371c9d4SSatish Balay x2 = xb[1]; 2911547795f9SHong Zhang ib = idx + ii[0]; 29129371c9d4SSatish Balay n = ii[1] - ii[0]; 29139371c9d4SSatish Balay ii++; 2914547795f9SHong Zhang for (j = 0; j < n; j++) { 2915547795f9SHong Zhang rval = ib[j] * 2; 2916547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2917547795f9SHong Zhang z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2918547795f9SHong Zhang v += 4; 2919547795f9SHong Zhang } 2920547795f9SHong Zhang if (!usecprow) xb += 2; 2921547795f9SHong Zhang } 2922547795f9SHong Zhang break; 2923547795f9SHong Zhang case 3: 2924547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2925547795f9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 29269371c9d4SSatish Balay x1 = xb[0]; 29279371c9d4SSatish Balay x2 = xb[1]; 29289371c9d4SSatish Balay x3 = xb[2]; 2929547795f9SHong Zhang ib = idx + ii[0]; 29309371c9d4SSatish Balay n = ii[1] - ii[0]; 29319371c9d4SSatish Balay ii++; 2932547795f9SHong Zhang for (j = 0; j < n; j++) { 2933547795f9SHong Zhang rval = ib[j] * 3; 2934547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2935547795f9SHong Zhang z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2936547795f9SHong Zhang z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2937547795f9SHong Zhang v += 9; 2938547795f9SHong Zhang } 2939547795f9SHong Zhang if (!usecprow) xb += 3; 2940547795f9SHong Zhang } 2941547795f9SHong Zhang break; 2942547795f9SHong Zhang case 4: 2943547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2944547795f9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 29459371c9d4SSatish Balay x1 = xb[0]; 29469371c9d4SSatish Balay x2 = xb[1]; 29479371c9d4SSatish Balay x3 = xb[2]; 29489371c9d4SSatish Balay x4 = xb[3]; 2949547795f9SHong Zhang ib = idx + ii[0]; 29509371c9d4SSatish Balay n = ii[1] - ii[0]; 29519371c9d4SSatish Balay ii++; 2952547795f9SHong Zhang for (j = 0; j < n; j++) { 2953547795f9SHong Zhang rval = ib[j] * 4; 2954547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2955547795f9SHong Zhang z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2956547795f9SHong Zhang z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2957547795f9SHong Zhang z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2958547795f9SHong Zhang v += 16; 2959547795f9SHong Zhang } 2960547795f9SHong Zhang if (!usecprow) xb += 4; 2961547795f9SHong Zhang } 2962547795f9SHong Zhang break; 2963547795f9SHong Zhang case 5: 2964547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2965547795f9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 29669371c9d4SSatish Balay x1 = xb[0]; 29679371c9d4SSatish Balay x2 = xb[1]; 29689371c9d4SSatish Balay x3 = xb[2]; 29699371c9d4SSatish Balay x4 = xb[3]; 29709371c9d4SSatish Balay x5 = xb[4]; 2971547795f9SHong Zhang ib = idx + ii[0]; 29729371c9d4SSatish Balay n = ii[1] - ii[0]; 29739371c9d4SSatish Balay ii++; 2974547795f9SHong Zhang for (j = 0; j < n; j++) { 2975547795f9SHong Zhang rval = ib[j] * 5; 2976547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2977547795f9SHong Zhang z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2978547795f9SHong Zhang z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2979547795f9SHong Zhang z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2980547795f9SHong Zhang z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2981547795f9SHong Zhang v += 25; 2982547795f9SHong Zhang } 2983547795f9SHong Zhang if (!usecprow) xb += 5; 2984547795f9SHong Zhang } 2985547795f9SHong Zhang break; 2986*d71ae5a4SJacob Faibussowitsch default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2987*d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2988968ae2c8SSatish Balay #if 0 2989968ae2c8SSatish Balay { 2990b8c08b77SHong Zhang PetscInt ncols,k,bs2=a->bs2; 2991b8c08b77SHong Zhang PetscScalar *work,*workt,zb; 2992d9ca1df4SBarry Smith const PetscScalar *xtmp; 2993547795f9SHong Zhang if (!a->mult_work) { 2994547795f9SHong Zhang k = PetscMax(A->rmap->n,A->cmap->n); 29959566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2996547795f9SHong Zhang } 2997547795f9SHong Zhang work = a->mult_work; 2998547795f9SHong Zhang xtmp = x; 2999547795f9SHong Zhang for (i=0; i<mbs; i++) { 3000547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 3001547795f9SHong Zhang ncols = n*bs; 30029566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work,ncols)); 300326fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs*ridx[i]; 300496b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 3005547795f9SHong Zhang v += n*bs2; 3006547795f9SHong Zhang if (!usecprow) xtmp += bs; 3007547795f9SHong Zhang workt = work; 3008547795f9SHong Zhang for (j=0; j<n; j++) { 3009547795f9SHong Zhang zb = z + bs*(*idx++); 3010547795f9SHong Zhang for (k=0; k<bs; k++) zb[k] += workt[k] ; 3011547795f9SHong Zhang workt += bs; 3012547795f9SHong Zhang } 3013547795f9SHong Zhang } 3014547795f9SHong Zhang } 3015968ae2c8SSatish Balay #endif 3016547795f9SHong Zhang } 30179566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 30189566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 30199566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3020547795f9SHong Zhang PetscFunctionReturn(0); 3021547795f9SHong Zhang } 3022547795f9SHong Zhang 3023*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 3024*d71ae5a4SJacob Faibussowitsch { 30252d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3026d9ca1df4SBarry Smith PetscScalar *zb, *z, x1, x2, x3, x4, x5; 3027f4259b30SLisandro Dalcin const PetscScalar *x, *xb = NULL; 3028d9ca1df4SBarry Smith const MatScalar *v; 3029d9ca1df4SBarry Smith PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3030d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 30313447b6efSHong Zhang Mat_CompressedRow cprow = a->compressedrow; 3032ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 30332d61bbb3SSatish Balay 30342d61bbb3SSatish Balay PetscFunctionBegin; 30359566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 30369566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 30379566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 30382d61bbb3SSatish Balay 30392d61bbb3SSatish Balay idx = a->j; 30402d61bbb3SSatish Balay v = a->a; 30413447b6efSHong Zhang if (usecprow) { 30423447b6efSHong Zhang mbs = cprow.nrows; 30433447b6efSHong Zhang ii = cprow.i; 30447b2bb3b9SHong Zhang ridx = cprow.rindex; 30453447b6efSHong Zhang } else { 30463447b6efSHong Zhang mbs = a->mbs; 30472d61bbb3SSatish Balay ii = a->i; 3048f1af5d2fSBarry Smith xb = x; 30493447b6efSHong Zhang } 30502d61bbb3SSatish Balay 30512d61bbb3SSatish Balay switch (bs) { 30522d61bbb3SSatish Balay case 1: 30532d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30547b2bb3b9SHong Zhang if (usecprow) xb = x + ridx[i]; 3055f1af5d2fSBarry Smith x1 = xb[0]; 30563447b6efSHong Zhang ib = idx + ii[0]; 30579371c9d4SSatish Balay n = ii[1] - ii[0]; 30589371c9d4SSatish Balay ii++; 30592d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30602d61bbb3SSatish Balay rval = ib[j]; 3061f1af5d2fSBarry Smith z[rval] += *v * x1; 3062f1af5d2fSBarry Smith v++; 30632d61bbb3SSatish Balay } 30643447b6efSHong Zhang if (!usecprow) xb++; 30652d61bbb3SSatish Balay } 30662d61bbb3SSatish Balay break; 30672d61bbb3SSatish Balay case 2: 30682d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30697b2bb3b9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 30709371c9d4SSatish Balay x1 = xb[0]; 30719371c9d4SSatish Balay x2 = xb[1]; 30723447b6efSHong Zhang ib = idx + ii[0]; 30739371c9d4SSatish Balay n = ii[1] - ii[0]; 30749371c9d4SSatish Balay ii++; 30752d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30762d61bbb3SSatish Balay rval = ib[j] * 2; 30772d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2; 30782d61bbb3SSatish Balay z[rval++] += v[2] * x1 + v[3] * x2; 30792d61bbb3SSatish Balay v += 4; 30802d61bbb3SSatish Balay } 30813447b6efSHong Zhang if (!usecprow) xb += 2; 30822d61bbb3SSatish Balay } 30832d61bbb3SSatish Balay break; 30842d61bbb3SSatish Balay case 3: 30852d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30867b2bb3b9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 30879371c9d4SSatish Balay x1 = xb[0]; 30889371c9d4SSatish Balay x2 = xb[1]; 30899371c9d4SSatish Balay x3 = xb[2]; 30903447b6efSHong Zhang ib = idx + ii[0]; 30919371c9d4SSatish Balay n = ii[1] - ii[0]; 30929371c9d4SSatish Balay ii++; 30932d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30942d61bbb3SSatish Balay rval = ib[j] * 3; 30952d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 30962d61bbb3SSatish Balay z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 30972d61bbb3SSatish Balay z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 30982d61bbb3SSatish Balay v += 9; 30992d61bbb3SSatish Balay } 31003447b6efSHong Zhang if (!usecprow) xb += 3; 31012d61bbb3SSatish Balay } 31022d61bbb3SSatish Balay break; 31032d61bbb3SSatish Balay case 4: 31042d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 31057b2bb3b9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 31069371c9d4SSatish Balay x1 = xb[0]; 31079371c9d4SSatish Balay x2 = xb[1]; 31089371c9d4SSatish Balay x3 = xb[2]; 31099371c9d4SSatish Balay x4 = xb[3]; 31103447b6efSHong Zhang ib = idx + ii[0]; 31119371c9d4SSatish Balay n = ii[1] - ii[0]; 31129371c9d4SSatish Balay ii++; 31132d61bbb3SSatish Balay for (j = 0; j < n; j++) { 31142d61bbb3SSatish Balay rval = ib[j] * 4; 31152d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 31162d61bbb3SSatish Balay z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 31172d61bbb3SSatish Balay z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 31182d61bbb3SSatish Balay z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 31192d61bbb3SSatish Balay v += 16; 31202d61bbb3SSatish Balay } 31213447b6efSHong Zhang if (!usecprow) xb += 4; 31222d61bbb3SSatish Balay } 31232d61bbb3SSatish Balay break; 31242d61bbb3SSatish Balay case 5: 31252d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 31267b2bb3b9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 31279371c9d4SSatish Balay x1 = xb[0]; 31289371c9d4SSatish Balay x2 = xb[1]; 31299371c9d4SSatish Balay x3 = xb[2]; 31309371c9d4SSatish Balay x4 = xb[3]; 31319371c9d4SSatish Balay x5 = xb[4]; 31323447b6efSHong Zhang ib = idx + ii[0]; 31339371c9d4SSatish Balay n = ii[1] - ii[0]; 31349371c9d4SSatish Balay ii++; 31352d61bbb3SSatish Balay for (j = 0; j < n; j++) { 31362d61bbb3SSatish Balay rval = ib[j] * 5; 31372d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 31382d61bbb3SSatish Balay z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 31392d61bbb3SSatish Balay z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 31402d61bbb3SSatish Balay z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 31412d61bbb3SSatish Balay z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 31422d61bbb3SSatish Balay v += 25; 31432d61bbb3SSatish Balay } 31443447b6efSHong Zhang if (!usecprow) xb += 5; 31452d61bbb3SSatish Balay } 31462d61bbb3SSatish Balay break; 3147f1af5d2fSBarry Smith default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3148690b6cddSBarry Smith PetscInt ncols, k; 3149d9ca1df4SBarry Smith PetscScalar *work, *workt; 3150d9ca1df4SBarry Smith const PetscScalar *xtmp; 31512d61bbb3SSatish Balay if (!a->mult_work) { 3152d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 31539566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 31542d61bbb3SSatish Balay } 31552d61bbb3SSatish Balay work = a->mult_work; 31563447b6efSHong Zhang xtmp = x; 31572d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 31589371c9d4SSatish Balay n = ii[1] - ii[0]; 31599371c9d4SSatish Balay ii++; 31602d61bbb3SSatish Balay ncols = n * bs; 31619566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work, ncols)); 316226fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs * ridx[i]; 316396b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 31642d61bbb3SSatish Balay v += n * bs2; 31653447b6efSHong Zhang if (!usecprow) xtmp += bs; 31662d61bbb3SSatish Balay workt = work; 31672d61bbb3SSatish Balay for (j = 0; j < n; j++) { 31682d61bbb3SSatish Balay zb = z + bs * (*idx++); 31692d61bbb3SSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 31702d61bbb3SSatish Balay workt += bs; 31712d61bbb3SSatish Balay } 31722d61bbb3SSatish Balay } 31732d61bbb3SSatish Balay } 31742d61bbb3SSatish Balay } 31759566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 31769566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 31779566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 31782d61bbb3SSatish Balay PetscFunctionReturn(0); 31792d61bbb3SSatish Balay } 31802d61bbb3SSatish Balay 3181*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) 3182*d71ae5a4SJacob Faibussowitsch { 31832d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3184690b6cddSBarry Smith PetscInt totalnz = a->bs2 * a->nz; 3185f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 3186c5df96a5SBarry Smith PetscBLASInt one = 1, tnz; 31872d61bbb3SSatish Balay 31882d61bbb3SSatish Balay PetscFunctionBegin; 31899566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3190792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 31919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 31922d61bbb3SSatish Balay PetscFunctionReturn(0); 31932d61bbb3SSatish Balay } 31942d61bbb3SSatish Balay 3195*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) 3196*d71ae5a4SJacob Faibussowitsch { 31972d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 31983f1db9ecSBarry Smith MatScalar *v = a->a; 3199329f5518SBarry Smith PetscReal sum = 0.0; 3200d0f46423SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 32012d61bbb3SSatish Balay 32022d61bbb3SSatish Balay PetscFunctionBegin; 32032d61bbb3SSatish Balay if (type == NORM_FROBENIUS) { 3204570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16) 3205570b7f6dSBarry Smith PetscBLASInt one = 1, cnt = bs2 * nz; 3206792fecdfSBarry Smith PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3207570b7f6dSBarry Smith #else 32082d61bbb3SSatish Balay for (i = 0; i < bs2 * nz; i++) { 32099371c9d4SSatish Balay sum += PetscRealPart(PetscConj(*v) * (*v)); 32109371c9d4SSatish Balay v++; 32112d61bbb3SSatish Balay } 3212570b7f6dSBarry Smith #endif 32138f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum); 32149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 32158a62d963SHong Zhang } else if (type == NORM_1) { /* maximum column sum */ 32168a62d963SHong Zhang PetscReal *tmp; 32178a62d963SHong Zhang PetscInt *bcol = a->j; 32189566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 32198a62d963SHong Zhang for (i = 0; i < nz; i++) { 32208a62d963SHong Zhang for (j = 0; j < bs; j++) { 32218a62d963SHong Zhang k1 = bs * (*bcol) + j; /* column index */ 32228a62d963SHong Zhang for (k = 0; k < bs; k++) { 32239371c9d4SSatish Balay tmp[k1] += PetscAbsScalar(*v); 32249371c9d4SSatish Balay v++; 32258a62d963SHong Zhang } 32268a62d963SHong Zhang } 32278a62d963SHong Zhang bcol++; 32288a62d963SHong Zhang } 32298a62d963SHong Zhang *norm = 0.0; 3230d0f46423SBarry Smith for (j = 0; j < A->cmap->n; j++) { 32318a62d963SHong Zhang if (tmp[j] > *norm) *norm = tmp[j]; 32328a62d963SHong Zhang } 32339566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp)); 32349566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3235596552b5SBarry Smith } else if (type == NORM_INFINITY) { /* maximum row sum */ 3236596552b5SBarry Smith *norm = 0.0; 3237596552b5SBarry Smith for (k = 0; k < bs; k++) { 323874f84c7bSSatish Balay for (j = 0; j < a->mbs; j++) { 3239596552b5SBarry Smith v = a->a + bs2 * a->i[j] + k; 3240596552b5SBarry Smith sum = 0.0; 3241596552b5SBarry Smith for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 32420e90e235SBarry Smith for (k1 = 0; k1 < bs; k1++) { 3243596552b5SBarry Smith sum += PetscAbsScalar(*v); 3244596552b5SBarry Smith v += bs; 32452d61bbb3SSatish Balay } 32460e90e235SBarry Smith } 3247596552b5SBarry Smith if (sum > *norm) *norm = sum; 3248596552b5SBarry Smith } 3249596552b5SBarry Smith } 32509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3251e7e72b3dSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 32522d61bbb3SSatish Balay PetscFunctionReturn(0); 32532d61bbb3SSatish Balay } 32542d61bbb3SSatish Balay 3255*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) 3256*d71ae5a4SJacob Faibussowitsch { 32572d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 32582d61bbb3SSatish Balay 32592d61bbb3SSatish Balay PetscFunctionBegin; 32602d61bbb3SSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3261d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3262273d9f13SBarry Smith *flg = PETSC_FALSE; 3263273d9f13SBarry Smith PetscFunctionReturn(0); 32642d61bbb3SSatish Balay } 32652d61bbb3SSatish Balay 32662d61bbb3SSatish Balay /* if the a->i are the same */ 32679566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 326826fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 32692d61bbb3SSatish Balay 32702d61bbb3SSatish Balay /* if a->j are the same */ 32719566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 327226fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 327326fbe8dcSKarl Rupp 32742d61bbb3SSatish Balay /* if a->a are the same */ 32759566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 32762d61bbb3SSatish Balay PetscFunctionReturn(0); 32772d61bbb3SSatish Balay } 32782d61bbb3SSatish Balay 3279*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) 3280*d71ae5a4SJacob Faibussowitsch { 32812d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3282690b6cddSBarry Smith PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 328387828ca2SBarry Smith PetscScalar *x, zero = 0.0; 32843f1db9ecSBarry Smith MatScalar *aa, *aa_j; 32852d61bbb3SSatish Balay 32862d61bbb3SSatish Balay PetscFunctionBegin; 328728b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3288d0f46423SBarry Smith bs = A->rmap->bs; 32892d61bbb3SSatish Balay aa = a->a; 32902d61bbb3SSatish Balay ai = a->i; 32912d61bbb3SSatish Balay aj = a->j; 32922d61bbb3SSatish Balay ambs = a->mbs; 32932d61bbb3SSatish Balay bs2 = a->bs2; 32942d61bbb3SSatish Balay 32959566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 32969566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 32979566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 329808401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 32992d61bbb3SSatish Balay for (i = 0; i < ambs; i++) { 33002d61bbb3SSatish Balay for (j = ai[i]; j < ai[i + 1]; j++) { 33012d61bbb3SSatish Balay if (aj[j] == i) { 33022d61bbb3SSatish Balay row = i * bs; 33032d61bbb3SSatish Balay aa_j = aa + j * bs2; 33042d61bbb3SSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 33052d61bbb3SSatish Balay break; 33062d61bbb3SSatish Balay } 33072d61bbb3SSatish Balay } 33082d61bbb3SSatish Balay } 33099566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 33102d61bbb3SSatish Balay PetscFunctionReturn(0); 33112d61bbb3SSatish Balay } 33122d61bbb3SSatish Balay 3313*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) 3314*d71ae5a4SJacob Faibussowitsch { 33152d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 331653ef36baSBarry Smith const PetscScalar *l, *r, *li, *ri; 331753ef36baSBarry Smith PetscScalar x; 33183f1db9ecSBarry Smith MatScalar *aa, *v; 331953ef36baSBarry Smith PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 332053ef36baSBarry Smith const PetscInt *ai, *aj; 33212d61bbb3SSatish Balay 33222d61bbb3SSatish Balay PetscFunctionBegin; 33232d61bbb3SSatish Balay ai = a->i; 33242d61bbb3SSatish Balay aj = a->j; 33252d61bbb3SSatish Balay aa = a->a; 3326d0f46423SBarry Smith m = A->rmap->n; 3327d0f46423SBarry Smith n = A->cmap->n; 3328d0f46423SBarry Smith bs = A->rmap->bs; 33292d61bbb3SSatish Balay mbs = a->mbs; 33302d61bbb3SSatish Balay bs2 = a->bs2; 33312d61bbb3SSatish Balay if (ll) { 33329566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 33339566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 333408401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 33352d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 33362d61bbb3SSatish Balay M = ai[i + 1] - ai[i]; 33372d61bbb3SSatish Balay li = l + i * bs; 33382d61bbb3SSatish Balay v = aa + bs2 * ai[i]; 33392d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 3340ad540459SPierre Jolivet for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; 33412d61bbb3SSatish Balay } 33422d61bbb3SSatish Balay } 33439566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 33449566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 33452d61bbb3SSatish Balay } 33462d61bbb3SSatish Balay 33472d61bbb3SSatish Balay if (rr) { 33489566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(rr, &r)); 33499566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(rr, &rn)); 335008401ef6SPierre Jolivet PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 33512d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 335253ef36baSBarry Smith iai = ai[i]; 335353ef36baSBarry Smith M = ai[i + 1] - iai; 335453ef36baSBarry Smith v = aa + bs2 * iai; 33552d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 335653ef36baSBarry Smith ri = r + bs * aj[iai + j]; 33572d61bbb3SSatish Balay for (k = 0; k < bs; k++) { 33582d61bbb3SSatish Balay x = ri[k]; 335953ef36baSBarry Smith for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 336053ef36baSBarry Smith v += bs; 33612d61bbb3SSatish Balay } 33622d61bbb3SSatish Balay } 33632d61bbb3SSatish Balay } 33649566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(rr, &r)); 33659566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 33662d61bbb3SSatish Balay } 33672d61bbb3SSatish Balay PetscFunctionReturn(0); 33682d61bbb3SSatish Balay } 33692d61bbb3SSatish Balay 3370*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) 3371*d71ae5a4SJacob Faibussowitsch { 33722d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33732d61bbb3SSatish Balay 33742d61bbb3SSatish Balay PetscFunctionBegin; 33752d61bbb3SSatish Balay info->block_size = a->bs2; 3376ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; 33772d61bbb3SSatish Balay info->nz_used = a->bs2 * a->nz; 33783966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 33792d61bbb3SSatish Balay info->assemblies = A->num_ass; 33808e58a170SBarry Smith info->mallocs = A->info.mallocs; 33814dfa11a4SJacob Faibussowitsch info->memory = 0; /* REVIEW ME */ 3382d5f3da31SBarry Smith if (A->factortype) { 33832d61bbb3SSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 33842d61bbb3SSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 33852d61bbb3SSatish Balay info->factor_mallocs = A->info.factor_mallocs; 33862d61bbb3SSatish Balay } else { 33872d61bbb3SSatish Balay info->fill_ratio_given = 0; 33882d61bbb3SSatish Balay info->fill_ratio_needed = 0; 33892d61bbb3SSatish Balay info->factor_mallocs = 0; 33902d61bbb3SSatish Balay } 33912d61bbb3SSatish Balay PetscFunctionReturn(0); 33922d61bbb3SSatish Balay } 33932d61bbb3SSatish Balay 3394*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 3395*d71ae5a4SJacob Faibussowitsch { 33962d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33972d61bbb3SSatish Balay 33982d61bbb3SSatish Balay PetscFunctionBegin; 33999566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 34002d61bbb3SSatish Balay PetscFunctionReturn(0); 34012d61bbb3SSatish Balay } 3402a001520aSPierre Jolivet 3403*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 3404*d71ae5a4SJacob Faibussowitsch { 3405a001520aSPierre Jolivet PetscFunctionBegin; 34069566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 34074222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 3408a001520aSPierre Jolivet PetscFunctionReturn(0); 3409a001520aSPierre Jolivet } 3410a001520aSPierre Jolivet 3411*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3412*d71ae5a4SJacob Faibussowitsch { 341374eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3414f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1; 3415bcf10a7aSPierre Jolivet const PetscScalar *xb; 341674eeabc5SPierre Jolivet PetscScalar x1; 341774eeabc5SPierre Jolivet const MatScalar *v, *vv; 341874eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 341974eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 342074eeabc5SPierre Jolivet 342174eeabc5SPierre Jolivet PetscFunctionBegin; 342274eeabc5SPierre Jolivet idx = a->j; 342374eeabc5SPierre Jolivet v = a->a; 342474eeabc5SPierre Jolivet if (usecprow) { 342574eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 342674eeabc5SPierre Jolivet ii = a->compressedrow.i; 342774eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 342874eeabc5SPierre Jolivet } else { 342974eeabc5SPierre Jolivet mbs = a->mbs; 343074eeabc5SPierre Jolivet ii = a->i; 343174eeabc5SPierre Jolivet z = c; 343274eeabc5SPierre Jolivet } 343374eeabc5SPierre Jolivet 343474eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 34359371c9d4SSatish Balay n = ii[1] - ii[0]; 34369371c9d4SSatish Balay ii++; 343774eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 343874eeabc5SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 343974eeabc5SPierre Jolivet if (usecprow) z = c + ridx[i]; 344074eeabc5SPierre Jolivet jj = idx; 344174eeabc5SPierre Jolivet vv = v; 344274eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 344374eeabc5SPierre Jolivet idx = jj; 344474eeabc5SPierre Jolivet v = vv; 344574eeabc5SPierre Jolivet sum1 = 0.0; 344674eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 34479371c9d4SSatish Balay xb = b + (*idx++); 34489371c9d4SSatish Balay x1 = xb[0 + k * bm]; 344974eeabc5SPierre Jolivet sum1 += v[0] * x1; 345074eeabc5SPierre Jolivet v += 1; 345174eeabc5SPierre Jolivet } 3452feb237baSPierre Jolivet z[0 + k * cm] = sum1; 345374eeabc5SPierre Jolivet } 345474eeabc5SPierre Jolivet if (!usecprow) z += 1; 345574eeabc5SPierre Jolivet } 345674eeabc5SPierre Jolivet PetscFunctionReturn(0); 345774eeabc5SPierre Jolivet } 345874eeabc5SPierre Jolivet 3459*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3460*d71ae5a4SJacob Faibussowitsch { 34614b7054f4SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3462f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2; 3463bcf10a7aSPierre Jolivet const PetscScalar *xb; 34644b7054f4SPierre Jolivet PetscScalar x1, x2; 34654b7054f4SPierre Jolivet const MatScalar *v, *vv; 34664b7054f4SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 34674b7054f4SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 34684b7054f4SPierre Jolivet 34694b7054f4SPierre Jolivet PetscFunctionBegin; 34704b7054f4SPierre Jolivet idx = a->j; 34714b7054f4SPierre Jolivet v = a->a; 34724b7054f4SPierre Jolivet if (usecprow) { 34734b7054f4SPierre Jolivet mbs = a->compressedrow.nrows; 34744b7054f4SPierre Jolivet ii = a->compressedrow.i; 34754b7054f4SPierre Jolivet ridx = a->compressedrow.rindex; 34764b7054f4SPierre Jolivet } else { 34774b7054f4SPierre Jolivet mbs = a->mbs; 34784b7054f4SPierre Jolivet ii = a->i; 34794b7054f4SPierre Jolivet z = c; 34804b7054f4SPierre Jolivet } 34814b7054f4SPierre Jolivet 34824b7054f4SPierre Jolivet for (i = 0; i < mbs; i++) { 34839371c9d4SSatish Balay n = ii[1] - ii[0]; 34849371c9d4SSatish Balay ii++; 34854b7054f4SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 34864b7054f4SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 34874b7054f4SPierre Jolivet if (usecprow) z = c + 2 * ridx[i]; 34884b7054f4SPierre Jolivet jj = idx; 34894b7054f4SPierre Jolivet vv = v; 34904b7054f4SPierre Jolivet for (k = 0; k < cn; k++) { 34914b7054f4SPierre Jolivet idx = jj; 34924b7054f4SPierre Jolivet v = vv; 34939371c9d4SSatish Balay sum1 = 0.0; 34949371c9d4SSatish Balay sum2 = 0.0; 34954b7054f4SPierre Jolivet for (j = 0; j < n; j++) { 34969371c9d4SSatish Balay xb = b + 2 * (*idx++); 34979371c9d4SSatish Balay x1 = xb[0 + k * bm]; 34989371c9d4SSatish Balay x2 = xb[1 + k * bm]; 34994b7054f4SPierre Jolivet sum1 += v[0] * x1 + v[2] * x2; 35004b7054f4SPierre Jolivet sum2 += v[1] * x1 + v[3] * x2; 35014b7054f4SPierre Jolivet v += 4; 35024b7054f4SPierre Jolivet } 35039371c9d4SSatish Balay z[0 + k * cm] = sum1; 35049371c9d4SSatish Balay z[1 + k * cm] = sum2; 35054b7054f4SPierre Jolivet } 35064b7054f4SPierre Jolivet if (!usecprow) z += 2; 35074b7054f4SPierre Jolivet } 35084b7054f4SPierre Jolivet PetscFunctionReturn(0); 35094b7054f4SPierre Jolivet } 35104b7054f4SPierre Jolivet 3511*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3512*d71ae5a4SJacob Faibussowitsch { 351374eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3514f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3; 3515bcf10a7aSPierre Jolivet const PetscScalar *xb; 351674eeabc5SPierre Jolivet PetscScalar x1, x2, x3; 351774eeabc5SPierre Jolivet const MatScalar *v, *vv; 351874eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 351974eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 352074eeabc5SPierre Jolivet 352174eeabc5SPierre Jolivet PetscFunctionBegin; 352274eeabc5SPierre Jolivet idx = a->j; 352374eeabc5SPierre Jolivet v = a->a; 352474eeabc5SPierre Jolivet if (usecprow) { 352574eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 352674eeabc5SPierre Jolivet ii = a->compressedrow.i; 352774eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 352874eeabc5SPierre Jolivet } else { 352974eeabc5SPierre Jolivet mbs = a->mbs; 353074eeabc5SPierre Jolivet ii = a->i; 353174eeabc5SPierre Jolivet z = c; 353274eeabc5SPierre Jolivet } 353374eeabc5SPierre Jolivet 353474eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 35359371c9d4SSatish Balay n = ii[1] - ii[0]; 35369371c9d4SSatish Balay ii++; 353774eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 353874eeabc5SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 353974eeabc5SPierre Jolivet if (usecprow) z = c + 3 * ridx[i]; 354074eeabc5SPierre Jolivet jj = idx; 354174eeabc5SPierre Jolivet vv = v; 354274eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 354374eeabc5SPierre Jolivet idx = jj; 354474eeabc5SPierre Jolivet v = vv; 35459371c9d4SSatish Balay sum1 = 0.0; 35469371c9d4SSatish Balay sum2 = 0.0; 35479371c9d4SSatish Balay sum3 = 0.0; 354874eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 35499371c9d4SSatish Balay xb = b + 3 * (*idx++); 35509371c9d4SSatish Balay x1 = xb[0 + k * bm]; 35519371c9d4SSatish Balay x2 = xb[1 + k * bm]; 35529371c9d4SSatish Balay x3 = xb[2 + k * bm]; 355374eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 355474eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 355574eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 355674eeabc5SPierre Jolivet v += 9; 355774eeabc5SPierre Jolivet } 35589371c9d4SSatish Balay z[0 + k * cm] = sum1; 35599371c9d4SSatish Balay z[1 + k * cm] = sum2; 35609371c9d4SSatish Balay z[2 + k * cm] = sum3; 356174eeabc5SPierre Jolivet } 356274eeabc5SPierre Jolivet if (!usecprow) z += 3; 356374eeabc5SPierre Jolivet } 356474eeabc5SPierre Jolivet PetscFunctionReturn(0); 356574eeabc5SPierre Jolivet } 356674eeabc5SPierre Jolivet 3567*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3568*d71ae5a4SJacob Faibussowitsch { 356974eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3570f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3571bcf10a7aSPierre Jolivet const PetscScalar *xb; 357274eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4; 357374eeabc5SPierre Jolivet const MatScalar *v, *vv; 357474eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 357574eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 357674eeabc5SPierre Jolivet 357774eeabc5SPierre Jolivet PetscFunctionBegin; 357874eeabc5SPierre Jolivet idx = a->j; 357974eeabc5SPierre Jolivet v = a->a; 358074eeabc5SPierre Jolivet if (usecprow) { 358174eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 358274eeabc5SPierre Jolivet ii = a->compressedrow.i; 358374eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 358474eeabc5SPierre Jolivet } else { 358574eeabc5SPierre Jolivet mbs = a->mbs; 358674eeabc5SPierre Jolivet ii = a->i; 358774eeabc5SPierre Jolivet z = c; 358874eeabc5SPierre Jolivet } 358974eeabc5SPierre Jolivet 359074eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 35919371c9d4SSatish Balay n = ii[1] - ii[0]; 35929371c9d4SSatish Balay ii++; 359374eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 359474eeabc5SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 359574eeabc5SPierre Jolivet if (usecprow) z = c + 4 * ridx[i]; 359674eeabc5SPierre Jolivet jj = idx; 359774eeabc5SPierre Jolivet vv = v; 359874eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 359974eeabc5SPierre Jolivet idx = jj; 360074eeabc5SPierre Jolivet v = vv; 36019371c9d4SSatish Balay sum1 = 0.0; 36029371c9d4SSatish Balay sum2 = 0.0; 36039371c9d4SSatish Balay sum3 = 0.0; 36049371c9d4SSatish Balay sum4 = 0.0; 360574eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 36069371c9d4SSatish Balay xb = b + 4 * (*idx++); 36079371c9d4SSatish Balay x1 = xb[0 + k * bm]; 36089371c9d4SSatish Balay x2 = xb[1 + k * bm]; 36099371c9d4SSatish Balay x3 = xb[2 + k * bm]; 36109371c9d4SSatish Balay x4 = xb[3 + k * bm]; 361174eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 361274eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 361374eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 361474eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 361574eeabc5SPierre Jolivet v += 16; 361674eeabc5SPierre Jolivet } 36179371c9d4SSatish Balay z[0 + k * cm] = sum1; 36189371c9d4SSatish Balay z[1 + k * cm] = sum2; 36199371c9d4SSatish Balay z[2 + k * cm] = sum3; 36209371c9d4SSatish Balay z[3 + k * cm] = sum4; 362174eeabc5SPierre Jolivet } 362274eeabc5SPierre Jolivet if (!usecprow) z += 4; 362374eeabc5SPierre Jolivet } 362474eeabc5SPierre Jolivet PetscFunctionReturn(0); 362574eeabc5SPierre Jolivet } 362674eeabc5SPierre Jolivet 3627*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3628*d71ae5a4SJacob Faibussowitsch { 362974eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3630f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3631bcf10a7aSPierre Jolivet const PetscScalar *xb; 363274eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 363374eeabc5SPierre Jolivet const MatScalar *v, *vv; 363474eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 363574eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 363674eeabc5SPierre Jolivet 363774eeabc5SPierre Jolivet PetscFunctionBegin; 363874eeabc5SPierre Jolivet idx = a->j; 363974eeabc5SPierre Jolivet v = a->a; 364074eeabc5SPierre Jolivet if (usecprow) { 364174eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 364274eeabc5SPierre Jolivet ii = a->compressedrow.i; 364374eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 364474eeabc5SPierre Jolivet } else { 364574eeabc5SPierre Jolivet mbs = a->mbs; 364674eeabc5SPierre Jolivet ii = a->i; 364774eeabc5SPierre Jolivet z = c; 364874eeabc5SPierre Jolivet } 364974eeabc5SPierre Jolivet 365074eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 36519371c9d4SSatish Balay n = ii[1] - ii[0]; 36529371c9d4SSatish Balay ii++; 365374eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 365474eeabc5SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 365574eeabc5SPierre Jolivet if (usecprow) z = c + 5 * ridx[i]; 365674eeabc5SPierre Jolivet jj = idx; 365774eeabc5SPierre Jolivet vv = v; 365874eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 365974eeabc5SPierre Jolivet idx = jj; 366074eeabc5SPierre Jolivet v = vv; 36619371c9d4SSatish Balay sum1 = 0.0; 36629371c9d4SSatish Balay sum2 = 0.0; 36639371c9d4SSatish Balay sum3 = 0.0; 36649371c9d4SSatish Balay sum4 = 0.0; 36659371c9d4SSatish Balay sum5 = 0.0; 366674eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 36679371c9d4SSatish Balay xb = b + 5 * (*idx++); 36689371c9d4SSatish Balay x1 = xb[0 + k * bm]; 36699371c9d4SSatish Balay x2 = xb[1 + k * bm]; 36709371c9d4SSatish Balay x3 = xb[2 + k * bm]; 36719371c9d4SSatish Balay x4 = xb[3 + k * bm]; 36729371c9d4SSatish Balay x5 = xb[4 + k * bm]; 367374eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 367474eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 367574eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 367674eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 367774eeabc5SPierre Jolivet sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 367874eeabc5SPierre Jolivet v += 25; 367974eeabc5SPierre Jolivet } 36809371c9d4SSatish Balay z[0 + k * cm] = sum1; 36819371c9d4SSatish Balay z[1 + k * cm] = sum2; 36829371c9d4SSatish Balay z[2 + k * cm] = sum3; 36839371c9d4SSatish Balay z[3 + k * cm] = sum4; 36849371c9d4SSatish Balay z[4 + k * cm] = sum5; 368574eeabc5SPierre Jolivet } 368674eeabc5SPierre Jolivet if (!usecprow) z += 5; 368774eeabc5SPierre Jolivet } 368874eeabc5SPierre Jolivet PetscFunctionReturn(0); 368974eeabc5SPierre Jolivet } 369074eeabc5SPierre Jolivet 3691*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) 3692*d71ae5a4SJacob Faibussowitsch { 3693a001520aSPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3694a001520aSPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3695910cf402Sprj- Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3696bcf10a7aSPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3697a001520aSPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3698a001520aSPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 3699f4259b30SLisandro Dalcin PetscScalar *z = NULL; 3700a001520aSPierre Jolivet PetscScalar *c, *b; 3701a001520aSPierre Jolivet const MatScalar *v; 3702a001520aSPierre Jolivet const PetscInt *idx, *ii, *ridx = NULL; 37034b7054f4SPierre Jolivet PetscScalar _DZero = 0.0, _DOne = 1.0; 3704a001520aSPierre Jolivet PetscBool usecprow = a->compressedrow.use; 3705a001520aSPierre Jolivet 3706a001520aSPierre Jolivet PetscFunctionBegin; 3707a001520aSPierre Jolivet if (!cm || !cn) PetscFunctionReturn(0); 370808401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 370908401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 371008401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3711a001520aSPierre Jolivet b = bd->v; 371248a46eb9SPierre Jolivet if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 37139566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 371474eeabc5SPierre Jolivet switch (bs) { 3715*d71ae5a4SJacob Faibussowitsch case 1: 3716*d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3717*d71ae5a4SJacob Faibussowitsch break; 3718*d71ae5a4SJacob Faibussowitsch case 2: 3719*d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3720*d71ae5a4SJacob Faibussowitsch break; 3721*d71ae5a4SJacob Faibussowitsch case 3: 3722*d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3723*d71ae5a4SJacob Faibussowitsch break; 3724*d71ae5a4SJacob Faibussowitsch case 4: 3725*d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3726*d71ae5a4SJacob Faibussowitsch break; 3727*d71ae5a4SJacob Faibussowitsch case 5: 3728*d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3729*d71ae5a4SJacob Faibussowitsch break; 373074eeabc5SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 37319566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 37329566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 37339566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 37349566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 3735a001520aSPierre Jolivet idx = a->j; 3736a001520aSPierre Jolivet v = a->a; 3737a001520aSPierre Jolivet if (usecprow) { 3738a001520aSPierre Jolivet mbs = a->compressedrow.nrows; 3739a001520aSPierre Jolivet ii = a->compressedrow.i; 3740a001520aSPierre Jolivet ridx = a->compressedrow.rindex; 3741a001520aSPierre Jolivet } else { 3742a001520aSPierre Jolivet mbs = a->mbs; 3743a001520aSPierre Jolivet ii = a->i; 3744a001520aSPierre Jolivet z = c; 3745a001520aSPierre Jolivet } 3746a001520aSPierre Jolivet for (i = 0; i < mbs; i++) { 37479371c9d4SSatish Balay n = ii[1] - ii[0]; 37489371c9d4SSatish Balay ii++; 3749a001520aSPierre Jolivet if (usecprow) z = c + bs * ridx[i]; 37504b7054f4SPierre Jolivet if (n) { 3751792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 37524b7054f4SPierre Jolivet v += bs2; 37534b7054f4SPierre Jolivet } 37544b7054f4SPierre Jolivet for (j = 1; j < n; j++) { 3755792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3756a001520aSPierre Jolivet v += bs2; 3757a001520aSPierre Jolivet } 3758a001520aSPierre Jolivet if (!usecprow) z += bs; 3759a001520aSPierre Jolivet } 37604b7054f4SPierre Jolivet } 37619566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 37629566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 3763a001520aSPierre Jolivet PetscFunctionReturn(0); 3764a001520aSPierre Jolivet } 3765