1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 2a001520aSPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <petscblaslapack.h> 6cac129eeSSatish Balay 75f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 896e086a2SDaniel Kokron #include <immintrin.h> 996e086a2SDaniel Kokron #endif 1096e086a2SDaniel Kokron 11d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 12d71ae5a4SJacob Faibussowitsch { 13a3192f15SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 145d0c19d7SBarry Smith PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 155d0c19d7SBarry Smith const PetscInt *idx; 167bede89fSBarry Smith PetscInt start, end, *ai, *aj, bs; 17f1af5d2fSBarry Smith PetscBT table; 18a3192f15SSatish Balay 193a40ed3dSBarry Smith PetscFunctionBegin; 20a3192f15SSatish Balay m = a->mbs; 21a3192f15SSatish Balay ai = a->i; 22a3192f15SSatish Balay aj = a->j; 23d0f46423SBarry Smith bs = A->rmap->bs; 24a3192f15SSatish Balay 2508401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 26a3192f15SSatish Balay 279566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(m, &table)); 289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &nidx)); 29a3192f15SSatish Balay 30a3192f15SSatish Balay for (i = 0; i < is_max; i++) { 31a3192f15SSatish Balay /* Initialise the two local arrays */ 32a3192f15SSatish Balay isz = 0; 339566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(m, table)); 34a3192f15SSatish Balay 35a3192f15SSatish Balay /* Extract the indices, assume there can be duplicate entries */ 369566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 379566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 38a3192f15SSatish Balay 39a3192f15SSatish Balay /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40a3192f15SSatish Balay for (j = 0; j < n; ++j) { 41218c64b6SSatish Balay ival = idx[j] / bs; /* convert the indices into block indices */ 4208401ef6SPierre Jolivet PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 4326fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 44a3192f15SSatish Balay } 459566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 469566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 47a3192f15SSatish Balay 48a3192f15SSatish Balay k = 0; 49a3192f15SSatish Balay for (j = 0; j < ov; j++) { /* for each overlap*/ 50a3192f15SSatish Balay n = isz; 51a3192f15SSatish Balay for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52a3192f15SSatish Balay row = nidx[k]; 53a3192f15SSatish Balay start = ai[row]; 54a3192f15SSatish Balay end = ai[row + 1]; 55a3192f15SSatish Balay for (l = start; l < end; l++) { 56a3192f15SSatish Balay val = aj[l]; 5726fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 58a3192f15SSatish Balay } 59a3192f15SSatish Balay } 60a3192f15SSatish Balay } 617bede89fSBarry Smith PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 62a3192f15SSatish Balay } 639566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table)); 649566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 66a3192f15SSatish Balay } 671c351548SSatish Balay 68d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 69d71ae5a4SJacob Faibussowitsch { 70736121d4SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 71690b6cddSBarry Smith PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 72690b6cddSBarry Smith PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 735d0c19d7SBarry Smith const PetscInt *irow, *icol; 745d0c19d7SBarry Smith PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 75690b6cddSBarry Smith PetscInt *aj = a->j, *ai = a->i; 763f1db9ecSBarry Smith MatScalar *mat_a; 77736121d4SSatish Balay Mat C; 786041f1b1SToby Isaac PetscBool flag; 79736121d4SSatish Balay 803a40ed3dSBarry Smith PetscFunctionBegin; 819566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 829566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 839566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 849566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 85736121d4SSatish Balay 869566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 87736121d4SSatish Balay ssmap = smap; 889566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 89736121d4SSatish Balay for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 90736121d4SSatish Balay /* determine lens of each row */ 91736121d4SSatish Balay for (i = 0; i < nrows; i++) { 92736121d4SSatish Balay kstart = ai[irow[i]]; 93736121d4SSatish Balay kend = kstart + a->ilen[irow[i]]; 94736121d4SSatish Balay lens[i] = 0; 95736121d4SSatish Balay for (k = kstart; k < kend; k++) { 9626fbe8dcSKarl Rupp if (ssmap[aj[k]]) lens[i]++; 97736121d4SSatish Balay } 98736121d4SSatish Balay } 99736121d4SSatish Balay /* Create and fill new matrix */ 100736121d4SSatish Balay if (scall == MAT_REUSE_MATRIX) { 101736121d4SSatish Balay c = (Mat_SeqBAIJ *)((*B)->data); 102736121d4SSatish Balay 103aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1049566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 10528b400f6SJacob Faibussowitsch PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 1069566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 107736121d4SSatish Balay C = *B; 1083a40ed3dSBarry Smith } else { 1099566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1109566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 1119566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1129566063dSJacob Faibussowitsch PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 113736121d4SSatish Balay } 114736121d4SSatish Balay c = (Mat_SeqBAIJ *)(C->data); 115736121d4SSatish Balay for (i = 0; i < nrows; i++) { 116736121d4SSatish Balay row = irow[i]; 117736121d4SSatish Balay kstart = ai[row]; 118736121d4SSatish Balay kend = kstart + a->ilen[row]; 119736121d4SSatish Balay mat_i = c->i[i]; 120d29f2997SMatthew Woehlke mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 121d29f2997SMatthew Woehlke mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 122736121d4SSatish Balay mat_ilen = c->ilen + i; 123736121d4SSatish Balay for (k = kstart; k < kend; k++) { 124736121d4SSatish Balay if ((tcol = ssmap[a->j[k]])) { 125736121d4SSatish Balay *mat_j++ = tcol - 1; 1269566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 127549d3d68SSatish Balay mat_a += bs2; 128736121d4SSatish Balay (*mat_ilen)++; 129736121d4SSatish Balay } 130736121d4SSatish Balay } 131736121d4SSatish Balay } 132cdc6f3adSToby Isaac /* sort */ 133d29f2997SMatthew Woehlke if (c->j && c->a) { 134cdc6f3adSToby Isaac MatScalar *work; 1359566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 136cdc6f3adSToby Isaac for (i = 0; i < nrows; i++) { 137cdc6f3adSToby Isaac PetscInt ilen; 138cdc6f3adSToby Isaac mat_i = c->i[i]; 139cdc6f3adSToby Isaac mat_j = c->j + mat_i; 140cdc6f3adSToby Isaac mat_a = c->a + mat_i * bs2; 141cdc6f3adSToby Isaac ilen = c->ilen[i]; 1429566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 143cdc6f3adSToby Isaac } 1449566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 145cdc6f3adSToby Isaac } 146218c64b6SSatish Balay 147736121d4SSatish Balay /* Free work space */ 1489566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 1499566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 1509566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 1519566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1529566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 153736121d4SSatish Balay 1549566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 155736121d4SSatish Balay *B = C; 1563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 157736121d4SSatish Balay } 158736121d4SSatish Balay 159d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 160d71ae5a4SJacob Faibussowitsch { 161218c64b6SSatish Balay IS is1, is2; 162218c64b6SSatish Balay 1633a40ed3dSBarry Smith PetscFunctionBegin; 164*f9a48b90SPierre Jolivet PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 165*f9a48b90SPierre Jolivet if (isrow == iscol) { 166*f9a48b90SPierre Jolivet is2 = is1; 167*f9a48b90SPierre Jolivet PetscCall(PetscObjectReference((PetscObject)is2)); 168*f9a48b90SPierre Jolivet } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 1699566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 1709566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 1719566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 1723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 173218c64b6SSatish Balay } 174218c64b6SSatish Balay 175d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 176d71ae5a4SJacob Faibussowitsch { 17716b64355SHong Zhang Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 1785c39f6d9SHong Zhang Mat_SubSppt *submatj = c->submatis1; 17916b64355SHong Zhang 18016b64355SHong Zhang PetscFunctionBegin; 1819566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 1829566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 1833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18416b64355SHong Zhang } 18516b64355SHong Zhang 18689a1a59bSHong Zhang /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 187d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) 188d71ae5a4SJacob Faibussowitsch { 18986e85357SHong Zhang PetscInt i; 19086e85357SHong Zhang Mat C; 19186e85357SHong Zhang Mat_SeqBAIJ *c; 19286e85357SHong Zhang Mat_SubSppt *submatj; 19386e85357SHong Zhang 19486e85357SHong Zhang PetscFunctionBegin; 19586e85357SHong Zhang for (i = 0; i < n; i++) { 19686e85357SHong Zhang C = (*mat)[i]; 19786e85357SHong Zhang c = (Mat_SeqBAIJ *)C->data; 19886e85357SHong Zhang submatj = c->submatis1; 19986e85357SHong Zhang if (submatj) { 2007daefbafSJunchao Zhang if (--((PetscObject)C)->refct <= 0) { 20126cc229bSBarry Smith PetscCall(PetscFree(C->factorprefix)); 2029566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 2039566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 2049566063dSJacob Faibussowitsch PetscCall(PetscFree(C->defaultvectype)); 2053faff063SStefano Zampini PetscCall(PetscFree(C->defaultrandtype)); 2069566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->rmap)); 2079566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->cmap)); 2089566063dSJacob Faibussowitsch PetscCall(PetscHeaderDestroy(&C)); 2097daefbafSJunchao Zhang } 21086e85357SHong Zhang } else { 2119566063dSJacob Faibussowitsch PetscCall(MatDestroy(&C)); 21286e85357SHong Zhang } 21386e85357SHong Zhang } 2147daefbafSJunchao Zhang 2157daefbafSJunchao Zhang /* Destroy Dummy submatrices created for reuse */ 2169566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 2177daefbafSJunchao Zhang 2189566063dSJacob Faibussowitsch PetscCall(PetscFree(*mat)); 2193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 22086e85357SHong Zhang } 22186e85357SHong Zhang 222d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 223d71ae5a4SJacob Faibussowitsch { 224690b6cddSBarry Smith PetscInt i; 225736121d4SSatish Balay 2263a40ed3dSBarry Smith PetscFunctionBegin; 22748a46eb9SPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 228736121d4SSatish Balay 22948a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 2303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 231736121d4SSatish Balay } 232218c64b6SSatish Balay 2332d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */ 234d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) 235d71ae5a4SJacob Faibussowitsch { 2362d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 237d9fead3dSBarry Smith PetscScalar *z, sum; 238d9fead3dSBarry Smith const PetscScalar *x; 239d9fead3dSBarry Smith const MatScalar *v; 2407c565772SBarry Smith PetscInt mbs, i, n; 2410298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 242ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2432d61bbb3SSatish Balay 2442d61bbb3SSatish Balay PetscFunctionBegin; 2459566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2469566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &z)); 2472d61bbb3SSatish Balay 24826e093fcSHong Zhang if (usecprow) { 24926e093fcSHong Zhang mbs = a->compressedrow.nrows; 25026e093fcSHong Zhang ii = a->compressedrow.i; 2517b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 2529566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(z, a->mbs)); 25326e093fcSHong Zhang } else { 25426e093fcSHong Zhang mbs = a->mbs; 2552d61bbb3SSatish Balay ii = a->i; 25626e093fcSHong Zhang } 2572d61bbb3SSatish Balay 2582d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 259ee54c7eeSHong Zhang n = ii[1] - ii[0]; 260ee54c7eeSHong Zhang v = a->a + ii[0]; 261ee54c7eeSHong Zhang idx = a->j + ii[0]; 262ee54c7eeSHong Zhang ii++; 263444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 264444d8c10SJed Brown PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2652d61bbb3SSatish Balay sum = 0.0; 2662162cab8SBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 26726e093fcSHong Zhang if (usecprow) { 2687b2bb3b9SHong Zhang z[ridx[i]] = sum; 26926e093fcSHong Zhang } else { 2702d61bbb3SSatish Balay z[i] = sum; 2712d61bbb3SSatish Balay } 27226e093fcSHong Zhang } 2739566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 2749566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &z)); 2759566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 2763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2772d61bbb3SSatish Balay } 2782d61bbb3SSatish Balay 279d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) 280d71ae5a4SJacob Faibussowitsch { 2812d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 282f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, *zarray; 283d9fead3dSBarry Smith const PetscScalar *x, *xb; 28487828ca2SBarry Smith PetscScalar x1, x2; 285d9fead3dSBarry Smith const MatScalar *v; 2867c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 287ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2882d61bbb3SSatish Balay 2892d61bbb3SSatish Balay PetscFunctionBegin; 2909566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2919566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 2922d61bbb3SSatish Balay 2932d61bbb3SSatish Balay idx = a->j; 2942d61bbb3SSatish Balay v = a->a; 29526e093fcSHong Zhang if (usecprow) { 29626e093fcSHong Zhang mbs = a->compressedrow.nrows; 29726e093fcSHong Zhang ii = a->compressedrow.i; 2987b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 2999566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 30026e093fcSHong Zhang } else { 30126e093fcSHong Zhang mbs = a->mbs; 3022d61bbb3SSatish Balay ii = a->i; 30326e093fcSHong Zhang z = zarray; 30426e093fcSHong Zhang } 3052d61bbb3SSatish Balay 3062d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 3079371c9d4SSatish Balay n = ii[1] - ii[0]; 3089371c9d4SSatish Balay ii++; 3099371c9d4SSatish Balay sum1 = 0.0; 3109371c9d4SSatish Balay sum2 = 0.0; 311444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 312444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3132d61bbb3SSatish Balay for (j = 0; j < n; j++) { 3149371c9d4SSatish Balay xb = x + 2 * (*idx++); 3159371c9d4SSatish Balay x1 = xb[0]; 3169371c9d4SSatish Balay x2 = xb[1]; 3172d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 3182d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 3192d61bbb3SSatish Balay v += 4; 3202d61bbb3SSatish Balay } 3217b2bb3b9SHong Zhang if (usecprow) z = zarray + 2 * ridx[i]; 3229371c9d4SSatish Balay z[0] = sum1; 3239371c9d4SSatish Balay z[1] = sum2; 32426e093fcSHong Zhang if (!usecprow) z += 2; 3252d61bbb3SSatish Balay } 3269566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3279566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 3289566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 3293ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3302d61bbb3SSatish Balay } 3312d61bbb3SSatish Balay 332d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) 333d71ae5a4SJacob Faibussowitsch { 3342d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 335f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 336d9fead3dSBarry Smith const PetscScalar *x, *xb; 337d9fead3dSBarry Smith const MatScalar *v; 3387c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 339ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 34026e093fcSHong Zhang 341b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 342fee21e36SBarry Smith #pragma disjoint(*v, *z, *xb) 343fee21e36SBarry Smith #endif 344fee21e36SBarry Smith 3452d61bbb3SSatish Balay PetscFunctionBegin; 3469566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3479566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 3482d61bbb3SSatish Balay 3492d61bbb3SSatish Balay idx = a->j; 3502d61bbb3SSatish Balay v = a->a; 35126e093fcSHong Zhang if (usecprow) { 35226e093fcSHong Zhang mbs = a->compressedrow.nrows; 35326e093fcSHong Zhang ii = a->compressedrow.i; 3547b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 3559566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 35626e093fcSHong Zhang } else { 35726e093fcSHong Zhang mbs = a->mbs; 3582d61bbb3SSatish Balay ii = a->i; 35926e093fcSHong Zhang z = zarray; 36026e093fcSHong Zhang } 3612d61bbb3SSatish Balay 3622d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 3639371c9d4SSatish Balay n = ii[1] - ii[0]; 3649371c9d4SSatish Balay ii++; 3659371c9d4SSatish Balay sum1 = 0.0; 3669371c9d4SSatish Balay sum2 = 0.0; 3679371c9d4SSatish Balay sum3 = 0.0; 368444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 369444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3702d61bbb3SSatish Balay for (j = 0; j < n; j++) { 37126fbe8dcSKarl Rupp xb = x + 3 * (*idx++); 37226fbe8dcSKarl Rupp x1 = xb[0]; 37326fbe8dcSKarl Rupp x2 = xb[1]; 37426fbe8dcSKarl Rupp x3 = xb[2]; 37526fbe8dcSKarl Rupp 3762d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 3772d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 3782d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 3792d61bbb3SSatish Balay v += 9; 3802d61bbb3SSatish Balay } 3817b2bb3b9SHong Zhang if (usecprow) z = zarray + 3 * ridx[i]; 3829371c9d4SSatish Balay z[0] = sum1; 3839371c9d4SSatish Balay z[1] = sum2; 3849371c9d4SSatish Balay z[2] = sum3; 38526e093fcSHong Zhang if (!usecprow) z += 3; 3862d61bbb3SSatish Balay } 3879566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3889566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 3899566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 3903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3912d61bbb3SSatish Balay } 3922d61bbb3SSatish Balay 393d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) 394d71ae5a4SJacob Faibussowitsch { 3952d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 396f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 397d9fead3dSBarry Smith const PetscScalar *x, *xb; 398d9fead3dSBarry Smith const MatScalar *v; 3997c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 400ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 4012d61bbb3SSatish Balay 4022d61bbb3SSatish Balay PetscFunctionBegin; 4039566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4049566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 4052d61bbb3SSatish Balay 4062d61bbb3SSatish Balay idx = a->j; 4072d61bbb3SSatish Balay v = a->a; 40826e093fcSHong Zhang if (usecprow) { 40926e093fcSHong Zhang mbs = a->compressedrow.nrows; 41026e093fcSHong Zhang ii = a->compressedrow.i; 4117b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 4129566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 41326e093fcSHong Zhang } else { 41426e093fcSHong Zhang mbs = a->mbs; 4152d61bbb3SSatish Balay ii = a->i; 41626e093fcSHong Zhang z = zarray; 41726e093fcSHong Zhang } 4182d61bbb3SSatish Balay 4192d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 42026fbe8dcSKarl Rupp n = ii[1] - ii[0]; 42126fbe8dcSKarl Rupp ii++; 42226fbe8dcSKarl Rupp sum1 = 0.0; 42326fbe8dcSKarl Rupp sum2 = 0.0; 42426fbe8dcSKarl Rupp sum3 = 0.0; 42526fbe8dcSKarl Rupp sum4 = 0.0; 42626fbe8dcSKarl Rupp 427444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 428444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4292d61bbb3SSatish Balay for (j = 0; j < n; j++) { 4302d61bbb3SSatish Balay xb = x + 4 * (*idx++); 4319371c9d4SSatish Balay x1 = xb[0]; 4329371c9d4SSatish Balay x2 = xb[1]; 4339371c9d4SSatish Balay x3 = xb[2]; 4349371c9d4SSatish Balay x4 = xb[3]; 4352d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 4362d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 4372d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 4382d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 4392d61bbb3SSatish Balay v += 16; 4402d61bbb3SSatish Balay } 4417b2bb3b9SHong Zhang if (usecprow) z = zarray + 4 * ridx[i]; 4429371c9d4SSatish Balay z[0] = sum1; 4439371c9d4SSatish Balay z[1] = sum2; 4449371c9d4SSatish Balay z[2] = sum3; 4459371c9d4SSatish Balay z[3] = sum4; 44626e093fcSHong Zhang if (!usecprow) z += 4; 4472d61bbb3SSatish Balay } 4489566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4499566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 4509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 4513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4522d61bbb3SSatish Balay } 4532d61bbb3SSatish Balay 454d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) 455d71ae5a4SJacob Faibussowitsch { 4562d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 457f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 458d9fead3dSBarry Smith const PetscScalar *xb, *x; 459d9fead3dSBarry Smith const MatScalar *v; 4600298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 4617c565772SBarry Smith PetscInt mbs, i, j, n; 462ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 4632d61bbb3SSatish Balay 464433994e6SBarry Smith PetscFunctionBegin; 4659566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4669566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 4672d61bbb3SSatish Balay 4682d61bbb3SSatish Balay idx = a->j; 4692d61bbb3SSatish Balay v = a->a; 47026e093fcSHong Zhang if (usecprow) { 47126e093fcSHong Zhang mbs = a->compressedrow.nrows; 47226e093fcSHong Zhang ii = a->compressedrow.i; 4737b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 4749566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 47526e093fcSHong Zhang } else { 47626e093fcSHong Zhang mbs = a->mbs; 4772d61bbb3SSatish Balay ii = a->i; 47826e093fcSHong Zhang z = zarray; 47926e093fcSHong Zhang } 4802d61bbb3SSatish Balay 4812d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 4829371c9d4SSatish Balay n = ii[1] - ii[0]; 4839371c9d4SSatish Balay ii++; 4849371c9d4SSatish Balay sum1 = 0.0; 4859371c9d4SSatish Balay sum2 = 0.0; 4869371c9d4SSatish Balay sum3 = 0.0; 4879371c9d4SSatish Balay sum4 = 0.0; 4889371c9d4SSatish Balay sum5 = 0.0; 489444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 490444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4912d61bbb3SSatish Balay for (j = 0; j < n; j++) { 4922d61bbb3SSatish Balay xb = x + 5 * (*idx++); 4939371c9d4SSatish Balay x1 = xb[0]; 4949371c9d4SSatish Balay x2 = xb[1]; 4959371c9d4SSatish Balay x3 = xb[2]; 4969371c9d4SSatish Balay x4 = xb[3]; 4979371c9d4SSatish Balay x5 = xb[4]; 4982d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 4992d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 5002d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 5012d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 5022d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 5032d61bbb3SSatish Balay v += 25; 5042d61bbb3SSatish Balay } 5057b2bb3b9SHong Zhang if (usecprow) z = zarray + 5 * ridx[i]; 5069371c9d4SSatish Balay z[0] = sum1; 5079371c9d4SSatish Balay z[1] = sum2; 5089371c9d4SSatish Balay z[2] = sum3; 5099371c9d4SSatish Balay z[3] = sum4; 5109371c9d4SSatish Balay z[4] = sum5; 51126e093fcSHong Zhang if (!usecprow) z += 5; 5122d61bbb3SSatish Balay } 5139566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5149566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 5159566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 5163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5172d61bbb3SSatish Balay } 5182d61bbb3SSatish Balay 519d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) 520d71ae5a4SJacob Faibussowitsch { 52115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 522f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 523d9fead3dSBarry Smith const PetscScalar *x, *xb; 52426e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 525d9fead3dSBarry Smith const MatScalar *v; 5267c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 527ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 52815091d37SBarry Smith 529433994e6SBarry Smith PetscFunctionBegin; 5309566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5319566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 53215091d37SBarry Smith 53315091d37SBarry Smith idx = a->j; 53415091d37SBarry Smith v = a->a; 53526e093fcSHong Zhang if (usecprow) { 53626e093fcSHong Zhang mbs = a->compressedrow.nrows; 53726e093fcSHong Zhang ii = a->compressedrow.i; 5387b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 5399566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 54026e093fcSHong Zhang } else { 54126e093fcSHong Zhang mbs = a->mbs; 54215091d37SBarry Smith ii = a->i; 54326e093fcSHong Zhang z = zarray; 54426e093fcSHong Zhang } 54515091d37SBarry Smith 54615091d37SBarry Smith for (i = 0; i < mbs; i++) { 54726fbe8dcSKarl Rupp n = ii[1] - ii[0]; 54826fbe8dcSKarl Rupp ii++; 54926fbe8dcSKarl Rupp sum1 = 0.0; 55026fbe8dcSKarl Rupp sum2 = 0.0; 55126fbe8dcSKarl Rupp sum3 = 0.0; 55226fbe8dcSKarl Rupp sum4 = 0.0; 55326fbe8dcSKarl Rupp sum5 = 0.0; 55426fbe8dcSKarl Rupp sum6 = 0.0; 55526fbe8dcSKarl Rupp 556444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 557444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 55815091d37SBarry Smith for (j = 0; j < n; j++) { 55915091d37SBarry Smith xb = x + 6 * (*idx++); 5609371c9d4SSatish Balay x1 = xb[0]; 5619371c9d4SSatish Balay x2 = xb[1]; 5629371c9d4SSatish Balay x3 = xb[2]; 5639371c9d4SSatish Balay x4 = xb[3]; 5649371c9d4SSatish Balay x5 = xb[4]; 5659371c9d4SSatish Balay x6 = xb[5]; 56615091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 56715091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 56815091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 56915091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 57015091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 57115091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 57215091d37SBarry Smith v += 36; 57315091d37SBarry Smith } 5747b2bb3b9SHong Zhang if (usecprow) z = zarray + 6 * ridx[i]; 5759371c9d4SSatish Balay z[0] = sum1; 5769371c9d4SSatish Balay z[1] = sum2; 5779371c9d4SSatish Balay z[2] = sum3; 5789371c9d4SSatish Balay z[3] = sum4; 5799371c9d4SSatish Balay z[4] = sum5; 5809371c9d4SSatish Balay z[5] = sum6; 58126e093fcSHong Zhang if (!usecprow) z += 6; 58215091d37SBarry Smith } 58315091d37SBarry Smith 5849566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 5869566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 5873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 58815091d37SBarry Smith } 5898ab949d8SShri Abhyankar 590d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) 591d71ae5a4SJacob Faibussowitsch { 5922d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 593f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 594d9fead3dSBarry Smith const PetscScalar *x, *xb; 59526e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 596d9fead3dSBarry Smith const MatScalar *v; 5977c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 598ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 5992d61bbb3SSatish Balay 600433994e6SBarry Smith PetscFunctionBegin; 6019566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6029566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 6032d61bbb3SSatish Balay 6042d61bbb3SSatish Balay idx = a->j; 6052d61bbb3SSatish Balay v = a->a; 60626e093fcSHong Zhang if (usecprow) { 60726e093fcSHong Zhang mbs = a->compressedrow.nrows; 60826e093fcSHong Zhang ii = a->compressedrow.i; 6097b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 6109566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 61126e093fcSHong Zhang } else { 61226e093fcSHong Zhang mbs = a->mbs; 6132d61bbb3SSatish Balay ii = a->i; 61426e093fcSHong Zhang z = zarray; 61526e093fcSHong Zhang } 6162d61bbb3SSatish Balay 6172d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 61826fbe8dcSKarl Rupp n = ii[1] - ii[0]; 61926fbe8dcSKarl Rupp ii++; 62026fbe8dcSKarl Rupp sum1 = 0.0; 62126fbe8dcSKarl Rupp sum2 = 0.0; 62226fbe8dcSKarl Rupp sum3 = 0.0; 62326fbe8dcSKarl Rupp sum4 = 0.0; 62426fbe8dcSKarl Rupp sum5 = 0.0; 62526fbe8dcSKarl Rupp sum6 = 0.0; 62626fbe8dcSKarl Rupp sum7 = 0.0; 62726fbe8dcSKarl Rupp 628444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 629444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 6302d61bbb3SSatish Balay for (j = 0; j < n; j++) { 6312d61bbb3SSatish Balay xb = x + 7 * (*idx++); 6329371c9d4SSatish Balay x1 = xb[0]; 6339371c9d4SSatish Balay x2 = xb[1]; 6349371c9d4SSatish Balay x3 = xb[2]; 6359371c9d4SSatish Balay x4 = xb[3]; 6369371c9d4SSatish Balay x5 = xb[4]; 6379371c9d4SSatish Balay x6 = xb[5]; 6389371c9d4SSatish Balay x7 = xb[6]; 6392d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 6402d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 6412d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 6422d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 6432d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 6442d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 6452d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 6462d61bbb3SSatish Balay v += 49; 6472d61bbb3SSatish Balay } 6487b2bb3b9SHong Zhang if (usecprow) z = zarray + 7 * ridx[i]; 6499371c9d4SSatish Balay z[0] = sum1; 6509371c9d4SSatish Balay z[1] = sum2; 6519371c9d4SSatish Balay z[2] = sum3; 6529371c9d4SSatish Balay z[3] = sum4; 6539371c9d4SSatish Balay z[4] = sum5; 6549371c9d4SSatish Balay z[5] = sum6; 6559371c9d4SSatish Balay z[6] = sum7; 65626e093fcSHong Zhang if (!usecprow) z += 7; 6572d61bbb3SSatish Balay } 6582d61bbb3SSatish Balay 6599566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6609566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 6619566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 6623ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6632d61bbb3SSatish Balay } 6642d61bbb3SSatish Balay 6655f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 666d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) 667d71ae5a4SJacob Faibussowitsch { 66896e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 669f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 67096e086a2SDaniel Kokron const PetscScalar *x, *xb; 67196e086a2SDaniel Kokron const MatScalar *v; 67296e086a2SDaniel Kokron PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 67396e086a2SDaniel Kokron const PetscInt *idx, *ii, *ridx = NULL; 674ce68d72fSJed Brown PetscInt k; 67596e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 67696e086a2SDaniel Kokron 67796e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 678ce68d72fSJed Brown __m256d w0, w1, w2, w3; 67996e086a2SDaniel Kokron __m256d z0, z1, z2; 68096e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 68196e086a2SDaniel Kokron 68296e086a2SDaniel Kokron PetscFunctionBegin; 6839566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6849566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 68596e086a2SDaniel Kokron 68696e086a2SDaniel Kokron idx = a->j; 68796e086a2SDaniel Kokron v = a->a; 68896e086a2SDaniel Kokron if (usecprow) { 68996e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 69096e086a2SDaniel Kokron ii = a->compressedrow.i; 69196e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 6929566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 69396e086a2SDaniel Kokron } else { 69496e086a2SDaniel Kokron mbs = a->mbs; 69596e086a2SDaniel Kokron ii = a->i; 69696e086a2SDaniel Kokron z = zarray; 69796e086a2SDaniel Kokron } 69896e086a2SDaniel Kokron 69996e086a2SDaniel Kokron if (!a->mult_work) { 70096e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 7019566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 70296e086a2SDaniel Kokron } 70396e086a2SDaniel Kokron 70496e086a2SDaniel Kokron work = a->mult_work; 70596e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 7069371c9d4SSatish Balay n = ii[1] - ii[0]; 7079371c9d4SSatish Balay ii++; 70896e086a2SDaniel Kokron workt = work; 70996e086a2SDaniel Kokron for (j = 0; j < n; j++) { 71096e086a2SDaniel Kokron xb = x + bs * (*idx++); 71196e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 71296e086a2SDaniel Kokron workt += bs; 71396e086a2SDaniel Kokron } 71496e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 71596e086a2SDaniel Kokron 7169371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 7179371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 7189371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 71996e086a2SDaniel Kokron 72096e086a2SDaniel Kokron for (j = 0; j < n; j++) { 721c05b70c4SSatish Balay /* first column of a */ 72296e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 7239371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 7249371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 7259371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 7269371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 7279371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 7289371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 72996e086a2SDaniel Kokron 730c05b70c4SSatish Balay /* second column of a */ 73196e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 7329371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 7339371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 7349371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 7359371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 7369371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 7379371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 73896e086a2SDaniel Kokron 739c05b70c4SSatish Balay /* third column of a */ 74096e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 7419371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 7429371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 7439371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 7449371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 7459371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 7469371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 74796e086a2SDaniel Kokron 748c05b70c4SSatish Balay /* fourth column of a */ 74996e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 7509371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 7519371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 7529371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 7539371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 7549371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 7559371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 75696e086a2SDaniel Kokron 757c05b70c4SSatish Balay /* fifth column of a */ 75896e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 7599371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 7609371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 7619371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 7629371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 7639371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 7649371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 76596e086a2SDaniel Kokron 766c05b70c4SSatish Balay /* sixth column of a */ 76796e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 7689371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 7699371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 7709371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 7719371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 7729371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 7739371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 77496e086a2SDaniel Kokron 775c05b70c4SSatish Balay /* seventh column of a */ 77696e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 7779371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 7789371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 7799371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 7809371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 7819371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 7829371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 78396e086a2SDaniel Kokron 7846aad120cSJose E. Roman /* eighth column of a */ 78596e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 7869371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 7879371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 7889371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 7899371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 7909371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 7919371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 79296e086a2SDaniel Kokron 793c05b70c4SSatish Balay /* ninth column of a */ 79496e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 7959371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 7969371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 7979371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 7989371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 7999371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 8009371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 80196e086a2SDaniel Kokron } 80296e086a2SDaniel Kokron 8039371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 8049371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 8059371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 80696e086a2SDaniel Kokron 80796e086a2SDaniel Kokron v += n * bs2; 80896e086a2SDaniel Kokron if (!usecprow) z += bs; 80996e086a2SDaniel Kokron } 8109566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8119566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 8129566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 8133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 81496e086a2SDaniel Kokron } 81596e086a2SDaniel Kokron #endif 81696e086a2SDaniel Kokron 817d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) 818d71ae5a4SJacob Faibussowitsch { 819ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 820f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 821ebada01fSBarry Smith const PetscScalar *x, *xb; 822ebada01fSBarry Smith PetscScalar *zarray, xv; 823ebada01fSBarry Smith const MatScalar *v; 824ebada01fSBarry Smith const PetscInt *ii, *ij = a->j, *idx; 825ebada01fSBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 826ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 827ebada01fSBarry Smith 828ebada01fSBarry Smith PetscFunctionBegin; 8299566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8309566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 831ebada01fSBarry Smith 832ebada01fSBarry Smith v = a->a; 833ebada01fSBarry Smith if (usecprow) { 834ebada01fSBarry Smith mbs = a->compressedrow.nrows; 835ebada01fSBarry Smith ii = a->compressedrow.i; 836ebada01fSBarry Smith ridx = a->compressedrow.rindex; 8379566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 838ebada01fSBarry Smith } else { 839ebada01fSBarry Smith mbs = a->mbs; 840ebada01fSBarry Smith ii = a->i; 841ebada01fSBarry Smith z = zarray; 842ebada01fSBarry Smith } 843ebada01fSBarry Smith 844ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 845ebada01fSBarry Smith n = ii[i + 1] - ii[i]; 846ebada01fSBarry Smith idx = ij + ii[i]; 8479371c9d4SSatish Balay sum1 = 0.0; 8489371c9d4SSatish Balay sum2 = 0.0; 8499371c9d4SSatish Balay sum3 = 0.0; 8509371c9d4SSatish Balay sum4 = 0.0; 8519371c9d4SSatish Balay sum5 = 0.0; 8529371c9d4SSatish Balay sum6 = 0.0; 8539371c9d4SSatish Balay sum7 = 0.0; 8549371c9d4SSatish Balay sum8 = 0.0; 8559371c9d4SSatish Balay sum9 = 0.0; 8569371c9d4SSatish Balay sum10 = 0.0; 8579371c9d4SSatish Balay sum11 = 0.0; 858ebada01fSBarry Smith 859ebada01fSBarry Smith for (j = 0; j < n; j++) { 860ebada01fSBarry Smith xb = x + 11 * (idx[j]); 861ebada01fSBarry Smith 862ebada01fSBarry Smith for (k = 0; k < 11; k++) { 863ebada01fSBarry Smith xv = xb[k]; 864ebada01fSBarry Smith sum1 += v[0] * xv; 865ebada01fSBarry Smith sum2 += v[1] * xv; 866ebada01fSBarry Smith sum3 += v[2] * xv; 867ebada01fSBarry Smith sum4 += v[3] * xv; 868ebada01fSBarry Smith sum5 += v[4] * xv; 869ebada01fSBarry Smith sum6 += v[5] * xv; 870ebada01fSBarry Smith sum7 += v[6] * xv; 871ebada01fSBarry Smith sum8 += v[7] * xv; 872ebada01fSBarry Smith sum9 += v[8] * xv; 873ebada01fSBarry Smith sum10 += v[9] * xv; 874ebada01fSBarry Smith sum11 += v[10] * xv; 875ebada01fSBarry Smith v += 11; 876ebada01fSBarry Smith } 877ebada01fSBarry Smith } 878ebada01fSBarry Smith if (usecprow) z = zarray + 11 * ridx[i]; 8799371c9d4SSatish Balay z[0] = sum1; 8809371c9d4SSatish Balay z[1] = sum2; 8819371c9d4SSatish Balay z[2] = sum3; 8829371c9d4SSatish Balay z[3] = sum4; 8839371c9d4SSatish Balay z[4] = sum5; 8849371c9d4SSatish Balay z[5] = sum6; 8859371c9d4SSatish Balay z[6] = sum7; 8869371c9d4SSatish Balay z[7] = sum8; 8879371c9d4SSatish Balay z[8] = sum9; 8889371c9d4SSatish Balay z[9] = sum10; 8899371c9d4SSatish Balay z[10] = sum11; 890ebada01fSBarry Smith 891ebada01fSBarry Smith if (!usecprow) z += 11; 892ebada01fSBarry Smith } 893ebada01fSBarry Smith 8949566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8959566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 8969566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 8973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 898ebada01fSBarry Smith } 899ebada01fSBarry Smith 9006679dcc1SBarry Smith /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 901d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) 902d71ae5a4SJacob Faibussowitsch { 9036679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 9046679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 9056679dcc1SBarry Smith const PetscScalar *x, *xb; 9066679dcc1SBarry Smith PetscScalar *zarray, xv; 9076679dcc1SBarry Smith const MatScalar *v; 9086679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 9096679dcc1SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 9106679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 9116679dcc1SBarry Smith 9126679dcc1SBarry Smith PetscFunctionBegin; 9139566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9149566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 9156679dcc1SBarry Smith 9166679dcc1SBarry Smith v = a->a; 9176679dcc1SBarry Smith if (usecprow) { 9186679dcc1SBarry Smith mbs = a->compressedrow.nrows; 9196679dcc1SBarry Smith ii = a->compressedrow.i; 9206679dcc1SBarry Smith ridx = a->compressedrow.rindex; 9219566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 9226679dcc1SBarry Smith } else { 9236679dcc1SBarry Smith mbs = a->mbs; 9246679dcc1SBarry Smith ii = a->i; 9256679dcc1SBarry Smith z = zarray; 9266679dcc1SBarry Smith } 9276679dcc1SBarry Smith 9286679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 9296679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 9306679dcc1SBarry Smith idx = ij + ii[i]; 9319371c9d4SSatish Balay sum1 = 0.0; 9329371c9d4SSatish Balay sum2 = 0.0; 9339371c9d4SSatish Balay sum3 = 0.0; 9349371c9d4SSatish Balay sum4 = 0.0; 9359371c9d4SSatish Balay sum5 = 0.0; 9369371c9d4SSatish Balay sum6 = 0.0; 9379371c9d4SSatish Balay sum7 = 0.0; 9389371c9d4SSatish Balay sum8 = 0.0; 9399371c9d4SSatish Balay sum9 = 0.0; 9409371c9d4SSatish Balay sum10 = 0.0; 9419371c9d4SSatish Balay sum11 = 0.0; 9429371c9d4SSatish Balay sum12 = 0.0; 9436679dcc1SBarry Smith 9446679dcc1SBarry Smith for (j = 0; j < n; j++) { 9456679dcc1SBarry Smith xb = x + 12 * (idx[j]); 9466679dcc1SBarry Smith 9476679dcc1SBarry Smith for (k = 0; k < 12; k++) { 9486679dcc1SBarry Smith xv = xb[k]; 9496679dcc1SBarry Smith sum1 += v[0] * xv; 9506679dcc1SBarry Smith sum2 += v[1] * xv; 9516679dcc1SBarry Smith sum3 += v[2] * xv; 9526679dcc1SBarry Smith sum4 += v[3] * xv; 9536679dcc1SBarry Smith sum5 += v[4] * xv; 9546679dcc1SBarry Smith sum6 += v[5] * xv; 9556679dcc1SBarry Smith sum7 += v[6] * xv; 9566679dcc1SBarry Smith sum8 += v[7] * xv; 9576679dcc1SBarry Smith sum9 += v[8] * xv; 9586679dcc1SBarry Smith sum10 += v[9] * xv; 9596679dcc1SBarry Smith sum11 += v[10] * xv; 9606679dcc1SBarry Smith sum12 += v[11] * xv; 9616679dcc1SBarry Smith v += 12; 9626679dcc1SBarry Smith } 9636679dcc1SBarry Smith } 9646679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 9659371c9d4SSatish Balay z[0] = sum1; 9669371c9d4SSatish Balay z[1] = sum2; 9679371c9d4SSatish Balay z[2] = sum3; 9689371c9d4SSatish Balay z[3] = sum4; 9699371c9d4SSatish Balay z[4] = sum5; 9709371c9d4SSatish Balay z[5] = sum6; 9719371c9d4SSatish Balay z[6] = sum7; 9729371c9d4SSatish Balay z[7] = sum8; 9739371c9d4SSatish Balay z[8] = sum9; 9749371c9d4SSatish Balay z[9] = sum10; 9759371c9d4SSatish Balay z[10] = sum11; 9769371c9d4SSatish Balay z[11] = sum12; 9776679dcc1SBarry Smith if (!usecprow) z += 12; 9786679dcc1SBarry Smith } 9799566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9809566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 9819566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 9823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 9836679dcc1SBarry Smith } 9846679dcc1SBarry Smith 985d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) 986d71ae5a4SJacob Faibussowitsch { 9876679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 9886679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 9896679dcc1SBarry Smith const PetscScalar *x, *xb; 9906679dcc1SBarry Smith PetscScalar *zarray, *yarray, xv; 9916679dcc1SBarry Smith const MatScalar *v; 9926679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 9936679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 9946679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 9956679dcc1SBarry Smith 9966679dcc1SBarry Smith PetscFunctionBegin; 9979566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9989566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 9996679dcc1SBarry Smith 10006679dcc1SBarry Smith v = a->a; 10016679dcc1SBarry Smith if (usecprow) { 100248a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 10036679dcc1SBarry Smith mbs = a->compressedrow.nrows; 10046679dcc1SBarry Smith ii = a->compressedrow.i; 10056679dcc1SBarry Smith ridx = a->compressedrow.rindex; 10066679dcc1SBarry Smith } else { 10076679dcc1SBarry Smith ii = a->i; 10086679dcc1SBarry Smith y = yarray; 10096679dcc1SBarry Smith z = zarray; 10106679dcc1SBarry Smith } 10116679dcc1SBarry Smith 10126679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 10136679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 10146679dcc1SBarry Smith idx = ij + ii[i]; 10156679dcc1SBarry Smith 10166679dcc1SBarry Smith if (usecprow) { 10176679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 10186679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 10196679dcc1SBarry Smith } 10209371c9d4SSatish Balay sum1 = y[0]; 10219371c9d4SSatish Balay sum2 = y[1]; 10229371c9d4SSatish Balay sum3 = y[2]; 10239371c9d4SSatish Balay sum4 = y[3]; 10249371c9d4SSatish Balay sum5 = y[4]; 10259371c9d4SSatish Balay sum6 = y[5]; 10269371c9d4SSatish Balay sum7 = y[6]; 10279371c9d4SSatish Balay sum8 = y[7]; 10289371c9d4SSatish Balay sum9 = y[8]; 10299371c9d4SSatish Balay sum10 = y[9]; 10309371c9d4SSatish Balay sum11 = y[10]; 10319371c9d4SSatish Balay sum12 = y[11]; 10326679dcc1SBarry Smith 10336679dcc1SBarry Smith for (j = 0; j < n; j++) { 10346679dcc1SBarry Smith xb = x + 12 * (idx[j]); 10356679dcc1SBarry Smith 10366679dcc1SBarry Smith for (k = 0; k < 12; k++) { 10376679dcc1SBarry Smith xv = xb[k]; 10386679dcc1SBarry Smith sum1 += v[0] * xv; 10396679dcc1SBarry Smith sum2 += v[1] * xv; 10406679dcc1SBarry Smith sum3 += v[2] * xv; 10416679dcc1SBarry Smith sum4 += v[3] * xv; 10426679dcc1SBarry Smith sum5 += v[4] * xv; 10436679dcc1SBarry Smith sum6 += v[5] * xv; 10446679dcc1SBarry Smith sum7 += v[6] * xv; 10456679dcc1SBarry Smith sum8 += v[7] * xv; 10466679dcc1SBarry Smith sum9 += v[8] * xv; 10476679dcc1SBarry Smith sum10 += v[9] * xv; 10486679dcc1SBarry Smith sum11 += v[10] * xv; 10496679dcc1SBarry Smith sum12 += v[11] * xv; 10506679dcc1SBarry Smith v += 12; 10516679dcc1SBarry Smith } 10526679dcc1SBarry Smith } 10536679dcc1SBarry Smith 10549371c9d4SSatish Balay z[0] = sum1; 10559371c9d4SSatish Balay z[1] = sum2; 10569371c9d4SSatish Balay z[2] = sum3; 10579371c9d4SSatish Balay z[3] = sum4; 10589371c9d4SSatish Balay z[4] = sum5; 10599371c9d4SSatish Balay z[5] = sum6; 10609371c9d4SSatish Balay z[6] = sum7; 10619371c9d4SSatish Balay z[7] = sum8; 10629371c9d4SSatish Balay z[8] = sum9; 10639371c9d4SSatish Balay z[9] = sum10; 10649371c9d4SSatish Balay z[10] = sum11; 10659371c9d4SSatish Balay z[11] = sum12; 10666679dcc1SBarry Smith if (!usecprow) { 10676679dcc1SBarry Smith y += 12; 10686679dcc1SBarry Smith z += 12; 10696679dcc1SBarry Smith } 10706679dcc1SBarry Smith } 10719566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10729566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 10739566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 10743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 10756679dcc1SBarry Smith } 10766679dcc1SBarry Smith 10776679dcc1SBarry Smith /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1078d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) 1079d71ae5a4SJacob Faibussowitsch { 10806679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 10816679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 10826679dcc1SBarry Smith const PetscScalar *x, *xb; 10836679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray; 10846679dcc1SBarry Smith const MatScalar *v; 10856679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 10866679dcc1SBarry Smith PetscInt mbs, i, j, n; 10876679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 10886679dcc1SBarry Smith 10896679dcc1SBarry Smith PetscFunctionBegin; 10909566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10919566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 10926679dcc1SBarry Smith 10936679dcc1SBarry Smith v = a->a; 10946679dcc1SBarry Smith if (usecprow) { 10956679dcc1SBarry Smith mbs = a->compressedrow.nrows; 10966679dcc1SBarry Smith ii = a->compressedrow.i; 10976679dcc1SBarry Smith ridx = a->compressedrow.rindex; 10989566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 10996679dcc1SBarry Smith } else { 11006679dcc1SBarry Smith mbs = a->mbs; 11016679dcc1SBarry Smith ii = a->i; 11026679dcc1SBarry Smith z = zarray; 11036679dcc1SBarry Smith } 11046679dcc1SBarry Smith 11056679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 11066679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 11076679dcc1SBarry Smith idx = ij + ii[i]; 11086679dcc1SBarry Smith 11096679dcc1SBarry Smith sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 11106679dcc1SBarry Smith for (j = 0; j < n; j++) { 11116679dcc1SBarry Smith xb = x + 12 * (idx[j]); 11129371c9d4SSatish Balay x1 = xb[0]; 11139371c9d4SSatish Balay x2 = xb[1]; 11149371c9d4SSatish Balay x3 = xb[2]; 11159371c9d4SSatish Balay x4 = xb[3]; 11166679dcc1SBarry Smith 11176679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11186679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11196679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11206679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11216679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11226679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11236679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11246679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11256679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11266679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11276679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11286679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11296679dcc1SBarry Smith v += 48; 11306679dcc1SBarry Smith 11319371c9d4SSatish Balay x1 = xb[4]; 11329371c9d4SSatish Balay x2 = xb[5]; 11339371c9d4SSatish Balay x3 = xb[6]; 11349371c9d4SSatish Balay x4 = xb[7]; 11356679dcc1SBarry Smith 11366679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11376679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11386679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11396679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11406679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11416679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11426679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11436679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11446679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11456679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11466679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11476679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11486679dcc1SBarry Smith v += 48; 11496679dcc1SBarry Smith 11509371c9d4SSatish Balay x1 = xb[8]; 11519371c9d4SSatish Balay x2 = xb[9]; 11529371c9d4SSatish Balay x3 = xb[10]; 11539371c9d4SSatish Balay x4 = xb[11]; 11546679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11556679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11566679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11576679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11586679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11596679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11606679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11616679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11626679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11636679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11646679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11656679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11666679dcc1SBarry Smith v += 48; 11676679dcc1SBarry Smith } 11686679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 11699371c9d4SSatish Balay z[0] = sum1; 11709371c9d4SSatish Balay z[1] = sum2; 11719371c9d4SSatish Balay z[2] = sum3; 11729371c9d4SSatish Balay z[3] = sum4; 11739371c9d4SSatish Balay z[4] = sum5; 11749371c9d4SSatish Balay z[5] = sum6; 11759371c9d4SSatish Balay z[6] = sum7; 11769371c9d4SSatish Balay z[7] = sum8; 11779371c9d4SSatish Balay z[8] = sum9; 11789371c9d4SSatish Balay z[9] = sum10; 11799371c9d4SSatish Balay z[10] = sum11; 11809371c9d4SSatish Balay z[11] = sum12; 11816679dcc1SBarry Smith if (!usecprow) z += 12; 11826679dcc1SBarry Smith } 11839566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11849566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 11859566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 11863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 11876679dcc1SBarry Smith } 11886679dcc1SBarry Smith 11896679dcc1SBarry Smith /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1190d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) 1191d71ae5a4SJacob Faibussowitsch { 11926679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 11936679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 11946679dcc1SBarry Smith const PetscScalar *x, *xb; 11956679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray, *yarray; 11966679dcc1SBarry Smith const MatScalar *v; 11976679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 11986679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, n; 11996679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 12006679dcc1SBarry Smith 12016679dcc1SBarry Smith PetscFunctionBegin; 12029566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 12039566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 12046679dcc1SBarry Smith 12056679dcc1SBarry Smith v = a->a; 12066679dcc1SBarry Smith if (usecprow) { 120748a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 12086679dcc1SBarry Smith mbs = a->compressedrow.nrows; 12096679dcc1SBarry Smith ii = a->compressedrow.i; 12106679dcc1SBarry Smith ridx = a->compressedrow.rindex; 12116679dcc1SBarry Smith } else { 12126679dcc1SBarry Smith ii = a->i; 12136679dcc1SBarry Smith y = yarray; 12146679dcc1SBarry Smith z = zarray; 12156679dcc1SBarry Smith } 12166679dcc1SBarry Smith 12176679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 12186679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 12196679dcc1SBarry Smith idx = ij + ii[i]; 12206679dcc1SBarry Smith 12216679dcc1SBarry Smith if (usecprow) { 12226679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 12236679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 12246679dcc1SBarry Smith } 12259371c9d4SSatish Balay sum1 = y[0]; 12269371c9d4SSatish Balay sum2 = y[1]; 12279371c9d4SSatish Balay sum3 = y[2]; 12289371c9d4SSatish Balay sum4 = y[3]; 12299371c9d4SSatish Balay sum5 = y[4]; 12309371c9d4SSatish Balay sum6 = y[5]; 12319371c9d4SSatish Balay sum7 = y[6]; 12329371c9d4SSatish Balay sum8 = y[7]; 12339371c9d4SSatish Balay sum9 = y[8]; 12349371c9d4SSatish Balay sum10 = y[9]; 12359371c9d4SSatish Balay sum11 = y[10]; 12369371c9d4SSatish Balay sum12 = y[11]; 12376679dcc1SBarry Smith 12386679dcc1SBarry Smith for (j = 0; j < n; j++) { 12396679dcc1SBarry Smith xb = x + 12 * (idx[j]); 12409371c9d4SSatish Balay x1 = xb[0]; 12419371c9d4SSatish Balay x2 = xb[1]; 12429371c9d4SSatish Balay x3 = xb[2]; 12439371c9d4SSatish Balay x4 = xb[3]; 12446679dcc1SBarry Smith 12456679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12466679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12476679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12486679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12496679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12506679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12516679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12526679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12536679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12546679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12556679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12566679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12576679dcc1SBarry Smith v += 48; 12586679dcc1SBarry Smith 12599371c9d4SSatish Balay x1 = xb[4]; 12609371c9d4SSatish Balay x2 = xb[5]; 12619371c9d4SSatish Balay x3 = xb[6]; 12629371c9d4SSatish Balay x4 = xb[7]; 12636679dcc1SBarry Smith 12646679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12656679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12666679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12676679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12686679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12696679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12706679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12716679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12726679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12736679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12746679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12756679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12766679dcc1SBarry Smith v += 48; 12776679dcc1SBarry Smith 12789371c9d4SSatish Balay x1 = xb[8]; 12799371c9d4SSatish Balay x2 = xb[9]; 12809371c9d4SSatish Balay x3 = xb[10]; 12819371c9d4SSatish Balay x4 = xb[11]; 12826679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12836679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12846679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12856679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12866679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12876679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12886679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12896679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12906679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12916679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12926679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12936679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12946679dcc1SBarry Smith v += 48; 12956679dcc1SBarry Smith } 12969371c9d4SSatish Balay z[0] = sum1; 12979371c9d4SSatish Balay z[1] = sum2; 12989371c9d4SSatish Balay z[2] = sum3; 12999371c9d4SSatish Balay z[3] = sum4; 13009371c9d4SSatish Balay z[4] = sum5; 13019371c9d4SSatish Balay z[5] = sum6; 13029371c9d4SSatish Balay z[6] = sum7; 13039371c9d4SSatish Balay z[7] = sum8; 13049371c9d4SSatish Balay z[8] = sum9; 13059371c9d4SSatish Balay z[9] = sum10; 13069371c9d4SSatish Balay z[10] = sum11; 13079371c9d4SSatish Balay z[11] = sum12; 13086679dcc1SBarry Smith if (!usecprow) { 13096679dcc1SBarry Smith y += 12; 13106679dcc1SBarry Smith z += 12; 13116679dcc1SBarry Smith } 13126679dcc1SBarry Smith } 13139566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 13149566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 13159566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 13163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13176679dcc1SBarry Smith } 13186679dcc1SBarry Smith 13196679dcc1SBarry Smith #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1320d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) 1321d71ae5a4SJacob Faibussowitsch { 13226679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 13236679dcc1SBarry Smith PetscScalar *z = NULL, *zarray; 13246679dcc1SBarry Smith const PetscScalar *x, *work; 13256679dcc1SBarry Smith const MatScalar *v = a->a; 13266679dcc1SBarry Smith PetscInt mbs, i, j, n; 13276679dcc1SBarry Smith const PetscInt *idx = a->j, *ii, *ridx = NULL; 13286679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 13296679dcc1SBarry Smith const PetscInt bs = 12, bs2 = 144; 13306679dcc1SBarry Smith 13316679dcc1SBarry Smith __m256d a0, a1, a2, a3, a4, a5; 13326679dcc1SBarry Smith __m256d w0, w1, w2, w3; 13336679dcc1SBarry Smith __m256d z0, z1, z2; 13346679dcc1SBarry Smith 13356679dcc1SBarry Smith PetscFunctionBegin; 13369566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 13379566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 13386679dcc1SBarry Smith 13396679dcc1SBarry Smith if (usecprow) { 13406679dcc1SBarry Smith mbs = a->compressedrow.nrows; 13416679dcc1SBarry Smith ii = a->compressedrow.i; 13426679dcc1SBarry Smith ridx = a->compressedrow.rindex; 13439566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 13446679dcc1SBarry Smith } else { 13456679dcc1SBarry Smith mbs = a->mbs; 13466679dcc1SBarry Smith ii = a->i; 13476679dcc1SBarry Smith z = zarray; 13486679dcc1SBarry Smith } 13496679dcc1SBarry Smith 13506679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 13519371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 13529371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 13539371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 13546679dcc1SBarry Smith 13559371c9d4SSatish Balay n = ii[1] - ii[0]; 13569371c9d4SSatish Balay ii++; 13576679dcc1SBarry Smith for (j = 0; j < n; j++) { 13586679dcc1SBarry Smith work = x + bs * (*idx++); 13596679dcc1SBarry Smith 13606679dcc1SBarry Smith /* first column of a */ 13616679dcc1SBarry Smith w0 = _mm256_set1_pd(work[0]); 13629371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 0); 13639371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 13649371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 4); 13659371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 13669371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 8); 13679371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 13686679dcc1SBarry Smith 13696679dcc1SBarry Smith /* second column of a */ 13706679dcc1SBarry Smith w1 = _mm256_set1_pd(work[1]); 13719371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 12); 13729371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 13739371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 16); 13749371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 13759371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 20); 13769371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 13776679dcc1SBarry Smith 13786679dcc1SBarry Smith /* third column of a */ 13796679dcc1SBarry Smith w2 = _mm256_set1_pd(work[2]); 13809371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 24); 13819371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 13829371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 28); 13839371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 13849371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 32); 13859371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 13866679dcc1SBarry Smith 13876679dcc1SBarry Smith /* fourth column of a */ 13886679dcc1SBarry Smith w3 = _mm256_set1_pd(work[3]); 13899371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 36); 13909371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 13919371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 40); 13929371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 13939371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 44); 13949371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 13956679dcc1SBarry Smith 13966679dcc1SBarry Smith /* fifth column of a */ 13976679dcc1SBarry Smith w0 = _mm256_set1_pd(work[4]); 13989371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 48); 13999371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14009371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 52); 14019371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14029371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 56); 14039371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14046679dcc1SBarry Smith 14056679dcc1SBarry Smith /* sixth column of a */ 14066679dcc1SBarry Smith w1 = _mm256_set1_pd(work[5]); 14079371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 60); 14089371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14099371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 64); 14109371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14119371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 68); 14129371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14136679dcc1SBarry Smith 14146679dcc1SBarry Smith /* seventh column of a */ 14156679dcc1SBarry Smith w2 = _mm256_set1_pd(work[6]); 14169371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 72); 14179371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14189371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 76); 14199371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14209371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 80); 14219371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14226679dcc1SBarry Smith 14236aad120cSJose E. Roman /* eighth column of a */ 14246679dcc1SBarry Smith w3 = _mm256_set1_pd(work[7]); 14259371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 84); 14269371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14279371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 88); 14289371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14299371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 92); 14309371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14316679dcc1SBarry Smith 14326679dcc1SBarry Smith /* ninth column of a */ 14336679dcc1SBarry Smith w0 = _mm256_set1_pd(work[8]); 14349371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 96); 14359371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14369371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 100); 14379371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14389371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 104); 14399371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14406679dcc1SBarry Smith 14416679dcc1SBarry Smith /* tenth column of a */ 14426679dcc1SBarry Smith w1 = _mm256_set1_pd(work[9]); 14439371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 108); 14449371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14459371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 112); 14469371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14479371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 116); 14489371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14496679dcc1SBarry Smith 14506679dcc1SBarry Smith /* eleventh column of a */ 14516679dcc1SBarry Smith w2 = _mm256_set1_pd(work[10]); 14529371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 120); 14539371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14549371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 124); 14559371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14569371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 128); 14579371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14586679dcc1SBarry Smith 14596679dcc1SBarry Smith /* twelveth column of a */ 14606679dcc1SBarry Smith w3 = _mm256_set1_pd(work[11]); 14619371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 132); 14629371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14639371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 136); 14649371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14659371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 140); 14669371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14676679dcc1SBarry Smith 14686679dcc1SBarry Smith v += bs2; 14696679dcc1SBarry Smith } 14706679dcc1SBarry Smith if (usecprow) z = zarray + bs * ridx[i]; 14719371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 14729371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 14739371c9d4SSatish Balay _mm256_storeu_pd(&z[8], z2); 14746679dcc1SBarry Smith if (!usecprow) z += bs; 14756679dcc1SBarry Smith } 14769566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 14779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 14789566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 14793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14806679dcc1SBarry Smith } 14816679dcc1SBarry Smith #endif 14826679dcc1SBarry Smith 14838ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1484832cc040SShri Abhyankar /* Default MatMult for block size 15 */ 1485d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) 1486d71ae5a4SJacob Faibussowitsch { 14878ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1488f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 14898ab949d8SShri Abhyankar const PetscScalar *x, *xb; 149053ef36baSBarry Smith PetscScalar *zarray, xv; 14918ab949d8SShri Abhyankar const MatScalar *v; 14928ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 14937c565772SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 1494ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 14958ab949d8SShri Abhyankar 14968ab949d8SShri Abhyankar PetscFunctionBegin; 14979566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 14989566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 14998ab949d8SShri Abhyankar 15008ab949d8SShri Abhyankar v = a->a; 15018ab949d8SShri Abhyankar if (usecprow) { 15028ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 15038ab949d8SShri Abhyankar ii = a->compressedrow.i; 15048ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 15059566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 15068ab949d8SShri Abhyankar } else { 15078ab949d8SShri Abhyankar mbs = a->mbs; 15088ab949d8SShri Abhyankar ii = a->i; 15098ab949d8SShri Abhyankar z = zarray; 15108ab949d8SShri Abhyankar } 15118ab949d8SShri Abhyankar 15128ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 15138ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 15148ab949d8SShri Abhyankar idx = ij + ii[i]; 15159371c9d4SSatish Balay sum1 = 0.0; 15169371c9d4SSatish Balay sum2 = 0.0; 15179371c9d4SSatish Balay sum3 = 0.0; 15189371c9d4SSatish Balay sum4 = 0.0; 15199371c9d4SSatish Balay sum5 = 0.0; 15209371c9d4SSatish Balay sum6 = 0.0; 15219371c9d4SSatish Balay sum7 = 0.0; 15229371c9d4SSatish Balay sum8 = 0.0; 15239371c9d4SSatish Balay sum9 = 0.0; 15249371c9d4SSatish Balay sum10 = 0.0; 15259371c9d4SSatish Balay sum11 = 0.0; 15269371c9d4SSatish Balay sum12 = 0.0; 15279371c9d4SSatish Balay sum13 = 0.0; 15289371c9d4SSatish Balay sum14 = 0.0; 15299371c9d4SSatish Balay sum15 = 0.0; 15308ab949d8SShri Abhyankar 15318ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 15328ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 15338ab949d8SShri Abhyankar 15348ab949d8SShri Abhyankar for (k = 0; k < 15; k++) { 153553ef36baSBarry Smith xv = xb[k]; 153653ef36baSBarry Smith sum1 += v[0] * xv; 153753ef36baSBarry Smith sum2 += v[1] * xv; 153853ef36baSBarry Smith sum3 += v[2] * xv; 153953ef36baSBarry Smith sum4 += v[3] * xv; 154053ef36baSBarry Smith sum5 += v[4] * xv; 154153ef36baSBarry Smith sum6 += v[5] * xv; 154253ef36baSBarry Smith sum7 += v[6] * xv; 154353ef36baSBarry Smith sum8 += v[7] * xv; 154453ef36baSBarry Smith sum9 += v[8] * xv; 154553ef36baSBarry Smith sum10 += v[9] * xv; 154653ef36baSBarry Smith sum11 += v[10] * xv; 154753ef36baSBarry Smith sum12 += v[11] * xv; 154853ef36baSBarry Smith sum13 += v[12] * xv; 154953ef36baSBarry Smith sum14 += v[13] * xv; 155053ef36baSBarry Smith sum15 += v[14] * xv; 15518ab949d8SShri Abhyankar v += 15; 15528ab949d8SShri Abhyankar } 15538ab949d8SShri Abhyankar } 15548ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 15559371c9d4SSatish Balay z[0] = sum1; 15569371c9d4SSatish Balay z[1] = sum2; 15579371c9d4SSatish Balay z[2] = sum3; 15589371c9d4SSatish Balay z[3] = sum4; 15599371c9d4SSatish Balay z[4] = sum5; 15609371c9d4SSatish Balay z[5] = sum6; 15619371c9d4SSatish Balay z[6] = sum7; 15629371c9d4SSatish Balay z[7] = sum8; 15639371c9d4SSatish Balay z[8] = sum9; 15649371c9d4SSatish Balay z[9] = sum10; 15659371c9d4SSatish Balay z[10] = sum11; 15669371c9d4SSatish Balay z[11] = sum12; 15679371c9d4SSatish Balay z[12] = sum13; 15689371c9d4SSatish Balay z[13] = sum14; 15699371c9d4SSatish Balay z[14] = sum15; 15708ab949d8SShri Abhyankar 15718ab949d8SShri Abhyankar if (!usecprow) z += 15; 15728ab949d8SShri Abhyankar } 15738ab949d8SShri Abhyankar 15749566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 15759566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 15769566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 15773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 15788ab949d8SShri Abhyankar } 15798ab949d8SShri Abhyankar 15808ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1581d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) 1582d71ae5a4SJacob Faibussowitsch { 15838ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1584f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 15858ab949d8SShri Abhyankar const PetscScalar *x, *xb; 15860b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, *zarray; 15878ab949d8SShri Abhyankar const MatScalar *v; 15888ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 15897c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1590ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 15918ab949d8SShri Abhyankar 15928ab949d8SShri Abhyankar PetscFunctionBegin; 15939566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 15949566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 15958ab949d8SShri Abhyankar 15968ab949d8SShri Abhyankar v = a->a; 15978ab949d8SShri Abhyankar if (usecprow) { 15988ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 15998ab949d8SShri Abhyankar ii = a->compressedrow.i; 16008ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 16019566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 16028ab949d8SShri Abhyankar } else { 16038ab949d8SShri Abhyankar mbs = a->mbs; 16048ab949d8SShri Abhyankar ii = a->i; 16058ab949d8SShri Abhyankar z = zarray; 16068ab949d8SShri Abhyankar } 16078ab949d8SShri Abhyankar 16088ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 16098ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 16108ab949d8SShri Abhyankar idx = ij + ii[i]; 16119371c9d4SSatish Balay sum1 = 0.0; 16129371c9d4SSatish Balay sum2 = 0.0; 16139371c9d4SSatish Balay sum3 = 0.0; 16149371c9d4SSatish Balay sum4 = 0.0; 16159371c9d4SSatish Balay sum5 = 0.0; 16169371c9d4SSatish Balay sum6 = 0.0; 16179371c9d4SSatish Balay sum7 = 0.0; 16189371c9d4SSatish Balay sum8 = 0.0; 16199371c9d4SSatish Balay sum9 = 0.0; 16209371c9d4SSatish Balay sum10 = 0.0; 16219371c9d4SSatish Balay sum11 = 0.0; 16229371c9d4SSatish Balay sum12 = 0.0; 16239371c9d4SSatish Balay sum13 = 0.0; 16249371c9d4SSatish Balay sum14 = 0.0; 16259371c9d4SSatish Balay sum15 = 0.0; 16268ab949d8SShri Abhyankar 16278ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 16288ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 16299371c9d4SSatish Balay x1 = xb[0]; 16309371c9d4SSatish Balay x2 = xb[1]; 16319371c9d4SSatish Balay x3 = xb[2]; 16329371c9d4SSatish Balay x4 = xb[3]; 16338ab949d8SShri Abhyankar 16348ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16358ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16368ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16378ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16388ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16398ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16408ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16418ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16428ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16438ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16448ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16458ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16468ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16478ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16488ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16498ab949d8SShri Abhyankar 16508ab949d8SShri Abhyankar v += 60; 16518ab949d8SShri Abhyankar 16529371c9d4SSatish Balay x1 = xb[4]; 16539371c9d4SSatish Balay x2 = xb[5]; 16549371c9d4SSatish Balay x3 = xb[6]; 16559371c9d4SSatish Balay x4 = xb[7]; 16568ab949d8SShri Abhyankar 16578ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16588ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16598ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16608ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16618ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16628ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16638ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16648ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16658ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16668ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16678ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16688ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16698ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16708ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16718ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16728ab949d8SShri Abhyankar v += 60; 16738ab949d8SShri Abhyankar 16749371c9d4SSatish Balay x1 = xb[8]; 16759371c9d4SSatish Balay x2 = xb[9]; 16769371c9d4SSatish Balay x3 = xb[10]; 16779371c9d4SSatish Balay x4 = xb[11]; 16780b8f6341SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16790b8f6341SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16800b8f6341SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16810b8f6341SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16820b8f6341SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16830b8f6341SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16840b8f6341SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16850b8f6341SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16860b8f6341SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16870b8f6341SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16880b8f6341SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16890b8f6341SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16900b8f6341SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16910b8f6341SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16920b8f6341SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16930b8f6341SShri Abhyankar v += 60; 16940b8f6341SShri Abhyankar 16959371c9d4SSatish Balay x1 = xb[12]; 16969371c9d4SSatish Balay x2 = xb[13]; 16979371c9d4SSatish Balay x3 = xb[14]; 16988ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 16998ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 17008ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 17018ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 17028ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 17038ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 17048ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 17058ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 17068ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 17078ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 17088ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 17098ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 17108ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 17118ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 17128ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 17138ab949d8SShri Abhyankar v += 45; 17148ab949d8SShri Abhyankar } 17158ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 17169371c9d4SSatish Balay z[0] = sum1; 17179371c9d4SSatish Balay z[1] = sum2; 17189371c9d4SSatish Balay z[2] = sum3; 17199371c9d4SSatish Balay z[3] = sum4; 17209371c9d4SSatish Balay z[4] = sum5; 17219371c9d4SSatish Balay z[5] = sum6; 17229371c9d4SSatish Balay z[6] = sum7; 17239371c9d4SSatish Balay z[7] = sum8; 17249371c9d4SSatish Balay z[8] = sum9; 17259371c9d4SSatish Balay z[9] = sum10; 17269371c9d4SSatish Balay z[10] = sum11; 17279371c9d4SSatish Balay z[11] = sum12; 17289371c9d4SSatish Balay z[12] = sum13; 17299371c9d4SSatish Balay z[13] = sum14; 17309371c9d4SSatish Balay z[14] = sum15; 17318ab949d8SShri Abhyankar 17328ab949d8SShri Abhyankar if (!usecprow) z += 15; 17338ab949d8SShri Abhyankar } 17348ab949d8SShri Abhyankar 17359566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 17369566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 17379566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 17383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17398ab949d8SShri Abhyankar } 17408ab949d8SShri Abhyankar 17418ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1742d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) 1743d71ae5a4SJacob Faibussowitsch { 17448ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1745f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 17468ab949d8SShri Abhyankar const PetscScalar *x, *xb; 17470b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 17488ab949d8SShri Abhyankar const MatScalar *v; 17498ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 17507c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1751ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 17528ab949d8SShri Abhyankar 17538ab949d8SShri Abhyankar PetscFunctionBegin; 17549566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 17559566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 17568ab949d8SShri Abhyankar 17578ab949d8SShri Abhyankar v = a->a; 17588ab949d8SShri Abhyankar if (usecprow) { 17598ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 17608ab949d8SShri Abhyankar ii = a->compressedrow.i; 17618ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 17629566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 17638ab949d8SShri Abhyankar } else { 17648ab949d8SShri Abhyankar mbs = a->mbs; 17658ab949d8SShri Abhyankar ii = a->i; 17668ab949d8SShri Abhyankar z = zarray; 17678ab949d8SShri Abhyankar } 17688ab949d8SShri Abhyankar 17698ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 17708ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 17718ab949d8SShri Abhyankar idx = ij + ii[i]; 17729371c9d4SSatish Balay sum1 = 0.0; 17739371c9d4SSatish Balay sum2 = 0.0; 17749371c9d4SSatish Balay sum3 = 0.0; 17759371c9d4SSatish Balay sum4 = 0.0; 17769371c9d4SSatish Balay sum5 = 0.0; 17779371c9d4SSatish Balay sum6 = 0.0; 17789371c9d4SSatish Balay sum7 = 0.0; 17799371c9d4SSatish Balay sum8 = 0.0; 17809371c9d4SSatish Balay sum9 = 0.0; 17819371c9d4SSatish Balay sum10 = 0.0; 17829371c9d4SSatish Balay sum11 = 0.0; 17839371c9d4SSatish Balay sum12 = 0.0; 17849371c9d4SSatish Balay sum13 = 0.0; 17859371c9d4SSatish Balay sum14 = 0.0; 17869371c9d4SSatish Balay sum15 = 0.0; 17878ab949d8SShri Abhyankar 17888ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 17898ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 17909371c9d4SSatish Balay x1 = xb[0]; 17919371c9d4SSatish Balay x2 = xb[1]; 17929371c9d4SSatish Balay x3 = xb[2]; 17939371c9d4SSatish Balay x4 = xb[3]; 17949371c9d4SSatish Balay x5 = xb[4]; 17959371c9d4SSatish Balay x6 = xb[5]; 17969371c9d4SSatish Balay x7 = xb[6]; 17970b8f6341SShri Abhyankar x8 = xb[7]; 17988ab949d8SShri Abhyankar 17998ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 18008ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 18018ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 18028ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 18038ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 18048ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 18058ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 18068ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 18078ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 18088ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 18098ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 18108ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 18118ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 18128ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 18138ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 18148ab949d8SShri Abhyankar v += 120; 18158ab949d8SShri Abhyankar 18169371c9d4SSatish Balay x1 = xb[8]; 18179371c9d4SSatish Balay x2 = xb[9]; 18189371c9d4SSatish Balay x3 = xb[10]; 18199371c9d4SSatish Balay x4 = xb[11]; 18209371c9d4SSatish Balay x5 = xb[12]; 18219371c9d4SSatish Balay x6 = xb[13]; 18229371c9d4SSatish Balay x7 = xb[14]; 18230b8f6341SShri Abhyankar 18248ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 18258ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 18268ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 18278ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 18288ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 18298ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 18308ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 18318ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 18328ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 18338ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 18348ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 18358ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 18368ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 18378ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 18388ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 18398ab949d8SShri Abhyankar v += 105; 18408ab949d8SShri Abhyankar } 18418ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 18429371c9d4SSatish Balay z[0] = sum1; 18439371c9d4SSatish Balay z[1] = sum2; 18449371c9d4SSatish Balay z[2] = sum3; 18459371c9d4SSatish Balay z[3] = sum4; 18469371c9d4SSatish Balay z[4] = sum5; 18479371c9d4SSatish Balay z[5] = sum6; 18489371c9d4SSatish Balay z[6] = sum7; 18499371c9d4SSatish Balay z[7] = sum8; 18509371c9d4SSatish Balay z[8] = sum9; 18519371c9d4SSatish Balay z[9] = sum10; 18529371c9d4SSatish Balay z[10] = sum11; 18539371c9d4SSatish Balay z[11] = sum12; 18549371c9d4SSatish Balay z[12] = sum13; 18559371c9d4SSatish Balay z[13] = sum14; 18569371c9d4SSatish Balay z[14] = sum15; 18578ab949d8SShri Abhyankar 18588ab949d8SShri Abhyankar if (!usecprow) z += 15; 18598ab949d8SShri Abhyankar } 18608ab949d8SShri Abhyankar 18619566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 18629566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 18639566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 18643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18658ab949d8SShri Abhyankar } 18668ab949d8SShri Abhyankar 18678ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1868d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) 1869d71ae5a4SJacob Faibussowitsch { 18708ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1871f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 18728ab949d8SShri Abhyankar const PetscScalar *x, *xb; 18738ab949d8SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 18748ab949d8SShri Abhyankar const MatScalar *v; 18758ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 18767c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1877ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 18788ab949d8SShri Abhyankar 18798ab949d8SShri Abhyankar PetscFunctionBegin; 18809566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 18819566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 18828ab949d8SShri Abhyankar 18838ab949d8SShri Abhyankar v = a->a; 18848ab949d8SShri Abhyankar if (usecprow) { 18858ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 18868ab949d8SShri Abhyankar ii = a->compressedrow.i; 18878ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 18889566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 18898ab949d8SShri Abhyankar } else { 18908ab949d8SShri Abhyankar mbs = a->mbs; 18918ab949d8SShri Abhyankar ii = a->i; 18928ab949d8SShri Abhyankar z = zarray; 18938ab949d8SShri Abhyankar } 18948ab949d8SShri Abhyankar 18958ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 18968ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 18978ab949d8SShri Abhyankar idx = ij + ii[i]; 18989371c9d4SSatish Balay sum1 = 0.0; 18999371c9d4SSatish Balay sum2 = 0.0; 19009371c9d4SSatish Balay sum3 = 0.0; 19019371c9d4SSatish Balay sum4 = 0.0; 19029371c9d4SSatish Balay sum5 = 0.0; 19039371c9d4SSatish Balay sum6 = 0.0; 19049371c9d4SSatish Balay sum7 = 0.0; 19059371c9d4SSatish Balay sum8 = 0.0; 19069371c9d4SSatish Balay sum9 = 0.0; 19079371c9d4SSatish Balay sum10 = 0.0; 19089371c9d4SSatish Balay sum11 = 0.0; 19099371c9d4SSatish Balay sum12 = 0.0; 19109371c9d4SSatish Balay sum13 = 0.0; 19119371c9d4SSatish Balay sum14 = 0.0; 19129371c9d4SSatish Balay sum15 = 0.0; 19138ab949d8SShri Abhyankar 19148ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 19158ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 19169371c9d4SSatish Balay x1 = xb[0]; 19179371c9d4SSatish Balay x2 = xb[1]; 19189371c9d4SSatish Balay x3 = xb[2]; 19199371c9d4SSatish Balay x4 = xb[3]; 19209371c9d4SSatish Balay x5 = xb[4]; 19219371c9d4SSatish Balay x6 = xb[5]; 19229371c9d4SSatish Balay x7 = xb[6]; 19239371c9d4SSatish Balay x8 = xb[7]; 19249371c9d4SSatish Balay x9 = xb[8]; 19259371c9d4SSatish Balay x10 = xb[9]; 19269371c9d4SSatish Balay x11 = xb[10]; 19279371c9d4SSatish Balay x12 = xb[11]; 19289371c9d4SSatish Balay x13 = xb[12]; 19299371c9d4SSatish Balay x14 = xb[13]; 19309371c9d4SSatish Balay x15 = xb[14]; 19318ab949d8SShri Abhyankar 19328ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 19338ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 19348ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 19358ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 19368ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 19378ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 19388ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 19398ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 19408ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 19418ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 19428ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 19438ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 19448ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 19458ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 19468ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 19478ab949d8SShri Abhyankar v += 225; 19488ab949d8SShri Abhyankar } 19498ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 19509371c9d4SSatish Balay z[0] = sum1; 19519371c9d4SSatish Balay z[1] = sum2; 19529371c9d4SSatish Balay z[2] = sum3; 19539371c9d4SSatish Balay z[3] = sum4; 19549371c9d4SSatish Balay z[4] = sum5; 19559371c9d4SSatish Balay z[5] = sum6; 19569371c9d4SSatish Balay z[6] = sum7; 19579371c9d4SSatish Balay z[7] = sum8; 19589371c9d4SSatish Balay z[8] = sum9; 19599371c9d4SSatish Balay z[9] = sum10; 19609371c9d4SSatish Balay z[10] = sum11; 19619371c9d4SSatish Balay z[11] = sum12; 19629371c9d4SSatish Balay z[12] = sum13; 19639371c9d4SSatish Balay z[13] = sum14; 19649371c9d4SSatish Balay z[14] = sum15; 19658ab949d8SShri Abhyankar 19668ab949d8SShri Abhyankar if (!usecprow) z += 15; 19678ab949d8SShri Abhyankar } 19688ab949d8SShri Abhyankar 19699566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 19709566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 19719566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 19723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 19738ab949d8SShri Abhyankar } 19748ab949d8SShri Abhyankar 19753f1db9ecSBarry Smith /* 19763f1db9ecSBarry Smith This will not work with MatScalar == float because it calls the BLAS 19773f1db9ecSBarry Smith */ 1978d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) 1979d71ae5a4SJacob Faibussowitsch { 19802d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1981f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 1982d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1983d9ca1df4SBarry Smith const MatScalar *v; 1984d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1985d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 1986d9ca1df4SBarry Smith PetscInt ncols, k; 1987ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 19882d61bbb3SSatish Balay 19892d61bbb3SSatish Balay PetscFunctionBegin; 19909566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 19919566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 19922d61bbb3SSatish Balay 19932d61bbb3SSatish Balay idx = a->j; 19942d61bbb3SSatish Balay v = a->a; 199526e093fcSHong Zhang if (usecprow) { 199626e093fcSHong Zhang mbs = a->compressedrow.nrows; 199726e093fcSHong Zhang ii = a->compressedrow.i; 19987b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 19999566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 200026e093fcSHong Zhang } else { 200126e093fcSHong Zhang mbs = a->mbs; 20022d61bbb3SSatish Balay ii = a->i; 200326e093fcSHong Zhang z = zarray; 200426e093fcSHong Zhang } 2005218c64b6SSatish Balay 20062d61bbb3SSatish Balay if (!a->mult_work) { 2007d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 20089566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 20092d61bbb3SSatish Balay } 20102d61bbb3SSatish Balay work = a->mult_work; 20112d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 20129371c9d4SSatish Balay n = ii[1] - ii[0]; 20139371c9d4SSatish Balay ii++; 20142d61bbb3SSatish Balay ncols = n * bs; 20152d61bbb3SSatish Balay workt = work; 20162d61bbb3SSatish Balay for (j = 0; j < n; j++) { 20172d61bbb3SSatish Balay xb = x + bs * (*idx++); 20182d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 20192d61bbb3SSatish Balay workt += bs; 20202d61bbb3SSatish Balay } 20217b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 202296b95a6bSBarry Smith PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 20232d61bbb3SSatish Balay v += n * bs2; 202426e093fcSHong Zhang if (!usecprow) z += bs; 20252d61bbb3SSatish Balay } 20269566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20279566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 20289566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 20293ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20302d61bbb3SSatish Balay } 20312d61bbb3SSatish Balay 2032d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 2033d71ae5a4SJacob Faibussowitsch { 20342d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2035122f12eaSBarry Smith const PetscScalar *x; 2036122f12eaSBarry Smith PetscScalar *y, *z, sum; 2037122f12eaSBarry Smith const MatScalar *v; 20387c565772SBarry Smith PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2039122f12eaSBarry Smith const PetscInt *idx, *ii; 2040ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20412d61bbb3SSatish Balay 20422d61bbb3SSatish Balay PetscFunctionBegin; 20439566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20449566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 20452d61bbb3SSatish Balay 20462d61bbb3SSatish Balay idx = a->j; 20472d61bbb3SSatish Balay v = a->a; 204826e093fcSHong Zhang if (usecprow) { 204948a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 205026e093fcSHong Zhang mbs = a->compressedrow.nrows; 205126e093fcSHong Zhang ii = a->compressedrow.i; 20527b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 205326e093fcSHong Zhang } else { 20542d61bbb3SSatish Balay ii = a->i; 205526e093fcSHong Zhang } 20562d61bbb3SSatish Balay 20572d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 2058122f12eaSBarry Smith n = ii[1] - ii[0]; 2059122f12eaSBarry Smith ii++; 206026e093fcSHong Zhang if (!usecprow) { 2061122f12eaSBarry Smith sum = y[i]; 2062122f12eaSBarry Smith } else { 2063122f12eaSBarry Smith sum = y[ridx[i]]; 2064122f12eaSBarry Smith } 2065444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2066444d8c10SJed Brown PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2067122f12eaSBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 2068122f12eaSBarry Smith v += n; 2069122f12eaSBarry Smith idx += n; 2070122f12eaSBarry Smith if (usecprow) { 2071122f12eaSBarry Smith z[ridx[i]] = sum; 2072122f12eaSBarry Smith } else { 2073122f12eaSBarry Smith z[i] = sum; 207426e093fcSHong Zhang } 20752d61bbb3SSatish Balay } 20769566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 20789566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 20793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20802d61bbb3SSatish Balay } 20812d61bbb3SSatish Balay 2082d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 2083d71ae5a4SJacob Faibussowitsch { 20842d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2085f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2086d9ca1df4SBarry Smith const PetscScalar *x, *xb; 208726e093fcSHong Zhang PetscScalar x1, x2, *yarray, *zarray; 2088d9ca1df4SBarry Smith const MatScalar *v; 2089d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, j; 2090d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2091ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20922d61bbb3SSatish Balay 20932d61bbb3SSatish Balay PetscFunctionBegin; 20949566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20959566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 20962d61bbb3SSatish Balay 20972d61bbb3SSatish Balay idx = a->j; 20982d61bbb3SSatish Balay v = a->a; 209926e093fcSHong Zhang if (usecprow) { 210048a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 210126e093fcSHong Zhang mbs = a->compressedrow.nrows; 210226e093fcSHong Zhang ii = a->compressedrow.i; 21037b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 210426e093fcSHong Zhang } else { 21052d61bbb3SSatish Balay ii = a->i; 210626e093fcSHong Zhang y = yarray; 210726e093fcSHong Zhang z = zarray; 210826e093fcSHong Zhang } 21092d61bbb3SSatish Balay 21102d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 21119371c9d4SSatish Balay n = ii[1] - ii[0]; 21129371c9d4SSatish Balay ii++; 211326e093fcSHong Zhang if (usecprow) { 21147b2bb3b9SHong Zhang z = zarray + 2 * ridx[i]; 21157b2bb3b9SHong Zhang y = yarray + 2 * ridx[i]; 211626e093fcSHong Zhang } 21179371c9d4SSatish Balay sum1 = y[0]; 21189371c9d4SSatish Balay sum2 = y[1]; 2119444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2120444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 21212d61bbb3SSatish Balay for (j = 0; j < n; j++) { 212226fbe8dcSKarl Rupp xb = x + 2 * (*idx++); 212326fbe8dcSKarl Rupp x1 = xb[0]; 212426fbe8dcSKarl Rupp x2 = xb[1]; 212526fbe8dcSKarl Rupp 21262d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 21272d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 21282d61bbb3SSatish Balay v += 4; 21292d61bbb3SSatish Balay } 21309371c9d4SSatish Balay z[0] = sum1; 21319371c9d4SSatish Balay z[1] = sum2; 213226e093fcSHong Zhang if (!usecprow) { 21339371c9d4SSatish Balay z += 2; 21349371c9d4SSatish Balay y += 2; 21352d61bbb3SSatish Balay } 213626e093fcSHong Zhang } 21379566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 21389566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 21399566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * a->nz)); 21403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21412d61bbb3SSatish Balay } 21422d61bbb3SSatish Balay 2143d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 2144d71ae5a4SJacob Faibussowitsch { 21452d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2146f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2147d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2148d9ca1df4SBarry Smith const MatScalar *v; 2149d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2150d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2151ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 21522d61bbb3SSatish Balay 21532d61bbb3SSatish Balay PetscFunctionBegin; 21549566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 21559566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 21562d61bbb3SSatish Balay 21572d61bbb3SSatish Balay idx = a->j; 21582d61bbb3SSatish Balay v = a->a; 215926e093fcSHong Zhang if (usecprow) { 216048a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 216126e093fcSHong Zhang mbs = a->compressedrow.nrows; 216226e093fcSHong Zhang ii = a->compressedrow.i; 21637b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 216426e093fcSHong Zhang } else { 21652d61bbb3SSatish Balay ii = a->i; 216626e093fcSHong Zhang y = yarray; 216726e093fcSHong Zhang z = zarray; 216826e093fcSHong Zhang } 21692d61bbb3SSatish Balay 21702d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 21719371c9d4SSatish Balay n = ii[1] - ii[0]; 21729371c9d4SSatish Balay ii++; 217326e093fcSHong Zhang if (usecprow) { 21747b2bb3b9SHong Zhang z = zarray + 3 * ridx[i]; 21757b2bb3b9SHong Zhang y = yarray + 3 * ridx[i]; 217626e093fcSHong Zhang } 21779371c9d4SSatish Balay sum1 = y[0]; 21789371c9d4SSatish Balay sum2 = y[1]; 21799371c9d4SSatish Balay sum3 = y[2]; 2180444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2181444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 21822d61bbb3SSatish Balay for (j = 0; j < n; j++) { 21839371c9d4SSatish Balay xb = x + 3 * (*idx++); 21849371c9d4SSatish Balay x1 = xb[0]; 21859371c9d4SSatish Balay x2 = xb[1]; 21869371c9d4SSatish Balay x3 = xb[2]; 21872d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 21882d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 21892d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 21902d61bbb3SSatish Balay v += 9; 21912d61bbb3SSatish Balay } 21929371c9d4SSatish Balay z[0] = sum1; 21939371c9d4SSatish Balay z[1] = sum2; 21949371c9d4SSatish Balay z[2] = sum3; 219526e093fcSHong Zhang if (!usecprow) { 21969371c9d4SSatish Balay z += 3; 21979371c9d4SSatish Balay y += 3; 21982d61bbb3SSatish Balay } 219926e093fcSHong Zhang } 22009566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 22019566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 22029566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz)); 22033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 22042d61bbb3SSatish Balay } 22052d61bbb3SSatish Balay 2206d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 2207d71ae5a4SJacob Faibussowitsch { 22082d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2209f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2210d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2211d9ca1df4SBarry Smith const MatScalar *v; 2212d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2213d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2214ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 22152d61bbb3SSatish Balay 22162d61bbb3SSatish Balay PetscFunctionBegin; 22179566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 22189566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 22192d61bbb3SSatish Balay 22202d61bbb3SSatish Balay idx = a->j; 22212d61bbb3SSatish Balay v = a->a; 222226e093fcSHong Zhang if (usecprow) { 222348a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 222426e093fcSHong Zhang mbs = a->compressedrow.nrows; 222526e093fcSHong Zhang ii = a->compressedrow.i; 22267b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 222726e093fcSHong Zhang } else { 22282d61bbb3SSatish Balay ii = a->i; 222926e093fcSHong Zhang y = yarray; 223026e093fcSHong Zhang z = zarray; 223126e093fcSHong Zhang } 22322d61bbb3SSatish Balay 22332d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 22349371c9d4SSatish Balay n = ii[1] - ii[0]; 22359371c9d4SSatish Balay ii++; 223626e093fcSHong Zhang if (usecprow) { 22377b2bb3b9SHong Zhang z = zarray + 4 * ridx[i]; 22387b2bb3b9SHong Zhang y = yarray + 4 * ridx[i]; 223926e093fcSHong Zhang } 22409371c9d4SSatish Balay sum1 = y[0]; 22419371c9d4SSatish Balay sum2 = y[1]; 22429371c9d4SSatish Balay sum3 = y[2]; 22439371c9d4SSatish Balay sum4 = y[3]; 2244444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2245444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 22462d61bbb3SSatish Balay for (j = 0; j < n; j++) { 22472d61bbb3SSatish Balay xb = x + 4 * (*idx++); 22489371c9d4SSatish Balay x1 = xb[0]; 22499371c9d4SSatish Balay x2 = xb[1]; 22509371c9d4SSatish Balay x3 = xb[2]; 22519371c9d4SSatish Balay x4 = xb[3]; 22522d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 22532d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 22542d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 22552d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 22562d61bbb3SSatish Balay v += 16; 22572d61bbb3SSatish Balay } 22589371c9d4SSatish Balay z[0] = sum1; 22599371c9d4SSatish Balay z[1] = sum2; 22609371c9d4SSatish Balay z[2] = sum3; 22619371c9d4SSatish Balay z[3] = sum4; 226226e093fcSHong Zhang if (!usecprow) { 22639371c9d4SSatish Balay z += 4; 22649371c9d4SSatish Balay y += 4; 22652d61bbb3SSatish Balay } 226626e093fcSHong Zhang } 22679566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 22689566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 22699566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz)); 22703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 22712d61bbb3SSatish Balay } 22722d61bbb3SSatish Balay 2273d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 2274d71ae5a4SJacob Faibussowitsch { 22752d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2276f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2277d9ca1df4SBarry Smith const PetscScalar *x, *xb; 227826e093fcSHong Zhang PetscScalar *yarray, *zarray; 2279d9ca1df4SBarry Smith const MatScalar *v; 2280d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2281d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2282ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 22832d61bbb3SSatish Balay 22842d61bbb3SSatish Balay PetscFunctionBegin; 22859566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 22869566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 22872d61bbb3SSatish Balay 22882d61bbb3SSatish Balay idx = a->j; 22892d61bbb3SSatish Balay v = a->a; 229026e093fcSHong Zhang if (usecprow) { 229148a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 229226e093fcSHong Zhang mbs = a->compressedrow.nrows; 229326e093fcSHong Zhang ii = a->compressedrow.i; 22947b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 229526e093fcSHong Zhang } else { 22962d61bbb3SSatish Balay ii = a->i; 229726e093fcSHong Zhang y = yarray; 229826e093fcSHong Zhang z = zarray; 229926e093fcSHong Zhang } 23002d61bbb3SSatish Balay 23012d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 23029371c9d4SSatish Balay n = ii[1] - ii[0]; 23039371c9d4SSatish Balay ii++; 230426e093fcSHong Zhang if (usecprow) { 23057b2bb3b9SHong Zhang z = zarray + 5 * ridx[i]; 23067b2bb3b9SHong Zhang y = yarray + 5 * ridx[i]; 230726e093fcSHong Zhang } 23089371c9d4SSatish Balay sum1 = y[0]; 23099371c9d4SSatish Balay sum2 = y[1]; 23109371c9d4SSatish Balay sum3 = y[2]; 23119371c9d4SSatish Balay sum4 = y[3]; 23129371c9d4SSatish Balay sum5 = y[4]; 2313444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2314444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 23152d61bbb3SSatish Balay for (j = 0; j < n; j++) { 23162d61bbb3SSatish Balay xb = x + 5 * (*idx++); 23179371c9d4SSatish Balay x1 = xb[0]; 23189371c9d4SSatish Balay x2 = xb[1]; 23199371c9d4SSatish Balay x3 = xb[2]; 23209371c9d4SSatish Balay x4 = xb[3]; 23219371c9d4SSatish Balay x5 = xb[4]; 23222d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 23232d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 23242d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 23252d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 23262d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 23272d61bbb3SSatish Balay v += 25; 23282d61bbb3SSatish Balay } 23299371c9d4SSatish Balay z[0] = sum1; 23309371c9d4SSatish Balay z[1] = sum2; 23319371c9d4SSatish Balay z[2] = sum3; 23329371c9d4SSatish Balay z[3] = sum4; 23339371c9d4SSatish Balay z[4] = sum5; 233426e093fcSHong Zhang if (!usecprow) { 23359371c9d4SSatish Balay z += 5; 23369371c9d4SSatish Balay y += 5; 23372d61bbb3SSatish Balay } 233826e093fcSHong Zhang } 23399566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 23409566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 23419566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz)); 23423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23432d61bbb3SSatish Balay } 2344c2916339SPierre Jolivet 2345d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 2346d71ae5a4SJacob Faibussowitsch { 234715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2348f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2349d9ca1df4SBarry Smith const PetscScalar *x, *xb; 235026e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2351d9ca1df4SBarry Smith const MatScalar *v; 2352d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2353d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2354ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 235515091d37SBarry Smith 235615091d37SBarry Smith PetscFunctionBegin; 23579566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 23589566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 235915091d37SBarry Smith 236015091d37SBarry Smith idx = a->j; 236115091d37SBarry Smith v = a->a; 236226e093fcSHong Zhang if (usecprow) { 236348a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 236426e093fcSHong Zhang mbs = a->compressedrow.nrows; 236526e093fcSHong Zhang ii = a->compressedrow.i; 23667b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 236726e093fcSHong Zhang } else { 236815091d37SBarry Smith ii = a->i; 236926e093fcSHong Zhang y = yarray; 237026e093fcSHong Zhang z = zarray; 237126e093fcSHong Zhang } 237215091d37SBarry Smith 237315091d37SBarry Smith for (i = 0; i < mbs; i++) { 23749371c9d4SSatish Balay n = ii[1] - ii[0]; 23759371c9d4SSatish Balay ii++; 237626e093fcSHong Zhang if (usecprow) { 23777b2bb3b9SHong Zhang z = zarray + 6 * ridx[i]; 23787b2bb3b9SHong Zhang y = yarray + 6 * ridx[i]; 237926e093fcSHong Zhang } 23809371c9d4SSatish Balay sum1 = y[0]; 23819371c9d4SSatish Balay sum2 = y[1]; 23829371c9d4SSatish Balay sum3 = y[2]; 23839371c9d4SSatish Balay sum4 = y[3]; 23849371c9d4SSatish Balay sum5 = y[4]; 23859371c9d4SSatish Balay sum6 = y[5]; 2386444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2387444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 238815091d37SBarry Smith for (j = 0; j < n; j++) { 23893b95cb0eSSatish Balay xb = x + 6 * (*idx++); 23909371c9d4SSatish Balay x1 = xb[0]; 23919371c9d4SSatish Balay x2 = xb[1]; 23929371c9d4SSatish Balay x3 = xb[2]; 23939371c9d4SSatish Balay x4 = xb[3]; 23949371c9d4SSatish Balay x5 = xb[4]; 23959371c9d4SSatish Balay x6 = xb[5]; 239615091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 239715091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 239815091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 239915091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 240015091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 240115091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 240215091d37SBarry Smith v += 36; 240315091d37SBarry Smith } 24049371c9d4SSatish Balay z[0] = sum1; 24059371c9d4SSatish Balay z[1] = sum2; 24069371c9d4SSatish Balay z[2] = sum3; 24079371c9d4SSatish Balay z[3] = sum4; 24089371c9d4SSatish Balay z[4] = sum5; 24099371c9d4SSatish Balay z[5] = sum6; 241026e093fcSHong Zhang if (!usecprow) { 24119371c9d4SSatish Balay z += 6; 24129371c9d4SSatish Balay y += 6; 241315091d37SBarry Smith } 241426e093fcSHong Zhang } 24159566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 24169566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 24179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz)); 24183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 241915091d37SBarry Smith } 24202d61bbb3SSatish Balay 2421d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 2422d71ae5a4SJacob Faibussowitsch { 24232d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2424f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2425d9ca1df4SBarry Smith const PetscScalar *x, *xb; 242626e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2427d9ca1df4SBarry Smith const MatScalar *v; 2428d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2429d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2430ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 24312d61bbb3SSatish Balay 24322d61bbb3SSatish Balay PetscFunctionBegin; 24339566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 24349566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 24352d61bbb3SSatish Balay 24362d61bbb3SSatish Balay idx = a->j; 24372d61bbb3SSatish Balay v = a->a; 243826e093fcSHong Zhang if (usecprow) { 243948a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 244026e093fcSHong Zhang mbs = a->compressedrow.nrows; 244126e093fcSHong Zhang ii = a->compressedrow.i; 24427b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 244326e093fcSHong Zhang } else { 24442d61bbb3SSatish Balay ii = a->i; 244526e093fcSHong Zhang y = yarray; 244626e093fcSHong Zhang z = zarray; 244726e093fcSHong Zhang } 24482d61bbb3SSatish Balay 24492d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 24509371c9d4SSatish Balay n = ii[1] - ii[0]; 24519371c9d4SSatish Balay ii++; 245226e093fcSHong Zhang if (usecprow) { 24537b2bb3b9SHong Zhang z = zarray + 7 * ridx[i]; 24547b2bb3b9SHong Zhang y = yarray + 7 * ridx[i]; 245526e093fcSHong Zhang } 24569371c9d4SSatish Balay sum1 = y[0]; 24579371c9d4SSatish Balay sum2 = y[1]; 24589371c9d4SSatish Balay sum3 = y[2]; 24599371c9d4SSatish Balay sum4 = y[3]; 24609371c9d4SSatish Balay sum5 = y[4]; 24619371c9d4SSatish Balay sum6 = y[5]; 24629371c9d4SSatish Balay sum7 = y[6]; 2463444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2464444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 24652d61bbb3SSatish Balay for (j = 0; j < n; j++) { 24662d61bbb3SSatish Balay xb = x + 7 * (*idx++); 24679371c9d4SSatish Balay x1 = xb[0]; 24689371c9d4SSatish Balay x2 = xb[1]; 24699371c9d4SSatish Balay x3 = xb[2]; 24709371c9d4SSatish Balay x4 = xb[3]; 24719371c9d4SSatish Balay x5 = xb[4]; 24729371c9d4SSatish Balay x6 = xb[5]; 24739371c9d4SSatish Balay x7 = xb[6]; 24742d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 24752d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 24762d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 24772d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 24782d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 24792d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 24802d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 24812d61bbb3SSatish Balay v += 49; 24822d61bbb3SSatish Balay } 24839371c9d4SSatish Balay z[0] = sum1; 24849371c9d4SSatish Balay z[1] = sum2; 24859371c9d4SSatish Balay z[2] = sum3; 24869371c9d4SSatish Balay z[3] = sum4; 24879371c9d4SSatish Balay z[4] = sum5; 24889371c9d4SSatish Balay z[5] = sum6; 24899371c9d4SSatish Balay z[6] = sum7; 249026e093fcSHong Zhang if (!usecprow) { 24919371c9d4SSatish Balay z += 7; 24929371c9d4SSatish Balay y += 7; 24932d61bbb3SSatish Balay } 249426e093fcSHong Zhang } 24959566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 24969566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 24979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz)); 24983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 24992d61bbb3SSatish Balay } 2500218c64b6SSatish Balay 25015f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2502d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) 2503d71ae5a4SJacob Faibussowitsch { 250496e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2505f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 250696e086a2SDaniel Kokron const PetscScalar *x, *xb; 250796e086a2SDaniel Kokron const MatScalar *v; 25086679dcc1SBarry Smith PetscInt mbs, i, j, n; 2509ce68d72fSJed Brown PetscInt k; 251096e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 25116679dcc1SBarry Smith const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 251296e086a2SDaniel Kokron 251396e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 2514ce68d72fSJed Brown __m256d w0, w1, w2, w3; 251596e086a2SDaniel Kokron __m256d z0, z1, z2; 251696e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 251796e086a2SDaniel Kokron 251896e086a2SDaniel Kokron PetscFunctionBegin; 25199566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 25209566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 25219566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 252296e086a2SDaniel Kokron 252396e086a2SDaniel Kokron idx = a->j; 252496e086a2SDaniel Kokron v = a->a; 252596e086a2SDaniel Kokron if (usecprow) { 252696e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 252796e086a2SDaniel Kokron ii = a->compressedrow.i; 252896e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 252996e086a2SDaniel Kokron } else { 253096e086a2SDaniel Kokron mbs = a->mbs; 253196e086a2SDaniel Kokron ii = a->i; 253296e086a2SDaniel Kokron z = zarray; 253396e086a2SDaniel Kokron } 253496e086a2SDaniel Kokron 253596e086a2SDaniel Kokron if (!a->mult_work) { 253696e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 25379566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 253896e086a2SDaniel Kokron } 253996e086a2SDaniel Kokron 254096e086a2SDaniel Kokron work = a->mult_work; 254196e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 25429371c9d4SSatish Balay n = ii[1] - ii[0]; 25439371c9d4SSatish Balay ii++; 254496e086a2SDaniel Kokron workt = work; 254596e086a2SDaniel Kokron for (j = 0; j < n; j++) { 254696e086a2SDaniel Kokron xb = x + bs * (*idx++); 254796e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 254896e086a2SDaniel Kokron workt += bs; 254996e086a2SDaniel Kokron } 255096e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 255196e086a2SDaniel Kokron 25529371c9d4SSatish Balay z0 = _mm256_loadu_pd(&z[0]); 25539371c9d4SSatish Balay z1 = _mm256_loadu_pd(&z[4]); 25549371c9d4SSatish Balay z2 = _mm256_set1_pd(z[8]); 255596e086a2SDaniel Kokron 255696e086a2SDaniel Kokron for (j = 0; j < n; j++) { 2557c05b70c4SSatish Balay /* first column of a */ 255896e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 25599371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 25609371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 25619371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 25629371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 25639371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 25649371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 256596e086a2SDaniel Kokron 2566c05b70c4SSatish Balay /* second column of a */ 256796e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 25689371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 25699371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 25709371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 25719371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 25729371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 25739371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 257496e086a2SDaniel Kokron 2575c05b70c4SSatish Balay /* third column of a */ 257696e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 25779371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 25789371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 25799371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 25809371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 25819371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 25829371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 258396e086a2SDaniel Kokron 2584c05b70c4SSatish Balay /* fourth column of a */ 258596e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 25869371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 25879371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 25889371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 25899371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 25909371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 25919371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 259296e086a2SDaniel Kokron 2593c05b70c4SSatish Balay /* fifth column of a */ 259496e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 25959371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 25969371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 25979371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 25989371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 25999371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 26009371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 260196e086a2SDaniel Kokron 2602c05b70c4SSatish Balay /* sixth column of a */ 260396e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 26049371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 26059371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 26069371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 26079371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 26089371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 26099371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 261096e086a2SDaniel Kokron 2611c05b70c4SSatish Balay /* seventh column of a */ 261296e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 26139371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 26149371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 26159371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 26169371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 26179371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 26189371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 261996e086a2SDaniel Kokron 26206aad120cSJose E. Roman /* eighth column of a */ 262196e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 26229371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 26239371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 26249371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 26259371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 26269371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 26279371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 262896e086a2SDaniel Kokron 2629c05b70c4SSatish Balay /* ninth column of a */ 263096e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 26319371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 26329371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 26339371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 26349371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 26359371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 26369371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 263796e086a2SDaniel Kokron } 263896e086a2SDaniel Kokron 26399371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 26409371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 26419371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 264296e086a2SDaniel Kokron 264396e086a2SDaniel Kokron v += n * bs2; 264496e086a2SDaniel Kokron if (!usecprow) z += bs; 264596e086a2SDaniel Kokron } 26469566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 26479566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 26489566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(162.0 * a->nz)); 26493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 265096e086a2SDaniel Kokron } 265196e086a2SDaniel Kokron #endif 265296e086a2SDaniel Kokron 2653d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) 2654d71ae5a4SJacob Faibussowitsch { 2655ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2656f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2657ebada01fSBarry Smith const PetscScalar *x, *xb; 2658ebada01fSBarry Smith PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2659ebada01fSBarry Smith const MatScalar *v; 2660ebada01fSBarry Smith PetscInt mbs = a->mbs, i, j, n; 2661ebada01fSBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2662ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 2663ebada01fSBarry Smith 2664ebada01fSBarry Smith PetscFunctionBegin; 26659566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 26669566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2667ebada01fSBarry Smith 2668ebada01fSBarry Smith idx = a->j; 2669ebada01fSBarry Smith v = a->a; 2670ebada01fSBarry Smith if (usecprow) { 267148a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2672ebada01fSBarry Smith mbs = a->compressedrow.nrows; 2673ebada01fSBarry Smith ii = a->compressedrow.i; 2674ebada01fSBarry Smith ridx = a->compressedrow.rindex; 2675ebada01fSBarry Smith } else { 2676ebada01fSBarry Smith ii = a->i; 2677ebada01fSBarry Smith y = yarray; 2678ebada01fSBarry Smith z = zarray; 2679ebada01fSBarry Smith } 2680ebada01fSBarry Smith 2681ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 26829371c9d4SSatish Balay n = ii[1] - ii[0]; 26839371c9d4SSatish Balay ii++; 2684ebada01fSBarry Smith if (usecprow) { 2685ebada01fSBarry Smith z = zarray + 11 * ridx[i]; 2686ebada01fSBarry Smith y = yarray + 11 * ridx[i]; 2687ebada01fSBarry Smith } 26889371c9d4SSatish Balay sum1 = y[0]; 26899371c9d4SSatish Balay sum2 = y[1]; 26909371c9d4SSatish Balay sum3 = y[2]; 26919371c9d4SSatish Balay sum4 = y[3]; 26929371c9d4SSatish Balay sum5 = y[4]; 26939371c9d4SSatish Balay sum6 = y[5]; 26949371c9d4SSatish Balay sum7 = y[6]; 26959371c9d4SSatish Balay sum8 = y[7]; 26969371c9d4SSatish Balay sum9 = y[8]; 26979371c9d4SSatish Balay sum10 = y[9]; 26989371c9d4SSatish Balay sum11 = y[10]; 2699ebada01fSBarry Smith PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2700ebada01fSBarry Smith PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2701ebada01fSBarry Smith for (j = 0; j < n; j++) { 2702ebada01fSBarry Smith xb = x + 11 * (*idx++); 27039371c9d4SSatish Balay x1 = xb[0]; 27049371c9d4SSatish Balay x2 = xb[1]; 27059371c9d4SSatish Balay x3 = xb[2]; 27069371c9d4SSatish Balay x4 = xb[3]; 27079371c9d4SSatish Balay x5 = xb[4]; 27089371c9d4SSatish Balay x6 = xb[5]; 27099371c9d4SSatish Balay x7 = xb[6]; 27109371c9d4SSatish Balay x8 = xb[7]; 27119371c9d4SSatish Balay x9 = xb[8]; 27129371c9d4SSatish Balay x10 = xb[9]; 27139371c9d4SSatish Balay x11 = xb[10]; 2714ebada01fSBarry Smith sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2715ebada01fSBarry Smith sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2716ebada01fSBarry Smith sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2717ebada01fSBarry Smith sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2718ebada01fSBarry Smith sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2719ebada01fSBarry Smith sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2720ebada01fSBarry Smith sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2721ebada01fSBarry Smith sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2722ebada01fSBarry Smith sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2723ebada01fSBarry Smith sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2724ebada01fSBarry Smith sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2725ebada01fSBarry Smith v += 121; 2726ebada01fSBarry Smith } 27279371c9d4SSatish Balay z[0] = sum1; 27289371c9d4SSatish Balay z[1] = sum2; 27299371c9d4SSatish Balay z[2] = sum3; 27309371c9d4SSatish Balay z[3] = sum4; 27319371c9d4SSatish Balay z[4] = sum5; 27329371c9d4SSatish Balay z[5] = sum6; 27339371c9d4SSatish Balay z[6] = sum7; 27349371c9d4SSatish Balay z[7] = sum8; 27359371c9d4SSatish Balay z[8] = sum9; 27369371c9d4SSatish Balay z[9] = sum10; 27379371c9d4SSatish Balay z[10] = sum11; 2738ebada01fSBarry Smith if (!usecprow) { 27399371c9d4SSatish Balay z += 11; 27409371c9d4SSatish Balay y += 11; 2741ebada01fSBarry Smith } 2742ebada01fSBarry Smith } 27439566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 27449566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 27459566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz)); 27463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2747ebada01fSBarry Smith } 2748ebada01fSBarry Smith 2749d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 2750d71ae5a4SJacob Faibussowitsch { 27512d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2752f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 2753d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2754d9ca1df4SBarry Smith const MatScalar *v; 2755d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2756d9ca1df4SBarry Smith PetscInt ncols, k; 2757d9ca1df4SBarry Smith const PetscInt *ridx = NULL, *idx, *ii; 2758ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2759218c64b6SSatish Balay 27602d61bbb3SSatish Balay PetscFunctionBegin; 27619566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 27629566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 27639566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 27642d61bbb3SSatish Balay 27652d61bbb3SSatish Balay idx = a->j; 27662d61bbb3SSatish Balay v = a->a; 276726e093fcSHong Zhang if (usecprow) { 276826e093fcSHong Zhang mbs = a->compressedrow.nrows; 276926e093fcSHong Zhang ii = a->compressedrow.i; 27707b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 277126e093fcSHong Zhang } else { 277226e093fcSHong Zhang mbs = a->mbs; 27732d61bbb3SSatish Balay ii = a->i; 277426e093fcSHong Zhang z = zarray; 277526e093fcSHong Zhang } 27762d61bbb3SSatish Balay 27772d61bbb3SSatish Balay if (!a->mult_work) { 2778d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 27799566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 27802d61bbb3SSatish Balay } 27812d61bbb3SSatish Balay work = a->mult_work; 27822d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 27839371c9d4SSatish Balay n = ii[1] - ii[0]; 27849371c9d4SSatish Balay ii++; 27852d61bbb3SSatish Balay ncols = n * bs; 27862d61bbb3SSatish Balay workt = work; 27872d61bbb3SSatish Balay for (j = 0; j < n; j++) { 27882d61bbb3SSatish Balay xb = x + bs * (*idx++); 27892d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 27902d61bbb3SSatish Balay workt += bs; 27912d61bbb3SSatish Balay } 27927b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 279396b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 27942d61bbb3SSatish Balay v += n * bs2; 279526fbe8dcSKarl Rupp if (!usecprow) z += bs; 279626e093fcSHong Zhang } 27979566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 27989566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 27999566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 28003ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28012d61bbb3SSatish Balay } 28022d61bbb3SSatish Balay 2803d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2804d71ae5a4SJacob Faibussowitsch { 2805547795f9SHong Zhang PetscScalar zero = 0.0; 2806547795f9SHong Zhang 2807547795f9SHong Zhang PetscFunctionBegin; 28089566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28099566063dSJacob Faibussowitsch PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 28103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2811547795f9SHong Zhang } 2812547795f9SHong Zhang 2813d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2814d71ae5a4SJacob Faibussowitsch { 28153447b6efSHong Zhang PetscScalar zero = 0.0; 28162d61bbb3SSatish Balay 28172d61bbb3SSatish Balay PetscFunctionBegin; 28189566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28199566063dSJacob Faibussowitsch PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 28203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28212d61bbb3SSatish Balay } 28222d61bbb3SSatish Balay 2823d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2824d71ae5a4SJacob Faibussowitsch { 2825547795f9SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2826b8c08b77SHong Zhang PetscScalar *z, x1, x2, x3, x4, x5; 2827d9ca1df4SBarry Smith const PetscScalar *x, *xb = NULL; 2828d9ca1df4SBarry Smith const MatScalar *v; 2829b8c08b77SHong Zhang PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2830d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 2831547795f9SHong Zhang Mat_CompressedRow cprow = a->compressedrow; 2832ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 2833547795f9SHong Zhang 2834547795f9SHong Zhang PetscFunctionBegin; 28359566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 28369566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 28379566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 2838547795f9SHong Zhang 2839547795f9SHong Zhang idx = a->j; 2840547795f9SHong Zhang v = a->a; 2841547795f9SHong Zhang if (usecprow) { 2842547795f9SHong Zhang mbs = cprow.nrows; 2843547795f9SHong Zhang ii = cprow.i; 2844547795f9SHong Zhang ridx = cprow.rindex; 2845547795f9SHong Zhang } else { 2846547795f9SHong Zhang mbs = a->mbs; 2847547795f9SHong Zhang ii = a->i; 2848547795f9SHong Zhang xb = x; 2849547795f9SHong Zhang } 2850547795f9SHong Zhang 2851547795f9SHong Zhang switch (bs) { 2852547795f9SHong Zhang case 1: 2853547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2854547795f9SHong Zhang if (usecprow) xb = x + ridx[i]; 2855547795f9SHong Zhang x1 = xb[0]; 2856547795f9SHong Zhang ib = idx + ii[0]; 28579371c9d4SSatish Balay n = ii[1] - ii[0]; 28589371c9d4SSatish Balay ii++; 2859547795f9SHong Zhang for (j = 0; j < n; j++) { 2860547795f9SHong Zhang rval = ib[j]; 2861547795f9SHong Zhang z[rval] += PetscConj(*v) * x1; 2862547795f9SHong Zhang v++; 2863547795f9SHong Zhang } 2864547795f9SHong Zhang if (!usecprow) xb++; 2865547795f9SHong Zhang } 2866547795f9SHong Zhang break; 2867547795f9SHong Zhang case 2: 2868547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2869547795f9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 28709371c9d4SSatish Balay x1 = xb[0]; 28719371c9d4SSatish Balay x2 = xb[1]; 2872547795f9SHong Zhang ib = idx + ii[0]; 28739371c9d4SSatish Balay n = ii[1] - ii[0]; 28749371c9d4SSatish Balay ii++; 2875547795f9SHong Zhang for (j = 0; j < n; j++) { 2876547795f9SHong Zhang rval = ib[j] * 2; 2877547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2878547795f9SHong Zhang z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2879547795f9SHong Zhang v += 4; 2880547795f9SHong Zhang } 2881547795f9SHong Zhang if (!usecprow) xb += 2; 2882547795f9SHong Zhang } 2883547795f9SHong Zhang break; 2884547795f9SHong Zhang case 3: 2885547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2886547795f9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 28879371c9d4SSatish Balay x1 = xb[0]; 28889371c9d4SSatish Balay x2 = xb[1]; 28899371c9d4SSatish Balay x3 = xb[2]; 2890547795f9SHong Zhang ib = idx + ii[0]; 28919371c9d4SSatish Balay n = ii[1] - ii[0]; 28929371c9d4SSatish Balay ii++; 2893547795f9SHong Zhang for (j = 0; j < n; j++) { 2894547795f9SHong Zhang rval = ib[j] * 3; 2895547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2896547795f9SHong Zhang z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2897547795f9SHong Zhang z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2898547795f9SHong Zhang v += 9; 2899547795f9SHong Zhang } 2900547795f9SHong Zhang if (!usecprow) xb += 3; 2901547795f9SHong Zhang } 2902547795f9SHong Zhang break; 2903547795f9SHong Zhang case 4: 2904547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2905547795f9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 29069371c9d4SSatish Balay x1 = xb[0]; 29079371c9d4SSatish Balay x2 = xb[1]; 29089371c9d4SSatish Balay x3 = xb[2]; 29099371c9d4SSatish Balay x4 = xb[3]; 2910547795f9SHong Zhang ib = idx + ii[0]; 29119371c9d4SSatish Balay n = ii[1] - ii[0]; 29129371c9d4SSatish Balay ii++; 2913547795f9SHong Zhang for (j = 0; j < n; j++) { 2914547795f9SHong Zhang rval = ib[j] * 4; 2915547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2916547795f9SHong Zhang z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2917547795f9SHong Zhang z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2918547795f9SHong Zhang z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2919547795f9SHong Zhang v += 16; 2920547795f9SHong Zhang } 2921547795f9SHong Zhang if (!usecprow) xb += 4; 2922547795f9SHong Zhang } 2923547795f9SHong Zhang break; 2924547795f9SHong Zhang case 5: 2925547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2926547795f9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 29279371c9d4SSatish Balay x1 = xb[0]; 29289371c9d4SSatish Balay x2 = xb[1]; 29299371c9d4SSatish Balay x3 = xb[2]; 29309371c9d4SSatish Balay x4 = xb[3]; 29319371c9d4SSatish Balay x5 = xb[4]; 2932547795f9SHong Zhang ib = idx + ii[0]; 29339371c9d4SSatish Balay n = ii[1] - ii[0]; 29349371c9d4SSatish Balay ii++; 2935547795f9SHong Zhang for (j = 0; j < n; j++) { 2936547795f9SHong Zhang rval = ib[j] * 5; 2937547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2938547795f9SHong Zhang z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2939547795f9SHong Zhang z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2940547795f9SHong Zhang z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2941547795f9SHong Zhang z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2942547795f9SHong Zhang v += 25; 2943547795f9SHong Zhang } 2944547795f9SHong Zhang if (!usecprow) xb += 5; 2945547795f9SHong Zhang } 2946547795f9SHong Zhang break; 2947d71ae5a4SJacob Faibussowitsch default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2948d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2949968ae2c8SSatish Balay #if 0 2950968ae2c8SSatish Balay { 2951b8c08b77SHong Zhang PetscInt ncols,k,bs2=a->bs2; 2952b8c08b77SHong Zhang PetscScalar *work,*workt,zb; 2953d9ca1df4SBarry Smith const PetscScalar *xtmp; 2954547795f9SHong Zhang if (!a->mult_work) { 2955547795f9SHong Zhang k = PetscMax(A->rmap->n,A->cmap->n); 29569566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2957547795f9SHong Zhang } 2958547795f9SHong Zhang work = a->mult_work; 2959547795f9SHong Zhang xtmp = x; 2960547795f9SHong Zhang for (i=0; i<mbs; i++) { 2961547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 2962547795f9SHong Zhang ncols = n*bs; 29639566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work,ncols)); 296426fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs*ridx[i]; 296596b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2966547795f9SHong Zhang v += n*bs2; 2967547795f9SHong Zhang if (!usecprow) xtmp += bs; 2968547795f9SHong Zhang workt = work; 2969547795f9SHong Zhang for (j=0; j<n; j++) { 2970547795f9SHong Zhang zb = z + bs*(*idx++); 2971547795f9SHong Zhang for (k=0; k<bs; k++) zb[k] += workt[k] ; 2972547795f9SHong Zhang workt += bs; 2973547795f9SHong Zhang } 2974547795f9SHong Zhang } 2975547795f9SHong Zhang } 2976968ae2c8SSatish Balay #endif 2977547795f9SHong Zhang } 29789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 29799566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 29809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 29813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2982547795f9SHong Zhang } 2983547795f9SHong Zhang 2984d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2985d71ae5a4SJacob Faibussowitsch { 29862d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2987d9ca1df4SBarry Smith PetscScalar *zb, *z, x1, x2, x3, x4, x5; 2988f4259b30SLisandro Dalcin const PetscScalar *x, *xb = NULL; 2989d9ca1df4SBarry Smith const MatScalar *v; 2990d9ca1df4SBarry Smith PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2991d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 29923447b6efSHong Zhang Mat_CompressedRow cprow = a->compressedrow; 2993ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 29942d61bbb3SSatish Balay 29952d61bbb3SSatish Balay PetscFunctionBegin; 29969566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 29979566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 29989566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 29992d61bbb3SSatish Balay 30002d61bbb3SSatish Balay idx = a->j; 30012d61bbb3SSatish Balay v = a->a; 30023447b6efSHong Zhang if (usecprow) { 30033447b6efSHong Zhang mbs = cprow.nrows; 30043447b6efSHong Zhang ii = cprow.i; 30057b2bb3b9SHong Zhang ridx = cprow.rindex; 30063447b6efSHong Zhang } else { 30073447b6efSHong Zhang mbs = a->mbs; 30082d61bbb3SSatish Balay ii = a->i; 3009f1af5d2fSBarry Smith xb = x; 30103447b6efSHong Zhang } 30112d61bbb3SSatish Balay 30122d61bbb3SSatish Balay switch (bs) { 30132d61bbb3SSatish Balay case 1: 30142d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30157b2bb3b9SHong Zhang if (usecprow) xb = x + ridx[i]; 3016f1af5d2fSBarry Smith x1 = xb[0]; 30173447b6efSHong Zhang ib = idx + ii[0]; 30189371c9d4SSatish Balay n = ii[1] - ii[0]; 30199371c9d4SSatish Balay ii++; 30202d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30212d61bbb3SSatish Balay rval = ib[j]; 3022f1af5d2fSBarry Smith z[rval] += *v * x1; 3023f1af5d2fSBarry Smith v++; 30242d61bbb3SSatish Balay } 30253447b6efSHong Zhang if (!usecprow) xb++; 30262d61bbb3SSatish Balay } 30272d61bbb3SSatish Balay break; 30282d61bbb3SSatish Balay case 2: 30292d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30307b2bb3b9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 30319371c9d4SSatish Balay x1 = xb[0]; 30329371c9d4SSatish Balay x2 = xb[1]; 30333447b6efSHong Zhang ib = idx + ii[0]; 30349371c9d4SSatish Balay n = ii[1] - ii[0]; 30359371c9d4SSatish Balay ii++; 30362d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30372d61bbb3SSatish Balay rval = ib[j] * 2; 30382d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2; 30392d61bbb3SSatish Balay z[rval++] += v[2] * x1 + v[3] * x2; 30402d61bbb3SSatish Balay v += 4; 30412d61bbb3SSatish Balay } 30423447b6efSHong Zhang if (!usecprow) xb += 2; 30432d61bbb3SSatish Balay } 30442d61bbb3SSatish Balay break; 30452d61bbb3SSatish Balay case 3: 30462d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30477b2bb3b9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 30489371c9d4SSatish Balay x1 = xb[0]; 30499371c9d4SSatish Balay x2 = xb[1]; 30509371c9d4SSatish Balay x3 = xb[2]; 30513447b6efSHong Zhang ib = idx + ii[0]; 30529371c9d4SSatish Balay n = ii[1] - ii[0]; 30539371c9d4SSatish Balay ii++; 30542d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30552d61bbb3SSatish Balay rval = ib[j] * 3; 30562d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 30572d61bbb3SSatish Balay z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 30582d61bbb3SSatish Balay z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 30592d61bbb3SSatish Balay v += 9; 30602d61bbb3SSatish Balay } 30613447b6efSHong Zhang if (!usecprow) xb += 3; 30622d61bbb3SSatish Balay } 30632d61bbb3SSatish Balay break; 30642d61bbb3SSatish Balay case 4: 30652d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30667b2bb3b9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 30679371c9d4SSatish Balay x1 = xb[0]; 30689371c9d4SSatish Balay x2 = xb[1]; 30699371c9d4SSatish Balay x3 = xb[2]; 30709371c9d4SSatish Balay x4 = xb[3]; 30713447b6efSHong Zhang ib = idx + ii[0]; 30729371c9d4SSatish Balay n = ii[1] - ii[0]; 30739371c9d4SSatish Balay ii++; 30742d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30752d61bbb3SSatish Balay rval = ib[j] * 4; 30762d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 30772d61bbb3SSatish Balay z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 30782d61bbb3SSatish Balay z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 30792d61bbb3SSatish Balay z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 30802d61bbb3SSatish Balay v += 16; 30812d61bbb3SSatish Balay } 30823447b6efSHong Zhang if (!usecprow) xb += 4; 30832d61bbb3SSatish Balay } 30842d61bbb3SSatish Balay break; 30852d61bbb3SSatish Balay case 5: 30862d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30877b2bb3b9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 30889371c9d4SSatish Balay x1 = xb[0]; 30899371c9d4SSatish Balay x2 = xb[1]; 30909371c9d4SSatish Balay x3 = xb[2]; 30919371c9d4SSatish Balay x4 = xb[3]; 30929371c9d4SSatish Balay x5 = xb[4]; 30933447b6efSHong Zhang ib = idx + ii[0]; 30949371c9d4SSatish Balay n = ii[1] - ii[0]; 30959371c9d4SSatish Balay ii++; 30962d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30972d61bbb3SSatish Balay rval = ib[j] * 5; 30982d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 30992d61bbb3SSatish Balay z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 31002d61bbb3SSatish Balay z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 31012d61bbb3SSatish Balay z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 31022d61bbb3SSatish Balay z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 31032d61bbb3SSatish Balay v += 25; 31042d61bbb3SSatish Balay } 31053447b6efSHong Zhang if (!usecprow) xb += 5; 31062d61bbb3SSatish Balay } 31072d61bbb3SSatish Balay break; 3108f1af5d2fSBarry Smith default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3109690b6cddSBarry Smith PetscInt ncols, k; 3110d9ca1df4SBarry Smith PetscScalar *work, *workt; 3111d9ca1df4SBarry Smith const PetscScalar *xtmp; 31122d61bbb3SSatish Balay if (!a->mult_work) { 3113d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 31149566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 31152d61bbb3SSatish Balay } 31162d61bbb3SSatish Balay work = a->mult_work; 31173447b6efSHong Zhang xtmp = x; 31182d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 31199371c9d4SSatish Balay n = ii[1] - ii[0]; 31209371c9d4SSatish Balay ii++; 31212d61bbb3SSatish Balay ncols = n * bs; 31229566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work, ncols)); 312326fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs * ridx[i]; 312496b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 31252d61bbb3SSatish Balay v += n * bs2; 31263447b6efSHong Zhang if (!usecprow) xtmp += bs; 31272d61bbb3SSatish Balay workt = work; 31282d61bbb3SSatish Balay for (j = 0; j < n; j++) { 31292d61bbb3SSatish Balay zb = z + bs * (*idx++); 31302d61bbb3SSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 31312d61bbb3SSatish Balay workt += bs; 31322d61bbb3SSatish Balay } 31332d61bbb3SSatish Balay } 31342d61bbb3SSatish Balay } 31352d61bbb3SSatish Balay } 31369566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 31379566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 31389566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 31393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 31402d61bbb3SSatish Balay } 31412d61bbb3SSatish Balay 3142d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) 3143d71ae5a4SJacob Faibussowitsch { 31442d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3145690b6cddSBarry Smith PetscInt totalnz = a->bs2 * a->nz; 3146f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 3147c5df96a5SBarry Smith PetscBLASInt one = 1, tnz; 31482d61bbb3SSatish Balay 31492d61bbb3SSatish Balay PetscFunctionBegin; 31509566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3151792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 31529566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 31533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 31542d61bbb3SSatish Balay } 31552d61bbb3SSatish Balay 3156d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) 3157d71ae5a4SJacob Faibussowitsch { 31582d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 31593f1db9ecSBarry Smith MatScalar *v = a->a; 3160329f5518SBarry Smith PetscReal sum = 0.0; 3161d0f46423SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 31622d61bbb3SSatish Balay 31632d61bbb3SSatish Balay PetscFunctionBegin; 31642d61bbb3SSatish Balay if (type == NORM_FROBENIUS) { 3165570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16) 3166570b7f6dSBarry Smith PetscBLASInt one = 1, cnt = bs2 * nz; 3167792fecdfSBarry Smith PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3168570b7f6dSBarry Smith #else 31692d61bbb3SSatish Balay for (i = 0; i < bs2 * nz; i++) { 31709371c9d4SSatish Balay sum += PetscRealPart(PetscConj(*v) * (*v)); 31719371c9d4SSatish Balay v++; 31722d61bbb3SSatish Balay } 3173570b7f6dSBarry Smith #endif 31748f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum); 31759566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 31768a62d963SHong Zhang } else if (type == NORM_1) { /* maximum column sum */ 31778a62d963SHong Zhang PetscReal *tmp; 31788a62d963SHong Zhang PetscInt *bcol = a->j; 31799566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 31808a62d963SHong Zhang for (i = 0; i < nz; i++) { 31818a62d963SHong Zhang for (j = 0; j < bs; j++) { 31828a62d963SHong Zhang k1 = bs * (*bcol) + j; /* column index */ 31838a62d963SHong Zhang for (k = 0; k < bs; k++) { 31849371c9d4SSatish Balay tmp[k1] += PetscAbsScalar(*v); 31859371c9d4SSatish Balay v++; 31868a62d963SHong Zhang } 31878a62d963SHong Zhang } 31888a62d963SHong Zhang bcol++; 31898a62d963SHong Zhang } 31908a62d963SHong Zhang *norm = 0.0; 3191d0f46423SBarry Smith for (j = 0; j < A->cmap->n; j++) { 31928a62d963SHong Zhang if (tmp[j] > *norm) *norm = tmp[j]; 31938a62d963SHong Zhang } 31949566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp)); 31959566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3196596552b5SBarry Smith } else if (type == NORM_INFINITY) { /* maximum row sum */ 3197596552b5SBarry Smith *norm = 0.0; 3198596552b5SBarry Smith for (k = 0; k < bs; k++) { 319974f84c7bSSatish Balay for (j = 0; j < a->mbs; j++) { 3200596552b5SBarry Smith v = a->a + bs2 * a->i[j] + k; 3201596552b5SBarry Smith sum = 0.0; 3202596552b5SBarry Smith for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 32030e90e235SBarry Smith for (k1 = 0; k1 < bs; k1++) { 3204596552b5SBarry Smith sum += PetscAbsScalar(*v); 3205596552b5SBarry Smith v += bs; 32062d61bbb3SSatish Balay } 32070e90e235SBarry Smith } 3208596552b5SBarry Smith if (sum > *norm) *norm = sum; 3209596552b5SBarry Smith } 3210596552b5SBarry Smith } 32119566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3212e7e72b3dSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 32133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 32142d61bbb3SSatish Balay } 32152d61bbb3SSatish Balay 3216d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) 3217d71ae5a4SJacob Faibussowitsch { 32182d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 32192d61bbb3SSatish Balay 32202d61bbb3SSatish Balay PetscFunctionBegin; 32212d61bbb3SSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3222d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3223273d9f13SBarry Smith *flg = PETSC_FALSE; 32243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 32252d61bbb3SSatish Balay } 32262d61bbb3SSatish Balay 32272d61bbb3SSatish Balay /* if the a->i are the same */ 32289566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 32293ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 32302d61bbb3SSatish Balay 32312d61bbb3SSatish Balay /* if a->j are the same */ 32329566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 32333ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 323426fbe8dcSKarl Rupp 32352d61bbb3SSatish Balay /* if a->a are the same */ 32369566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 32373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 32382d61bbb3SSatish Balay } 32392d61bbb3SSatish Balay 3240d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) 3241d71ae5a4SJacob Faibussowitsch { 32422d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3243690b6cddSBarry Smith PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 324487828ca2SBarry Smith PetscScalar *x, zero = 0.0; 32453f1db9ecSBarry Smith MatScalar *aa, *aa_j; 32462d61bbb3SSatish Balay 32472d61bbb3SSatish Balay PetscFunctionBegin; 324828b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3249d0f46423SBarry Smith bs = A->rmap->bs; 32502d61bbb3SSatish Balay aa = a->a; 32512d61bbb3SSatish Balay ai = a->i; 32522d61bbb3SSatish Balay aj = a->j; 32532d61bbb3SSatish Balay ambs = a->mbs; 32542d61bbb3SSatish Balay bs2 = a->bs2; 32552d61bbb3SSatish Balay 32569566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 32579566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 32589566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 325908401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 32602d61bbb3SSatish Balay for (i = 0; i < ambs; i++) { 32612d61bbb3SSatish Balay for (j = ai[i]; j < ai[i + 1]; j++) { 32622d61bbb3SSatish Balay if (aj[j] == i) { 32632d61bbb3SSatish Balay row = i * bs; 32642d61bbb3SSatish Balay aa_j = aa + j * bs2; 32652d61bbb3SSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 32662d61bbb3SSatish Balay break; 32672d61bbb3SSatish Balay } 32682d61bbb3SSatish Balay } 32692d61bbb3SSatish Balay } 32709566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 32713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 32722d61bbb3SSatish Balay } 32732d61bbb3SSatish Balay 3274d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) 3275d71ae5a4SJacob Faibussowitsch { 32762d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 327753ef36baSBarry Smith const PetscScalar *l, *r, *li, *ri; 327853ef36baSBarry Smith PetscScalar x; 32793f1db9ecSBarry Smith MatScalar *aa, *v; 328053ef36baSBarry Smith PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 328153ef36baSBarry Smith const PetscInt *ai, *aj; 32822d61bbb3SSatish Balay 32832d61bbb3SSatish Balay PetscFunctionBegin; 32842d61bbb3SSatish Balay ai = a->i; 32852d61bbb3SSatish Balay aj = a->j; 32862d61bbb3SSatish Balay aa = a->a; 3287d0f46423SBarry Smith m = A->rmap->n; 3288d0f46423SBarry Smith n = A->cmap->n; 3289d0f46423SBarry Smith bs = A->rmap->bs; 32902d61bbb3SSatish Balay mbs = a->mbs; 32912d61bbb3SSatish Balay bs2 = a->bs2; 32922d61bbb3SSatish Balay if (ll) { 32939566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 32949566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 329508401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 32962d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 32972d61bbb3SSatish Balay M = ai[i + 1] - ai[i]; 32982d61bbb3SSatish Balay li = l + i * bs; 32992d61bbb3SSatish Balay v = aa + bs2 * ai[i]; 33002d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 3301ad540459SPierre Jolivet for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; 33022d61bbb3SSatish Balay } 33032d61bbb3SSatish Balay } 33049566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 33059566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 33062d61bbb3SSatish Balay } 33072d61bbb3SSatish Balay 33082d61bbb3SSatish Balay if (rr) { 33099566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(rr, &r)); 33109566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(rr, &rn)); 331108401ef6SPierre Jolivet PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 33122d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 331353ef36baSBarry Smith iai = ai[i]; 331453ef36baSBarry Smith M = ai[i + 1] - iai; 331553ef36baSBarry Smith v = aa + bs2 * iai; 33162d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 331753ef36baSBarry Smith ri = r + bs * aj[iai + j]; 33182d61bbb3SSatish Balay for (k = 0; k < bs; k++) { 33192d61bbb3SSatish Balay x = ri[k]; 332053ef36baSBarry Smith for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 332153ef36baSBarry Smith v += bs; 33222d61bbb3SSatish Balay } 33232d61bbb3SSatish Balay } 33242d61bbb3SSatish Balay } 33259566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(rr, &r)); 33269566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 33272d61bbb3SSatish Balay } 33283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33292d61bbb3SSatish Balay } 33302d61bbb3SSatish Balay 3331d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) 3332d71ae5a4SJacob Faibussowitsch { 33332d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33342d61bbb3SSatish Balay 33352d61bbb3SSatish Balay PetscFunctionBegin; 33362d61bbb3SSatish Balay info->block_size = a->bs2; 3337ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; 33382d61bbb3SSatish Balay info->nz_used = a->bs2 * a->nz; 33393966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 33402d61bbb3SSatish Balay info->assemblies = A->num_ass; 33418e58a170SBarry Smith info->mallocs = A->info.mallocs; 33424dfa11a4SJacob Faibussowitsch info->memory = 0; /* REVIEW ME */ 3343d5f3da31SBarry Smith if (A->factortype) { 33442d61bbb3SSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 33452d61bbb3SSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 33462d61bbb3SSatish Balay info->factor_mallocs = A->info.factor_mallocs; 33472d61bbb3SSatish Balay } else { 33482d61bbb3SSatish Balay info->fill_ratio_given = 0; 33492d61bbb3SSatish Balay info->fill_ratio_needed = 0; 33502d61bbb3SSatish Balay info->factor_mallocs = 0; 33512d61bbb3SSatish Balay } 33523ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33532d61bbb3SSatish Balay } 33542d61bbb3SSatish Balay 3355d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 3356d71ae5a4SJacob Faibussowitsch { 33572d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33582d61bbb3SSatish Balay 33592d61bbb3SSatish Balay PetscFunctionBegin; 33609566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 33613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33622d61bbb3SSatish Balay } 3363a001520aSPierre Jolivet 3364d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 3365d71ae5a4SJacob Faibussowitsch { 3366a001520aSPierre Jolivet PetscFunctionBegin; 33679566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 33684222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 33693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3370a001520aSPierre Jolivet } 3371a001520aSPierre Jolivet 3372d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3373d71ae5a4SJacob Faibussowitsch { 337474eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3375f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1; 3376bcf10a7aSPierre Jolivet const PetscScalar *xb; 337774eeabc5SPierre Jolivet PetscScalar x1; 337874eeabc5SPierre Jolivet const MatScalar *v, *vv; 337974eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 338074eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 338174eeabc5SPierre Jolivet 338274eeabc5SPierre Jolivet PetscFunctionBegin; 338374eeabc5SPierre Jolivet idx = a->j; 338474eeabc5SPierre Jolivet v = a->a; 338574eeabc5SPierre Jolivet if (usecprow) { 338674eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 338774eeabc5SPierre Jolivet ii = a->compressedrow.i; 338874eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 338974eeabc5SPierre Jolivet } else { 339074eeabc5SPierre Jolivet mbs = a->mbs; 339174eeabc5SPierre Jolivet ii = a->i; 339274eeabc5SPierre Jolivet z = c; 339374eeabc5SPierre Jolivet } 339474eeabc5SPierre Jolivet 339574eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 33969371c9d4SSatish Balay n = ii[1] - ii[0]; 33979371c9d4SSatish Balay ii++; 339874eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 339974eeabc5SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 340074eeabc5SPierre Jolivet if (usecprow) z = c + ridx[i]; 340174eeabc5SPierre Jolivet jj = idx; 340274eeabc5SPierre Jolivet vv = v; 340374eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 340474eeabc5SPierre Jolivet idx = jj; 340574eeabc5SPierre Jolivet v = vv; 340674eeabc5SPierre Jolivet sum1 = 0.0; 340774eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 34089371c9d4SSatish Balay xb = b + (*idx++); 34099371c9d4SSatish Balay x1 = xb[0 + k * bm]; 341074eeabc5SPierre Jolivet sum1 += v[0] * x1; 341174eeabc5SPierre Jolivet v += 1; 341274eeabc5SPierre Jolivet } 3413feb237baSPierre Jolivet z[0 + k * cm] = sum1; 341474eeabc5SPierre Jolivet } 341574eeabc5SPierre Jolivet if (!usecprow) z += 1; 341674eeabc5SPierre Jolivet } 34173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 341874eeabc5SPierre Jolivet } 341974eeabc5SPierre Jolivet 3420d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3421d71ae5a4SJacob Faibussowitsch { 34224b7054f4SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3423f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2; 3424bcf10a7aSPierre Jolivet const PetscScalar *xb; 34254b7054f4SPierre Jolivet PetscScalar x1, x2; 34264b7054f4SPierre Jolivet const MatScalar *v, *vv; 34274b7054f4SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 34284b7054f4SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 34294b7054f4SPierre Jolivet 34304b7054f4SPierre Jolivet PetscFunctionBegin; 34314b7054f4SPierre Jolivet idx = a->j; 34324b7054f4SPierre Jolivet v = a->a; 34334b7054f4SPierre Jolivet if (usecprow) { 34344b7054f4SPierre Jolivet mbs = a->compressedrow.nrows; 34354b7054f4SPierre Jolivet ii = a->compressedrow.i; 34364b7054f4SPierre Jolivet ridx = a->compressedrow.rindex; 34374b7054f4SPierre Jolivet } else { 34384b7054f4SPierre Jolivet mbs = a->mbs; 34394b7054f4SPierre Jolivet ii = a->i; 34404b7054f4SPierre Jolivet z = c; 34414b7054f4SPierre Jolivet } 34424b7054f4SPierre Jolivet 34434b7054f4SPierre Jolivet for (i = 0; i < mbs; i++) { 34449371c9d4SSatish Balay n = ii[1] - ii[0]; 34459371c9d4SSatish Balay ii++; 34464b7054f4SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 34474b7054f4SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 34484b7054f4SPierre Jolivet if (usecprow) z = c + 2 * ridx[i]; 34494b7054f4SPierre Jolivet jj = idx; 34504b7054f4SPierre Jolivet vv = v; 34514b7054f4SPierre Jolivet for (k = 0; k < cn; k++) { 34524b7054f4SPierre Jolivet idx = jj; 34534b7054f4SPierre Jolivet v = vv; 34549371c9d4SSatish Balay sum1 = 0.0; 34559371c9d4SSatish Balay sum2 = 0.0; 34564b7054f4SPierre Jolivet for (j = 0; j < n; j++) { 34579371c9d4SSatish Balay xb = b + 2 * (*idx++); 34589371c9d4SSatish Balay x1 = xb[0 + k * bm]; 34599371c9d4SSatish Balay x2 = xb[1 + k * bm]; 34604b7054f4SPierre Jolivet sum1 += v[0] * x1 + v[2] * x2; 34614b7054f4SPierre Jolivet sum2 += v[1] * x1 + v[3] * x2; 34624b7054f4SPierre Jolivet v += 4; 34634b7054f4SPierre Jolivet } 34649371c9d4SSatish Balay z[0 + k * cm] = sum1; 34659371c9d4SSatish Balay z[1 + k * cm] = sum2; 34664b7054f4SPierre Jolivet } 34674b7054f4SPierre Jolivet if (!usecprow) z += 2; 34684b7054f4SPierre Jolivet } 34693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 34704b7054f4SPierre Jolivet } 34714b7054f4SPierre Jolivet 3472d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3473d71ae5a4SJacob Faibussowitsch { 347474eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3475f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3; 3476bcf10a7aSPierre Jolivet const PetscScalar *xb; 347774eeabc5SPierre Jolivet PetscScalar x1, x2, x3; 347874eeabc5SPierre Jolivet const MatScalar *v, *vv; 347974eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 348074eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 348174eeabc5SPierre Jolivet 348274eeabc5SPierre Jolivet PetscFunctionBegin; 348374eeabc5SPierre Jolivet idx = a->j; 348474eeabc5SPierre Jolivet v = a->a; 348574eeabc5SPierre Jolivet if (usecprow) { 348674eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 348774eeabc5SPierre Jolivet ii = a->compressedrow.i; 348874eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 348974eeabc5SPierre Jolivet } else { 349074eeabc5SPierre Jolivet mbs = a->mbs; 349174eeabc5SPierre Jolivet ii = a->i; 349274eeabc5SPierre Jolivet z = c; 349374eeabc5SPierre Jolivet } 349474eeabc5SPierre Jolivet 349574eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 34969371c9d4SSatish Balay n = ii[1] - ii[0]; 34979371c9d4SSatish Balay ii++; 349874eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 349974eeabc5SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 350074eeabc5SPierre Jolivet if (usecprow) z = c + 3 * ridx[i]; 350174eeabc5SPierre Jolivet jj = idx; 350274eeabc5SPierre Jolivet vv = v; 350374eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 350474eeabc5SPierre Jolivet idx = jj; 350574eeabc5SPierre Jolivet v = vv; 35069371c9d4SSatish Balay sum1 = 0.0; 35079371c9d4SSatish Balay sum2 = 0.0; 35089371c9d4SSatish Balay sum3 = 0.0; 350974eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 35109371c9d4SSatish Balay xb = b + 3 * (*idx++); 35119371c9d4SSatish Balay x1 = xb[0 + k * bm]; 35129371c9d4SSatish Balay x2 = xb[1 + k * bm]; 35139371c9d4SSatish Balay x3 = xb[2 + k * bm]; 351474eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 351574eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 351674eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 351774eeabc5SPierre Jolivet v += 9; 351874eeabc5SPierre Jolivet } 35199371c9d4SSatish Balay z[0 + k * cm] = sum1; 35209371c9d4SSatish Balay z[1 + k * cm] = sum2; 35219371c9d4SSatish Balay z[2 + k * cm] = sum3; 352274eeabc5SPierre Jolivet } 352374eeabc5SPierre Jolivet if (!usecprow) z += 3; 352474eeabc5SPierre Jolivet } 35253ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 352674eeabc5SPierre Jolivet } 352774eeabc5SPierre Jolivet 3528d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3529d71ae5a4SJacob Faibussowitsch { 353074eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3531f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3532bcf10a7aSPierre Jolivet const PetscScalar *xb; 353374eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4; 353474eeabc5SPierre Jolivet const MatScalar *v, *vv; 353574eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 353674eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 353774eeabc5SPierre Jolivet 353874eeabc5SPierre Jolivet PetscFunctionBegin; 353974eeabc5SPierre Jolivet idx = a->j; 354074eeabc5SPierre Jolivet v = a->a; 354174eeabc5SPierre Jolivet if (usecprow) { 354274eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 354374eeabc5SPierre Jolivet ii = a->compressedrow.i; 354474eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 354574eeabc5SPierre Jolivet } else { 354674eeabc5SPierre Jolivet mbs = a->mbs; 354774eeabc5SPierre Jolivet ii = a->i; 354874eeabc5SPierre Jolivet z = c; 354974eeabc5SPierre Jolivet } 355074eeabc5SPierre Jolivet 355174eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 35529371c9d4SSatish Balay n = ii[1] - ii[0]; 35539371c9d4SSatish Balay ii++; 355474eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 355574eeabc5SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 355674eeabc5SPierre Jolivet if (usecprow) z = c + 4 * ridx[i]; 355774eeabc5SPierre Jolivet jj = idx; 355874eeabc5SPierre Jolivet vv = v; 355974eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 356074eeabc5SPierre Jolivet idx = jj; 356174eeabc5SPierre Jolivet v = vv; 35629371c9d4SSatish Balay sum1 = 0.0; 35639371c9d4SSatish Balay sum2 = 0.0; 35649371c9d4SSatish Balay sum3 = 0.0; 35659371c9d4SSatish Balay sum4 = 0.0; 356674eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 35679371c9d4SSatish Balay xb = b + 4 * (*idx++); 35689371c9d4SSatish Balay x1 = xb[0 + k * bm]; 35699371c9d4SSatish Balay x2 = xb[1 + k * bm]; 35709371c9d4SSatish Balay x3 = xb[2 + k * bm]; 35719371c9d4SSatish Balay x4 = xb[3 + k * bm]; 357274eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 357374eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 357474eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 357574eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 357674eeabc5SPierre Jolivet v += 16; 357774eeabc5SPierre Jolivet } 35789371c9d4SSatish Balay z[0 + k * cm] = sum1; 35799371c9d4SSatish Balay z[1 + k * cm] = sum2; 35809371c9d4SSatish Balay z[2 + k * cm] = sum3; 35819371c9d4SSatish Balay z[3 + k * cm] = sum4; 358274eeabc5SPierre Jolivet } 358374eeabc5SPierre Jolivet if (!usecprow) z += 4; 358474eeabc5SPierre Jolivet } 35853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 358674eeabc5SPierre Jolivet } 358774eeabc5SPierre Jolivet 3588d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3589d71ae5a4SJacob Faibussowitsch { 359074eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3591f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3592bcf10a7aSPierre Jolivet const PetscScalar *xb; 359374eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 359474eeabc5SPierre Jolivet const MatScalar *v, *vv; 359574eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 359674eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 359774eeabc5SPierre Jolivet 359874eeabc5SPierre Jolivet PetscFunctionBegin; 359974eeabc5SPierre Jolivet idx = a->j; 360074eeabc5SPierre Jolivet v = a->a; 360174eeabc5SPierre Jolivet if (usecprow) { 360274eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 360374eeabc5SPierre Jolivet ii = a->compressedrow.i; 360474eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 360574eeabc5SPierre Jolivet } else { 360674eeabc5SPierre Jolivet mbs = a->mbs; 360774eeabc5SPierre Jolivet ii = a->i; 360874eeabc5SPierre Jolivet z = c; 360974eeabc5SPierre Jolivet } 361074eeabc5SPierre Jolivet 361174eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 36129371c9d4SSatish Balay n = ii[1] - ii[0]; 36139371c9d4SSatish Balay ii++; 361474eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 361574eeabc5SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 361674eeabc5SPierre Jolivet if (usecprow) z = c + 5 * ridx[i]; 361774eeabc5SPierre Jolivet jj = idx; 361874eeabc5SPierre Jolivet vv = v; 361974eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 362074eeabc5SPierre Jolivet idx = jj; 362174eeabc5SPierre Jolivet v = vv; 36229371c9d4SSatish Balay sum1 = 0.0; 36239371c9d4SSatish Balay sum2 = 0.0; 36249371c9d4SSatish Balay sum3 = 0.0; 36259371c9d4SSatish Balay sum4 = 0.0; 36269371c9d4SSatish Balay sum5 = 0.0; 362774eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 36289371c9d4SSatish Balay xb = b + 5 * (*idx++); 36299371c9d4SSatish Balay x1 = xb[0 + k * bm]; 36309371c9d4SSatish Balay x2 = xb[1 + k * bm]; 36319371c9d4SSatish Balay x3 = xb[2 + k * bm]; 36329371c9d4SSatish Balay x4 = xb[3 + k * bm]; 36339371c9d4SSatish Balay x5 = xb[4 + k * bm]; 363474eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 363574eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 363674eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 363774eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 363874eeabc5SPierre Jolivet sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 363974eeabc5SPierre Jolivet v += 25; 364074eeabc5SPierre Jolivet } 36419371c9d4SSatish Balay z[0 + k * cm] = sum1; 36429371c9d4SSatish Balay z[1 + k * cm] = sum2; 36439371c9d4SSatish Balay z[2 + k * cm] = sum3; 36449371c9d4SSatish Balay z[3 + k * cm] = sum4; 36459371c9d4SSatish Balay z[4 + k * cm] = sum5; 364674eeabc5SPierre Jolivet } 364774eeabc5SPierre Jolivet if (!usecprow) z += 5; 364874eeabc5SPierre Jolivet } 36493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 365074eeabc5SPierre Jolivet } 365174eeabc5SPierre Jolivet 3652d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) 3653d71ae5a4SJacob Faibussowitsch { 3654a001520aSPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3655a001520aSPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3656910cf402Sprj- Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3657bcf10a7aSPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3658a001520aSPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3659a001520aSPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 3660f4259b30SLisandro Dalcin PetscScalar *z = NULL; 3661a001520aSPierre Jolivet PetscScalar *c, *b; 3662a001520aSPierre Jolivet const MatScalar *v; 3663a001520aSPierre Jolivet const PetscInt *idx, *ii, *ridx = NULL; 36644b7054f4SPierre Jolivet PetscScalar _DZero = 0.0, _DOne = 1.0; 3665a001520aSPierre Jolivet PetscBool usecprow = a->compressedrow.use; 3666a001520aSPierre Jolivet 3667a001520aSPierre Jolivet PetscFunctionBegin; 36683ba16761SJacob Faibussowitsch if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 366908401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 367008401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 367108401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3672a001520aSPierre Jolivet b = bd->v; 367348a46eb9SPierre Jolivet if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 36749566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 367574eeabc5SPierre Jolivet switch (bs) { 3676d71ae5a4SJacob Faibussowitsch case 1: 3677d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3678d71ae5a4SJacob Faibussowitsch break; 3679d71ae5a4SJacob Faibussowitsch case 2: 3680d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3681d71ae5a4SJacob Faibussowitsch break; 3682d71ae5a4SJacob Faibussowitsch case 3: 3683d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3684d71ae5a4SJacob Faibussowitsch break; 3685d71ae5a4SJacob Faibussowitsch case 4: 3686d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3687d71ae5a4SJacob Faibussowitsch break; 3688d71ae5a4SJacob Faibussowitsch case 5: 3689d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3690d71ae5a4SJacob Faibussowitsch break; 369174eeabc5SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 36929566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 36939566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 36949566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 36959566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 3696a001520aSPierre Jolivet idx = a->j; 3697a001520aSPierre Jolivet v = a->a; 3698a001520aSPierre Jolivet if (usecprow) { 3699a001520aSPierre Jolivet mbs = a->compressedrow.nrows; 3700a001520aSPierre Jolivet ii = a->compressedrow.i; 3701a001520aSPierre Jolivet ridx = a->compressedrow.rindex; 3702a001520aSPierre Jolivet } else { 3703a001520aSPierre Jolivet mbs = a->mbs; 3704a001520aSPierre Jolivet ii = a->i; 3705a001520aSPierre Jolivet z = c; 3706a001520aSPierre Jolivet } 3707a001520aSPierre Jolivet for (i = 0; i < mbs; i++) { 37089371c9d4SSatish Balay n = ii[1] - ii[0]; 37099371c9d4SSatish Balay ii++; 3710a001520aSPierre Jolivet if (usecprow) z = c + bs * ridx[i]; 37114b7054f4SPierre Jolivet if (n) { 3712792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 37134b7054f4SPierre Jolivet v += bs2; 37144b7054f4SPierre Jolivet } 37154b7054f4SPierre Jolivet for (j = 1; j < n; j++) { 3716792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3717a001520aSPierre Jolivet v += bs2; 3718a001520aSPierre Jolivet } 3719a001520aSPierre Jolivet if (!usecprow) z += bs; 3720a001520aSPierre Jolivet } 37214b7054f4SPierre Jolivet } 37229566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 37239566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 37243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3725a001520aSPierre Jolivet } 3726