1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 2a001520aSPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <petscblaslapack.h> 6cac129eeSSatish Balay 75f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 896e086a2SDaniel Kokron #include <immintrin.h> 9fb56d528SJed Brown #elif defined(PETSC_HAVE_XMMINTRIN_H) 10fb56d528SJed Brown #include <xmmintrin.h> 1196e086a2SDaniel Kokron #endif 1296e086a2SDaniel Kokron 13d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 14d71ae5a4SJacob Faibussowitsch { 15a3192f15SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 165d0c19d7SBarry Smith PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 175d0c19d7SBarry Smith const PetscInt *idx; 187bede89fSBarry Smith PetscInt start, end, *ai, *aj, bs; 19f1af5d2fSBarry Smith PetscBT table; 20a3192f15SSatish Balay 213a40ed3dSBarry Smith PetscFunctionBegin; 22a3192f15SSatish Balay m = a->mbs; 23a3192f15SSatish Balay ai = a->i; 24a3192f15SSatish Balay aj = a->j; 25d0f46423SBarry Smith bs = A->rmap->bs; 26a3192f15SSatish Balay 2708401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 28a3192f15SSatish Balay 299566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(m, &table)); 309566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &nidx)); 31a3192f15SSatish Balay 32a3192f15SSatish Balay for (i = 0; i < is_max; i++) { 33a3192f15SSatish Balay /* Initialise the two local arrays */ 34a3192f15SSatish Balay isz = 0; 359566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(m, table)); 36a3192f15SSatish Balay 37a3192f15SSatish Balay /* Extract the indices, assume there can be duplicate entries */ 389566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 399566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 40a3192f15SSatish Balay 41a3192f15SSatish Balay /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 42a3192f15SSatish Balay for (j = 0; j < n; ++j) { 43218c64b6SSatish Balay ival = idx[j] / bs; /* convert the indices into block indices */ 4408401ef6SPierre Jolivet PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 4526fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 46a3192f15SSatish Balay } 479566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 489566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 49a3192f15SSatish Balay 50a3192f15SSatish Balay k = 0; 51a3192f15SSatish Balay for (j = 0; j < ov; j++) { /* for each overlap*/ 52a3192f15SSatish Balay n = isz; 53a3192f15SSatish Balay for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 54a3192f15SSatish Balay row = nidx[k]; 55a3192f15SSatish Balay start = ai[row]; 56a3192f15SSatish Balay end = ai[row + 1]; 57a3192f15SSatish Balay for (l = start; l < end; l++) { 58a3192f15SSatish Balay val = aj[l]; 5926fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 60a3192f15SSatish Balay } 61a3192f15SSatish Balay } 62a3192f15SSatish Balay } 637bede89fSBarry Smith PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 64a3192f15SSatish Balay } 659566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table)); 669566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 68a3192f15SSatish Balay } 691c351548SSatish Balay 7066976f2fSJacob Faibussowitsch static PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 71d71ae5a4SJacob Faibussowitsch { 72736121d4SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 73690b6cddSBarry Smith PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 74690b6cddSBarry Smith PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 755d0c19d7SBarry Smith const PetscInt *irow, *icol; 765d0c19d7SBarry Smith PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 77690b6cddSBarry Smith PetscInt *aj = a->j, *ai = a->i; 783f1db9ecSBarry Smith MatScalar *mat_a; 79736121d4SSatish Balay Mat C; 806041f1b1SToby Isaac PetscBool flag; 81736121d4SSatish Balay 823a40ed3dSBarry Smith PetscFunctionBegin; 839566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 849566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 859566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 869566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 87736121d4SSatish Balay 889566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 89736121d4SSatish Balay ssmap = smap; 909566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 91736121d4SSatish Balay for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 92736121d4SSatish Balay /* determine lens of each row */ 93736121d4SSatish Balay for (i = 0; i < nrows; i++) { 94736121d4SSatish Balay kstart = ai[irow[i]]; 95736121d4SSatish Balay kend = kstart + a->ilen[irow[i]]; 96736121d4SSatish Balay lens[i] = 0; 97736121d4SSatish Balay for (k = kstart; k < kend; k++) { 9826fbe8dcSKarl Rupp if (ssmap[aj[k]]) lens[i]++; 99736121d4SSatish Balay } 100736121d4SSatish Balay } 101736121d4SSatish Balay /* Create and fill new matrix */ 102736121d4SSatish Balay if (scall == MAT_REUSE_MATRIX) { 103736121d4SSatish Balay c = (Mat_SeqBAIJ *)((*B)->data); 104736121d4SSatish Balay 105aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1069566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 107*fdfbdca6SPierre Jolivet PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 1089566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 109736121d4SSatish Balay C = *B; 1103a40ed3dSBarry Smith } else { 1119566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1129566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 1139566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1149566063dSJacob Faibussowitsch PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 115736121d4SSatish Balay } 116736121d4SSatish Balay c = (Mat_SeqBAIJ *)(C->data); 117736121d4SSatish Balay for (i = 0; i < nrows; i++) { 118736121d4SSatish Balay row = irow[i]; 119736121d4SSatish Balay kstart = ai[row]; 120736121d4SSatish Balay kend = kstart + a->ilen[row]; 121736121d4SSatish Balay mat_i = c->i[i]; 122d29f2997SMatthew Woehlke mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 123d29f2997SMatthew Woehlke mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 124736121d4SSatish Balay mat_ilen = c->ilen + i; 125736121d4SSatish Balay for (k = kstart; k < kend; k++) { 126736121d4SSatish Balay if ((tcol = ssmap[a->j[k]])) { 127736121d4SSatish Balay *mat_j++ = tcol - 1; 1289566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 129549d3d68SSatish Balay mat_a += bs2; 130736121d4SSatish Balay (*mat_ilen)++; 131736121d4SSatish Balay } 132736121d4SSatish Balay } 133736121d4SSatish Balay } 134cdc6f3adSToby Isaac /* sort */ 135d29f2997SMatthew Woehlke if (c->j && c->a) { 136cdc6f3adSToby Isaac MatScalar *work; 1379566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 138cdc6f3adSToby Isaac for (i = 0; i < nrows; i++) { 139cdc6f3adSToby Isaac PetscInt ilen; 140cdc6f3adSToby Isaac mat_i = c->i[i]; 141cdc6f3adSToby Isaac mat_j = c->j + mat_i; 142cdc6f3adSToby Isaac mat_a = c->a + mat_i * bs2; 143cdc6f3adSToby Isaac ilen = c->ilen[i]; 1449566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 145cdc6f3adSToby Isaac } 1469566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 147cdc6f3adSToby Isaac } 148218c64b6SSatish Balay 149736121d4SSatish Balay /* Free work space */ 1509566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 1519566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 1529566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 1539566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1549566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 155736121d4SSatish Balay 1569566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 157736121d4SSatish Balay *B = C; 1583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 159736121d4SSatish Balay } 160736121d4SSatish Balay 161d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 162d71ae5a4SJacob Faibussowitsch { 163218c64b6SSatish Balay IS is1, is2; 164218c64b6SSatish Balay 1653a40ed3dSBarry Smith PetscFunctionBegin; 166f9a48b90SPierre Jolivet PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 167f9a48b90SPierre Jolivet if (isrow == iscol) { 168f9a48b90SPierre Jolivet is2 = is1; 169f9a48b90SPierre Jolivet PetscCall(PetscObjectReference((PetscObject)is2)); 170f9a48b90SPierre Jolivet } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 1719566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 1729566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 1739566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 1743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 175218c64b6SSatish Balay } 176218c64b6SSatish Balay 177d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 178d71ae5a4SJacob Faibussowitsch { 17916b64355SHong Zhang Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 1805c39f6d9SHong Zhang Mat_SubSppt *submatj = c->submatis1; 18116b64355SHong Zhang 18216b64355SHong Zhang PetscFunctionBegin; 1839566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 1849566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 1853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18616b64355SHong Zhang } 18716b64355SHong Zhang 18889a1a59bSHong Zhang /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 189d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) 190d71ae5a4SJacob Faibussowitsch { 19186e85357SHong Zhang PetscInt i; 19286e85357SHong Zhang Mat C; 19386e85357SHong Zhang Mat_SeqBAIJ *c; 19486e85357SHong Zhang Mat_SubSppt *submatj; 19586e85357SHong Zhang 19686e85357SHong Zhang PetscFunctionBegin; 19786e85357SHong Zhang for (i = 0; i < n; i++) { 19886e85357SHong Zhang C = (*mat)[i]; 19986e85357SHong Zhang c = (Mat_SeqBAIJ *)C->data; 20086e85357SHong Zhang submatj = c->submatis1; 20186e85357SHong Zhang if (submatj) { 2027daefbafSJunchao Zhang if (--((PetscObject)C)->refct <= 0) { 20326cc229bSBarry Smith PetscCall(PetscFree(C->factorprefix)); 2049566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 2059566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 2069566063dSJacob Faibussowitsch PetscCall(PetscFree(C->defaultvectype)); 2073faff063SStefano Zampini PetscCall(PetscFree(C->defaultrandtype)); 2089566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->rmap)); 2099566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->cmap)); 2109566063dSJacob Faibussowitsch PetscCall(PetscHeaderDestroy(&C)); 2117daefbafSJunchao Zhang } 21286e85357SHong Zhang } else { 2139566063dSJacob Faibussowitsch PetscCall(MatDestroy(&C)); 21486e85357SHong Zhang } 21586e85357SHong Zhang } 2167daefbafSJunchao Zhang 2177daefbafSJunchao Zhang /* Destroy Dummy submatrices created for reuse */ 2189566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 2197daefbafSJunchao Zhang 2209566063dSJacob Faibussowitsch PetscCall(PetscFree(*mat)); 2213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 22286e85357SHong Zhang } 22386e85357SHong Zhang 224d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 225d71ae5a4SJacob Faibussowitsch { 226690b6cddSBarry Smith PetscInt i; 227736121d4SSatish Balay 2283a40ed3dSBarry Smith PetscFunctionBegin; 22948a46eb9SPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 230736121d4SSatish Balay 23148a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 2323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 233736121d4SSatish Balay } 234218c64b6SSatish Balay 2352d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */ 236d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) 237d71ae5a4SJacob Faibussowitsch { 2382d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 239d9fead3dSBarry Smith PetscScalar *z, sum; 240d9fead3dSBarry Smith const PetscScalar *x; 241d9fead3dSBarry Smith const MatScalar *v; 2427c565772SBarry Smith PetscInt mbs, i, n; 2430298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 244ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2452d61bbb3SSatish Balay 2462d61bbb3SSatish Balay PetscFunctionBegin; 2479566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2489566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &z)); 2492d61bbb3SSatish Balay 25026e093fcSHong Zhang if (usecprow) { 25126e093fcSHong Zhang mbs = a->compressedrow.nrows; 25226e093fcSHong Zhang ii = a->compressedrow.i; 2537b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 2549566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(z, a->mbs)); 25526e093fcSHong Zhang } else { 25626e093fcSHong Zhang mbs = a->mbs; 2572d61bbb3SSatish Balay ii = a->i; 25826e093fcSHong Zhang } 2592d61bbb3SSatish Balay 2602d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 261ee54c7eeSHong Zhang n = ii[1] - ii[0]; 262ee54c7eeSHong Zhang v = a->a + ii[0]; 263ee54c7eeSHong Zhang idx = a->j + ii[0]; 264ee54c7eeSHong Zhang ii++; 265444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 266444d8c10SJed Brown PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2672d61bbb3SSatish Balay sum = 0.0; 2682162cab8SBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 26926e093fcSHong Zhang if (usecprow) { 2707b2bb3b9SHong Zhang z[ridx[i]] = sum; 27126e093fcSHong Zhang } else { 2722d61bbb3SSatish Balay z[i] = sum; 2732d61bbb3SSatish Balay } 27426e093fcSHong Zhang } 2759566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 2769566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &z)); 2779566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 2783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2792d61bbb3SSatish Balay } 2802d61bbb3SSatish Balay 281d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) 282d71ae5a4SJacob Faibussowitsch { 2832d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 284f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, *zarray; 285d9fead3dSBarry Smith const PetscScalar *x, *xb; 28687828ca2SBarry Smith PetscScalar x1, x2; 287d9fead3dSBarry Smith const MatScalar *v; 2887c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 289ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2902d61bbb3SSatish Balay 2912d61bbb3SSatish Balay PetscFunctionBegin; 2929566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2939566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 2942d61bbb3SSatish Balay 2952d61bbb3SSatish Balay idx = a->j; 2962d61bbb3SSatish Balay v = a->a; 29726e093fcSHong Zhang if (usecprow) { 29826e093fcSHong Zhang mbs = a->compressedrow.nrows; 29926e093fcSHong Zhang ii = a->compressedrow.i; 3007b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 3019566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 30226e093fcSHong Zhang } else { 30326e093fcSHong Zhang mbs = a->mbs; 3042d61bbb3SSatish Balay ii = a->i; 30526e093fcSHong Zhang z = zarray; 30626e093fcSHong Zhang } 3072d61bbb3SSatish Balay 3082d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 3099371c9d4SSatish Balay n = ii[1] - ii[0]; 3109371c9d4SSatish Balay ii++; 3119371c9d4SSatish Balay sum1 = 0.0; 3129371c9d4SSatish Balay sum2 = 0.0; 313444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 314444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3152d61bbb3SSatish Balay for (j = 0; j < n; j++) { 3169371c9d4SSatish Balay xb = x + 2 * (*idx++); 3179371c9d4SSatish Balay x1 = xb[0]; 3189371c9d4SSatish Balay x2 = xb[1]; 3192d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 3202d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 3212d61bbb3SSatish Balay v += 4; 3222d61bbb3SSatish Balay } 3237b2bb3b9SHong Zhang if (usecprow) z = zarray + 2 * ridx[i]; 3249371c9d4SSatish Balay z[0] = sum1; 3259371c9d4SSatish Balay z[1] = sum2; 32626e093fcSHong Zhang if (!usecprow) z += 2; 3272d61bbb3SSatish Balay } 3289566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3299566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 3309566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 3313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3322d61bbb3SSatish Balay } 3332d61bbb3SSatish Balay 334d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) 335d71ae5a4SJacob Faibussowitsch { 3362d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 337f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 338d9fead3dSBarry Smith const PetscScalar *x, *xb; 339d9fead3dSBarry Smith const MatScalar *v; 3407c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 341ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 34226e093fcSHong Zhang 343b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 344fee21e36SBarry Smith #pragma disjoint(*v, *z, *xb) 345fee21e36SBarry Smith #endif 346fee21e36SBarry Smith 3472d61bbb3SSatish Balay PetscFunctionBegin; 3489566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3499566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 3502d61bbb3SSatish Balay 3512d61bbb3SSatish Balay idx = a->j; 3522d61bbb3SSatish Balay v = a->a; 35326e093fcSHong Zhang if (usecprow) { 35426e093fcSHong Zhang mbs = a->compressedrow.nrows; 35526e093fcSHong Zhang ii = a->compressedrow.i; 3567b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 3579566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 35826e093fcSHong Zhang } else { 35926e093fcSHong Zhang mbs = a->mbs; 3602d61bbb3SSatish Balay ii = a->i; 36126e093fcSHong Zhang z = zarray; 36226e093fcSHong Zhang } 3632d61bbb3SSatish Balay 3642d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 3659371c9d4SSatish Balay n = ii[1] - ii[0]; 3669371c9d4SSatish Balay ii++; 3679371c9d4SSatish Balay sum1 = 0.0; 3689371c9d4SSatish Balay sum2 = 0.0; 3699371c9d4SSatish Balay sum3 = 0.0; 370444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 371444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3722d61bbb3SSatish Balay for (j = 0; j < n; j++) { 37326fbe8dcSKarl Rupp xb = x + 3 * (*idx++); 37426fbe8dcSKarl Rupp x1 = xb[0]; 37526fbe8dcSKarl Rupp x2 = xb[1]; 37626fbe8dcSKarl Rupp x3 = xb[2]; 37726fbe8dcSKarl Rupp 3782d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 3792d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 3802d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 3812d61bbb3SSatish Balay v += 9; 3822d61bbb3SSatish Balay } 3837b2bb3b9SHong Zhang if (usecprow) z = zarray + 3 * ridx[i]; 3849371c9d4SSatish Balay z[0] = sum1; 3859371c9d4SSatish Balay z[1] = sum2; 3869371c9d4SSatish Balay z[2] = sum3; 38726e093fcSHong Zhang if (!usecprow) z += 3; 3882d61bbb3SSatish Balay } 3899566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3909566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 3919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 3923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3932d61bbb3SSatish Balay } 3942d61bbb3SSatish Balay 395d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) 396d71ae5a4SJacob Faibussowitsch { 3972d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 398f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 399d9fead3dSBarry Smith const PetscScalar *x, *xb; 400d9fead3dSBarry Smith const MatScalar *v; 4017c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 402ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 4032d61bbb3SSatish Balay 4042d61bbb3SSatish Balay PetscFunctionBegin; 4059566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4069566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 4072d61bbb3SSatish Balay 4082d61bbb3SSatish Balay idx = a->j; 4092d61bbb3SSatish Balay v = a->a; 41026e093fcSHong Zhang if (usecprow) { 41126e093fcSHong Zhang mbs = a->compressedrow.nrows; 41226e093fcSHong Zhang ii = a->compressedrow.i; 4137b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 4149566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 41526e093fcSHong Zhang } else { 41626e093fcSHong Zhang mbs = a->mbs; 4172d61bbb3SSatish Balay ii = a->i; 41826e093fcSHong Zhang z = zarray; 41926e093fcSHong Zhang } 4202d61bbb3SSatish Balay 4212d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 42226fbe8dcSKarl Rupp n = ii[1] - ii[0]; 42326fbe8dcSKarl Rupp ii++; 42426fbe8dcSKarl Rupp sum1 = 0.0; 42526fbe8dcSKarl Rupp sum2 = 0.0; 42626fbe8dcSKarl Rupp sum3 = 0.0; 42726fbe8dcSKarl Rupp sum4 = 0.0; 42826fbe8dcSKarl Rupp 429444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 430444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4312d61bbb3SSatish Balay for (j = 0; j < n; j++) { 4322d61bbb3SSatish Balay xb = x + 4 * (*idx++); 4339371c9d4SSatish Balay x1 = xb[0]; 4349371c9d4SSatish Balay x2 = xb[1]; 4359371c9d4SSatish Balay x3 = xb[2]; 4369371c9d4SSatish Balay x4 = xb[3]; 4372d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 4382d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 4392d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 4402d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 4412d61bbb3SSatish Balay v += 16; 4422d61bbb3SSatish Balay } 4437b2bb3b9SHong Zhang if (usecprow) z = zarray + 4 * ridx[i]; 4449371c9d4SSatish Balay z[0] = sum1; 4459371c9d4SSatish Balay z[1] = sum2; 4469371c9d4SSatish Balay z[2] = sum3; 4479371c9d4SSatish Balay z[3] = sum4; 44826e093fcSHong Zhang if (!usecprow) z += 4; 4492d61bbb3SSatish Balay } 4509566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4519566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 4529566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 4533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4542d61bbb3SSatish Balay } 4552d61bbb3SSatish Balay 456d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) 457d71ae5a4SJacob Faibussowitsch { 4582d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 459f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 460d9fead3dSBarry Smith const PetscScalar *xb, *x; 461d9fead3dSBarry Smith const MatScalar *v; 4620298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 4637c565772SBarry Smith PetscInt mbs, i, j, n; 464ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 4652d61bbb3SSatish Balay 466433994e6SBarry Smith PetscFunctionBegin; 4679566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4689566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 4692d61bbb3SSatish Balay 4702d61bbb3SSatish Balay idx = a->j; 4712d61bbb3SSatish Balay v = a->a; 47226e093fcSHong Zhang if (usecprow) { 47326e093fcSHong Zhang mbs = a->compressedrow.nrows; 47426e093fcSHong Zhang ii = a->compressedrow.i; 4757b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 4769566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 47726e093fcSHong Zhang } else { 47826e093fcSHong Zhang mbs = a->mbs; 4792d61bbb3SSatish Balay ii = a->i; 48026e093fcSHong Zhang z = zarray; 48126e093fcSHong Zhang } 4822d61bbb3SSatish Balay 4832d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 4849371c9d4SSatish Balay n = ii[1] - ii[0]; 4859371c9d4SSatish Balay ii++; 4869371c9d4SSatish Balay sum1 = 0.0; 4879371c9d4SSatish Balay sum2 = 0.0; 4889371c9d4SSatish Balay sum3 = 0.0; 4899371c9d4SSatish Balay sum4 = 0.0; 4909371c9d4SSatish Balay sum5 = 0.0; 491444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 492444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4932d61bbb3SSatish Balay for (j = 0; j < n; j++) { 4942d61bbb3SSatish Balay xb = x + 5 * (*idx++); 4959371c9d4SSatish Balay x1 = xb[0]; 4969371c9d4SSatish Balay x2 = xb[1]; 4979371c9d4SSatish Balay x3 = xb[2]; 4989371c9d4SSatish Balay x4 = xb[3]; 4999371c9d4SSatish Balay x5 = xb[4]; 5002d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 5012d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 5022d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 5032d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 5042d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 5052d61bbb3SSatish Balay v += 25; 5062d61bbb3SSatish Balay } 5077b2bb3b9SHong Zhang if (usecprow) z = zarray + 5 * ridx[i]; 5089371c9d4SSatish Balay z[0] = sum1; 5099371c9d4SSatish Balay z[1] = sum2; 5109371c9d4SSatish Balay z[2] = sum3; 5119371c9d4SSatish Balay z[3] = sum4; 5129371c9d4SSatish Balay z[4] = sum5; 51326e093fcSHong Zhang if (!usecprow) z += 5; 5142d61bbb3SSatish Balay } 5159566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5169566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 5179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 5183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5192d61bbb3SSatish Balay } 5202d61bbb3SSatish Balay 521d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) 522d71ae5a4SJacob Faibussowitsch { 52315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 524f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 525d9fead3dSBarry Smith const PetscScalar *x, *xb; 52626e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 527d9fead3dSBarry Smith const MatScalar *v; 5287c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 529ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 53015091d37SBarry Smith 531433994e6SBarry Smith PetscFunctionBegin; 5329566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5339566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 53415091d37SBarry Smith 53515091d37SBarry Smith idx = a->j; 53615091d37SBarry Smith v = a->a; 53726e093fcSHong Zhang if (usecprow) { 53826e093fcSHong Zhang mbs = a->compressedrow.nrows; 53926e093fcSHong Zhang ii = a->compressedrow.i; 5407b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 5419566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 54226e093fcSHong Zhang } else { 54326e093fcSHong Zhang mbs = a->mbs; 54415091d37SBarry Smith ii = a->i; 54526e093fcSHong Zhang z = zarray; 54626e093fcSHong Zhang } 54715091d37SBarry Smith 54815091d37SBarry Smith for (i = 0; i < mbs; i++) { 54926fbe8dcSKarl Rupp n = ii[1] - ii[0]; 55026fbe8dcSKarl Rupp ii++; 55126fbe8dcSKarl Rupp sum1 = 0.0; 55226fbe8dcSKarl Rupp sum2 = 0.0; 55326fbe8dcSKarl Rupp sum3 = 0.0; 55426fbe8dcSKarl Rupp sum4 = 0.0; 55526fbe8dcSKarl Rupp sum5 = 0.0; 55626fbe8dcSKarl Rupp sum6 = 0.0; 55726fbe8dcSKarl Rupp 558444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 559444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 56015091d37SBarry Smith for (j = 0; j < n; j++) { 56115091d37SBarry Smith xb = x + 6 * (*idx++); 5629371c9d4SSatish Balay x1 = xb[0]; 5639371c9d4SSatish Balay x2 = xb[1]; 5649371c9d4SSatish Balay x3 = xb[2]; 5659371c9d4SSatish Balay x4 = xb[3]; 5669371c9d4SSatish Balay x5 = xb[4]; 5679371c9d4SSatish Balay x6 = xb[5]; 56815091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 56915091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 57015091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 57115091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 57215091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 57315091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 57415091d37SBarry Smith v += 36; 57515091d37SBarry Smith } 5767b2bb3b9SHong Zhang if (usecprow) z = zarray + 6 * ridx[i]; 5779371c9d4SSatish Balay z[0] = sum1; 5789371c9d4SSatish Balay z[1] = sum2; 5799371c9d4SSatish Balay z[2] = sum3; 5809371c9d4SSatish Balay z[3] = sum4; 5819371c9d4SSatish Balay z[4] = sum5; 5829371c9d4SSatish Balay z[5] = sum6; 58326e093fcSHong Zhang if (!usecprow) z += 6; 58415091d37SBarry Smith } 58515091d37SBarry Smith 5869566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5879566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 5889566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 5893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 59015091d37SBarry Smith } 5918ab949d8SShri Abhyankar 592d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) 593d71ae5a4SJacob Faibussowitsch { 5942d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 595f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 596d9fead3dSBarry Smith const PetscScalar *x, *xb; 59726e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 598d9fead3dSBarry Smith const MatScalar *v; 5997c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 600ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 6012d61bbb3SSatish Balay 602433994e6SBarry Smith PetscFunctionBegin; 6039566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6049566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 6052d61bbb3SSatish Balay 6062d61bbb3SSatish Balay idx = a->j; 6072d61bbb3SSatish Balay v = a->a; 60826e093fcSHong Zhang if (usecprow) { 60926e093fcSHong Zhang mbs = a->compressedrow.nrows; 61026e093fcSHong Zhang ii = a->compressedrow.i; 6117b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 6129566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 61326e093fcSHong Zhang } else { 61426e093fcSHong Zhang mbs = a->mbs; 6152d61bbb3SSatish Balay ii = a->i; 61626e093fcSHong Zhang z = zarray; 61726e093fcSHong Zhang } 6182d61bbb3SSatish Balay 6192d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 62026fbe8dcSKarl Rupp n = ii[1] - ii[0]; 62126fbe8dcSKarl Rupp ii++; 62226fbe8dcSKarl Rupp sum1 = 0.0; 62326fbe8dcSKarl Rupp sum2 = 0.0; 62426fbe8dcSKarl Rupp sum3 = 0.0; 62526fbe8dcSKarl Rupp sum4 = 0.0; 62626fbe8dcSKarl Rupp sum5 = 0.0; 62726fbe8dcSKarl Rupp sum6 = 0.0; 62826fbe8dcSKarl Rupp sum7 = 0.0; 62926fbe8dcSKarl Rupp 630444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 631444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 6322d61bbb3SSatish Balay for (j = 0; j < n; j++) { 6332d61bbb3SSatish Balay xb = x + 7 * (*idx++); 6349371c9d4SSatish Balay x1 = xb[0]; 6359371c9d4SSatish Balay x2 = xb[1]; 6369371c9d4SSatish Balay x3 = xb[2]; 6379371c9d4SSatish Balay x4 = xb[3]; 6389371c9d4SSatish Balay x5 = xb[4]; 6399371c9d4SSatish Balay x6 = xb[5]; 6409371c9d4SSatish Balay x7 = xb[6]; 6412d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 6422d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 6432d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 6442d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 6452d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 6462d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 6472d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 6482d61bbb3SSatish Balay v += 49; 6492d61bbb3SSatish Balay } 6507b2bb3b9SHong Zhang if (usecprow) z = zarray + 7 * ridx[i]; 6519371c9d4SSatish Balay z[0] = sum1; 6529371c9d4SSatish Balay z[1] = sum2; 6539371c9d4SSatish Balay z[2] = sum3; 6549371c9d4SSatish Balay z[3] = sum4; 6559371c9d4SSatish Balay z[4] = sum5; 6569371c9d4SSatish Balay z[5] = sum6; 6579371c9d4SSatish Balay z[6] = sum7; 65826e093fcSHong Zhang if (!usecprow) z += 7; 6592d61bbb3SSatish Balay } 6602d61bbb3SSatish Balay 6619566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6629566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 6639566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 6643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6652d61bbb3SSatish Balay } 6662d61bbb3SSatish Balay 6675f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 668d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) 669d71ae5a4SJacob Faibussowitsch { 67096e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 671f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 67296e086a2SDaniel Kokron const PetscScalar *x, *xb; 67396e086a2SDaniel Kokron const MatScalar *v; 67496e086a2SDaniel Kokron PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 67596e086a2SDaniel Kokron const PetscInt *idx, *ii, *ridx = NULL; 676ce68d72fSJed Brown PetscInt k; 67796e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 67896e086a2SDaniel Kokron 67996e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 680ce68d72fSJed Brown __m256d w0, w1, w2, w3; 68196e086a2SDaniel Kokron __m256d z0, z1, z2; 68296e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 68396e086a2SDaniel Kokron 68496e086a2SDaniel Kokron PetscFunctionBegin; 6859566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6869566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 68796e086a2SDaniel Kokron 68896e086a2SDaniel Kokron idx = a->j; 68996e086a2SDaniel Kokron v = a->a; 69096e086a2SDaniel Kokron if (usecprow) { 69196e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 69296e086a2SDaniel Kokron ii = a->compressedrow.i; 69396e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 6949566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 69596e086a2SDaniel Kokron } else { 69696e086a2SDaniel Kokron mbs = a->mbs; 69796e086a2SDaniel Kokron ii = a->i; 69896e086a2SDaniel Kokron z = zarray; 69996e086a2SDaniel Kokron } 70096e086a2SDaniel Kokron 70196e086a2SDaniel Kokron if (!a->mult_work) { 70296e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 7039566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 70496e086a2SDaniel Kokron } 70596e086a2SDaniel Kokron 70696e086a2SDaniel Kokron work = a->mult_work; 70796e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 7089371c9d4SSatish Balay n = ii[1] - ii[0]; 7099371c9d4SSatish Balay ii++; 71096e086a2SDaniel Kokron workt = work; 71196e086a2SDaniel Kokron for (j = 0; j < n; j++) { 71296e086a2SDaniel Kokron xb = x + bs * (*idx++); 71396e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 71496e086a2SDaniel Kokron workt += bs; 71596e086a2SDaniel Kokron } 71696e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 71796e086a2SDaniel Kokron 7189371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 7199371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 7209371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 72196e086a2SDaniel Kokron 72296e086a2SDaniel Kokron for (j = 0; j < n; j++) { 723c05b70c4SSatish Balay /* first column of a */ 72496e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 7259371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 7269371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 7279371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 7289371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 7299371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 7309371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 73196e086a2SDaniel Kokron 732c05b70c4SSatish Balay /* second column of a */ 73396e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 7349371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 7359371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 7369371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 7379371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 7389371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 7399371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 74096e086a2SDaniel Kokron 741c05b70c4SSatish Balay /* third column of a */ 74296e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 7439371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 7449371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 7459371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 7469371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 7479371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 7489371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 74996e086a2SDaniel Kokron 750c05b70c4SSatish Balay /* fourth column of a */ 75196e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 7529371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 7539371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 7549371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 7559371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 7569371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 7579371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 75896e086a2SDaniel Kokron 759c05b70c4SSatish Balay /* fifth column of a */ 76096e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 7619371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 7629371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 7639371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 7649371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 7659371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 7669371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 76796e086a2SDaniel Kokron 768c05b70c4SSatish Balay /* sixth column of a */ 76996e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 7709371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 7719371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 7729371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 7739371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 7749371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 7759371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 77696e086a2SDaniel Kokron 777c05b70c4SSatish Balay /* seventh column of a */ 77896e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 7799371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 7809371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 7819371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 7829371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 7839371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 7849371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 78596e086a2SDaniel Kokron 7866aad120cSJose E. Roman /* eighth column of a */ 78796e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 7889371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 7899371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 7909371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 7919371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 7929371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 7939371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 79496e086a2SDaniel Kokron 795c05b70c4SSatish Balay /* ninth column of a */ 79696e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 7979371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 7989371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 7999371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 8009371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 8019371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 8029371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 80396e086a2SDaniel Kokron } 80496e086a2SDaniel Kokron 8059371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 8069371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 8079371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 80896e086a2SDaniel Kokron 80996e086a2SDaniel Kokron v += n * bs2; 81096e086a2SDaniel Kokron if (!usecprow) z += bs; 81196e086a2SDaniel Kokron } 8129566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8139566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 8149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 8153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 81696e086a2SDaniel Kokron } 81796e086a2SDaniel Kokron #endif 81896e086a2SDaniel Kokron 819d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) 820d71ae5a4SJacob Faibussowitsch { 821ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 822f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 823ebada01fSBarry Smith const PetscScalar *x, *xb; 824ebada01fSBarry Smith PetscScalar *zarray, xv; 825ebada01fSBarry Smith const MatScalar *v; 826ebada01fSBarry Smith const PetscInt *ii, *ij = a->j, *idx; 827ebada01fSBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 828ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 829ebada01fSBarry Smith 830ebada01fSBarry Smith PetscFunctionBegin; 8319566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8329566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 833ebada01fSBarry Smith 834ebada01fSBarry Smith v = a->a; 835ebada01fSBarry Smith if (usecprow) { 836ebada01fSBarry Smith mbs = a->compressedrow.nrows; 837ebada01fSBarry Smith ii = a->compressedrow.i; 838ebada01fSBarry Smith ridx = a->compressedrow.rindex; 8399566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 840ebada01fSBarry Smith } else { 841ebada01fSBarry Smith mbs = a->mbs; 842ebada01fSBarry Smith ii = a->i; 843ebada01fSBarry Smith z = zarray; 844ebada01fSBarry Smith } 845ebada01fSBarry Smith 846ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 847ebada01fSBarry Smith n = ii[i + 1] - ii[i]; 848ebada01fSBarry Smith idx = ij + ii[i]; 8499371c9d4SSatish Balay sum1 = 0.0; 8509371c9d4SSatish Balay sum2 = 0.0; 8519371c9d4SSatish Balay sum3 = 0.0; 8529371c9d4SSatish Balay sum4 = 0.0; 8539371c9d4SSatish Balay sum5 = 0.0; 8549371c9d4SSatish Balay sum6 = 0.0; 8559371c9d4SSatish Balay sum7 = 0.0; 8569371c9d4SSatish Balay sum8 = 0.0; 8579371c9d4SSatish Balay sum9 = 0.0; 8589371c9d4SSatish Balay sum10 = 0.0; 8599371c9d4SSatish Balay sum11 = 0.0; 860ebada01fSBarry Smith 861ebada01fSBarry Smith for (j = 0; j < n; j++) { 862ebada01fSBarry Smith xb = x + 11 * (idx[j]); 863ebada01fSBarry Smith 864ebada01fSBarry Smith for (k = 0; k < 11; k++) { 865ebada01fSBarry Smith xv = xb[k]; 866ebada01fSBarry Smith sum1 += v[0] * xv; 867ebada01fSBarry Smith sum2 += v[1] * xv; 868ebada01fSBarry Smith sum3 += v[2] * xv; 869ebada01fSBarry Smith sum4 += v[3] * xv; 870ebada01fSBarry Smith sum5 += v[4] * xv; 871ebada01fSBarry Smith sum6 += v[5] * xv; 872ebada01fSBarry Smith sum7 += v[6] * xv; 873ebada01fSBarry Smith sum8 += v[7] * xv; 874ebada01fSBarry Smith sum9 += v[8] * xv; 875ebada01fSBarry Smith sum10 += v[9] * xv; 876ebada01fSBarry Smith sum11 += v[10] * xv; 877ebada01fSBarry Smith v += 11; 878ebada01fSBarry Smith } 879ebada01fSBarry Smith } 880ebada01fSBarry Smith if (usecprow) z = zarray + 11 * ridx[i]; 8819371c9d4SSatish Balay z[0] = sum1; 8829371c9d4SSatish Balay z[1] = sum2; 8839371c9d4SSatish Balay z[2] = sum3; 8849371c9d4SSatish Balay z[3] = sum4; 8859371c9d4SSatish Balay z[4] = sum5; 8869371c9d4SSatish Balay z[5] = sum6; 8879371c9d4SSatish Balay z[6] = sum7; 8889371c9d4SSatish Balay z[7] = sum8; 8899371c9d4SSatish Balay z[8] = sum9; 8909371c9d4SSatish Balay z[9] = sum10; 8919371c9d4SSatish Balay z[10] = sum11; 892ebada01fSBarry Smith 893ebada01fSBarry Smith if (!usecprow) z += 11; 894ebada01fSBarry Smith } 895ebada01fSBarry Smith 8969566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8979566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 8989566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 8993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 900ebada01fSBarry Smith } 901ebada01fSBarry Smith 9026679dcc1SBarry Smith /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 903d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) 904d71ae5a4SJacob Faibussowitsch { 9056679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 9066679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 9076679dcc1SBarry Smith const PetscScalar *x, *xb; 9086679dcc1SBarry Smith PetscScalar *zarray, xv; 9096679dcc1SBarry Smith const MatScalar *v; 9106679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 9116679dcc1SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 9126679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 9136679dcc1SBarry Smith 9146679dcc1SBarry Smith PetscFunctionBegin; 9159566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9169566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 9176679dcc1SBarry Smith 9186679dcc1SBarry Smith v = a->a; 9196679dcc1SBarry Smith if (usecprow) { 9206679dcc1SBarry Smith mbs = a->compressedrow.nrows; 9216679dcc1SBarry Smith ii = a->compressedrow.i; 9226679dcc1SBarry Smith ridx = a->compressedrow.rindex; 9239566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 9246679dcc1SBarry Smith } else { 9256679dcc1SBarry Smith mbs = a->mbs; 9266679dcc1SBarry Smith ii = a->i; 9276679dcc1SBarry Smith z = zarray; 9286679dcc1SBarry Smith } 9296679dcc1SBarry Smith 9306679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 9316679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 9326679dcc1SBarry Smith idx = ij + ii[i]; 9339371c9d4SSatish Balay sum1 = 0.0; 9349371c9d4SSatish Balay sum2 = 0.0; 9359371c9d4SSatish Balay sum3 = 0.0; 9369371c9d4SSatish Balay sum4 = 0.0; 9379371c9d4SSatish Balay sum5 = 0.0; 9389371c9d4SSatish Balay sum6 = 0.0; 9399371c9d4SSatish Balay sum7 = 0.0; 9409371c9d4SSatish Balay sum8 = 0.0; 9419371c9d4SSatish Balay sum9 = 0.0; 9429371c9d4SSatish Balay sum10 = 0.0; 9439371c9d4SSatish Balay sum11 = 0.0; 9449371c9d4SSatish Balay sum12 = 0.0; 9456679dcc1SBarry Smith 9466679dcc1SBarry Smith for (j = 0; j < n; j++) { 9476679dcc1SBarry Smith xb = x + 12 * (idx[j]); 9486679dcc1SBarry Smith 9496679dcc1SBarry Smith for (k = 0; k < 12; k++) { 9506679dcc1SBarry Smith xv = xb[k]; 9516679dcc1SBarry Smith sum1 += v[0] * xv; 9526679dcc1SBarry Smith sum2 += v[1] * xv; 9536679dcc1SBarry Smith sum3 += v[2] * xv; 9546679dcc1SBarry Smith sum4 += v[3] * xv; 9556679dcc1SBarry Smith sum5 += v[4] * xv; 9566679dcc1SBarry Smith sum6 += v[5] * xv; 9576679dcc1SBarry Smith sum7 += v[6] * xv; 9586679dcc1SBarry Smith sum8 += v[7] * xv; 9596679dcc1SBarry Smith sum9 += v[8] * xv; 9606679dcc1SBarry Smith sum10 += v[9] * xv; 9616679dcc1SBarry Smith sum11 += v[10] * xv; 9626679dcc1SBarry Smith sum12 += v[11] * xv; 9636679dcc1SBarry Smith v += 12; 9646679dcc1SBarry Smith } 9656679dcc1SBarry Smith } 9666679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 9679371c9d4SSatish Balay z[0] = sum1; 9689371c9d4SSatish Balay z[1] = sum2; 9699371c9d4SSatish Balay z[2] = sum3; 9709371c9d4SSatish Balay z[3] = sum4; 9719371c9d4SSatish Balay z[4] = sum5; 9729371c9d4SSatish Balay z[5] = sum6; 9739371c9d4SSatish Balay z[6] = sum7; 9749371c9d4SSatish Balay z[7] = sum8; 9759371c9d4SSatish Balay z[8] = sum9; 9769371c9d4SSatish Balay z[9] = sum10; 9779371c9d4SSatish Balay z[10] = sum11; 9789371c9d4SSatish Balay z[11] = sum12; 9796679dcc1SBarry Smith if (!usecprow) z += 12; 9806679dcc1SBarry Smith } 9819566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9829566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 9839566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 9843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 9856679dcc1SBarry Smith } 9866679dcc1SBarry Smith 987d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) 988d71ae5a4SJacob Faibussowitsch { 9896679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 9906679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 9916679dcc1SBarry Smith const PetscScalar *x, *xb; 9926679dcc1SBarry Smith PetscScalar *zarray, *yarray, xv; 9936679dcc1SBarry Smith const MatScalar *v; 9946679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 9956679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 9966679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 9976679dcc1SBarry Smith 9986679dcc1SBarry Smith PetscFunctionBegin; 9999566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10009566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 10016679dcc1SBarry Smith 10026679dcc1SBarry Smith v = a->a; 10036679dcc1SBarry Smith if (usecprow) { 100448a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 10056679dcc1SBarry Smith mbs = a->compressedrow.nrows; 10066679dcc1SBarry Smith ii = a->compressedrow.i; 10076679dcc1SBarry Smith ridx = a->compressedrow.rindex; 10086679dcc1SBarry Smith } else { 10096679dcc1SBarry Smith ii = a->i; 10106679dcc1SBarry Smith y = yarray; 10116679dcc1SBarry Smith z = zarray; 10126679dcc1SBarry Smith } 10136679dcc1SBarry Smith 10146679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 10156679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 10166679dcc1SBarry Smith idx = ij + ii[i]; 10176679dcc1SBarry Smith 10186679dcc1SBarry Smith if (usecprow) { 10196679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 10206679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 10216679dcc1SBarry Smith } 10229371c9d4SSatish Balay sum1 = y[0]; 10239371c9d4SSatish Balay sum2 = y[1]; 10249371c9d4SSatish Balay sum3 = y[2]; 10259371c9d4SSatish Balay sum4 = y[3]; 10269371c9d4SSatish Balay sum5 = y[4]; 10279371c9d4SSatish Balay sum6 = y[5]; 10289371c9d4SSatish Balay sum7 = y[6]; 10299371c9d4SSatish Balay sum8 = y[7]; 10309371c9d4SSatish Balay sum9 = y[8]; 10319371c9d4SSatish Balay sum10 = y[9]; 10329371c9d4SSatish Balay sum11 = y[10]; 10339371c9d4SSatish Balay sum12 = y[11]; 10346679dcc1SBarry Smith 10356679dcc1SBarry Smith for (j = 0; j < n; j++) { 10366679dcc1SBarry Smith xb = x + 12 * (idx[j]); 10376679dcc1SBarry Smith 10386679dcc1SBarry Smith for (k = 0; k < 12; k++) { 10396679dcc1SBarry Smith xv = xb[k]; 10406679dcc1SBarry Smith sum1 += v[0] * xv; 10416679dcc1SBarry Smith sum2 += v[1] * xv; 10426679dcc1SBarry Smith sum3 += v[2] * xv; 10436679dcc1SBarry Smith sum4 += v[3] * xv; 10446679dcc1SBarry Smith sum5 += v[4] * xv; 10456679dcc1SBarry Smith sum6 += v[5] * xv; 10466679dcc1SBarry Smith sum7 += v[6] * xv; 10476679dcc1SBarry Smith sum8 += v[7] * xv; 10486679dcc1SBarry Smith sum9 += v[8] * xv; 10496679dcc1SBarry Smith sum10 += v[9] * xv; 10506679dcc1SBarry Smith sum11 += v[10] * xv; 10516679dcc1SBarry Smith sum12 += v[11] * xv; 10526679dcc1SBarry Smith v += 12; 10536679dcc1SBarry Smith } 10546679dcc1SBarry Smith } 10556679dcc1SBarry Smith 10569371c9d4SSatish Balay z[0] = sum1; 10579371c9d4SSatish Balay z[1] = sum2; 10589371c9d4SSatish Balay z[2] = sum3; 10599371c9d4SSatish Balay z[3] = sum4; 10609371c9d4SSatish Balay z[4] = sum5; 10619371c9d4SSatish Balay z[5] = sum6; 10629371c9d4SSatish Balay z[6] = sum7; 10639371c9d4SSatish Balay z[7] = sum8; 10649371c9d4SSatish Balay z[8] = sum9; 10659371c9d4SSatish Balay z[9] = sum10; 10669371c9d4SSatish Balay z[10] = sum11; 10679371c9d4SSatish Balay z[11] = sum12; 10686679dcc1SBarry Smith if (!usecprow) { 10696679dcc1SBarry Smith y += 12; 10706679dcc1SBarry Smith z += 12; 10716679dcc1SBarry Smith } 10726679dcc1SBarry Smith } 10739566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10749566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 10759566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 10763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 10776679dcc1SBarry Smith } 10786679dcc1SBarry Smith 10796679dcc1SBarry Smith /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1080d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) 1081d71ae5a4SJacob Faibussowitsch { 10826679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 10836679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 10846679dcc1SBarry Smith const PetscScalar *x, *xb; 10856679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray; 10866679dcc1SBarry Smith const MatScalar *v; 10876679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 10886679dcc1SBarry Smith PetscInt mbs, i, j, n; 10896679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 10906679dcc1SBarry Smith 10916679dcc1SBarry Smith PetscFunctionBegin; 10929566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10939566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 10946679dcc1SBarry Smith 10956679dcc1SBarry Smith v = a->a; 10966679dcc1SBarry Smith if (usecprow) { 10976679dcc1SBarry Smith mbs = a->compressedrow.nrows; 10986679dcc1SBarry Smith ii = a->compressedrow.i; 10996679dcc1SBarry Smith ridx = a->compressedrow.rindex; 11009566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 11016679dcc1SBarry Smith } else { 11026679dcc1SBarry Smith mbs = a->mbs; 11036679dcc1SBarry Smith ii = a->i; 11046679dcc1SBarry Smith z = zarray; 11056679dcc1SBarry Smith } 11066679dcc1SBarry Smith 11076679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 11086679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 11096679dcc1SBarry Smith idx = ij + ii[i]; 11106679dcc1SBarry Smith 11116679dcc1SBarry Smith sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 11126679dcc1SBarry Smith for (j = 0; j < n; j++) { 11136679dcc1SBarry Smith xb = x + 12 * (idx[j]); 11149371c9d4SSatish Balay x1 = xb[0]; 11159371c9d4SSatish Balay x2 = xb[1]; 11169371c9d4SSatish Balay x3 = xb[2]; 11179371c9d4SSatish Balay x4 = xb[3]; 11186679dcc1SBarry Smith 11196679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11206679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11216679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11226679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11236679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11246679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11256679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11266679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11276679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11286679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11296679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11306679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11316679dcc1SBarry Smith v += 48; 11326679dcc1SBarry Smith 11339371c9d4SSatish Balay x1 = xb[4]; 11349371c9d4SSatish Balay x2 = xb[5]; 11359371c9d4SSatish Balay x3 = xb[6]; 11369371c9d4SSatish Balay x4 = xb[7]; 11376679dcc1SBarry Smith 11386679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11396679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11406679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11416679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11426679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11436679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11446679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11456679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11466679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11476679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11486679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11496679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11506679dcc1SBarry Smith v += 48; 11516679dcc1SBarry Smith 11529371c9d4SSatish Balay x1 = xb[8]; 11539371c9d4SSatish Balay x2 = xb[9]; 11549371c9d4SSatish Balay x3 = xb[10]; 11559371c9d4SSatish Balay x4 = xb[11]; 11566679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11576679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11586679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11596679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11606679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11616679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11626679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11636679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11646679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11656679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11666679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11676679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11686679dcc1SBarry Smith v += 48; 11696679dcc1SBarry Smith } 11706679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 11719371c9d4SSatish Balay z[0] = sum1; 11729371c9d4SSatish Balay z[1] = sum2; 11739371c9d4SSatish Balay z[2] = sum3; 11749371c9d4SSatish Balay z[3] = sum4; 11759371c9d4SSatish Balay z[4] = sum5; 11769371c9d4SSatish Balay z[5] = sum6; 11779371c9d4SSatish Balay z[6] = sum7; 11789371c9d4SSatish Balay z[7] = sum8; 11799371c9d4SSatish Balay z[8] = sum9; 11809371c9d4SSatish Balay z[9] = sum10; 11819371c9d4SSatish Balay z[10] = sum11; 11829371c9d4SSatish Balay z[11] = sum12; 11836679dcc1SBarry Smith if (!usecprow) z += 12; 11846679dcc1SBarry Smith } 11859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11869566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 11879566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 11883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 11896679dcc1SBarry Smith } 11906679dcc1SBarry Smith 11916679dcc1SBarry Smith /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1192d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) 1193d71ae5a4SJacob Faibussowitsch { 11946679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 11956679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 11966679dcc1SBarry Smith const PetscScalar *x, *xb; 11976679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray, *yarray; 11986679dcc1SBarry Smith const MatScalar *v; 11996679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 12006679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, n; 12016679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 12026679dcc1SBarry Smith 12036679dcc1SBarry Smith PetscFunctionBegin; 12049566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 12059566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 12066679dcc1SBarry Smith 12076679dcc1SBarry Smith v = a->a; 12086679dcc1SBarry Smith if (usecprow) { 120948a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 12106679dcc1SBarry Smith mbs = a->compressedrow.nrows; 12116679dcc1SBarry Smith ii = a->compressedrow.i; 12126679dcc1SBarry Smith ridx = a->compressedrow.rindex; 12136679dcc1SBarry Smith } else { 12146679dcc1SBarry Smith ii = a->i; 12156679dcc1SBarry Smith y = yarray; 12166679dcc1SBarry Smith z = zarray; 12176679dcc1SBarry Smith } 12186679dcc1SBarry Smith 12196679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 12206679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 12216679dcc1SBarry Smith idx = ij + ii[i]; 12226679dcc1SBarry Smith 12236679dcc1SBarry Smith if (usecprow) { 12246679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 12256679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 12266679dcc1SBarry Smith } 12279371c9d4SSatish Balay sum1 = y[0]; 12289371c9d4SSatish Balay sum2 = y[1]; 12299371c9d4SSatish Balay sum3 = y[2]; 12309371c9d4SSatish Balay sum4 = y[3]; 12319371c9d4SSatish Balay sum5 = y[4]; 12329371c9d4SSatish Balay sum6 = y[5]; 12339371c9d4SSatish Balay sum7 = y[6]; 12349371c9d4SSatish Balay sum8 = y[7]; 12359371c9d4SSatish Balay sum9 = y[8]; 12369371c9d4SSatish Balay sum10 = y[9]; 12379371c9d4SSatish Balay sum11 = y[10]; 12389371c9d4SSatish Balay sum12 = y[11]; 12396679dcc1SBarry Smith 12406679dcc1SBarry Smith for (j = 0; j < n; j++) { 12416679dcc1SBarry Smith xb = x + 12 * (idx[j]); 12429371c9d4SSatish Balay x1 = xb[0]; 12439371c9d4SSatish Balay x2 = xb[1]; 12449371c9d4SSatish Balay x3 = xb[2]; 12459371c9d4SSatish Balay x4 = xb[3]; 12466679dcc1SBarry Smith 12476679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12486679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12496679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12506679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12516679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12526679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12536679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12546679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12556679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12566679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12576679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12586679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12596679dcc1SBarry Smith v += 48; 12606679dcc1SBarry Smith 12619371c9d4SSatish Balay x1 = xb[4]; 12629371c9d4SSatish Balay x2 = xb[5]; 12639371c9d4SSatish Balay x3 = xb[6]; 12649371c9d4SSatish Balay x4 = xb[7]; 12656679dcc1SBarry Smith 12666679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12676679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12686679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12696679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12706679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12716679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12726679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12736679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12746679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12756679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12766679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12776679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12786679dcc1SBarry Smith v += 48; 12796679dcc1SBarry Smith 12809371c9d4SSatish Balay x1 = xb[8]; 12819371c9d4SSatish Balay x2 = xb[9]; 12829371c9d4SSatish Balay x3 = xb[10]; 12839371c9d4SSatish Balay x4 = xb[11]; 12846679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12856679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12866679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12876679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12886679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12896679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12906679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12916679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12926679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12936679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12946679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12956679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12966679dcc1SBarry Smith v += 48; 12976679dcc1SBarry Smith } 12989371c9d4SSatish Balay z[0] = sum1; 12999371c9d4SSatish Balay z[1] = sum2; 13009371c9d4SSatish Balay z[2] = sum3; 13019371c9d4SSatish Balay z[3] = sum4; 13029371c9d4SSatish Balay z[4] = sum5; 13039371c9d4SSatish Balay z[5] = sum6; 13049371c9d4SSatish Balay z[6] = sum7; 13059371c9d4SSatish Balay z[7] = sum8; 13069371c9d4SSatish Balay z[8] = sum9; 13079371c9d4SSatish Balay z[9] = sum10; 13089371c9d4SSatish Balay z[10] = sum11; 13099371c9d4SSatish Balay z[11] = sum12; 13106679dcc1SBarry Smith if (!usecprow) { 13116679dcc1SBarry Smith y += 12; 13126679dcc1SBarry Smith z += 12; 13136679dcc1SBarry Smith } 13146679dcc1SBarry Smith } 13159566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 13169566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 13179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 13183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13196679dcc1SBarry Smith } 13206679dcc1SBarry Smith 13216679dcc1SBarry Smith #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1322d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) 1323d71ae5a4SJacob Faibussowitsch { 13246679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 13256679dcc1SBarry Smith PetscScalar *z = NULL, *zarray; 13266679dcc1SBarry Smith const PetscScalar *x, *work; 13276679dcc1SBarry Smith const MatScalar *v = a->a; 13286679dcc1SBarry Smith PetscInt mbs, i, j, n; 13296679dcc1SBarry Smith const PetscInt *idx = a->j, *ii, *ridx = NULL; 13306679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 13316679dcc1SBarry Smith const PetscInt bs = 12, bs2 = 144; 13326679dcc1SBarry Smith 13336679dcc1SBarry Smith __m256d a0, a1, a2, a3, a4, a5; 13346679dcc1SBarry Smith __m256d w0, w1, w2, w3; 13356679dcc1SBarry Smith __m256d z0, z1, z2; 13366679dcc1SBarry Smith 13376679dcc1SBarry Smith PetscFunctionBegin; 13389566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 13399566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 13406679dcc1SBarry Smith 13416679dcc1SBarry Smith if (usecprow) { 13426679dcc1SBarry Smith mbs = a->compressedrow.nrows; 13436679dcc1SBarry Smith ii = a->compressedrow.i; 13446679dcc1SBarry Smith ridx = a->compressedrow.rindex; 13459566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 13466679dcc1SBarry Smith } else { 13476679dcc1SBarry Smith mbs = a->mbs; 13486679dcc1SBarry Smith ii = a->i; 13496679dcc1SBarry Smith z = zarray; 13506679dcc1SBarry Smith } 13516679dcc1SBarry Smith 13526679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 13539371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 13549371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 13559371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 13566679dcc1SBarry Smith 13579371c9d4SSatish Balay n = ii[1] - ii[0]; 13589371c9d4SSatish Balay ii++; 13596679dcc1SBarry Smith for (j = 0; j < n; j++) { 13606679dcc1SBarry Smith work = x + bs * (*idx++); 13616679dcc1SBarry Smith 13626679dcc1SBarry Smith /* first column of a */ 13636679dcc1SBarry Smith w0 = _mm256_set1_pd(work[0]); 13649371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 0); 13659371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 13669371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 4); 13679371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 13689371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 8); 13699371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 13706679dcc1SBarry Smith 13716679dcc1SBarry Smith /* second column of a */ 13726679dcc1SBarry Smith w1 = _mm256_set1_pd(work[1]); 13739371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 12); 13749371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 13759371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 16); 13769371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 13779371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 20); 13789371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 13796679dcc1SBarry Smith 13806679dcc1SBarry Smith /* third column of a */ 13816679dcc1SBarry Smith w2 = _mm256_set1_pd(work[2]); 13829371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 24); 13839371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 13849371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 28); 13859371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 13869371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 32); 13879371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 13886679dcc1SBarry Smith 13896679dcc1SBarry Smith /* fourth column of a */ 13906679dcc1SBarry Smith w3 = _mm256_set1_pd(work[3]); 13919371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 36); 13929371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 13939371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 40); 13949371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 13959371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 44); 13969371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 13976679dcc1SBarry Smith 13986679dcc1SBarry Smith /* fifth column of a */ 13996679dcc1SBarry Smith w0 = _mm256_set1_pd(work[4]); 14009371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 48); 14019371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14029371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 52); 14039371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14049371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 56); 14059371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14066679dcc1SBarry Smith 14076679dcc1SBarry Smith /* sixth column of a */ 14086679dcc1SBarry Smith w1 = _mm256_set1_pd(work[5]); 14099371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 60); 14109371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14119371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 64); 14129371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14139371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 68); 14149371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14156679dcc1SBarry Smith 14166679dcc1SBarry Smith /* seventh column of a */ 14176679dcc1SBarry Smith w2 = _mm256_set1_pd(work[6]); 14189371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 72); 14199371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14209371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 76); 14219371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14229371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 80); 14239371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14246679dcc1SBarry Smith 14256aad120cSJose E. Roman /* eighth column of a */ 14266679dcc1SBarry Smith w3 = _mm256_set1_pd(work[7]); 14279371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 84); 14289371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14299371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 88); 14309371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14319371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 92); 14329371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14336679dcc1SBarry Smith 14346679dcc1SBarry Smith /* ninth column of a */ 14356679dcc1SBarry Smith w0 = _mm256_set1_pd(work[8]); 14369371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 96); 14379371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14389371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 100); 14399371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14409371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 104); 14419371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14426679dcc1SBarry Smith 14436679dcc1SBarry Smith /* tenth column of a */ 14446679dcc1SBarry Smith w1 = _mm256_set1_pd(work[9]); 14459371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 108); 14469371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14479371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 112); 14489371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14499371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 116); 14509371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14516679dcc1SBarry Smith 14526679dcc1SBarry Smith /* eleventh column of a */ 14536679dcc1SBarry Smith w2 = _mm256_set1_pd(work[10]); 14549371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 120); 14559371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14569371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 124); 14579371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14589371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 128); 14599371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14606679dcc1SBarry Smith 14616679dcc1SBarry Smith /* twelveth column of a */ 14626679dcc1SBarry Smith w3 = _mm256_set1_pd(work[11]); 14639371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 132); 14649371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14659371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 136); 14669371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14679371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 140); 14689371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14696679dcc1SBarry Smith 14706679dcc1SBarry Smith v += bs2; 14716679dcc1SBarry Smith } 14726679dcc1SBarry Smith if (usecprow) z = zarray + bs * ridx[i]; 14739371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 14749371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 14759371c9d4SSatish Balay _mm256_storeu_pd(&z[8], z2); 14766679dcc1SBarry Smith if (!usecprow) z += bs; 14776679dcc1SBarry Smith } 14789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 14799566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 14809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 14813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14826679dcc1SBarry Smith } 14836679dcc1SBarry Smith #endif 14846679dcc1SBarry Smith 14858ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1486832cc040SShri Abhyankar /* Default MatMult for block size 15 */ 1487d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) 1488d71ae5a4SJacob Faibussowitsch { 14898ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1490f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 14918ab949d8SShri Abhyankar const PetscScalar *x, *xb; 149253ef36baSBarry Smith PetscScalar *zarray, xv; 14938ab949d8SShri Abhyankar const MatScalar *v; 14948ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 14957c565772SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 1496ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 14978ab949d8SShri Abhyankar 14988ab949d8SShri Abhyankar PetscFunctionBegin; 14999566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 15009566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 15018ab949d8SShri Abhyankar 15028ab949d8SShri Abhyankar v = a->a; 15038ab949d8SShri Abhyankar if (usecprow) { 15048ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 15058ab949d8SShri Abhyankar ii = a->compressedrow.i; 15068ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 15079566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 15088ab949d8SShri Abhyankar } else { 15098ab949d8SShri Abhyankar mbs = a->mbs; 15108ab949d8SShri Abhyankar ii = a->i; 15118ab949d8SShri Abhyankar z = zarray; 15128ab949d8SShri Abhyankar } 15138ab949d8SShri Abhyankar 15148ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 15158ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 15168ab949d8SShri Abhyankar idx = ij + ii[i]; 15179371c9d4SSatish Balay sum1 = 0.0; 15189371c9d4SSatish Balay sum2 = 0.0; 15199371c9d4SSatish Balay sum3 = 0.0; 15209371c9d4SSatish Balay sum4 = 0.0; 15219371c9d4SSatish Balay sum5 = 0.0; 15229371c9d4SSatish Balay sum6 = 0.0; 15239371c9d4SSatish Balay sum7 = 0.0; 15249371c9d4SSatish Balay sum8 = 0.0; 15259371c9d4SSatish Balay sum9 = 0.0; 15269371c9d4SSatish Balay sum10 = 0.0; 15279371c9d4SSatish Balay sum11 = 0.0; 15289371c9d4SSatish Balay sum12 = 0.0; 15299371c9d4SSatish Balay sum13 = 0.0; 15309371c9d4SSatish Balay sum14 = 0.0; 15319371c9d4SSatish Balay sum15 = 0.0; 15328ab949d8SShri Abhyankar 15338ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 15348ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 15358ab949d8SShri Abhyankar 15368ab949d8SShri Abhyankar for (k = 0; k < 15; k++) { 153753ef36baSBarry Smith xv = xb[k]; 153853ef36baSBarry Smith sum1 += v[0] * xv; 153953ef36baSBarry Smith sum2 += v[1] * xv; 154053ef36baSBarry Smith sum3 += v[2] * xv; 154153ef36baSBarry Smith sum4 += v[3] * xv; 154253ef36baSBarry Smith sum5 += v[4] * xv; 154353ef36baSBarry Smith sum6 += v[5] * xv; 154453ef36baSBarry Smith sum7 += v[6] * xv; 154553ef36baSBarry Smith sum8 += v[7] * xv; 154653ef36baSBarry Smith sum9 += v[8] * xv; 154753ef36baSBarry Smith sum10 += v[9] * xv; 154853ef36baSBarry Smith sum11 += v[10] * xv; 154953ef36baSBarry Smith sum12 += v[11] * xv; 155053ef36baSBarry Smith sum13 += v[12] * xv; 155153ef36baSBarry Smith sum14 += v[13] * xv; 155253ef36baSBarry Smith sum15 += v[14] * xv; 15538ab949d8SShri Abhyankar v += 15; 15548ab949d8SShri Abhyankar } 15558ab949d8SShri Abhyankar } 15568ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 15579371c9d4SSatish Balay z[0] = sum1; 15589371c9d4SSatish Balay z[1] = sum2; 15599371c9d4SSatish Balay z[2] = sum3; 15609371c9d4SSatish Balay z[3] = sum4; 15619371c9d4SSatish Balay z[4] = sum5; 15629371c9d4SSatish Balay z[5] = sum6; 15639371c9d4SSatish Balay z[6] = sum7; 15649371c9d4SSatish Balay z[7] = sum8; 15659371c9d4SSatish Balay z[8] = sum9; 15669371c9d4SSatish Balay z[9] = sum10; 15679371c9d4SSatish Balay z[10] = sum11; 15689371c9d4SSatish Balay z[11] = sum12; 15699371c9d4SSatish Balay z[12] = sum13; 15709371c9d4SSatish Balay z[13] = sum14; 15719371c9d4SSatish Balay z[14] = sum15; 15728ab949d8SShri Abhyankar 15738ab949d8SShri Abhyankar if (!usecprow) z += 15; 15748ab949d8SShri Abhyankar } 15758ab949d8SShri Abhyankar 15769566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 15779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 15789566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 15793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 15808ab949d8SShri Abhyankar } 15818ab949d8SShri Abhyankar 15828ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1583d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) 1584d71ae5a4SJacob Faibussowitsch { 15858ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1586f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 15878ab949d8SShri Abhyankar const PetscScalar *x, *xb; 15880b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, *zarray; 15898ab949d8SShri Abhyankar const MatScalar *v; 15908ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 15917c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1592ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 15938ab949d8SShri Abhyankar 15948ab949d8SShri Abhyankar PetscFunctionBegin; 15959566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 15969566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 15978ab949d8SShri Abhyankar 15988ab949d8SShri Abhyankar v = a->a; 15998ab949d8SShri Abhyankar if (usecprow) { 16008ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 16018ab949d8SShri Abhyankar ii = a->compressedrow.i; 16028ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 16039566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 16048ab949d8SShri Abhyankar } else { 16058ab949d8SShri Abhyankar mbs = a->mbs; 16068ab949d8SShri Abhyankar ii = a->i; 16078ab949d8SShri Abhyankar z = zarray; 16088ab949d8SShri Abhyankar } 16098ab949d8SShri Abhyankar 16108ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 16118ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 16128ab949d8SShri Abhyankar idx = ij + ii[i]; 16139371c9d4SSatish Balay sum1 = 0.0; 16149371c9d4SSatish Balay sum2 = 0.0; 16159371c9d4SSatish Balay sum3 = 0.0; 16169371c9d4SSatish Balay sum4 = 0.0; 16179371c9d4SSatish Balay sum5 = 0.0; 16189371c9d4SSatish Balay sum6 = 0.0; 16199371c9d4SSatish Balay sum7 = 0.0; 16209371c9d4SSatish Balay sum8 = 0.0; 16219371c9d4SSatish Balay sum9 = 0.0; 16229371c9d4SSatish Balay sum10 = 0.0; 16239371c9d4SSatish Balay sum11 = 0.0; 16249371c9d4SSatish Balay sum12 = 0.0; 16259371c9d4SSatish Balay sum13 = 0.0; 16269371c9d4SSatish Balay sum14 = 0.0; 16279371c9d4SSatish Balay sum15 = 0.0; 16288ab949d8SShri Abhyankar 16298ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 16308ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 16319371c9d4SSatish Balay x1 = xb[0]; 16329371c9d4SSatish Balay x2 = xb[1]; 16339371c9d4SSatish Balay x3 = xb[2]; 16349371c9d4SSatish Balay x4 = xb[3]; 16358ab949d8SShri Abhyankar 16368ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16378ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16388ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16398ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16408ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16418ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16428ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16438ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16448ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16458ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16468ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16478ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16488ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16498ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16508ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16518ab949d8SShri Abhyankar 16528ab949d8SShri Abhyankar v += 60; 16538ab949d8SShri Abhyankar 16549371c9d4SSatish Balay x1 = xb[4]; 16559371c9d4SSatish Balay x2 = xb[5]; 16569371c9d4SSatish Balay x3 = xb[6]; 16579371c9d4SSatish Balay x4 = xb[7]; 16588ab949d8SShri Abhyankar 16598ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16608ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16618ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16628ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16638ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16648ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16658ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16668ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16678ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16688ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16698ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16708ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16718ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16728ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16738ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16748ab949d8SShri Abhyankar v += 60; 16758ab949d8SShri Abhyankar 16769371c9d4SSatish Balay x1 = xb[8]; 16779371c9d4SSatish Balay x2 = xb[9]; 16789371c9d4SSatish Balay x3 = xb[10]; 16799371c9d4SSatish Balay x4 = xb[11]; 16800b8f6341SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16810b8f6341SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16820b8f6341SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16830b8f6341SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16840b8f6341SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16850b8f6341SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16860b8f6341SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16870b8f6341SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16880b8f6341SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16890b8f6341SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16900b8f6341SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16910b8f6341SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16920b8f6341SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16930b8f6341SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16940b8f6341SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16950b8f6341SShri Abhyankar v += 60; 16960b8f6341SShri Abhyankar 16979371c9d4SSatish Balay x1 = xb[12]; 16989371c9d4SSatish Balay x2 = xb[13]; 16999371c9d4SSatish Balay x3 = xb[14]; 17008ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 17018ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 17028ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 17038ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 17048ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 17058ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 17068ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 17078ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 17088ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 17098ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 17108ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 17118ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 17128ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 17138ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 17148ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 17158ab949d8SShri Abhyankar v += 45; 17168ab949d8SShri Abhyankar } 17178ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 17189371c9d4SSatish Balay z[0] = sum1; 17199371c9d4SSatish Balay z[1] = sum2; 17209371c9d4SSatish Balay z[2] = sum3; 17219371c9d4SSatish Balay z[3] = sum4; 17229371c9d4SSatish Balay z[4] = sum5; 17239371c9d4SSatish Balay z[5] = sum6; 17249371c9d4SSatish Balay z[6] = sum7; 17259371c9d4SSatish Balay z[7] = sum8; 17269371c9d4SSatish Balay z[8] = sum9; 17279371c9d4SSatish Balay z[9] = sum10; 17289371c9d4SSatish Balay z[10] = sum11; 17299371c9d4SSatish Balay z[11] = sum12; 17309371c9d4SSatish Balay z[12] = sum13; 17319371c9d4SSatish Balay z[13] = sum14; 17329371c9d4SSatish Balay z[14] = sum15; 17338ab949d8SShri Abhyankar 17348ab949d8SShri Abhyankar if (!usecprow) z += 15; 17358ab949d8SShri Abhyankar } 17368ab949d8SShri Abhyankar 17379566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 17389566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 17399566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 17403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17418ab949d8SShri Abhyankar } 17428ab949d8SShri Abhyankar 17438ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1744d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) 1745d71ae5a4SJacob Faibussowitsch { 17468ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1747f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 17488ab949d8SShri Abhyankar const PetscScalar *x, *xb; 17490b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 17508ab949d8SShri Abhyankar const MatScalar *v; 17518ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 17527c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1753ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 17548ab949d8SShri Abhyankar 17558ab949d8SShri Abhyankar PetscFunctionBegin; 17569566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 17579566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 17588ab949d8SShri Abhyankar 17598ab949d8SShri Abhyankar v = a->a; 17608ab949d8SShri Abhyankar if (usecprow) { 17618ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 17628ab949d8SShri Abhyankar ii = a->compressedrow.i; 17638ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 17649566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 17658ab949d8SShri Abhyankar } else { 17668ab949d8SShri Abhyankar mbs = a->mbs; 17678ab949d8SShri Abhyankar ii = a->i; 17688ab949d8SShri Abhyankar z = zarray; 17698ab949d8SShri Abhyankar } 17708ab949d8SShri Abhyankar 17718ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 17728ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 17738ab949d8SShri Abhyankar idx = ij + ii[i]; 17749371c9d4SSatish Balay sum1 = 0.0; 17759371c9d4SSatish Balay sum2 = 0.0; 17769371c9d4SSatish Balay sum3 = 0.0; 17779371c9d4SSatish Balay sum4 = 0.0; 17789371c9d4SSatish Balay sum5 = 0.0; 17799371c9d4SSatish Balay sum6 = 0.0; 17809371c9d4SSatish Balay sum7 = 0.0; 17819371c9d4SSatish Balay sum8 = 0.0; 17829371c9d4SSatish Balay sum9 = 0.0; 17839371c9d4SSatish Balay sum10 = 0.0; 17849371c9d4SSatish Balay sum11 = 0.0; 17859371c9d4SSatish Balay sum12 = 0.0; 17869371c9d4SSatish Balay sum13 = 0.0; 17879371c9d4SSatish Balay sum14 = 0.0; 17889371c9d4SSatish Balay sum15 = 0.0; 17898ab949d8SShri Abhyankar 17908ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 17918ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 17929371c9d4SSatish Balay x1 = xb[0]; 17939371c9d4SSatish Balay x2 = xb[1]; 17949371c9d4SSatish Balay x3 = xb[2]; 17959371c9d4SSatish Balay x4 = xb[3]; 17969371c9d4SSatish Balay x5 = xb[4]; 17979371c9d4SSatish Balay x6 = xb[5]; 17989371c9d4SSatish Balay x7 = xb[6]; 17990b8f6341SShri Abhyankar x8 = xb[7]; 18008ab949d8SShri Abhyankar 18018ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 18028ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 18038ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 18048ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 18058ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 18068ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 18078ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 18088ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 18098ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 18108ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 18118ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 18128ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 18138ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 18148ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 18158ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 18168ab949d8SShri Abhyankar v += 120; 18178ab949d8SShri Abhyankar 18189371c9d4SSatish Balay x1 = xb[8]; 18199371c9d4SSatish Balay x2 = xb[9]; 18209371c9d4SSatish Balay x3 = xb[10]; 18219371c9d4SSatish Balay x4 = xb[11]; 18229371c9d4SSatish Balay x5 = xb[12]; 18239371c9d4SSatish Balay x6 = xb[13]; 18249371c9d4SSatish Balay x7 = xb[14]; 18250b8f6341SShri Abhyankar 18268ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 18278ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 18288ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 18298ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 18308ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 18318ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 18328ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 18338ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 18348ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 18358ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 18368ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 18378ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 18388ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 18398ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 18408ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 18418ab949d8SShri Abhyankar v += 105; 18428ab949d8SShri Abhyankar } 18438ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 18449371c9d4SSatish Balay z[0] = sum1; 18459371c9d4SSatish Balay z[1] = sum2; 18469371c9d4SSatish Balay z[2] = sum3; 18479371c9d4SSatish Balay z[3] = sum4; 18489371c9d4SSatish Balay z[4] = sum5; 18499371c9d4SSatish Balay z[5] = sum6; 18509371c9d4SSatish Balay z[6] = sum7; 18519371c9d4SSatish Balay z[7] = sum8; 18529371c9d4SSatish Balay z[8] = sum9; 18539371c9d4SSatish Balay z[9] = sum10; 18549371c9d4SSatish Balay z[10] = sum11; 18559371c9d4SSatish Balay z[11] = sum12; 18569371c9d4SSatish Balay z[12] = sum13; 18579371c9d4SSatish Balay z[13] = sum14; 18589371c9d4SSatish Balay z[14] = sum15; 18598ab949d8SShri Abhyankar 18608ab949d8SShri Abhyankar if (!usecprow) z += 15; 18618ab949d8SShri Abhyankar } 18628ab949d8SShri Abhyankar 18639566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 18649566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 18659566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 18663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18678ab949d8SShri Abhyankar } 18688ab949d8SShri Abhyankar 18698ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1870d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) 1871d71ae5a4SJacob Faibussowitsch { 18728ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1873f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 18748ab949d8SShri Abhyankar const PetscScalar *x, *xb; 18758ab949d8SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 18768ab949d8SShri Abhyankar const MatScalar *v; 18778ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 18787c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1879ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 18808ab949d8SShri Abhyankar 18818ab949d8SShri Abhyankar PetscFunctionBegin; 18829566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 18839566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 18848ab949d8SShri Abhyankar 18858ab949d8SShri Abhyankar v = a->a; 18868ab949d8SShri Abhyankar if (usecprow) { 18878ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 18888ab949d8SShri Abhyankar ii = a->compressedrow.i; 18898ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 18909566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 18918ab949d8SShri Abhyankar } else { 18928ab949d8SShri Abhyankar mbs = a->mbs; 18938ab949d8SShri Abhyankar ii = a->i; 18948ab949d8SShri Abhyankar z = zarray; 18958ab949d8SShri Abhyankar } 18968ab949d8SShri Abhyankar 18978ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 18988ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 18998ab949d8SShri Abhyankar idx = ij + ii[i]; 19009371c9d4SSatish Balay sum1 = 0.0; 19019371c9d4SSatish Balay sum2 = 0.0; 19029371c9d4SSatish Balay sum3 = 0.0; 19039371c9d4SSatish Balay sum4 = 0.0; 19049371c9d4SSatish Balay sum5 = 0.0; 19059371c9d4SSatish Balay sum6 = 0.0; 19069371c9d4SSatish Balay sum7 = 0.0; 19079371c9d4SSatish Balay sum8 = 0.0; 19089371c9d4SSatish Balay sum9 = 0.0; 19099371c9d4SSatish Balay sum10 = 0.0; 19109371c9d4SSatish Balay sum11 = 0.0; 19119371c9d4SSatish Balay sum12 = 0.0; 19129371c9d4SSatish Balay sum13 = 0.0; 19139371c9d4SSatish Balay sum14 = 0.0; 19149371c9d4SSatish Balay sum15 = 0.0; 19158ab949d8SShri Abhyankar 19168ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 19178ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 19189371c9d4SSatish Balay x1 = xb[0]; 19199371c9d4SSatish Balay x2 = xb[1]; 19209371c9d4SSatish Balay x3 = xb[2]; 19219371c9d4SSatish Balay x4 = xb[3]; 19229371c9d4SSatish Balay x5 = xb[4]; 19239371c9d4SSatish Balay x6 = xb[5]; 19249371c9d4SSatish Balay x7 = xb[6]; 19259371c9d4SSatish Balay x8 = xb[7]; 19269371c9d4SSatish Balay x9 = xb[8]; 19279371c9d4SSatish Balay x10 = xb[9]; 19289371c9d4SSatish Balay x11 = xb[10]; 19299371c9d4SSatish Balay x12 = xb[11]; 19309371c9d4SSatish Balay x13 = xb[12]; 19319371c9d4SSatish Balay x14 = xb[13]; 19329371c9d4SSatish Balay x15 = xb[14]; 19338ab949d8SShri Abhyankar 19348ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 19358ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 19368ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 19378ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 19388ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 19398ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 19408ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 19418ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 19428ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 19438ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 19448ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 19458ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 19468ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 19478ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 19488ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 19498ab949d8SShri Abhyankar v += 225; 19508ab949d8SShri Abhyankar } 19518ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 19529371c9d4SSatish Balay z[0] = sum1; 19539371c9d4SSatish Balay z[1] = sum2; 19549371c9d4SSatish Balay z[2] = sum3; 19559371c9d4SSatish Balay z[3] = sum4; 19569371c9d4SSatish Balay z[4] = sum5; 19579371c9d4SSatish Balay z[5] = sum6; 19589371c9d4SSatish Balay z[6] = sum7; 19599371c9d4SSatish Balay z[7] = sum8; 19609371c9d4SSatish Balay z[8] = sum9; 19619371c9d4SSatish Balay z[9] = sum10; 19629371c9d4SSatish Balay z[10] = sum11; 19639371c9d4SSatish Balay z[11] = sum12; 19649371c9d4SSatish Balay z[12] = sum13; 19659371c9d4SSatish Balay z[13] = sum14; 19669371c9d4SSatish Balay z[14] = sum15; 19678ab949d8SShri Abhyankar 19688ab949d8SShri Abhyankar if (!usecprow) z += 15; 19698ab949d8SShri Abhyankar } 19708ab949d8SShri Abhyankar 19719566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 19729566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 19739566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 19743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 19758ab949d8SShri Abhyankar } 19768ab949d8SShri Abhyankar 19773f1db9ecSBarry Smith /* 19783f1db9ecSBarry Smith This will not work with MatScalar == float because it calls the BLAS 19793f1db9ecSBarry Smith */ 1980d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) 1981d71ae5a4SJacob Faibussowitsch { 19822d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1983f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 1984d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1985d9ca1df4SBarry Smith const MatScalar *v; 1986d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1987d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 1988d9ca1df4SBarry Smith PetscInt ncols, k; 1989ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 19902d61bbb3SSatish Balay 19912d61bbb3SSatish Balay PetscFunctionBegin; 19929566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 19939566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 19942d61bbb3SSatish Balay 19952d61bbb3SSatish Balay idx = a->j; 19962d61bbb3SSatish Balay v = a->a; 199726e093fcSHong Zhang if (usecprow) { 199826e093fcSHong Zhang mbs = a->compressedrow.nrows; 199926e093fcSHong Zhang ii = a->compressedrow.i; 20007b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 20019566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 200226e093fcSHong Zhang } else { 200326e093fcSHong Zhang mbs = a->mbs; 20042d61bbb3SSatish Balay ii = a->i; 200526e093fcSHong Zhang z = zarray; 200626e093fcSHong Zhang } 2007218c64b6SSatish Balay 20082d61bbb3SSatish Balay if (!a->mult_work) { 2009d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 20109566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 20112d61bbb3SSatish Balay } 20122d61bbb3SSatish Balay work = a->mult_work; 20132d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 20149371c9d4SSatish Balay n = ii[1] - ii[0]; 20159371c9d4SSatish Balay ii++; 20162d61bbb3SSatish Balay ncols = n * bs; 20172d61bbb3SSatish Balay workt = work; 20182d61bbb3SSatish Balay for (j = 0; j < n; j++) { 20192d61bbb3SSatish Balay xb = x + bs * (*idx++); 20202d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 20212d61bbb3SSatish Balay workt += bs; 20222d61bbb3SSatish Balay } 20237b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 202496b95a6bSBarry Smith PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 20252d61bbb3SSatish Balay v += n * bs2; 202626e093fcSHong Zhang if (!usecprow) z += bs; 20272d61bbb3SSatish Balay } 20289566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20299566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 20309566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 20313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20322d61bbb3SSatish Balay } 20332d61bbb3SSatish Balay 2034d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 2035d71ae5a4SJacob Faibussowitsch { 20362d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2037122f12eaSBarry Smith const PetscScalar *x; 2038122f12eaSBarry Smith PetscScalar *y, *z, sum; 2039122f12eaSBarry Smith const MatScalar *v; 20407c565772SBarry Smith PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2041122f12eaSBarry Smith const PetscInt *idx, *ii; 2042ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20432d61bbb3SSatish Balay 20442d61bbb3SSatish Balay PetscFunctionBegin; 20459566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20469566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 20472d61bbb3SSatish Balay 20482d61bbb3SSatish Balay idx = a->j; 20492d61bbb3SSatish Balay v = a->a; 205026e093fcSHong Zhang if (usecprow) { 205148a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 205226e093fcSHong Zhang mbs = a->compressedrow.nrows; 205326e093fcSHong Zhang ii = a->compressedrow.i; 20547b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 205526e093fcSHong Zhang } else { 20562d61bbb3SSatish Balay ii = a->i; 205726e093fcSHong Zhang } 20582d61bbb3SSatish Balay 20592d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 2060122f12eaSBarry Smith n = ii[1] - ii[0]; 2061122f12eaSBarry Smith ii++; 206226e093fcSHong Zhang if (!usecprow) { 2063122f12eaSBarry Smith sum = y[i]; 2064122f12eaSBarry Smith } else { 2065122f12eaSBarry Smith sum = y[ridx[i]]; 2066122f12eaSBarry Smith } 2067444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2068444d8c10SJed Brown PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2069122f12eaSBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 2070122f12eaSBarry Smith v += n; 2071122f12eaSBarry Smith idx += n; 2072122f12eaSBarry Smith if (usecprow) { 2073122f12eaSBarry Smith z[ridx[i]] = sum; 2074122f12eaSBarry Smith } else { 2075122f12eaSBarry Smith z[i] = sum; 207626e093fcSHong Zhang } 20772d61bbb3SSatish Balay } 20789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20799566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 20809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 20813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20822d61bbb3SSatish Balay } 20832d61bbb3SSatish Balay 2084d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 2085d71ae5a4SJacob Faibussowitsch { 20862d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2087f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2088d9ca1df4SBarry Smith const PetscScalar *x, *xb; 208926e093fcSHong Zhang PetscScalar x1, x2, *yarray, *zarray; 2090d9ca1df4SBarry Smith const MatScalar *v; 2091d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, j; 2092d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2093ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20942d61bbb3SSatish Balay 20952d61bbb3SSatish Balay PetscFunctionBegin; 20969566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20979566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 20982d61bbb3SSatish Balay 20992d61bbb3SSatish Balay idx = a->j; 21002d61bbb3SSatish Balay v = a->a; 210126e093fcSHong Zhang if (usecprow) { 210248a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 210326e093fcSHong Zhang mbs = a->compressedrow.nrows; 210426e093fcSHong Zhang ii = a->compressedrow.i; 21057b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 210626e093fcSHong Zhang } else { 21072d61bbb3SSatish Balay ii = a->i; 210826e093fcSHong Zhang y = yarray; 210926e093fcSHong Zhang z = zarray; 211026e093fcSHong Zhang } 21112d61bbb3SSatish Balay 21122d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 21139371c9d4SSatish Balay n = ii[1] - ii[0]; 21149371c9d4SSatish Balay ii++; 211526e093fcSHong Zhang if (usecprow) { 21167b2bb3b9SHong Zhang z = zarray + 2 * ridx[i]; 21177b2bb3b9SHong Zhang y = yarray + 2 * ridx[i]; 211826e093fcSHong Zhang } 21199371c9d4SSatish Balay sum1 = y[0]; 21209371c9d4SSatish Balay sum2 = y[1]; 2121444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2122444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 21232d61bbb3SSatish Balay for (j = 0; j < n; j++) { 212426fbe8dcSKarl Rupp xb = x + 2 * (*idx++); 212526fbe8dcSKarl Rupp x1 = xb[0]; 212626fbe8dcSKarl Rupp x2 = xb[1]; 212726fbe8dcSKarl Rupp 21282d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 21292d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 21302d61bbb3SSatish Balay v += 4; 21312d61bbb3SSatish Balay } 21329371c9d4SSatish Balay z[0] = sum1; 21339371c9d4SSatish Balay z[1] = sum2; 213426e093fcSHong Zhang if (!usecprow) { 21359371c9d4SSatish Balay z += 2; 21369371c9d4SSatish Balay y += 2; 21372d61bbb3SSatish Balay } 213826e093fcSHong Zhang } 21399566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 21409566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 21419566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * a->nz)); 21423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21432d61bbb3SSatish Balay } 21442d61bbb3SSatish Balay 2145d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 2146d71ae5a4SJacob Faibussowitsch { 21472d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2148f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2149d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2150d9ca1df4SBarry Smith const MatScalar *v; 2151d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2152d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2153ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 21542d61bbb3SSatish Balay 21552d61bbb3SSatish Balay PetscFunctionBegin; 21569566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 21579566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 21582d61bbb3SSatish Balay 21592d61bbb3SSatish Balay idx = a->j; 21602d61bbb3SSatish Balay v = a->a; 216126e093fcSHong Zhang if (usecprow) { 216248a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 216326e093fcSHong Zhang mbs = a->compressedrow.nrows; 216426e093fcSHong Zhang ii = a->compressedrow.i; 21657b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 216626e093fcSHong Zhang } else { 21672d61bbb3SSatish Balay ii = a->i; 216826e093fcSHong Zhang y = yarray; 216926e093fcSHong Zhang z = zarray; 217026e093fcSHong Zhang } 21712d61bbb3SSatish Balay 21722d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 21739371c9d4SSatish Balay n = ii[1] - ii[0]; 21749371c9d4SSatish Balay ii++; 217526e093fcSHong Zhang if (usecprow) { 21767b2bb3b9SHong Zhang z = zarray + 3 * ridx[i]; 21777b2bb3b9SHong Zhang y = yarray + 3 * ridx[i]; 217826e093fcSHong Zhang } 21799371c9d4SSatish Balay sum1 = y[0]; 21809371c9d4SSatish Balay sum2 = y[1]; 21819371c9d4SSatish Balay sum3 = y[2]; 2182444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2183444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 21842d61bbb3SSatish Balay for (j = 0; j < n; j++) { 21859371c9d4SSatish Balay xb = x + 3 * (*idx++); 21869371c9d4SSatish Balay x1 = xb[0]; 21879371c9d4SSatish Balay x2 = xb[1]; 21889371c9d4SSatish Balay x3 = xb[2]; 21892d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 21902d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 21912d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 21922d61bbb3SSatish Balay v += 9; 21932d61bbb3SSatish Balay } 21949371c9d4SSatish Balay z[0] = sum1; 21959371c9d4SSatish Balay z[1] = sum2; 21969371c9d4SSatish Balay z[2] = sum3; 219726e093fcSHong Zhang if (!usecprow) { 21989371c9d4SSatish Balay z += 3; 21999371c9d4SSatish Balay y += 3; 22002d61bbb3SSatish Balay } 220126e093fcSHong Zhang } 22029566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 22039566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 22049566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz)); 22053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 22062d61bbb3SSatish Balay } 22072d61bbb3SSatish Balay 2208d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 2209d71ae5a4SJacob Faibussowitsch { 22102d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2211f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2212d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2213d9ca1df4SBarry Smith const MatScalar *v; 2214d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2215d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2216ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 22172d61bbb3SSatish Balay 22182d61bbb3SSatish Balay PetscFunctionBegin; 22199566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 22209566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 22212d61bbb3SSatish Balay 22222d61bbb3SSatish Balay idx = a->j; 22232d61bbb3SSatish Balay v = a->a; 222426e093fcSHong Zhang if (usecprow) { 222548a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 222626e093fcSHong Zhang mbs = a->compressedrow.nrows; 222726e093fcSHong Zhang ii = a->compressedrow.i; 22287b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 222926e093fcSHong Zhang } else { 22302d61bbb3SSatish Balay ii = a->i; 223126e093fcSHong Zhang y = yarray; 223226e093fcSHong Zhang z = zarray; 223326e093fcSHong Zhang } 22342d61bbb3SSatish Balay 22352d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 22369371c9d4SSatish Balay n = ii[1] - ii[0]; 22379371c9d4SSatish Balay ii++; 223826e093fcSHong Zhang if (usecprow) { 22397b2bb3b9SHong Zhang z = zarray + 4 * ridx[i]; 22407b2bb3b9SHong Zhang y = yarray + 4 * ridx[i]; 224126e093fcSHong Zhang } 22429371c9d4SSatish Balay sum1 = y[0]; 22439371c9d4SSatish Balay sum2 = y[1]; 22449371c9d4SSatish Balay sum3 = y[2]; 22459371c9d4SSatish Balay sum4 = y[3]; 2246444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2247444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 22482d61bbb3SSatish Balay for (j = 0; j < n; j++) { 22492d61bbb3SSatish Balay xb = x + 4 * (*idx++); 22509371c9d4SSatish Balay x1 = xb[0]; 22519371c9d4SSatish Balay x2 = xb[1]; 22529371c9d4SSatish Balay x3 = xb[2]; 22539371c9d4SSatish Balay x4 = xb[3]; 22542d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 22552d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 22562d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 22572d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 22582d61bbb3SSatish Balay v += 16; 22592d61bbb3SSatish Balay } 22609371c9d4SSatish Balay z[0] = sum1; 22619371c9d4SSatish Balay z[1] = sum2; 22629371c9d4SSatish Balay z[2] = sum3; 22639371c9d4SSatish Balay z[3] = sum4; 226426e093fcSHong Zhang if (!usecprow) { 22659371c9d4SSatish Balay z += 4; 22669371c9d4SSatish Balay y += 4; 22672d61bbb3SSatish Balay } 226826e093fcSHong Zhang } 22699566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 22709566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 22719566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz)); 22723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 22732d61bbb3SSatish Balay } 22742d61bbb3SSatish Balay 2275d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 2276d71ae5a4SJacob Faibussowitsch { 22772d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2278f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2279d9ca1df4SBarry Smith const PetscScalar *x, *xb; 228026e093fcSHong Zhang PetscScalar *yarray, *zarray; 2281d9ca1df4SBarry Smith const MatScalar *v; 2282d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2283d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2284ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 22852d61bbb3SSatish Balay 22862d61bbb3SSatish Balay PetscFunctionBegin; 22879566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 22889566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 22892d61bbb3SSatish Balay 22902d61bbb3SSatish Balay idx = a->j; 22912d61bbb3SSatish Balay v = a->a; 229226e093fcSHong Zhang if (usecprow) { 229348a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 229426e093fcSHong Zhang mbs = a->compressedrow.nrows; 229526e093fcSHong Zhang ii = a->compressedrow.i; 22967b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 229726e093fcSHong Zhang } else { 22982d61bbb3SSatish Balay ii = a->i; 229926e093fcSHong Zhang y = yarray; 230026e093fcSHong Zhang z = zarray; 230126e093fcSHong Zhang } 23022d61bbb3SSatish Balay 23032d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 23049371c9d4SSatish Balay n = ii[1] - ii[0]; 23059371c9d4SSatish Balay ii++; 230626e093fcSHong Zhang if (usecprow) { 23077b2bb3b9SHong Zhang z = zarray + 5 * ridx[i]; 23087b2bb3b9SHong Zhang y = yarray + 5 * ridx[i]; 230926e093fcSHong Zhang } 23109371c9d4SSatish Balay sum1 = y[0]; 23119371c9d4SSatish Balay sum2 = y[1]; 23129371c9d4SSatish Balay sum3 = y[2]; 23139371c9d4SSatish Balay sum4 = y[3]; 23149371c9d4SSatish Balay sum5 = y[4]; 2315444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2316444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 23172d61bbb3SSatish Balay for (j = 0; j < n; j++) { 23182d61bbb3SSatish Balay xb = x + 5 * (*idx++); 23199371c9d4SSatish Balay x1 = xb[0]; 23209371c9d4SSatish Balay x2 = xb[1]; 23219371c9d4SSatish Balay x3 = xb[2]; 23229371c9d4SSatish Balay x4 = xb[3]; 23239371c9d4SSatish Balay x5 = xb[4]; 23242d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 23252d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 23262d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 23272d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 23282d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 23292d61bbb3SSatish Balay v += 25; 23302d61bbb3SSatish Balay } 23319371c9d4SSatish Balay z[0] = sum1; 23329371c9d4SSatish Balay z[1] = sum2; 23339371c9d4SSatish Balay z[2] = sum3; 23349371c9d4SSatish Balay z[3] = sum4; 23359371c9d4SSatish Balay z[4] = sum5; 233626e093fcSHong Zhang if (!usecprow) { 23379371c9d4SSatish Balay z += 5; 23389371c9d4SSatish Balay y += 5; 23392d61bbb3SSatish Balay } 234026e093fcSHong Zhang } 23419566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 23429566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 23439566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz)); 23443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23452d61bbb3SSatish Balay } 2346c2916339SPierre Jolivet 2347d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 2348d71ae5a4SJacob Faibussowitsch { 234915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2350f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2351d9ca1df4SBarry Smith const PetscScalar *x, *xb; 235226e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2353d9ca1df4SBarry Smith const MatScalar *v; 2354d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2355d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2356ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 235715091d37SBarry Smith 235815091d37SBarry Smith PetscFunctionBegin; 23599566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 23609566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 236115091d37SBarry Smith 236215091d37SBarry Smith idx = a->j; 236315091d37SBarry Smith v = a->a; 236426e093fcSHong Zhang if (usecprow) { 236548a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 236626e093fcSHong Zhang mbs = a->compressedrow.nrows; 236726e093fcSHong Zhang ii = a->compressedrow.i; 23687b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 236926e093fcSHong Zhang } else { 237015091d37SBarry Smith ii = a->i; 237126e093fcSHong Zhang y = yarray; 237226e093fcSHong Zhang z = zarray; 237326e093fcSHong Zhang } 237415091d37SBarry Smith 237515091d37SBarry Smith for (i = 0; i < mbs; i++) { 23769371c9d4SSatish Balay n = ii[1] - ii[0]; 23779371c9d4SSatish Balay ii++; 237826e093fcSHong Zhang if (usecprow) { 23797b2bb3b9SHong Zhang z = zarray + 6 * ridx[i]; 23807b2bb3b9SHong Zhang y = yarray + 6 * ridx[i]; 238126e093fcSHong Zhang } 23829371c9d4SSatish Balay sum1 = y[0]; 23839371c9d4SSatish Balay sum2 = y[1]; 23849371c9d4SSatish Balay sum3 = y[2]; 23859371c9d4SSatish Balay sum4 = y[3]; 23869371c9d4SSatish Balay sum5 = y[4]; 23879371c9d4SSatish Balay sum6 = y[5]; 2388444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2389444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 239015091d37SBarry Smith for (j = 0; j < n; j++) { 23913b95cb0eSSatish Balay xb = x + 6 * (*idx++); 23929371c9d4SSatish Balay x1 = xb[0]; 23939371c9d4SSatish Balay x2 = xb[1]; 23949371c9d4SSatish Balay x3 = xb[2]; 23959371c9d4SSatish Balay x4 = xb[3]; 23969371c9d4SSatish Balay x5 = xb[4]; 23979371c9d4SSatish Balay x6 = xb[5]; 239815091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 239915091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 240015091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 240115091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 240215091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 240315091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 240415091d37SBarry Smith v += 36; 240515091d37SBarry Smith } 24069371c9d4SSatish Balay z[0] = sum1; 24079371c9d4SSatish Balay z[1] = sum2; 24089371c9d4SSatish Balay z[2] = sum3; 24099371c9d4SSatish Balay z[3] = sum4; 24109371c9d4SSatish Balay z[4] = sum5; 24119371c9d4SSatish Balay z[5] = sum6; 241226e093fcSHong Zhang if (!usecprow) { 24139371c9d4SSatish Balay z += 6; 24149371c9d4SSatish Balay y += 6; 241515091d37SBarry Smith } 241626e093fcSHong Zhang } 24179566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 24189566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 24199566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz)); 24203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 242115091d37SBarry Smith } 24222d61bbb3SSatish Balay 2423d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 2424d71ae5a4SJacob Faibussowitsch { 24252d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2426f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2427d9ca1df4SBarry Smith const PetscScalar *x, *xb; 242826e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2429d9ca1df4SBarry Smith const MatScalar *v; 2430d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2431d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2432ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 24332d61bbb3SSatish Balay 24342d61bbb3SSatish Balay PetscFunctionBegin; 24359566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 24369566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 24372d61bbb3SSatish Balay 24382d61bbb3SSatish Balay idx = a->j; 24392d61bbb3SSatish Balay v = a->a; 244026e093fcSHong Zhang if (usecprow) { 244148a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 244226e093fcSHong Zhang mbs = a->compressedrow.nrows; 244326e093fcSHong Zhang ii = a->compressedrow.i; 24447b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 244526e093fcSHong Zhang } else { 24462d61bbb3SSatish Balay ii = a->i; 244726e093fcSHong Zhang y = yarray; 244826e093fcSHong Zhang z = zarray; 244926e093fcSHong Zhang } 24502d61bbb3SSatish Balay 24512d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 24529371c9d4SSatish Balay n = ii[1] - ii[0]; 24539371c9d4SSatish Balay ii++; 245426e093fcSHong Zhang if (usecprow) { 24557b2bb3b9SHong Zhang z = zarray + 7 * ridx[i]; 24567b2bb3b9SHong Zhang y = yarray + 7 * ridx[i]; 245726e093fcSHong Zhang } 24589371c9d4SSatish Balay sum1 = y[0]; 24599371c9d4SSatish Balay sum2 = y[1]; 24609371c9d4SSatish Balay sum3 = y[2]; 24619371c9d4SSatish Balay sum4 = y[3]; 24629371c9d4SSatish Balay sum5 = y[4]; 24639371c9d4SSatish Balay sum6 = y[5]; 24649371c9d4SSatish Balay sum7 = y[6]; 2465444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2466444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 24672d61bbb3SSatish Balay for (j = 0; j < n; j++) { 24682d61bbb3SSatish Balay xb = x + 7 * (*idx++); 24699371c9d4SSatish Balay x1 = xb[0]; 24709371c9d4SSatish Balay x2 = xb[1]; 24719371c9d4SSatish Balay x3 = xb[2]; 24729371c9d4SSatish Balay x4 = xb[3]; 24739371c9d4SSatish Balay x5 = xb[4]; 24749371c9d4SSatish Balay x6 = xb[5]; 24759371c9d4SSatish Balay x7 = xb[6]; 24762d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 24772d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 24782d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 24792d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 24802d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 24812d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 24822d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 24832d61bbb3SSatish Balay v += 49; 24842d61bbb3SSatish Balay } 24859371c9d4SSatish Balay z[0] = sum1; 24869371c9d4SSatish Balay z[1] = sum2; 24879371c9d4SSatish Balay z[2] = sum3; 24889371c9d4SSatish Balay z[3] = sum4; 24899371c9d4SSatish Balay z[4] = sum5; 24909371c9d4SSatish Balay z[5] = sum6; 24919371c9d4SSatish Balay z[6] = sum7; 249226e093fcSHong Zhang if (!usecprow) { 24939371c9d4SSatish Balay z += 7; 24949371c9d4SSatish Balay y += 7; 24952d61bbb3SSatish Balay } 249626e093fcSHong Zhang } 24979566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 24989566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 24999566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz)); 25003ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 25012d61bbb3SSatish Balay } 2502218c64b6SSatish Balay 25035f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2504d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) 2505d71ae5a4SJacob Faibussowitsch { 250696e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2507f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 250896e086a2SDaniel Kokron const PetscScalar *x, *xb; 250996e086a2SDaniel Kokron const MatScalar *v; 25106679dcc1SBarry Smith PetscInt mbs, i, j, n; 2511ce68d72fSJed Brown PetscInt k; 251296e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 25136679dcc1SBarry Smith const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 251496e086a2SDaniel Kokron 251596e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 2516ce68d72fSJed Brown __m256d w0, w1, w2, w3; 251796e086a2SDaniel Kokron __m256d z0, z1, z2; 251896e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 251996e086a2SDaniel Kokron 252096e086a2SDaniel Kokron PetscFunctionBegin; 25219566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 25229566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 25239566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 252496e086a2SDaniel Kokron 252596e086a2SDaniel Kokron idx = a->j; 252696e086a2SDaniel Kokron v = a->a; 252796e086a2SDaniel Kokron if (usecprow) { 252896e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 252996e086a2SDaniel Kokron ii = a->compressedrow.i; 253096e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 253196e086a2SDaniel Kokron } else { 253296e086a2SDaniel Kokron mbs = a->mbs; 253396e086a2SDaniel Kokron ii = a->i; 253496e086a2SDaniel Kokron z = zarray; 253596e086a2SDaniel Kokron } 253696e086a2SDaniel Kokron 253796e086a2SDaniel Kokron if (!a->mult_work) { 253896e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 25399566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 254096e086a2SDaniel Kokron } 254196e086a2SDaniel Kokron 254296e086a2SDaniel Kokron work = a->mult_work; 254396e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 25449371c9d4SSatish Balay n = ii[1] - ii[0]; 25459371c9d4SSatish Balay ii++; 254696e086a2SDaniel Kokron workt = work; 254796e086a2SDaniel Kokron for (j = 0; j < n; j++) { 254896e086a2SDaniel Kokron xb = x + bs * (*idx++); 254996e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 255096e086a2SDaniel Kokron workt += bs; 255196e086a2SDaniel Kokron } 255296e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 255396e086a2SDaniel Kokron 25549371c9d4SSatish Balay z0 = _mm256_loadu_pd(&z[0]); 25559371c9d4SSatish Balay z1 = _mm256_loadu_pd(&z[4]); 25569371c9d4SSatish Balay z2 = _mm256_set1_pd(z[8]); 255796e086a2SDaniel Kokron 255896e086a2SDaniel Kokron for (j = 0; j < n; j++) { 2559c05b70c4SSatish Balay /* first column of a */ 256096e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 25619371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 25629371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 25639371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 25649371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 25659371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 25669371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 256796e086a2SDaniel Kokron 2568c05b70c4SSatish Balay /* second column of a */ 256996e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 25709371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 25719371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 25729371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 25739371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 25749371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 25759371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 257696e086a2SDaniel Kokron 2577c05b70c4SSatish Balay /* third column of a */ 257896e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 25799371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 25809371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 25819371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 25829371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 25839371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 25849371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 258596e086a2SDaniel Kokron 2586c05b70c4SSatish Balay /* fourth column of a */ 258796e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 25889371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 25899371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 25909371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 25919371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 25929371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 25939371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 259496e086a2SDaniel Kokron 2595c05b70c4SSatish Balay /* fifth column of a */ 259696e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 25979371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 25989371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 25999371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 26009371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 26019371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 26029371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 260396e086a2SDaniel Kokron 2604c05b70c4SSatish Balay /* sixth column of a */ 260596e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 26069371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 26079371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 26089371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 26099371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 26109371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 26119371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 261296e086a2SDaniel Kokron 2613c05b70c4SSatish Balay /* seventh column of a */ 261496e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 26159371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 26169371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 26179371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 26189371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 26199371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 26209371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 262196e086a2SDaniel Kokron 26226aad120cSJose E. Roman /* eighth column of a */ 262396e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 26249371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 26259371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 26269371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 26279371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 26289371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 26299371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 263096e086a2SDaniel Kokron 2631c05b70c4SSatish Balay /* ninth column of a */ 263296e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 26339371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 26349371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 26359371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 26369371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 26379371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 26389371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 263996e086a2SDaniel Kokron } 264096e086a2SDaniel Kokron 26419371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 26429371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 26439371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 264496e086a2SDaniel Kokron 264596e086a2SDaniel Kokron v += n * bs2; 264696e086a2SDaniel Kokron if (!usecprow) z += bs; 264796e086a2SDaniel Kokron } 26489566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 26499566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 26509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(162.0 * a->nz)); 26513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 265296e086a2SDaniel Kokron } 265396e086a2SDaniel Kokron #endif 265496e086a2SDaniel Kokron 2655d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) 2656d71ae5a4SJacob Faibussowitsch { 2657ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2658f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2659ebada01fSBarry Smith const PetscScalar *x, *xb; 2660ebada01fSBarry Smith PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2661ebada01fSBarry Smith const MatScalar *v; 2662ebada01fSBarry Smith PetscInt mbs = a->mbs, i, j, n; 2663ebada01fSBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2664ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 2665ebada01fSBarry Smith 2666ebada01fSBarry Smith PetscFunctionBegin; 26679566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 26689566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2669ebada01fSBarry Smith 2670ebada01fSBarry Smith idx = a->j; 2671ebada01fSBarry Smith v = a->a; 2672ebada01fSBarry Smith if (usecprow) { 267348a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2674ebada01fSBarry Smith mbs = a->compressedrow.nrows; 2675ebada01fSBarry Smith ii = a->compressedrow.i; 2676ebada01fSBarry Smith ridx = a->compressedrow.rindex; 2677ebada01fSBarry Smith } else { 2678ebada01fSBarry Smith ii = a->i; 2679ebada01fSBarry Smith y = yarray; 2680ebada01fSBarry Smith z = zarray; 2681ebada01fSBarry Smith } 2682ebada01fSBarry Smith 2683ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 26849371c9d4SSatish Balay n = ii[1] - ii[0]; 26859371c9d4SSatish Balay ii++; 2686ebada01fSBarry Smith if (usecprow) { 2687ebada01fSBarry Smith z = zarray + 11 * ridx[i]; 2688ebada01fSBarry Smith y = yarray + 11 * ridx[i]; 2689ebada01fSBarry Smith } 26909371c9d4SSatish Balay sum1 = y[0]; 26919371c9d4SSatish Balay sum2 = y[1]; 26929371c9d4SSatish Balay sum3 = y[2]; 26939371c9d4SSatish Balay sum4 = y[3]; 26949371c9d4SSatish Balay sum5 = y[4]; 26959371c9d4SSatish Balay sum6 = y[5]; 26969371c9d4SSatish Balay sum7 = y[6]; 26979371c9d4SSatish Balay sum8 = y[7]; 26989371c9d4SSatish Balay sum9 = y[8]; 26999371c9d4SSatish Balay sum10 = y[9]; 27009371c9d4SSatish Balay sum11 = y[10]; 2701ebada01fSBarry Smith PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2702ebada01fSBarry Smith PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2703ebada01fSBarry Smith for (j = 0; j < n; j++) { 2704ebada01fSBarry Smith xb = x + 11 * (*idx++); 27059371c9d4SSatish Balay x1 = xb[0]; 27069371c9d4SSatish Balay x2 = xb[1]; 27079371c9d4SSatish Balay x3 = xb[2]; 27089371c9d4SSatish Balay x4 = xb[3]; 27099371c9d4SSatish Balay x5 = xb[4]; 27109371c9d4SSatish Balay x6 = xb[5]; 27119371c9d4SSatish Balay x7 = xb[6]; 27129371c9d4SSatish Balay x8 = xb[7]; 27139371c9d4SSatish Balay x9 = xb[8]; 27149371c9d4SSatish Balay x10 = xb[9]; 27159371c9d4SSatish Balay x11 = xb[10]; 2716ebada01fSBarry Smith sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2717ebada01fSBarry Smith sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2718ebada01fSBarry Smith sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2719ebada01fSBarry Smith sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2720ebada01fSBarry Smith sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2721ebada01fSBarry Smith sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2722ebada01fSBarry Smith sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2723ebada01fSBarry Smith sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2724ebada01fSBarry Smith sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2725ebada01fSBarry Smith sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2726ebada01fSBarry Smith sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2727ebada01fSBarry Smith v += 121; 2728ebada01fSBarry Smith } 27299371c9d4SSatish Balay z[0] = sum1; 27309371c9d4SSatish Balay z[1] = sum2; 27319371c9d4SSatish Balay z[2] = sum3; 27329371c9d4SSatish Balay z[3] = sum4; 27339371c9d4SSatish Balay z[4] = sum5; 27349371c9d4SSatish Balay z[5] = sum6; 27359371c9d4SSatish Balay z[6] = sum7; 27369371c9d4SSatish Balay z[7] = sum8; 27379371c9d4SSatish Balay z[8] = sum9; 27389371c9d4SSatish Balay z[9] = sum10; 27399371c9d4SSatish Balay z[10] = sum11; 2740ebada01fSBarry Smith if (!usecprow) { 27419371c9d4SSatish Balay z += 11; 27429371c9d4SSatish Balay y += 11; 2743ebada01fSBarry Smith } 2744ebada01fSBarry Smith } 27459566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 27469566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 27479566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz)); 27483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2749ebada01fSBarry Smith } 2750ebada01fSBarry Smith 2751d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 2752d71ae5a4SJacob Faibussowitsch { 27532d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2754f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 2755d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2756d9ca1df4SBarry Smith const MatScalar *v; 2757d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2758d9ca1df4SBarry Smith PetscInt ncols, k; 2759d9ca1df4SBarry Smith const PetscInt *ridx = NULL, *idx, *ii; 2760ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2761218c64b6SSatish Balay 27622d61bbb3SSatish Balay PetscFunctionBegin; 27639566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 27649566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 27659566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 27662d61bbb3SSatish Balay 27672d61bbb3SSatish Balay idx = a->j; 27682d61bbb3SSatish Balay v = a->a; 276926e093fcSHong Zhang if (usecprow) { 277026e093fcSHong Zhang mbs = a->compressedrow.nrows; 277126e093fcSHong Zhang ii = a->compressedrow.i; 27727b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 277326e093fcSHong Zhang } else { 277426e093fcSHong Zhang mbs = a->mbs; 27752d61bbb3SSatish Balay ii = a->i; 277626e093fcSHong Zhang z = zarray; 277726e093fcSHong Zhang } 27782d61bbb3SSatish Balay 27792d61bbb3SSatish Balay if (!a->mult_work) { 2780d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 27819566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 27822d61bbb3SSatish Balay } 27832d61bbb3SSatish Balay work = a->mult_work; 27842d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 27859371c9d4SSatish Balay n = ii[1] - ii[0]; 27869371c9d4SSatish Balay ii++; 27872d61bbb3SSatish Balay ncols = n * bs; 27882d61bbb3SSatish Balay workt = work; 27892d61bbb3SSatish Balay for (j = 0; j < n; j++) { 27902d61bbb3SSatish Balay xb = x + bs * (*idx++); 27912d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 27922d61bbb3SSatish Balay workt += bs; 27932d61bbb3SSatish Balay } 27947b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 279596b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 27962d61bbb3SSatish Balay v += n * bs2; 279726fbe8dcSKarl Rupp if (!usecprow) z += bs; 279826e093fcSHong Zhang } 27999566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 28009566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 28019566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 28023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28032d61bbb3SSatish Balay } 28042d61bbb3SSatish Balay 2805d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2806d71ae5a4SJacob Faibussowitsch { 2807547795f9SHong Zhang PetscScalar zero = 0.0; 2808547795f9SHong Zhang 2809547795f9SHong Zhang PetscFunctionBegin; 28109566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28119566063dSJacob Faibussowitsch PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 28123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2813547795f9SHong Zhang } 2814547795f9SHong Zhang 2815d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2816d71ae5a4SJacob Faibussowitsch { 28173447b6efSHong Zhang PetscScalar zero = 0.0; 28182d61bbb3SSatish Balay 28192d61bbb3SSatish Balay PetscFunctionBegin; 28209566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28219566063dSJacob Faibussowitsch PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 28223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28232d61bbb3SSatish Balay } 28242d61bbb3SSatish Balay 2825d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2826d71ae5a4SJacob Faibussowitsch { 2827547795f9SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2828b8c08b77SHong Zhang PetscScalar *z, x1, x2, x3, x4, x5; 2829d9ca1df4SBarry Smith const PetscScalar *x, *xb = NULL; 2830d9ca1df4SBarry Smith const MatScalar *v; 2831b8c08b77SHong Zhang PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2832d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 2833547795f9SHong Zhang Mat_CompressedRow cprow = a->compressedrow; 2834ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 2835547795f9SHong Zhang 2836547795f9SHong Zhang PetscFunctionBegin; 28379566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 28389566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 28399566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 2840547795f9SHong Zhang 2841547795f9SHong Zhang idx = a->j; 2842547795f9SHong Zhang v = a->a; 2843547795f9SHong Zhang if (usecprow) { 2844547795f9SHong Zhang mbs = cprow.nrows; 2845547795f9SHong Zhang ii = cprow.i; 2846547795f9SHong Zhang ridx = cprow.rindex; 2847547795f9SHong Zhang } else { 2848547795f9SHong Zhang mbs = a->mbs; 2849547795f9SHong Zhang ii = a->i; 2850547795f9SHong Zhang xb = x; 2851547795f9SHong Zhang } 2852547795f9SHong Zhang 2853547795f9SHong Zhang switch (bs) { 2854547795f9SHong Zhang case 1: 2855547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2856547795f9SHong Zhang if (usecprow) xb = x + ridx[i]; 2857547795f9SHong Zhang x1 = xb[0]; 2858547795f9SHong Zhang ib = idx + ii[0]; 28599371c9d4SSatish Balay n = ii[1] - ii[0]; 28609371c9d4SSatish Balay ii++; 2861547795f9SHong Zhang for (j = 0; j < n; j++) { 2862547795f9SHong Zhang rval = ib[j]; 2863547795f9SHong Zhang z[rval] += PetscConj(*v) * x1; 2864547795f9SHong Zhang v++; 2865547795f9SHong Zhang } 2866547795f9SHong Zhang if (!usecprow) xb++; 2867547795f9SHong Zhang } 2868547795f9SHong Zhang break; 2869547795f9SHong Zhang case 2: 2870547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2871547795f9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 28729371c9d4SSatish Balay x1 = xb[0]; 28739371c9d4SSatish Balay x2 = xb[1]; 2874547795f9SHong Zhang ib = idx + ii[0]; 28759371c9d4SSatish Balay n = ii[1] - ii[0]; 28769371c9d4SSatish Balay ii++; 2877547795f9SHong Zhang for (j = 0; j < n; j++) { 2878547795f9SHong Zhang rval = ib[j] * 2; 2879547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2880547795f9SHong Zhang z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2881547795f9SHong Zhang v += 4; 2882547795f9SHong Zhang } 2883547795f9SHong Zhang if (!usecprow) xb += 2; 2884547795f9SHong Zhang } 2885547795f9SHong Zhang break; 2886547795f9SHong Zhang case 3: 2887547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2888547795f9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 28899371c9d4SSatish Balay x1 = xb[0]; 28909371c9d4SSatish Balay x2 = xb[1]; 28919371c9d4SSatish Balay x3 = xb[2]; 2892547795f9SHong Zhang ib = idx + ii[0]; 28939371c9d4SSatish Balay n = ii[1] - ii[0]; 28949371c9d4SSatish Balay ii++; 2895547795f9SHong Zhang for (j = 0; j < n; j++) { 2896547795f9SHong Zhang rval = ib[j] * 3; 2897547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2898547795f9SHong Zhang z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2899547795f9SHong Zhang z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2900547795f9SHong Zhang v += 9; 2901547795f9SHong Zhang } 2902547795f9SHong Zhang if (!usecprow) xb += 3; 2903547795f9SHong Zhang } 2904547795f9SHong Zhang break; 2905547795f9SHong Zhang case 4: 2906547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2907547795f9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 29089371c9d4SSatish Balay x1 = xb[0]; 29099371c9d4SSatish Balay x2 = xb[1]; 29109371c9d4SSatish Balay x3 = xb[2]; 29119371c9d4SSatish Balay x4 = xb[3]; 2912547795f9SHong Zhang ib = idx + ii[0]; 29139371c9d4SSatish Balay n = ii[1] - ii[0]; 29149371c9d4SSatish Balay ii++; 2915547795f9SHong Zhang for (j = 0; j < n; j++) { 2916547795f9SHong Zhang rval = ib[j] * 4; 2917547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2918547795f9SHong Zhang z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2919547795f9SHong Zhang z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2920547795f9SHong Zhang z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2921547795f9SHong Zhang v += 16; 2922547795f9SHong Zhang } 2923547795f9SHong Zhang if (!usecprow) xb += 4; 2924547795f9SHong Zhang } 2925547795f9SHong Zhang break; 2926547795f9SHong Zhang case 5: 2927547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2928547795f9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 29299371c9d4SSatish Balay x1 = xb[0]; 29309371c9d4SSatish Balay x2 = xb[1]; 29319371c9d4SSatish Balay x3 = xb[2]; 29329371c9d4SSatish Balay x4 = xb[3]; 29339371c9d4SSatish Balay x5 = xb[4]; 2934547795f9SHong Zhang ib = idx + ii[0]; 29359371c9d4SSatish Balay n = ii[1] - ii[0]; 29369371c9d4SSatish Balay ii++; 2937547795f9SHong Zhang for (j = 0; j < n; j++) { 2938547795f9SHong Zhang rval = ib[j] * 5; 2939547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2940547795f9SHong Zhang z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2941547795f9SHong Zhang z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2942547795f9SHong Zhang z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2943547795f9SHong Zhang z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2944547795f9SHong Zhang v += 25; 2945547795f9SHong Zhang } 2946547795f9SHong Zhang if (!usecprow) xb += 5; 2947547795f9SHong Zhang } 2948547795f9SHong Zhang break; 2949d71ae5a4SJacob Faibussowitsch default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2950d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2951968ae2c8SSatish Balay #if 0 2952968ae2c8SSatish Balay { 2953b8c08b77SHong Zhang PetscInt ncols,k,bs2=a->bs2; 2954b8c08b77SHong Zhang PetscScalar *work,*workt,zb; 2955d9ca1df4SBarry Smith const PetscScalar *xtmp; 2956547795f9SHong Zhang if (!a->mult_work) { 2957547795f9SHong Zhang k = PetscMax(A->rmap->n,A->cmap->n); 29589566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2959547795f9SHong Zhang } 2960547795f9SHong Zhang work = a->mult_work; 2961547795f9SHong Zhang xtmp = x; 2962547795f9SHong Zhang for (i=0; i<mbs; i++) { 2963547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 2964547795f9SHong Zhang ncols = n*bs; 29659566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work,ncols)); 296626fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs*ridx[i]; 296796b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2968547795f9SHong Zhang v += n*bs2; 2969547795f9SHong Zhang if (!usecprow) xtmp += bs; 2970547795f9SHong Zhang workt = work; 2971547795f9SHong Zhang for (j=0; j<n; j++) { 2972547795f9SHong Zhang zb = z + bs*(*idx++); 2973547795f9SHong Zhang for (k=0; k<bs; k++) zb[k] += workt[k] ; 2974547795f9SHong Zhang workt += bs; 2975547795f9SHong Zhang } 2976547795f9SHong Zhang } 2977547795f9SHong Zhang } 2978968ae2c8SSatish Balay #endif 2979547795f9SHong Zhang } 29809566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 29819566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 29829566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 29833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2984547795f9SHong Zhang } 2985547795f9SHong Zhang 2986d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2987d71ae5a4SJacob Faibussowitsch { 29882d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2989d9ca1df4SBarry Smith PetscScalar *zb, *z, x1, x2, x3, x4, x5; 2990f4259b30SLisandro Dalcin const PetscScalar *x, *xb = NULL; 2991d9ca1df4SBarry Smith const MatScalar *v; 2992d9ca1df4SBarry Smith PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2993d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 29943447b6efSHong Zhang Mat_CompressedRow cprow = a->compressedrow; 2995ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 29962d61bbb3SSatish Balay 29972d61bbb3SSatish Balay PetscFunctionBegin; 29989566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 29999566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 30009566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 30012d61bbb3SSatish Balay 30022d61bbb3SSatish Balay idx = a->j; 30032d61bbb3SSatish Balay v = a->a; 30043447b6efSHong Zhang if (usecprow) { 30053447b6efSHong Zhang mbs = cprow.nrows; 30063447b6efSHong Zhang ii = cprow.i; 30077b2bb3b9SHong Zhang ridx = cprow.rindex; 30083447b6efSHong Zhang } else { 30093447b6efSHong Zhang mbs = a->mbs; 30102d61bbb3SSatish Balay ii = a->i; 3011f1af5d2fSBarry Smith xb = x; 30123447b6efSHong Zhang } 30132d61bbb3SSatish Balay 30142d61bbb3SSatish Balay switch (bs) { 30152d61bbb3SSatish Balay case 1: 30162d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30177b2bb3b9SHong Zhang if (usecprow) xb = x + ridx[i]; 3018f1af5d2fSBarry Smith x1 = xb[0]; 30193447b6efSHong Zhang ib = idx + ii[0]; 30209371c9d4SSatish Balay n = ii[1] - ii[0]; 30219371c9d4SSatish Balay ii++; 30222d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30232d61bbb3SSatish Balay rval = ib[j]; 3024f1af5d2fSBarry Smith z[rval] += *v * x1; 3025f1af5d2fSBarry Smith v++; 30262d61bbb3SSatish Balay } 30273447b6efSHong Zhang if (!usecprow) xb++; 30282d61bbb3SSatish Balay } 30292d61bbb3SSatish Balay break; 30302d61bbb3SSatish Balay case 2: 30312d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30327b2bb3b9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 30339371c9d4SSatish Balay x1 = xb[0]; 30349371c9d4SSatish Balay x2 = xb[1]; 30353447b6efSHong Zhang ib = idx + ii[0]; 30369371c9d4SSatish Balay n = ii[1] - ii[0]; 30379371c9d4SSatish Balay ii++; 30382d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30392d61bbb3SSatish Balay rval = ib[j] * 2; 30402d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2; 30412d61bbb3SSatish Balay z[rval++] += v[2] * x1 + v[3] * x2; 30422d61bbb3SSatish Balay v += 4; 30432d61bbb3SSatish Balay } 30443447b6efSHong Zhang if (!usecprow) xb += 2; 30452d61bbb3SSatish Balay } 30462d61bbb3SSatish Balay break; 30472d61bbb3SSatish Balay case 3: 30482d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30497b2bb3b9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 30509371c9d4SSatish Balay x1 = xb[0]; 30519371c9d4SSatish Balay x2 = xb[1]; 30529371c9d4SSatish Balay x3 = xb[2]; 30533447b6efSHong Zhang ib = idx + ii[0]; 30549371c9d4SSatish Balay n = ii[1] - ii[0]; 30559371c9d4SSatish Balay ii++; 30562d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30572d61bbb3SSatish Balay rval = ib[j] * 3; 30582d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 30592d61bbb3SSatish Balay z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 30602d61bbb3SSatish Balay z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 30612d61bbb3SSatish Balay v += 9; 30622d61bbb3SSatish Balay } 30633447b6efSHong Zhang if (!usecprow) xb += 3; 30642d61bbb3SSatish Balay } 30652d61bbb3SSatish Balay break; 30662d61bbb3SSatish Balay case 4: 30672d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30687b2bb3b9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 30699371c9d4SSatish Balay x1 = xb[0]; 30709371c9d4SSatish Balay x2 = xb[1]; 30719371c9d4SSatish Balay x3 = xb[2]; 30729371c9d4SSatish Balay x4 = xb[3]; 30733447b6efSHong Zhang ib = idx + ii[0]; 30749371c9d4SSatish Balay n = ii[1] - ii[0]; 30759371c9d4SSatish Balay ii++; 30762d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30772d61bbb3SSatish Balay rval = ib[j] * 4; 30782d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 30792d61bbb3SSatish Balay z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 30802d61bbb3SSatish Balay z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 30812d61bbb3SSatish Balay z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 30822d61bbb3SSatish Balay v += 16; 30832d61bbb3SSatish Balay } 30843447b6efSHong Zhang if (!usecprow) xb += 4; 30852d61bbb3SSatish Balay } 30862d61bbb3SSatish Balay break; 30872d61bbb3SSatish Balay case 5: 30882d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30897b2bb3b9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 30909371c9d4SSatish Balay x1 = xb[0]; 30919371c9d4SSatish Balay x2 = xb[1]; 30929371c9d4SSatish Balay x3 = xb[2]; 30939371c9d4SSatish Balay x4 = xb[3]; 30949371c9d4SSatish Balay x5 = xb[4]; 30953447b6efSHong Zhang ib = idx + ii[0]; 30969371c9d4SSatish Balay n = ii[1] - ii[0]; 30979371c9d4SSatish Balay ii++; 30982d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30992d61bbb3SSatish Balay rval = ib[j] * 5; 31002d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 31012d61bbb3SSatish Balay z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 31022d61bbb3SSatish Balay z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 31032d61bbb3SSatish Balay z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 31042d61bbb3SSatish Balay z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 31052d61bbb3SSatish Balay v += 25; 31062d61bbb3SSatish Balay } 31073447b6efSHong Zhang if (!usecprow) xb += 5; 31082d61bbb3SSatish Balay } 31092d61bbb3SSatish Balay break; 3110f1af5d2fSBarry Smith default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3111690b6cddSBarry Smith PetscInt ncols, k; 3112d9ca1df4SBarry Smith PetscScalar *work, *workt; 3113d9ca1df4SBarry Smith const PetscScalar *xtmp; 31142d61bbb3SSatish Balay if (!a->mult_work) { 3115d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 31169566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 31172d61bbb3SSatish Balay } 31182d61bbb3SSatish Balay work = a->mult_work; 31193447b6efSHong Zhang xtmp = x; 31202d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 31219371c9d4SSatish Balay n = ii[1] - ii[0]; 31229371c9d4SSatish Balay ii++; 31232d61bbb3SSatish Balay ncols = n * bs; 31249566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work, ncols)); 312526fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs * ridx[i]; 312696b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 31272d61bbb3SSatish Balay v += n * bs2; 31283447b6efSHong Zhang if (!usecprow) xtmp += bs; 31292d61bbb3SSatish Balay workt = work; 31302d61bbb3SSatish Balay for (j = 0; j < n; j++) { 31312d61bbb3SSatish Balay zb = z + bs * (*idx++); 31322d61bbb3SSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 31332d61bbb3SSatish Balay workt += bs; 31342d61bbb3SSatish Balay } 31352d61bbb3SSatish Balay } 31362d61bbb3SSatish Balay } 31372d61bbb3SSatish Balay } 31389566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 31399566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 31409566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 31413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 31422d61bbb3SSatish Balay } 31432d61bbb3SSatish Balay 3144d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) 3145d71ae5a4SJacob Faibussowitsch { 31462d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3147690b6cddSBarry Smith PetscInt totalnz = a->bs2 * a->nz; 3148f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 3149c5df96a5SBarry Smith PetscBLASInt one = 1, tnz; 31502d61bbb3SSatish Balay 31512d61bbb3SSatish Balay PetscFunctionBegin; 31529566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3153792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 31549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 31553ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 31562d61bbb3SSatish Balay } 31572d61bbb3SSatish Balay 3158d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) 3159d71ae5a4SJacob Faibussowitsch { 31602d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 31613f1db9ecSBarry Smith MatScalar *v = a->a; 3162329f5518SBarry Smith PetscReal sum = 0.0; 3163d0f46423SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 31642d61bbb3SSatish Balay 31652d61bbb3SSatish Balay PetscFunctionBegin; 31662d61bbb3SSatish Balay if (type == NORM_FROBENIUS) { 3167570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16) 3168570b7f6dSBarry Smith PetscBLASInt one = 1, cnt = bs2 * nz; 3169792fecdfSBarry Smith PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3170570b7f6dSBarry Smith #else 31712d61bbb3SSatish Balay for (i = 0; i < bs2 * nz; i++) { 31729371c9d4SSatish Balay sum += PetscRealPart(PetscConj(*v) * (*v)); 31739371c9d4SSatish Balay v++; 31742d61bbb3SSatish Balay } 3175570b7f6dSBarry Smith #endif 31768f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum); 31779566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 31788a62d963SHong Zhang } else if (type == NORM_1) { /* maximum column sum */ 31798a62d963SHong Zhang PetscReal *tmp; 31808a62d963SHong Zhang PetscInt *bcol = a->j; 31819566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 31828a62d963SHong Zhang for (i = 0; i < nz; i++) { 31838a62d963SHong Zhang for (j = 0; j < bs; j++) { 31848a62d963SHong Zhang k1 = bs * (*bcol) + j; /* column index */ 31858a62d963SHong Zhang for (k = 0; k < bs; k++) { 31869371c9d4SSatish Balay tmp[k1] += PetscAbsScalar(*v); 31879371c9d4SSatish Balay v++; 31888a62d963SHong Zhang } 31898a62d963SHong Zhang } 31908a62d963SHong Zhang bcol++; 31918a62d963SHong Zhang } 31928a62d963SHong Zhang *norm = 0.0; 3193d0f46423SBarry Smith for (j = 0; j < A->cmap->n; j++) { 31948a62d963SHong Zhang if (tmp[j] > *norm) *norm = tmp[j]; 31958a62d963SHong Zhang } 31969566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp)); 31979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3198596552b5SBarry Smith } else if (type == NORM_INFINITY) { /* maximum row sum */ 3199596552b5SBarry Smith *norm = 0.0; 3200596552b5SBarry Smith for (k = 0; k < bs; k++) { 320174f84c7bSSatish Balay for (j = 0; j < a->mbs; j++) { 3202596552b5SBarry Smith v = a->a + bs2 * a->i[j] + k; 3203596552b5SBarry Smith sum = 0.0; 3204596552b5SBarry Smith for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 32050e90e235SBarry Smith for (k1 = 0; k1 < bs; k1++) { 3206596552b5SBarry Smith sum += PetscAbsScalar(*v); 3207596552b5SBarry Smith v += bs; 32082d61bbb3SSatish Balay } 32090e90e235SBarry Smith } 3210596552b5SBarry Smith if (sum > *norm) *norm = sum; 3211596552b5SBarry Smith } 3212596552b5SBarry Smith } 32139566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3214e7e72b3dSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 32153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 32162d61bbb3SSatish Balay } 32172d61bbb3SSatish Balay 3218d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) 3219d71ae5a4SJacob Faibussowitsch { 32202d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 32212d61bbb3SSatish Balay 32222d61bbb3SSatish Balay PetscFunctionBegin; 32232d61bbb3SSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3224d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3225273d9f13SBarry Smith *flg = PETSC_FALSE; 32263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 32272d61bbb3SSatish Balay } 32282d61bbb3SSatish Balay 32292d61bbb3SSatish Balay /* if the a->i are the same */ 32309566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 32313ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 32322d61bbb3SSatish Balay 32332d61bbb3SSatish Balay /* if a->j are the same */ 32349566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 32353ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 323626fbe8dcSKarl Rupp 32372d61bbb3SSatish Balay /* if a->a are the same */ 32389566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 32393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 32402d61bbb3SSatish Balay } 32412d61bbb3SSatish Balay 3242d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) 3243d71ae5a4SJacob Faibussowitsch { 32442d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3245690b6cddSBarry Smith PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 324687828ca2SBarry Smith PetscScalar *x, zero = 0.0; 32473f1db9ecSBarry Smith MatScalar *aa, *aa_j; 32482d61bbb3SSatish Balay 32492d61bbb3SSatish Balay PetscFunctionBegin; 325028b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3251d0f46423SBarry Smith bs = A->rmap->bs; 32522d61bbb3SSatish Balay aa = a->a; 32532d61bbb3SSatish Balay ai = a->i; 32542d61bbb3SSatish Balay aj = a->j; 32552d61bbb3SSatish Balay ambs = a->mbs; 32562d61bbb3SSatish Balay bs2 = a->bs2; 32572d61bbb3SSatish Balay 32589566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 32599566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 32609566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 326108401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 32622d61bbb3SSatish Balay for (i = 0; i < ambs; i++) { 32632d61bbb3SSatish Balay for (j = ai[i]; j < ai[i + 1]; j++) { 32642d61bbb3SSatish Balay if (aj[j] == i) { 32652d61bbb3SSatish Balay row = i * bs; 32662d61bbb3SSatish Balay aa_j = aa + j * bs2; 32672d61bbb3SSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 32682d61bbb3SSatish Balay break; 32692d61bbb3SSatish Balay } 32702d61bbb3SSatish Balay } 32712d61bbb3SSatish Balay } 32729566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 32733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 32742d61bbb3SSatish Balay } 32752d61bbb3SSatish Balay 3276d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) 3277d71ae5a4SJacob Faibussowitsch { 32782d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 327953ef36baSBarry Smith const PetscScalar *l, *r, *li, *ri; 328053ef36baSBarry Smith PetscScalar x; 32813f1db9ecSBarry Smith MatScalar *aa, *v; 328253ef36baSBarry Smith PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 328353ef36baSBarry Smith const PetscInt *ai, *aj; 32842d61bbb3SSatish Balay 32852d61bbb3SSatish Balay PetscFunctionBegin; 32862d61bbb3SSatish Balay ai = a->i; 32872d61bbb3SSatish Balay aj = a->j; 32882d61bbb3SSatish Balay aa = a->a; 3289d0f46423SBarry Smith m = A->rmap->n; 3290d0f46423SBarry Smith n = A->cmap->n; 3291d0f46423SBarry Smith bs = A->rmap->bs; 32922d61bbb3SSatish Balay mbs = a->mbs; 32932d61bbb3SSatish Balay bs2 = a->bs2; 32942d61bbb3SSatish Balay if (ll) { 32959566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 32969566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 329708401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 32982d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 32992d61bbb3SSatish Balay M = ai[i + 1] - ai[i]; 33002d61bbb3SSatish Balay li = l + i * bs; 33012d61bbb3SSatish Balay v = aa + bs2 * ai[i]; 33022d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 3303ad540459SPierre Jolivet for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; 33042d61bbb3SSatish Balay } 33052d61bbb3SSatish Balay } 33069566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 33079566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 33082d61bbb3SSatish Balay } 33092d61bbb3SSatish Balay 33102d61bbb3SSatish Balay if (rr) { 33119566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(rr, &r)); 33129566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(rr, &rn)); 331308401ef6SPierre Jolivet PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 33142d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 331553ef36baSBarry Smith iai = ai[i]; 331653ef36baSBarry Smith M = ai[i + 1] - iai; 331753ef36baSBarry Smith v = aa + bs2 * iai; 33182d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 331953ef36baSBarry Smith ri = r + bs * aj[iai + j]; 33202d61bbb3SSatish Balay for (k = 0; k < bs; k++) { 33212d61bbb3SSatish Balay x = ri[k]; 332253ef36baSBarry Smith for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 332353ef36baSBarry Smith v += bs; 33242d61bbb3SSatish Balay } 33252d61bbb3SSatish Balay } 33262d61bbb3SSatish Balay } 33279566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(rr, &r)); 33289566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 33292d61bbb3SSatish Balay } 33303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33312d61bbb3SSatish Balay } 33322d61bbb3SSatish Balay 3333d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) 3334d71ae5a4SJacob Faibussowitsch { 33352d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33362d61bbb3SSatish Balay 33372d61bbb3SSatish Balay PetscFunctionBegin; 33382d61bbb3SSatish Balay info->block_size = a->bs2; 3339ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; 33402d61bbb3SSatish Balay info->nz_used = a->bs2 * a->nz; 33413966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 33422d61bbb3SSatish Balay info->assemblies = A->num_ass; 33438e58a170SBarry Smith info->mallocs = A->info.mallocs; 33444dfa11a4SJacob Faibussowitsch info->memory = 0; /* REVIEW ME */ 3345d5f3da31SBarry Smith if (A->factortype) { 33462d61bbb3SSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 33472d61bbb3SSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 33482d61bbb3SSatish Balay info->factor_mallocs = A->info.factor_mallocs; 33492d61bbb3SSatish Balay } else { 33502d61bbb3SSatish Balay info->fill_ratio_given = 0; 33512d61bbb3SSatish Balay info->fill_ratio_needed = 0; 33522d61bbb3SSatish Balay info->factor_mallocs = 0; 33532d61bbb3SSatish Balay } 33543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33552d61bbb3SSatish Balay } 33562d61bbb3SSatish Balay 3357d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 3358d71ae5a4SJacob Faibussowitsch { 33592d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33602d61bbb3SSatish Balay 33612d61bbb3SSatish Balay PetscFunctionBegin; 33629566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 33633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33642d61bbb3SSatish Balay } 3365a001520aSPierre Jolivet 3366d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 3367d71ae5a4SJacob Faibussowitsch { 3368a001520aSPierre Jolivet PetscFunctionBegin; 33699566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 33704222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 33713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3372a001520aSPierre Jolivet } 3373a001520aSPierre Jolivet 337466976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3375d71ae5a4SJacob Faibussowitsch { 337674eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3377f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1; 3378bcf10a7aSPierre Jolivet const PetscScalar *xb; 337974eeabc5SPierre Jolivet PetscScalar x1; 338074eeabc5SPierre Jolivet const MatScalar *v, *vv; 338174eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 338274eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 338374eeabc5SPierre Jolivet 338474eeabc5SPierre Jolivet PetscFunctionBegin; 338574eeabc5SPierre Jolivet idx = a->j; 338674eeabc5SPierre Jolivet v = a->a; 338774eeabc5SPierre Jolivet if (usecprow) { 338874eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 338974eeabc5SPierre Jolivet ii = a->compressedrow.i; 339074eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 339174eeabc5SPierre Jolivet } else { 339274eeabc5SPierre Jolivet mbs = a->mbs; 339374eeabc5SPierre Jolivet ii = a->i; 339474eeabc5SPierre Jolivet z = c; 339574eeabc5SPierre Jolivet } 339674eeabc5SPierre Jolivet 339774eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 33989371c9d4SSatish Balay n = ii[1] - ii[0]; 33999371c9d4SSatish Balay ii++; 340074eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 340174eeabc5SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 340274eeabc5SPierre Jolivet if (usecprow) z = c + ridx[i]; 340374eeabc5SPierre Jolivet jj = idx; 340474eeabc5SPierre Jolivet vv = v; 340574eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 340674eeabc5SPierre Jolivet idx = jj; 340774eeabc5SPierre Jolivet v = vv; 340874eeabc5SPierre Jolivet sum1 = 0.0; 340974eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 34109371c9d4SSatish Balay xb = b + (*idx++); 34119371c9d4SSatish Balay x1 = xb[0 + k * bm]; 341274eeabc5SPierre Jolivet sum1 += v[0] * x1; 341374eeabc5SPierre Jolivet v += 1; 341474eeabc5SPierre Jolivet } 3415feb237baSPierre Jolivet z[0 + k * cm] = sum1; 341674eeabc5SPierre Jolivet } 341774eeabc5SPierre Jolivet if (!usecprow) z += 1; 341874eeabc5SPierre Jolivet } 34193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 342074eeabc5SPierre Jolivet } 342174eeabc5SPierre Jolivet 342266976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3423d71ae5a4SJacob Faibussowitsch { 34244b7054f4SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3425f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2; 3426bcf10a7aSPierre Jolivet const PetscScalar *xb; 34274b7054f4SPierre Jolivet PetscScalar x1, x2; 34284b7054f4SPierre Jolivet const MatScalar *v, *vv; 34294b7054f4SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 34304b7054f4SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 34314b7054f4SPierre Jolivet 34324b7054f4SPierre Jolivet PetscFunctionBegin; 34334b7054f4SPierre Jolivet idx = a->j; 34344b7054f4SPierre Jolivet v = a->a; 34354b7054f4SPierre Jolivet if (usecprow) { 34364b7054f4SPierre Jolivet mbs = a->compressedrow.nrows; 34374b7054f4SPierre Jolivet ii = a->compressedrow.i; 34384b7054f4SPierre Jolivet ridx = a->compressedrow.rindex; 34394b7054f4SPierre Jolivet } else { 34404b7054f4SPierre Jolivet mbs = a->mbs; 34414b7054f4SPierre Jolivet ii = a->i; 34424b7054f4SPierre Jolivet z = c; 34434b7054f4SPierre Jolivet } 34444b7054f4SPierre Jolivet 34454b7054f4SPierre Jolivet for (i = 0; i < mbs; i++) { 34469371c9d4SSatish Balay n = ii[1] - ii[0]; 34479371c9d4SSatish Balay ii++; 34484b7054f4SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 34494b7054f4SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 34504b7054f4SPierre Jolivet if (usecprow) z = c + 2 * ridx[i]; 34514b7054f4SPierre Jolivet jj = idx; 34524b7054f4SPierre Jolivet vv = v; 34534b7054f4SPierre Jolivet for (k = 0; k < cn; k++) { 34544b7054f4SPierre Jolivet idx = jj; 34554b7054f4SPierre Jolivet v = vv; 34569371c9d4SSatish Balay sum1 = 0.0; 34579371c9d4SSatish Balay sum2 = 0.0; 34584b7054f4SPierre Jolivet for (j = 0; j < n; j++) { 34599371c9d4SSatish Balay xb = b + 2 * (*idx++); 34609371c9d4SSatish Balay x1 = xb[0 + k * bm]; 34619371c9d4SSatish Balay x2 = xb[1 + k * bm]; 34624b7054f4SPierre Jolivet sum1 += v[0] * x1 + v[2] * x2; 34634b7054f4SPierre Jolivet sum2 += v[1] * x1 + v[3] * x2; 34644b7054f4SPierre Jolivet v += 4; 34654b7054f4SPierre Jolivet } 34669371c9d4SSatish Balay z[0 + k * cm] = sum1; 34679371c9d4SSatish Balay z[1 + k * cm] = sum2; 34684b7054f4SPierre Jolivet } 34694b7054f4SPierre Jolivet if (!usecprow) z += 2; 34704b7054f4SPierre Jolivet } 34713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 34724b7054f4SPierre Jolivet } 34734b7054f4SPierre Jolivet 347466976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3475d71ae5a4SJacob Faibussowitsch { 347674eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3477f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3; 3478bcf10a7aSPierre Jolivet const PetscScalar *xb; 347974eeabc5SPierre Jolivet PetscScalar x1, x2, x3; 348074eeabc5SPierre Jolivet const MatScalar *v, *vv; 348174eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 348274eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 348374eeabc5SPierre Jolivet 348474eeabc5SPierre Jolivet PetscFunctionBegin; 348574eeabc5SPierre Jolivet idx = a->j; 348674eeabc5SPierre Jolivet v = a->a; 348774eeabc5SPierre Jolivet if (usecprow) { 348874eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 348974eeabc5SPierre Jolivet ii = a->compressedrow.i; 349074eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 349174eeabc5SPierre Jolivet } else { 349274eeabc5SPierre Jolivet mbs = a->mbs; 349374eeabc5SPierre Jolivet ii = a->i; 349474eeabc5SPierre Jolivet z = c; 349574eeabc5SPierre Jolivet } 349674eeabc5SPierre Jolivet 349774eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 34989371c9d4SSatish Balay n = ii[1] - ii[0]; 34999371c9d4SSatish Balay ii++; 350074eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 350174eeabc5SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 350274eeabc5SPierre Jolivet if (usecprow) z = c + 3 * ridx[i]; 350374eeabc5SPierre Jolivet jj = idx; 350474eeabc5SPierre Jolivet vv = v; 350574eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 350674eeabc5SPierre Jolivet idx = jj; 350774eeabc5SPierre Jolivet v = vv; 35089371c9d4SSatish Balay sum1 = 0.0; 35099371c9d4SSatish Balay sum2 = 0.0; 35109371c9d4SSatish Balay sum3 = 0.0; 351174eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 35129371c9d4SSatish Balay xb = b + 3 * (*idx++); 35139371c9d4SSatish Balay x1 = xb[0 + k * bm]; 35149371c9d4SSatish Balay x2 = xb[1 + k * bm]; 35159371c9d4SSatish Balay x3 = xb[2 + k * bm]; 351674eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 351774eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 351874eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 351974eeabc5SPierre Jolivet v += 9; 352074eeabc5SPierre Jolivet } 35219371c9d4SSatish Balay z[0 + k * cm] = sum1; 35229371c9d4SSatish Balay z[1 + k * cm] = sum2; 35239371c9d4SSatish Balay z[2 + k * cm] = sum3; 352474eeabc5SPierre Jolivet } 352574eeabc5SPierre Jolivet if (!usecprow) z += 3; 352674eeabc5SPierre Jolivet } 35273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 352874eeabc5SPierre Jolivet } 352974eeabc5SPierre Jolivet 353066976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3531d71ae5a4SJacob Faibussowitsch { 353274eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3533f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3534bcf10a7aSPierre Jolivet const PetscScalar *xb; 353574eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4; 353674eeabc5SPierre Jolivet const MatScalar *v, *vv; 353774eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 353874eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 353974eeabc5SPierre Jolivet 354074eeabc5SPierre Jolivet PetscFunctionBegin; 354174eeabc5SPierre Jolivet idx = a->j; 354274eeabc5SPierre Jolivet v = a->a; 354374eeabc5SPierre Jolivet if (usecprow) { 354474eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 354574eeabc5SPierre Jolivet ii = a->compressedrow.i; 354674eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 354774eeabc5SPierre Jolivet } else { 354874eeabc5SPierre Jolivet mbs = a->mbs; 354974eeabc5SPierre Jolivet ii = a->i; 355074eeabc5SPierre Jolivet z = c; 355174eeabc5SPierre Jolivet } 355274eeabc5SPierre Jolivet 355374eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 35549371c9d4SSatish Balay n = ii[1] - ii[0]; 35559371c9d4SSatish Balay ii++; 355674eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 355774eeabc5SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 355874eeabc5SPierre Jolivet if (usecprow) z = c + 4 * ridx[i]; 355974eeabc5SPierre Jolivet jj = idx; 356074eeabc5SPierre Jolivet vv = v; 356174eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 356274eeabc5SPierre Jolivet idx = jj; 356374eeabc5SPierre Jolivet v = vv; 35649371c9d4SSatish Balay sum1 = 0.0; 35659371c9d4SSatish Balay sum2 = 0.0; 35669371c9d4SSatish Balay sum3 = 0.0; 35679371c9d4SSatish Balay sum4 = 0.0; 356874eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 35699371c9d4SSatish Balay xb = b + 4 * (*idx++); 35709371c9d4SSatish Balay x1 = xb[0 + k * bm]; 35719371c9d4SSatish Balay x2 = xb[1 + k * bm]; 35729371c9d4SSatish Balay x3 = xb[2 + k * bm]; 35739371c9d4SSatish Balay x4 = xb[3 + k * bm]; 357474eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 357574eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 357674eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 357774eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 357874eeabc5SPierre Jolivet v += 16; 357974eeabc5SPierre Jolivet } 35809371c9d4SSatish Balay z[0 + k * cm] = sum1; 35819371c9d4SSatish Balay z[1 + k * cm] = sum2; 35829371c9d4SSatish Balay z[2 + k * cm] = sum3; 35839371c9d4SSatish Balay z[3 + k * cm] = sum4; 358474eeabc5SPierre Jolivet } 358574eeabc5SPierre Jolivet if (!usecprow) z += 4; 358674eeabc5SPierre Jolivet } 35873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 358874eeabc5SPierre Jolivet } 358974eeabc5SPierre Jolivet 359066976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3591d71ae5a4SJacob Faibussowitsch { 359274eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3593f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3594bcf10a7aSPierre Jolivet const PetscScalar *xb; 359574eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 359674eeabc5SPierre Jolivet const MatScalar *v, *vv; 359774eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 359874eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 359974eeabc5SPierre Jolivet 360074eeabc5SPierre Jolivet PetscFunctionBegin; 360174eeabc5SPierre Jolivet idx = a->j; 360274eeabc5SPierre Jolivet v = a->a; 360374eeabc5SPierre Jolivet if (usecprow) { 360474eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 360574eeabc5SPierre Jolivet ii = a->compressedrow.i; 360674eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 360774eeabc5SPierre Jolivet } else { 360874eeabc5SPierre Jolivet mbs = a->mbs; 360974eeabc5SPierre Jolivet ii = a->i; 361074eeabc5SPierre Jolivet z = c; 361174eeabc5SPierre Jolivet } 361274eeabc5SPierre Jolivet 361374eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 36149371c9d4SSatish Balay n = ii[1] - ii[0]; 36159371c9d4SSatish Balay ii++; 361674eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 361774eeabc5SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 361874eeabc5SPierre Jolivet if (usecprow) z = c + 5 * ridx[i]; 361974eeabc5SPierre Jolivet jj = idx; 362074eeabc5SPierre Jolivet vv = v; 362174eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 362274eeabc5SPierre Jolivet idx = jj; 362374eeabc5SPierre Jolivet v = vv; 36249371c9d4SSatish Balay sum1 = 0.0; 36259371c9d4SSatish Balay sum2 = 0.0; 36269371c9d4SSatish Balay sum3 = 0.0; 36279371c9d4SSatish Balay sum4 = 0.0; 36289371c9d4SSatish Balay sum5 = 0.0; 362974eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 36309371c9d4SSatish Balay xb = b + 5 * (*idx++); 36319371c9d4SSatish Balay x1 = xb[0 + k * bm]; 36329371c9d4SSatish Balay x2 = xb[1 + k * bm]; 36339371c9d4SSatish Balay x3 = xb[2 + k * bm]; 36349371c9d4SSatish Balay x4 = xb[3 + k * bm]; 36359371c9d4SSatish Balay x5 = xb[4 + k * bm]; 363674eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 363774eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 363874eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 363974eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 364074eeabc5SPierre Jolivet sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 364174eeabc5SPierre Jolivet v += 25; 364274eeabc5SPierre Jolivet } 36439371c9d4SSatish Balay z[0 + k * cm] = sum1; 36449371c9d4SSatish Balay z[1 + k * cm] = sum2; 36459371c9d4SSatish Balay z[2 + k * cm] = sum3; 36469371c9d4SSatish Balay z[3 + k * cm] = sum4; 36479371c9d4SSatish Balay z[4 + k * cm] = sum5; 364874eeabc5SPierre Jolivet } 364974eeabc5SPierre Jolivet if (!usecprow) z += 5; 365074eeabc5SPierre Jolivet } 36513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 365274eeabc5SPierre Jolivet } 365374eeabc5SPierre Jolivet 3654d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) 3655d71ae5a4SJacob Faibussowitsch { 3656a001520aSPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3657a001520aSPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3658910cf402Sprj- Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3659bcf10a7aSPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3660a001520aSPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3661a001520aSPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 3662f4259b30SLisandro Dalcin PetscScalar *z = NULL; 3663a001520aSPierre Jolivet PetscScalar *c, *b; 3664a001520aSPierre Jolivet const MatScalar *v; 3665a001520aSPierre Jolivet const PetscInt *idx, *ii, *ridx = NULL; 36664b7054f4SPierre Jolivet PetscScalar _DZero = 0.0, _DOne = 1.0; 3667a001520aSPierre Jolivet PetscBool usecprow = a->compressedrow.use; 3668a001520aSPierre Jolivet 3669a001520aSPierre Jolivet PetscFunctionBegin; 36703ba16761SJacob Faibussowitsch if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 367108401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 367208401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 367308401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3674a001520aSPierre Jolivet b = bd->v; 367548a46eb9SPierre Jolivet if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 36769566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 367774eeabc5SPierre Jolivet switch (bs) { 3678d71ae5a4SJacob Faibussowitsch case 1: 3679d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3680d71ae5a4SJacob Faibussowitsch break; 3681d71ae5a4SJacob Faibussowitsch case 2: 3682d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3683d71ae5a4SJacob Faibussowitsch break; 3684d71ae5a4SJacob Faibussowitsch case 3: 3685d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3686d71ae5a4SJacob Faibussowitsch break; 3687d71ae5a4SJacob Faibussowitsch case 4: 3688d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3689d71ae5a4SJacob Faibussowitsch break; 3690d71ae5a4SJacob Faibussowitsch case 5: 3691d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3692d71ae5a4SJacob Faibussowitsch break; 369374eeabc5SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 36949566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 36959566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 36969566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 36979566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 3698a001520aSPierre Jolivet idx = a->j; 3699a001520aSPierre Jolivet v = a->a; 3700a001520aSPierre Jolivet if (usecprow) { 3701a001520aSPierre Jolivet mbs = a->compressedrow.nrows; 3702a001520aSPierre Jolivet ii = a->compressedrow.i; 3703a001520aSPierre Jolivet ridx = a->compressedrow.rindex; 3704a001520aSPierre Jolivet } else { 3705a001520aSPierre Jolivet mbs = a->mbs; 3706a001520aSPierre Jolivet ii = a->i; 3707a001520aSPierre Jolivet z = c; 3708a001520aSPierre Jolivet } 3709a001520aSPierre Jolivet for (i = 0; i < mbs; i++) { 37109371c9d4SSatish Balay n = ii[1] - ii[0]; 37119371c9d4SSatish Balay ii++; 3712a001520aSPierre Jolivet if (usecprow) z = c + bs * ridx[i]; 37134b7054f4SPierre Jolivet if (n) { 3714792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 37154b7054f4SPierre Jolivet v += bs2; 37164b7054f4SPierre Jolivet } 37174b7054f4SPierre Jolivet for (j = 1; j < n; j++) { 3718792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3719a001520aSPierre Jolivet v += bs2; 3720a001520aSPierre Jolivet } 3721a001520aSPierre Jolivet if (!usecprow) z += bs; 3722a001520aSPierre Jolivet } 37234b7054f4SPierre Jolivet } 37249566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 37259566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 37263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3727a001520aSPierre Jolivet } 3728