1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 2a001520aSPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <petscblaslapack.h> 6cac129eeSSatish Balay 75f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 896e086a2SDaniel Kokron #include <immintrin.h> 996e086a2SDaniel Kokron #endif 1096e086a2SDaniel Kokron 119371c9d4SSatish Balay PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) { 12a3192f15SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 135d0c19d7SBarry Smith PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 145d0c19d7SBarry Smith const PetscInt *idx; 15690b6cddSBarry Smith PetscInt start, end, *ai, *aj, bs, *nidx2; 16f1af5d2fSBarry Smith PetscBT table; 17a3192f15SSatish Balay 183a40ed3dSBarry Smith PetscFunctionBegin; 19a3192f15SSatish Balay m = a->mbs; 20a3192f15SSatish Balay ai = a->i; 21a3192f15SSatish Balay aj = a->j; 22d0f46423SBarry Smith bs = A->rmap->bs; 23a3192f15SSatish Balay 2408401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 25a3192f15SSatish Balay 269566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(m, &table)); 279566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &nidx)); 289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2)); 29a3192f15SSatish Balay 30a3192f15SSatish Balay for (i = 0; i < is_max; i++) { 31a3192f15SSatish Balay /* Initialise the two local arrays */ 32a3192f15SSatish Balay isz = 0; 339566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(m, table)); 34a3192f15SSatish Balay 35a3192f15SSatish Balay /* Extract the indices, assume there can be duplicate entries */ 369566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 379566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 38a3192f15SSatish Balay 39a3192f15SSatish Balay /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40a3192f15SSatish Balay for (j = 0; j < n; ++j) { 41218c64b6SSatish Balay ival = idx[j] / bs; /* convert the indices into block indices */ 4208401ef6SPierre Jolivet PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 4326fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 44a3192f15SSatish Balay } 459566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 469566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 47a3192f15SSatish Balay 48a3192f15SSatish Balay k = 0; 49a3192f15SSatish Balay for (j = 0; j < ov; j++) { /* for each overlap*/ 50a3192f15SSatish Balay n = isz; 51a3192f15SSatish Balay for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52a3192f15SSatish Balay row = nidx[k]; 53a3192f15SSatish Balay start = ai[row]; 54a3192f15SSatish Balay end = ai[row + 1]; 55a3192f15SSatish Balay for (l = start; l < end; l++) { 56a3192f15SSatish Balay val = aj[l]; 5726fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 58a3192f15SSatish Balay } 59a3192f15SSatish Balay } 60a3192f15SSatish Balay } 61218c64b6SSatish Balay /* expand the Index Set */ 62218c64b6SSatish Balay for (j = 0; j < isz; j++) { 6326fbe8dcSKarl Rupp for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k; 64218c64b6SSatish Balay } 659566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i)); 66a3192f15SSatish Balay } 679566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table)); 689566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 699566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx2)); 703a40ed3dSBarry Smith PetscFunctionReturn(0); 71a3192f15SSatish Balay } 721c351548SSatish Balay 739371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 74736121d4SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 75690b6cddSBarry Smith PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 76690b6cddSBarry Smith PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 775d0c19d7SBarry Smith const PetscInt *irow, *icol; 785d0c19d7SBarry Smith PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 79690b6cddSBarry Smith PetscInt *aj = a->j, *ai = a->i; 803f1db9ecSBarry Smith MatScalar *mat_a; 81736121d4SSatish Balay Mat C; 826041f1b1SToby Isaac PetscBool flag; 83736121d4SSatish Balay 843a40ed3dSBarry Smith PetscFunctionBegin; 859566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 869566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 879566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 889566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 89736121d4SSatish Balay 909566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 91736121d4SSatish Balay ssmap = smap; 929566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 93736121d4SSatish Balay for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 94736121d4SSatish Balay /* determine lens of each row */ 95736121d4SSatish Balay for (i = 0; i < nrows; i++) { 96736121d4SSatish Balay kstart = ai[irow[i]]; 97736121d4SSatish Balay kend = kstart + a->ilen[irow[i]]; 98736121d4SSatish Balay lens[i] = 0; 99736121d4SSatish Balay for (k = kstart; k < kend; k++) { 10026fbe8dcSKarl Rupp if (ssmap[aj[k]]) lens[i]++; 101736121d4SSatish Balay } 102736121d4SSatish Balay } 103736121d4SSatish Balay /* Create and fill new matrix */ 104736121d4SSatish Balay if (scall == MAT_REUSE_MATRIX) { 105736121d4SSatish Balay c = (Mat_SeqBAIJ *)((*B)->data); 106736121d4SSatish Balay 107aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1089566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 10928b400f6SJacob Faibussowitsch PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 1109566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 111736121d4SSatish Balay C = *B; 1123a40ed3dSBarry Smith } else { 1139566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1149566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 1159566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1169566063dSJacob Faibussowitsch PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 117736121d4SSatish Balay } 118736121d4SSatish Balay c = (Mat_SeqBAIJ *)(C->data); 119736121d4SSatish Balay for (i = 0; i < nrows; i++) { 120736121d4SSatish Balay row = irow[i]; 121736121d4SSatish Balay kstart = ai[row]; 122736121d4SSatish Balay kend = kstart + a->ilen[row]; 123736121d4SSatish Balay mat_i = c->i[i]; 124d29f2997SMatthew Woehlke mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 125d29f2997SMatthew Woehlke mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 126736121d4SSatish Balay mat_ilen = c->ilen + i; 127736121d4SSatish Balay for (k = kstart; k < kend; k++) { 128736121d4SSatish Balay if ((tcol = ssmap[a->j[k]])) { 129736121d4SSatish Balay *mat_j++ = tcol - 1; 1309566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 131549d3d68SSatish Balay mat_a += bs2; 132736121d4SSatish Balay (*mat_ilen)++; 133736121d4SSatish Balay } 134736121d4SSatish Balay } 135736121d4SSatish Balay } 136cdc6f3adSToby Isaac /* sort */ 137d29f2997SMatthew Woehlke if (c->j && c->a) { 138cdc6f3adSToby Isaac MatScalar *work; 1399566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 140cdc6f3adSToby Isaac for (i = 0; i < nrows; i++) { 141cdc6f3adSToby Isaac PetscInt ilen; 142cdc6f3adSToby Isaac mat_i = c->i[i]; 143cdc6f3adSToby Isaac mat_j = c->j + mat_i; 144cdc6f3adSToby Isaac mat_a = c->a + mat_i * bs2; 145cdc6f3adSToby Isaac ilen = c->ilen[i]; 1469566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 147cdc6f3adSToby Isaac } 1489566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 149cdc6f3adSToby Isaac } 150218c64b6SSatish Balay 151736121d4SSatish Balay /* Free work space */ 1529566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 1539566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 1549566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 1559566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1569566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 157736121d4SSatish Balay 1589566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 159736121d4SSatish Balay *B = C; 1603a40ed3dSBarry Smith PetscFunctionReturn(0); 161736121d4SSatish Balay } 162736121d4SSatish Balay 1639371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 164218c64b6SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 165218c64b6SSatish Balay IS is1, is2; 166afebec48SHong Zhang PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j; 1675d0c19d7SBarry Smith const PetscInt *irow, *icol; 168218c64b6SSatish Balay 1693a40ed3dSBarry Smith PetscFunctionBegin; 1709566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 1719566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 1729566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 1739566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 174218c64b6SSatish Balay 175218c64b6SSatish Balay /* Verify if the indices corespond to each element in a block 176218c64b6SSatish Balay and form the IS with compressed IS */ 177f8ecb639SStefano Zampini maxmnbs = PetscMax(a->mbs, a->nbs); 1789566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 1799566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->mbs)); 180218c64b6SSatish Balay for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 181ad540459SPierre Jolivet for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 1826041f1b1SToby Isaac count = 0; 1836041f1b1SToby Isaac for (i = 0; i < nrows; i++) { 184afebec48SHong Zhang j = irow[i] / bs; 1856041f1b1SToby Isaac if ((vary[j]--) == bs) iary[count++] = j; 186218c64b6SSatish Balay } 1879566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 188218c64b6SSatish Balay 1899566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->nbs)); 190218c64b6SSatish Balay for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 191ad540459SPierre Jolivet for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 1926041f1b1SToby Isaac count = 0; 1936041f1b1SToby Isaac for (i = 0; i < ncols; i++) { 194afebec48SHong Zhang j = icol[i] / bs; 1956041f1b1SToby Isaac if ((vary[j]--) == bs) iary[count++] = j; 1966041f1b1SToby Isaac } 1979566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 1989566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 1999566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 2009566063dSJacob Faibussowitsch PetscCall(PetscFree2(vary, iary)); 201218c64b6SSatish Balay 2029566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 2039566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2049566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 2053a40ed3dSBarry Smith PetscFunctionReturn(0); 206218c64b6SSatish Balay } 207218c64b6SSatish Balay 2089371c9d4SSatish Balay PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) { 20916b64355SHong Zhang Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 2105c39f6d9SHong Zhang Mat_SubSppt *submatj = c->submatis1; 21116b64355SHong Zhang 21216b64355SHong Zhang PetscFunctionBegin; 2139566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 2149566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 21516b64355SHong Zhang PetscFunctionReturn(0); 21616b64355SHong Zhang } 21716b64355SHong Zhang 21889a1a59bSHong Zhang /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 2199371c9d4SSatish Balay PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) { 22086e85357SHong Zhang PetscInt i; 22186e85357SHong Zhang Mat C; 22286e85357SHong Zhang Mat_SeqBAIJ *c; 22386e85357SHong Zhang Mat_SubSppt *submatj; 22486e85357SHong Zhang 22586e85357SHong Zhang PetscFunctionBegin; 22686e85357SHong Zhang for (i = 0; i < n; i++) { 22786e85357SHong Zhang C = (*mat)[i]; 22886e85357SHong Zhang c = (Mat_SeqBAIJ *)C->data; 22986e85357SHong Zhang submatj = c->submatis1; 23086e85357SHong Zhang if (submatj) { 2317daefbafSJunchao Zhang if (--((PetscObject)C)->refct <= 0) { 23226cc229bSBarry Smith PetscCall(PetscFree(C->factorprefix)); 2339566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 2349566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 2359566063dSJacob Faibussowitsch PetscCall(PetscFree(C->defaultvectype)); 236*3faff063SStefano Zampini PetscCall(PetscFree(C->defaultrandtype)); 2379566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->rmap)); 2389566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->cmap)); 2399566063dSJacob Faibussowitsch PetscCall(PetscHeaderDestroy(&C)); 2407daefbafSJunchao Zhang } 24186e85357SHong Zhang } else { 2429566063dSJacob Faibussowitsch PetscCall(MatDestroy(&C)); 24386e85357SHong Zhang } 24486e85357SHong Zhang } 2457daefbafSJunchao Zhang 2467daefbafSJunchao Zhang /* Destroy Dummy submatrices created for reuse */ 2479566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 2487daefbafSJunchao Zhang 2499566063dSJacob Faibussowitsch PetscCall(PetscFree(*mat)); 25086e85357SHong Zhang PetscFunctionReturn(0); 25186e85357SHong Zhang } 25286e85357SHong Zhang 2539371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) { 254690b6cddSBarry Smith PetscInt i; 255736121d4SSatish Balay 2563a40ed3dSBarry Smith PetscFunctionBegin; 25748a46eb9SPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 258736121d4SSatish Balay 25948a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 2603a40ed3dSBarry Smith PetscFunctionReturn(0); 261736121d4SSatish Balay } 262218c64b6SSatish Balay 2632d61bbb3SSatish Balay /* -------------------------------------------------------*/ 2642d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */ 2652d61bbb3SSatish Balay /* -------------------------------------------------------*/ 2662d61bbb3SSatish Balay 2679371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) { 2682d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 269d9fead3dSBarry Smith PetscScalar *z, sum; 270d9fead3dSBarry Smith const PetscScalar *x; 271d9fead3dSBarry Smith const MatScalar *v; 2727c565772SBarry Smith PetscInt mbs, i, n; 2730298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 274ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2752d61bbb3SSatish Balay 2762d61bbb3SSatish Balay PetscFunctionBegin; 2779566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2789566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &z)); 2792d61bbb3SSatish Balay 28026e093fcSHong Zhang if (usecprow) { 28126e093fcSHong Zhang mbs = a->compressedrow.nrows; 28226e093fcSHong Zhang ii = a->compressedrow.i; 2837b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 2849566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(z, a->mbs)); 28526e093fcSHong Zhang } else { 28626e093fcSHong Zhang mbs = a->mbs; 2872d61bbb3SSatish Balay ii = a->i; 28826e093fcSHong Zhang } 2892d61bbb3SSatish Balay 2902d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 291ee54c7eeSHong Zhang n = ii[1] - ii[0]; 292ee54c7eeSHong Zhang v = a->a + ii[0]; 293ee54c7eeSHong Zhang idx = a->j + ii[0]; 294ee54c7eeSHong Zhang ii++; 295444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 296444d8c10SJed Brown PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2972d61bbb3SSatish Balay sum = 0.0; 2982162cab8SBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 29926e093fcSHong Zhang if (usecprow) { 3007b2bb3b9SHong Zhang z[ridx[i]] = sum; 30126e093fcSHong Zhang } else { 3022d61bbb3SSatish Balay z[i] = sum; 3032d61bbb3SSatish Balay } 30426e093fcSHong Zhang } 3059566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3069566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &z)); 3079566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 3082d61bbb3SSatish Balay PetscFunctionReturn(0); 3092d61bbb3SSatish Balay } 3102d61bbb3SSatish Balay 3119371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) { 3122d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 313f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, *zarray; 314d9fead3dSBarry Smith const PetscScalar *x, *xb; 31587828ca2SBarry Smith PetscScalar x1, x2; 316d9fead3dSBarry Smith const MatScalar *v; 3177c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 318ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 3192d61bbb3SSatish Balay 3202d61bbb3SSatish Balay PetscFunctionBegin; 3219566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3229566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 3232d61bbb3SSatish Balay 3242d61bbb3SSatish Balay idx = a->j; 3252d61bbb3SSatish Balay v = a->a; 32626e093fcSHong Zhang if (usecprow) { 32726e093fcSHong Zhang mbs = a->compressedrow.nrows; 32826e093fcSHong Zhang ii = a->compressedrow.i; 3297b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 3309566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 33126e093fcSHong Zhang } else { 33226e093fcSHong Zhang mbs = a->mbs; 3332d61bbb3SSatish Balay ii = a->i; 33426e093fcSHong Zhang z = zarray; 33526e093fcSHong Zhang } 3362d61bbb3SSatish Balay 3372d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 3389371c9d4SSatish Balay n = ii[1] - ii[0]; 3399371c9d4SSatish Balay ii++; 3409371c9d4SSatish Balay sum1 = 0.0; 3419371c9d4SSatish Balay sum2 = 0.0; 342444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 343444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3442d61bbb3SSatish Balay for (j = 0; j < n; j++) { 3459371c9d4SSatish Balay xb = x + 2 * (*idx++); 3469371c9d4SSatish Balay x1 = xb[0]; 3479371c9d4SSatish Balay x2 = xb[1]; 3482d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 3492d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 3502d61bbb3SSatish Balay v += 4; 3512d61bbb3SSatish Balay } 3527b2bb3b9SHong Zhang if (usecprow) z = zarray + 2 * ridx[i]; 3539371c9d4SSatish Balay z[0] = sum1; 3549371c9d4SSatish Balay z[1] = sum2; 35526e093fcSHong Zhang if (!usecprow) z += 2; 3562d61bbb3SSatish Balay } 3579566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3589566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 3599566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 3602d61bbb3SSatish Balay PetscFunctionReturn(0); 3612d61bbb3SSatish Balay } 3622d61bbb3SSatish Balay 3639371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) { 3642d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 365f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 366d9fead3dSBarry Smith const PetscScalar *x, *xb; 367d9fead3dSBarry Smith const MatScalar *v; 3687c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 369ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 37026e093fcSHong Zhang 371b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 372fee21e36SBarry Smith #pragma disjoint(*v, *z, *xb) 373fee21e36SBarry Smith #endif 374fee21e36SBarry Smith 3752d61bbb3SSatish Balay PetscFunctionBegin; 3769566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3779566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 3782d61bbb3SSatish Balay 3792d61bbb3SSatish Balay idx = a->j; 3802d61bbb3SSatish Balay v = a->a; 38126e093fcSHong Zhang if (usecprow) { 38226e093fcSHong Zhang mbs = a->compressedrow.nrows; 38326e093fcSHong Zhang ii = a->compressedrow.i; 3847b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 3859566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 38626e093fcSHong Zhang } else { 38726e093fcSHong Zhang mbs = a->mbs; 3882d61bbb3SSatish Balay ii = a->i; 38926e093fcSHong Zhang z = zarray; 39026e093fcSHong Zhang } 3912d61bbb3SSatish Balay 3922d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 3939371c9d4SSatish Balay n = ii[1] - ii[0]; 3949371c9d4SSatish Balay ii++; 3959371c9d4SSatish Balay sum1 = 0.0; 3969371c9d4SSatish Balay sum2 = 0.0; 3979371c9d4SSatish Balay sum3 = 0.0; 398444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 399444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4002d61bbb3SSatish Balay for (j = 0; j < n; j++) { 40126fbe8dcSKarl Rupp xb = x + 3 * (*idx++); 40226fbe8dcSKarl Rupp x1 = xb[0]; 40326fbe8dcSKarl Rupp x2 = xb[1]; 40426fbe8dcSKarl Rupp x3 = xb[2]; 40526fbe8dcSKarl Rupp 4062d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 4072d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 4082d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 4092d61bbb3SSatish Balay v += 9; 4102d61bbb3SSatish Balay } 4117b2bb3b9SHong Zhang if (usecprow) z = zarray + 3 * ridx[i]; 4129371c9d4SSatish Balay z[0] = sum1; 4139371c9d4SSatish Balay z[1] = sum2; 4149371c9d4SSatish Balay z[2] = sum3; 41526e093fcSHong Zhang if (!usecprow) z += 3; 4162d61bbb3SSatish Balay } 4179566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4189566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 4199566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 4202d61bbb3SSatish Balay PetscFunctionReturn(0); 4212d61bbb3SSatish Balay } 4222d61bbb3SSatish Balay 4239371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) { 4242d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 425f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 426d9fead3dSBarry Smith const PetscScalar *x, *xb; 427d9fead3dSBarry Smith const MatScalar *v; 4287c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 429ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 4302d61bbb3SSatish Balay 4312d61bbb3SSatish Balay PetscFunctionBegin; 4329566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4339566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 4342d61bbb3SSatish Balay 4352d61bbb3SSatish Balay idx = a->j; 4362d61bbb3SSatish Balay v = a->a; 43726e093fcSHong Zhang if (usecprow) { 43826e093fcSHong Zhang mbs = a->compressedrow.nrows; 43926e093fcSHong Zhang ii = a->compressedrow.i; 4407b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 4419566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 44226e093fcSHong Zhang } else { 44326e093fcSHong Zhang mbs = a->mbs; 4442d61bbb3SSatish Balay ii = a->i; 44526e093fcSHong Zhang z = zarray; 44626e093fcSHong Zhang } 4472d61bbb3SSatish Balay 4482d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 44926fbe8dcSKarl Rupp n = ii[1] - ii[0]; 45026fbe8dcSKarl Rupp ii++; 45126fbe8dcSKarl Rupp sum1 = 0.0; 45226fbe8dcSKarl Rupp sum2 = 0.0; 45326fbe8dcSKarl Rupp sum3 = 0.0; 45426fbe8dcSKarl Rupp sum4 = 0.0; 45526fbe8dcSKarl Rupp 456444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 457444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4582d61bbb3SSatish Balay for (j = 0; j < n; j++) { 4592d61bbb3SSatish Balay xb = x + 4 * (*idx++); 4609371c9d4SSatish Balay x1 = xb[0]; 4619371c9d4SSatish Balay x2 = xb[1]; 4629371c9d4SSatish Balay x3 = xb[2]; 4639371c9d4SSatish Balay x4 = xb[3]; 4642d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 4652d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 4662d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 4672d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 4682d61bbb3SSatish Balay v += 16; 4692d61bbb3SSatish Balay } 4707b2bb3b9SHong Zhang if (usecprow) z = zarray + 4 * ridx[i]; 4719371c9d4SSatish Balay z[0] = sum1; 4729371c9d4SSatish Balay z[1] = sum2; 4739371c9d4SSatish Balay z[2] = sum3; 4749371c9d4SSatish Balay z[3] = sum4; 47526e093fcSHong Zhang if (!usecprow) z += 4; 4762d61bbb3SSatish Balay } 4779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 4799566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 4802d61bbb3SSatish Balay PetscFunctionReturn(0); 4812d61bbb3SSatish Balay } 4822d61bbb3SSatish Balay 4839371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) { 4842d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 485f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 486d9fead3dSBarry Smith const PetscScalar *xb, *x; 487d9fead3dSBarry Smith const MatScalar *v; 4880298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 4897c565772SBarry Smith PetscInt mbs, i, j, n; 490ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 4912d61bbb3SSatish Balay 492433994e6SBarry Smith PetscFunctionBegin; 4939566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4949566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 4952d61bbb3SSatish Balay 4962d61bbb3SSatish Balay idx = a->j; 4972d61bbb3SSatish Balay v = a->a; 49826e093fcSHong Zhang if (usecprow) { 49926e093fcSHong Zhang mbs = a->compressedrow.nrows; 50026e093fcSHong Zhang ii = a->compressedrow.i; 5017b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 5029566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 50326e093fcSHong Zhang } else { 50426e093fcSHong Zhang mbs = a->mbs; 5052d61bbb3SSatish Balay ii = a->i; 50626e093fcSHong Zhang z = zarray; 50726e093fcSHong Zhang } 5082d61bbb3SSatish Balay 5092d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 5109371c9d4SSatish Balay n = ii[1] - ii[0]; 5119371c9d4SSatish Balay ii++; 5129371c9d4SSatish Balay sum1 = 0.0; 5139371c9d4SSatish Balay sum2 = 0.0; 5149371c9d4SSatish Balay sum3 = 0.0; 5159371c9d4SSatish Balay sum4 = 0.0; 5169371c9d4SSatish Balay sum5 = 0.0; 517444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 518444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 5192d61bbb3SSatish Balay for (j = 0; j < n; j++) { 5202d61bbb3SSatish Balay xb = x + 5 * (*idx++); 5219371c9d4SSatish Balay x1 = xb[0]; 5229371c9d4SSatish Balay x2 = xb[1]; 5239371c9d4SSatish Balay x3 = xb[2]; 5249371c9d4SSatish Balay x4 = xb[3]; 5259371c9d4SSatish Balay x5 = xb[4]; 5262d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 5272d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 5282d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 5292d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 5302d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 5312d61bbb3SSatish Balay v += 25; 5322d61bbb3SSatish Balay } 5337b2bb3b9SHong Zhang if (usecprow) z = zarray + 5 * ridx[i]; 5349371c9d4SSatish Balay z[0] = sum1; 5359371c9d4SSatish Balay z[1] = sum2; 5369371c9d4SSatish Balay z[2] = sum3; 5379371c9d4SSatish Balay z[3] = sum4; 5389371c9d4SSatish Balay z[4] = sum5; 53926e093fcSHong Zhang if (!usecprow) z += 5; 5402d61bbb3SSatish Balay } 5419566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5429566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 5439566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 5442d61bbb3SSatish Balay PetscFunctionReturn(0); 5452d61bbb3SSatish Balay } 5462d61bbb3SSatish Balay 5479371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) { 54815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 549f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 550d9fead3dSBarry Smith const PetscScalar *x, *xb; 55126e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 552d9fead3dSBarry Smith const MatScalar *v; 5537c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 554ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 55515091d37SBarry Smith 556433994e6SBarry Smith PetscFunctionBegin; 5579566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5589566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 55915091d37SBarry Smith 56015091d37SBarry Smith idx = a->j; 56115091d37SBarry Smith v = a->a; 56226e093fcSHong Zhang if (usecprow) { 56326e093fcSHong Zhang mbs = a->compressedrow.nrows; 56426e093fcSHong Zhang ii = a->compressedrow.i; 5657b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 5669566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 56726e093fcSHong Zhang } else { 56826e093fcSHong Zhang mbs = a->mbs; 56915091d37SBarry Smith ii = a->i; 57026e093fcSHong Zhang z = zarray; 57126e093fcSHong Zhang } 57215091d37SBarry Smith 57315091d37SBarry Smith for (i = 0; i < mbs; i++) { 57426fbe8dcSKarl Rupp n = ii[1] - ii[0]; 57526fbe8dcSKarl Rupp ii++; 57626fbe8dcSKarl Rupp sum1 = 0.0; 57726fbe8dcSKarl Rupp sum2 = 0.0; 57826fbe8dcSKarl Rupp sum3 = 0.0; 57926fbe8dcSKarl Rupp sum4 = 0.0; 58026fbe8dcSKarl Rupp sum5 = 0.0; 58126fbe8dcSKarl Rupp sum6 = 0.0; 58226fbe8dcSKarl Rupp 583444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 584444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 58515091d37SBarry Smith for (j = 0; j < n; j++) { 58615091d37SBarry Smith xb = x + 6 * (*idx++); 5879371c9d4SSatish Balay x1 = xb[0]; 5889371c9d4SSatish Balay x2 = xb[1]; 5899371c9d4SSatish Balay x3 = xb[2]; 5909371c9d4SSatish Balay x4 = xb[3]; 5919371c9d4SSatish Balay x5 = xb[4]; 5929371c9d4SSatish Balay x6 = xb[5]; 59315091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 59415091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 59515091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 59615091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 59715091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 59815091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 59915091d37SBarry Smith v += 36; 60015091d37SBarry Smith } 6017b2bb3b9SHong Zhang if (usecprow) z = zarray + 6 * ridx[i]; 6029371c9d4SSatish Balay z[0] = sum1; 6039371c9d4SSatish Balay z[1] = sum2; 6049371c9d4SSatish Balay z[2] = sum3; 6059371c9d4SSatish Balay z[3] = sum4; 6069371c9d4SSatish Balay z[4] = sum5; 6079371c9d4SSatish Balay z[5] = sum6; 60826e093fcSHong Zhang if (!usecprow) z += 6; 60915091d37SBarry Smith } 61015091d37SBarry Smith 6119566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6129566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 6139566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 61415091d37SBarry Smith PetscFunctionReturn(0); 61515091d37SBarry Smith } 6168ab949d8SShri Abhyankar 6179371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) { 6182d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 619f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 620d9fead3dSBarry Smith const PetscScalar *x, *xb; 62126e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 622d9fead3dSBarry Smith const MatScalar *v; 6237c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 624ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 6252d61bbb3SSatish Balay 626433994e6SBarry Smith PetscFunctionBegin; 6279566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6289566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 6292d61bbb3SSatish Balay 6302d61bbb3SSatish Balay idx = a->j; 6312d61bbb3SSatish Balay v = a->a; 63226e093fcSHong Zhang if (usecprow) { 63326e093fcSHong Zhang mbs = a->compressedrow.nrows; 63426e093fcSHong Zhang ii = a->compressedrow.i; 6357b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 6369566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 63726e093fcSHong Zhang } else { 63826e093fcSHong Zhang mbs = a->mbs; 6392d61bbb3SSatish Balay ii = a->i; 64026e093fcSHong Zhang z = zarray; 64126e093fcSHong Zhang } 6422d61bbb3SSatish Balay 6432d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 64426fbe8dcSKarl Rupp n = ii[1] - ii[0]; 64526fbe8dcSKarl Rupp ii++; 64626fbe8dcSKarl Rupp sum1 = 0.0; 64726fbe8dcSKarl Rupp sum2 = 0.0; 64826fbe8dcSKarl Rupp sum3 = 0.0; 64926fbe8dcSKarl Rupp sum4 = 0.0; 65026fbe8dcSKarl Rupp sum5 = 0.0; 65126fbe8dcSKarl Rupp sum6 = 0.0; 65226fbe8dcSKarl Rupp sum7 = 0.0; 65326fbe8dcSKarl Rupp 654444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 655444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 6562d61bbb3SSatish Balay for (j = 0; j < n; j++) { 6572d61bbb3SSatish Balay xb = x + 7 * (*idx++); 6589371c9d4SSatish Balay x1 = xb[0]; 6599371c9d4SSatish Balay x2 = xb[1]; 6609371c9d4SSatish Balay x3 = xb[2]; 6619371c9d4SSatish Balay x4 = xb[3]; 6629371c9d4SSatish Balay x5 = xb[4]; 6639371c9d4SSatish Balay x6 = xb[5]; 6649371c9d4SSatish Balay x7 = xb[6]; 6652d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 6662d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 6672d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 6682d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 6692d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 6702d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 6712d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 6722d61bbb3SSatish Balay v += 49; 6732d61bbb3SSatish Balay } 6747b2bb3b9SHong Zhang if (usecprow) z = zarray + 7 * ridx[i]; 6759371c9d4SSatish Balay z[0] = sum1; 6769371c9d4SSatish Balay z[1] = sum2; 6779371c9d4SSatish Balay z[2] = sum3; 6789371c9d4SSatish Balay z[3] = sum4; 6799371c9d4SSatish Balay z[4] = sum5; 6809371c9d4SSatish Balay z[5] = sum6; 6819371c9d4SSatish Balay z[6] = sum7; 68226e093fcSHong Zhang if (!usecprow) z += 7; 6832d61bbb3SSatish Balay } 6842d61bbb3SSatish Balay 6859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6869566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 6879566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 6882d61bbb3SSatish Balay PetscFunctionReturn(0); 6892d61bbb3SSatish Balay } 6902d61bbb3SSatish Balay 6915f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 6929371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) { 69396e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 694f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 69596e086a2SDaniel Kokron const PetscScalar *x, *xb; 69696e086a2SDaniel Kokron const MatScalar *v; 69796e086a2SDaniel Kokron PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 69896e086a2SDaniel Kokron const PetscInt *idx, *ii, *ridx = NULL; 699ce68d72fSJed Brown PetscInt k; 70096e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 70196e086a2SDaniel Kokron 70296e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 703ce68d72fSJed Brown __m256d w0, w1, w2, w3; 70496e086a2SDaniel Kokron __m256d z0, z1, z2; 70596e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 70696e086a2SDaniel Kokron 70796e086a2SDaniel Kokron PetscFunctionBegin; 7089566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7099566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 71096e086a2SDaniel Kokron 71196e086a2SDaniel Kokron idx = a->j; 71296e086a2SDaniel Kokron v = a->a; 71396e086a2SDaniel Kokron if (usecprow) { 71496e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 71596e086a2SDaniel Kokron ii = a->compressedrow.i; 71696e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 7179566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 71896e086a2SDaniel Kokron } else { 71996e086a2SDaniel Kokron mbs = a->mbs; 72096e086a2SDaniel Kokron ii = a->i; 72196e086a2SDaniel Kokron z = zarray; 72296e086a2SDaniel Kokron } 72396e086a2SDaniel Kokron 72496e086a2SDaniel Kokron if (!a->mult_work) { 72596e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 7269566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 72796e086a2SDaniel Kokron } 72896e086a2SDaniel Kokron 72996e086a2SDaniel Kokron work = a->mult_work; 73096e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 7319371c9d4SSatish Balay n = ii[1] - ii[0]; 7329371c9d4SSatish Balay ii++; 73396e086a2SDaniel Kokron workt = work; 73496e086a2SDaniel Kokron for (j = 0; j < n; j++) { 73596e086a2SDaniel Kokron xb = x + bs * (*idx++); 73696e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 73796e086a2SDaniel Kokron workt += bs; 73896e086a2SDaniel Kokron } 73996e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 74096e086a2SDaniel Kokron 7419371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 7429371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 7439371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 74496e086a2SDaniel Kokron 74596e086a2SDaniel Kokron for (j = 0; j < n; j++) { 746c05b70c4SSatish Balay /* first column of a */ 74796e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 7489371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 7499371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 7509371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 7519371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 7529371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 7539371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 75496e086a2SDaniel Kokron 755c05b70c4SSatish Balay /* second column of a */ 75696e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 7579371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 7589371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 7599371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 7609371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 7619371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 7629371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 76396e086a2SDaniel Kokron 764c05b70c4SSatish Balay /* third column of a */ 76596e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 7669371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 7679371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 7689371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 7699371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 7709371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 7719371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 77296e086a2SDaniel Kokron 773c05b70c4SSatish Balay /* fourth column of a */ 77496e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 7759371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 7769371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 7779371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 7789371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 7799371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 7809371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 78196e086a2SDaniel Kokron 782c05b70c4SSatish Balay /* fifth column of a */ 78396e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 7849371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 7859371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 7869371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 7879371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 7889371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 7899371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 79096e086a2SDaniel Kokron 791c05b70c4SSatish Balay /* sixth column of a */ 79296e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 7939371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 7949371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 7959371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 7969371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 7979371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 7989371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 79996e086a2SDaniel Kokron 800c05b70c4SSatish Balay /* seventh column of a */ 80196e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 8029371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 8039371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 8049371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 8059371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 8069371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 8079371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 80896e086a2SDaniel Kokron 8096aad120cSJose E. Roman /* eighth column of a */ 81096e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 8119371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 8129371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 8139371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 8149371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 8159371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 8169371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 81796e086a2SDaniel Kokron 818c05b70c4SSatish Balay /* ninth column of a */ 81996e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 8209371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 8219371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 8229371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 8239371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 8249371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 8259371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 82696e086a2SDaniel Kokron } 82796e086a2SDaniel Kokron 8289371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 8299371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 8309371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 83196e086a2SDaniel Kokron 83296e086a2SDaniel Kokron v += n * bs2; 83396e086a2SDaniel Kokron if (!usecprow) z += bs; 83496e086a2SDaniel Kokron } 8359566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8369566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 8379566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 83896e086a2SDaniel Kokron PetscFunctionReturn(0); 83996e086a2SDaniel Kokron } 84096e086a2SDaniel Kokron #endif 84196e086a2SDaniel Kokron 8429371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) { 843ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 844f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 845ebada01fSBarry Smith const PetscScalar *x, *xb; 846ebada01fSBarry Smith PetscScalar *zarray, xv; 847ebada01fSBarry Smith const MatScalar *v; 848ebada01fSBarry Smith const PetscInt *ii, *ij = a->j, *idx; 849ebada01fSBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 850ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 851ebada01fSBarry Smith 852ebada01fSBarry Smith PetscFunctionBegin; 8539566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8549566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 855ebada01fSBarry Smith 856ebada01fSBarry Smith v = a->a; 857ebada01fSBarry Smith if (usecprow) { 858ebada01fSBarry Smith mbs = a->compressedrow.nrows; 859ebada01fSBarry Smith ii = a->compressedrow.i; 860ebada01fSBarry Smith ridx = a->compressedrow.rindex; 8619566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 862ebada01fSBarry Smith } else { 863ebada01fSBarry Smith mbs = a->mbs; 864ebada01fSBarry Smith ii = a->i; 865ebada01fSBarry Smith z = zarray; 866ebada01fSBarry Smith } 867ebada01fSBarry Smith 868ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 869ebada01fSBarry Smith n = ii[i + 1] - ii[i]; 870ebada01fSBarry Smith idx = ij + ii[i]; 8719371c9d4SSatish Balay sum1 = 0.0; 8729371c9d4SSatish Balay sum2 = 0.0; 8739371c9d4SSatish Balay sum3 = 0.0; 8749371c9d4SSatish Balay sum4 = 0.0; 8759371c9d4SSatish Balay sum5 = 0.0; 8769371c9d4SSatish Balay sum6 = 0.0; 8779371c9d4SSatish Balay sum7 = 0.0; 8789371c9d4SSatish Balay sum8 = 0.0; 8799371c9d4SSatish Balay sum9 = 0.0; 8809371c9d4SSatish Balay sum10 = 0.0; 8819371c9d4SSatish Balay sum11 = 0.0; 882ebada01fSBarry Smith 883ebada01fSBarry Smith for (j = 0; j < n; j++) { 884ebada01fSBarry Smith xb = x + 11 * (idx[j]); 885ebada01fSBarry Smith 886ebada01fSBarry Smith for (k = 0; k < 11; k++) { 887ebada01fSBarry Smith xv = xb[k]; 888ebada01fSBarry Smith sum1 += v[0] * xv; 889ebada01fSBarry Smith sum2 += v[1] * xv; 890ebada01fSBarry Smith sum3 += v[2] * xv; 891ebada01fSBarry Smith sum4 += v[3] * xv; 892ebada01fSBarry Smith sum5 += v[4] * xv; 893ebada01fSBarry Smith sum6 += v[5] * xv; 894ebada01fSBarry Smith sum7 += v[6] * xv; 895ebada01fSBarry Smith sum8 += v[7] * xv; 896ebada01fSBarry Smith sum9 += v[8] * xv; 897ebada01fSBarry Smith sum10 += v[9] * xv; 898ebada01fSBarry Smith sum11 += v[10] * xv; 899ebada01fSBarry Smith v += 11; 900ebada01fSBarry Smith } 901ebada01fSBarry Smith } 902ebada01fSBarry Smith if (usecprow) z = zarray + 11 * ridx[i]; 9039371c9d4SSatish Balay z[0] = sum1; 9049371c9d4SSatish Balay z[1] = sum2; 9059371c9d4SSatish Balay z[2] = sum3; 9069371c9d4SSatish Balay z[3] = sum4; 9079371c9d4SSatish Balay z[4] = sum5; 9089371c9d4SSatish Balay z[5] = sum6; 9099371c9d4SSatish Balay z[6] = sum7; 9109371c9d4SSatish Balay z[7] = sum8; 9119371c9d4SSatish Balay z[8] = sum9; 9129371c9d4SSatish Balay z[9] = sum10; 9139371c9d4SSatish Balay z[10] = sum11; 914ebada01fSBarry Smith 915ebada01fSBarry Smith if (!usecprow) z += 11; 916ebada01fSBarry Smith } 917ebada01fSBarry Smith 9189566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9199566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 9209566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 921ebada01fSBarry Smith PetscFunctionReturn(0); 922ebada01fSBarry Smith } 923ebada01fSBarry Smith 9246679dcc1SBarry Smith /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 9259371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) { 9266679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 9276679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 9286679dcc1SBarry Smith const PetscScalar *x, *xb; 9296679dcc1SBarry Smith PetscScalar *zarray, xv; 9306679dcc1SBarry Smith const MatScalar *v; 9316679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 9326679dcc1SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 9336679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 9346679dcc1SBarry Smith 9356679dcc1SBarry Smith PetscFunctionBegin; 9369566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9379566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 9386679dcc1SBarry Smith 9396679dcc1SBarry Smith v = a->a; 9406679dcc1SBarry Smith if (usecprow) { 9416679dcc1SBarry Smith mbs = a->compressedrow.nrows; 9426679dcc1SBarry Smith ii = a->compressedrow.i; 9436679dcc1SBarry Smith ridx = a->compressedrow.rindex; 9449566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 9456679dcc1SBarry Smith } else { 9466679dcc1SBarry Smith mbs = a->mbs; 9476679dcc1SBarry Smith ii = a->i; 9486679dcc1SBarry Smith z = zarray; 9496679dcc1SBarry Smith } 9506679dcc1SBarry Smith 9516679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 9526679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 9536679dcc1SBarry Smith idx = ij + ii[i]; 9549371c9d4SSatish Balay sum1 = 0.0; 9559371c9d4SSatish Balay sum2 = 0.0; 9569371c9d4SSatish Balay sum3 = 0.0; 9579371c9d4SSatish Balay sum4 = 0.0; 9589371c9d4SSatish Balay sum5 = 0.0; 9599371c9d4SSatish Balay sum6 = 0.0; 9609371c9d4SSatish Balay sum7 = 0.0; 9619371c9d4SSatish Balay sum8 = 0.0; 9629371c9d4SSatish Balay sum9 = 0.0; 9639371c9d4SSatish Balay sum10 = 0.0; 9649371c9d4SSatish Balay sum11 = 0.0; 9659371c9d4SSatish Balay sum12 = 0.0; 9666679dcc1SBarry Smith 9676679dcc1SBarry Smith for (j = 0; j < n; j++) { 9686679dcc1SBarry Smith xb = x + 12 * (idx[j]); 9696679dcc1SBarry Smith 9706679dcc1SBarry Smith for (k = 0; k < 12; k++) { 9716679dcc1SBarry Smith xv = xb[k]; 9726679dcc1SBarry Smith sum1 += v[0] * xv; 9736679dcc1SBarry Smith sum2 += v[1] * xv; 9746679dcc1SBarry Smith sum3 += v[2] * xv; 9756679dcc1SBarry Smith sum4 += v[3] * xv; 9766679dcc1SBarry Smith sum5 += v[4] * xv; 9776679dcc1SBarry Smith sum6 += v[5] * xv; 9786679dcc1SBarry Smith sum7 += v[6] * xv; 9796679dcc1SBarry Smith sum8 += v[7] * xv; 9806679dcc1SBarry Smith sum9 += v[8] * xv; 9816679dcc1SBarry Smith sum10 += v[9] * xv; 9826679dcc1SBarry Smith sum11 += v[10] * xv; 9836679dcc1SBarry Smith sum12 += v[11] * xv; 9846679dcc1SBarry Smith v += 12; 9856679dcc1SBarry Smith } 9866679dcc1SBarry Smith } 9876679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 9889371c9d4SSatish Balay z[0] = sum1; 9899371c9d4SSatish Balay z[1] = sum2; 9909371c9d4SSatish Balay z[2] = sum3; 9919371c9d4SSatish Balay z[3] = sum4; 9929371c9d4SSatish Balay z[4] = sum5; 9939371c9d4SSatish Balay z[5] = sum6; 9949371c9d4SSatish Balay z[6] = sum7; 9959371c9d4SSatish Balay z[7] = sum8; 9969371c9d4SSatish Balay z[8] = sum9; 9979371c9d4SSatish Balay z[9] = sum10; 9989371c9d4SSatish Balay z[10] = sum11; 9999371c9d4SSatish Balay z[11] = sum12; 10006679dcc1SBarry Smith if (!usecprow) z += 12; 10016679dcc1SBarry Smith } 10029566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10039566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 10049566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 10056679dcc1SBarry Smith PetscFunctionReturn(0); 10066679dcc1SBarry Smith } 10076679dcc1SBarry Smith 10089371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) { 10096679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 10106679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 10116679dcc1SBarry Smith const PetscScalar *x, *xb; 10126679dcc1SBarry Smith PetscScalar *zarray, *yarray, xv; 10136679dcc1SBarry Smith const MatScalar *v; 10146679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 10156679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 10166679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 10176679dcc1SBarry Smith 10186679dcc1SBarry Smith PetscFunctionBegin; 10199566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10209566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 10216679dcc1SBarry Smith 10226679dcc1SBarry Smith v = a->a; 10236679dcc1SBarry Smith if (usecprow) { 102448a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 10256679dcc1SBarry Smith mbs = a->compressedrow.nrows; 10266679dcc1SBarry Smith ii = a->compressedrow.i; 10276679dcc1SBarry Smith ridx = a->compressedrow.rindex; 10286679dcc1SBarry Smith } else { 10296679dcc1SBarry Smith ii = a->i; 10306679dcc1SBarry Smith y = yarray; 10316679dcc1SBarry Smith z = zarray; 10326679dcc1SBarry Smith } 10336679dcc1SBarry Smith 10346679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 10356679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 10366679dcc1SBarry Smith idx = ij + ii[i]; 10376679dcc1SBarry Smith 10386679dcc1SBarry Smith if (usecprow) { 10396679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 10406679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 10416679dcc1SBarry Smith } 10429371c9d4SSatish Balay sum1 = y[0]; 10439371c9d4SSatish Balay sum2 = y[1]; 10449371c9d4SSatish Balay sum3 = y[2]; 10459371c9d4SSatish Balay sum4 = y[3]; 10469371c9d4SSatish Balay sum5 = y[4]; 10479371c9d4SSatish Balay sum6 = y[5]; 10489371c9d4SSatish Balay sum7 = y[6]; 10499371c9d4SSatish Balay sum8 = y[7]; 10509371c9d4SSatish Balay sum9 = y[8]; 10519371c9d4SSatish Balay sum10 = y[9]; 10529371c9d4SSatish Balay sum11 = y[10]; 10539371c9d4SSatish Balay sum12 = y[11]; 10546679dcc1SBarry Smith 10556679dcc1SBarry Smith for (j = 0; j < n; j++) { 10566679dcc1SBarry Smith xb = x + 12 * (idx[j]); 10576679dcc1SBarry Smith 10586679dcc1SBarry Smith for (k = 0; k < 12; k++) { 10596679dcc1SBarry Smith xv = xb[k]; 10606679dcc1SBarry Smith sum1 += v[0] * xv; 10616679dcc1SBarry Smith sum2 += v[1] * xv; 10626679dcc1SBarry Smith sum3 += v[2] * xv; 10636679dcc1SBarry Smith sum4 += v[3] * xv; 10646679dcc1SBarry Smith sum5 += v[4] * xv; 10656679dcc1SBarry Smith sum6 += v[5] * xv; 10666679dcc1SBarry Smith sum7 += v[6] * xv; 10676679dcc1SBarry Smith sum8 += v[7] * xv; 10686679dcc1SBarry Smith sum9 += v[8] * xv; 10696679dcc1SBarry Smith sum10 += v[9] * xv; 10706679dcc1SBarry Smith sum11 += v[10] * xv; 10716679dcc1SBarry Smith sum12 += v[11] * xv; 10726679dcc1SBarry Smith v += 12; 10736679dcc1SBarry Smith } 10746679dcc1SBarry Smith } 10756679dcc1SBarry Smith 10769371c9d4SSatish Balay z[0] = sum1; 10779371c9d4SSatish Balay z[1] = sum2; 10789371c9d4SSatish Balay z[2] = sum3; 10799371c9d4SSatish Balay z[3] = sum4; 10809371c9d4SSatish Balay z[4] = sum5; 10819371c9d4SSatish Balay z[5] = sum6; 10829371c9d4SSatish Balay z[6] = sum7; 10839371c9d4SSatish Balay z[7] = sum8; 10849371c9d4SSatish Balay z[8] = sum9; 10859371c9d4SSatish Balay z[9] = sum10; 10869371c9d4SSatish Balay z[10] = sum11; 10879371c9d4SSatish Balay z[11] = sum12; 10886679dcc1SBarry Smith if (!usecprow) { 10896679dcc1SBarry Smith y += 12; 10906679dcc1SBarry Smith z += 12; 10916679dcc1SBarry Smith } 10926679dcc1SBarry Smith } 10939566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10949566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 10959566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 10966679dcc1SBarry Smith PetscFunctionReturn(0); 10976679dcc1SBarry Smith } 10986679dcc1SBarry Smith 10996679dcc1SBarry Smith /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 11009371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) { 11016679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 11026679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 11036679dcc1SBarry Smith const PetscScalar *x, *xb; 11046679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray; 11056679dcc1SBarry Smith const MatScalar *v; 11066679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 11076679dcc1SBarry Smith PetscInt mbs, i, j, n; 11086679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 11096679dcc1SBarry Smith 11106679dcc1SBarry Smith PetscFunctionBegin; 11119566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 11129566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 11136679dcc1SBarry Smith 11146679dcc1SBarry Smith v = a->a; 11156679dcc1SBarry Smith if (usecprow) { 11166679dcc1SBarry Smith mbs = a->compressedrow.nrows; 11176679dcc1SBarry Smith ii = a->compressedrow.i; 11186679dcc1SBarry Smith ridx = a->compressedrow.rindex; 11199566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 11206679dcc1SBarry Smith } else { 11216679dcc1SBarry Smith mbs = a->mbs; 11226679dcc1SBarry Smith ii = a->i; 11236679dcc1SBarry Smith z = zarray; 11246679dcc1SBarry Smith } 11256679dcc1SBarry Smith 11266679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 11276679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 11286679dcc1SBarry Smith idx = ij + ii[i]; 11296679dcc1SBarry Smith 11306679dcc1SBarry Smith sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 11316679dcc1SBarry Smith for (j = 0; j < n; j++) { 11326679dcc1SBarry Smith xb = x + 12 * (idx[j]); 11339371c9d4SSatish Balay x1 = xb[0]; 11349371c9d4SSatish Balay x2 = xb[1]; 11359371c9d4SSatish Balay x3 = xb[2]; 11369371c9d4SSatish Balay x4 = xb[3]; 11376679dcc1SBarry Smith 11386679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11396679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11406679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11416679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11426679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11436679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11446679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11456679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11466679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11476679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11486679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11496679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11506679dcc1SBarry Smith v += 48; 11516679dcc1SBarry Smith 11529371c9d4SSatish Balay x1 = xb[4]; 11539371c9d4SSatish Balay x2 = xb[5]; 11549371c9d4SSatish Balay x3 = xb[6]; 11559371c9d4SSatish Balay x4 = xb[7]; 11566679dcc1SBarry Smith 11576679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11586679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11596679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11606679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11616679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11626679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11636679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11646679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11656679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11666679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11676679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11686679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11696679dcc1SBarry Smith v += 48; 11706679dcc1SBarry Smith 11719371c9d4SSatish Balay x1 = xb[8]; 11729371c9d4SSatish Balay x2 = xb[9]; 11739371c9d4SSatish Balay x3 = xb[10]; 11749371c9d4SSatish Balay x4 = xb[11]; 11756679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11766679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11776679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11786679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11796679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11806679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11816679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11826679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11836679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11846679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11856679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11866679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11876679dcc1SBarry Smith v += 48; 11886679dcc1SBarry Smith } 11896679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 11909371c9d4SSatish Balay z[0] = sum1; 11919371c9d4SSatish Balay z[1] = sum2; 11929371c9d4SSatish Balay z[2] = sum3; 11939371c9d4SSatish Balay z[3] = sum4; 11949371c9d4SSatish Balay z[4] = sum5; 11959371c9d4SSatish Balay z[5] = sum6; 11969371c9d4SSatish Balay z[6] = sum7; 11979371c9d4SSatish Balay z[7] = sum8; 11989371c9d4SSatish Balay z[8] = sum9; 11999371c9d4SSatish Balay z[9] = sum10; 12009371c9d4SSatish Balay z[10] = sum11; 12019371c9d4SSatish Balay z[11] = sum12; 12026679dcc1SBarry Smith if (!usecprow) z += 12; 12036679dcc1SBarry Smith } 12049566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 12059566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 12069566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 12076679dcc1SBarry Smith PetscFunctionReturn(0); 12086679dcc1SBarry Smith } 12096679dcc1SBarry Smith 12106679dcc1SBarry Smith /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 12119371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) { 12126679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 12136679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 12146679dcc1SBarry Smith const PetscScalar *x, *xb; 12156679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray, *yarray; 12166679dcc1SBarry Smith const MatScalar *v; 12176679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 12186679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, n; 12196679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 12206679dcc1SBarry Smith 12216679dcc1SBarry Smith PetscFunctionBegin; 12229566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 12239566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 12246679dcc1SBarry Smith 12256679dcc1SBarry Smith v = a->a; 12266679dcc1SBarry Smith if (usecprow) { 122748a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 12286679dcc1SBarry Smith mbs = a->compressedrow.nrows; 12296679dcc1SBarry Smith ii = a->compressedrow.i; 12306679dcc1SBarry Smith ridx = a->compressedrow.rindex; 12316679dcc1SBarry Smith } else { 12326679dcc1SBarry Smith ii = a->i; 12336679dcc1SBarry Smith y = yarray; 12346679dcc1SBarry Smith z = zarray; 12356679dcc1SBarry Smith } 12366679dcc1SBarry Smith 12376679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 12386679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 12396679dcc1SBarry Smith idx = ij + ii[i]; 12406679dcc1SBarry Smith 12416679dcc1SBarry Smith if (usecprow) { 12426679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 12436679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 12446679dcc1SBarry Smith } 12459371c9d4SSatish Balay sum1 = y[0]; 12469371c9d4SSatish Balay sum2 = y[1]; 12479371c9d4SSatish Balay sum3 = y[2]; 12489371c9d4SSatish Balay sum4 = y[3]; 12499371c9d4SSatish Balay sum5 = y[4]; 12509371c9d4SSatish Balay sum6 = y[5]; 12519371c9d4SSatish Balay sum7 = y[6]; 12529371c9d4SSatish Balay sum8 = y[7]; 12539371c9d4SSatish Balay sum9 = y[8]; 12549371c9d4SSatish Balay sum10 = y[9]; 12559371c9d4SSatish Balay sum11 = y[10]; 12569371c9d4SSatish Balay sum12 = y[11]; 12576679dcc1SBarry Smith 12586679dcc1SBarry Smith for (j = 0; j < n; j++) { 12596679dcc1SBarry Smith xb = x + 12 * (idx[j]); 12609371c9d4SSatish Balay x1 = xb[0]; 12619371c9d4SSatish Balay x2 = xb[1]; 12629371c9d4SSatish Balay x3 = xb[2]; 12639371c9d4SSatish Balay x4 = xb[3]; 12646679dcc1SBarry Smith 12656679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12666679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12676679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12686679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12696679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12706679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12716679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12726679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12736679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12746679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12756679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12766679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12776679dcc1SBarry Smith v += 48; 12786679dcc1SBarry Smith 12799371c9d4SSatish Balay x1 = xb[4]; 12809371c9d4SSatish Balay x2 = xb[5]; 12819371c9d4SSatish Balay x3 = xb[6]; 12829371c9d4SSatish Balay x4 = xb[7]; 12836679dcc1SBarry Smith 12846679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12856679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12866679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12876679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12886679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12896679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12906679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12916679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12926679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12936679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12946679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12956679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12966679dcc1SBarry Smith v += 48; 12976679dcc1SBarry Smith 12989371c9d4SSatish Balay x1 = xb[8]; 12999371c9d4SSatish Balay x2 = xb[9]; 13009371c9d4SSatish Balay x3 = xb[10]; 13019371c9d4SSatish Balay x4 = xb[11]; 13026679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 13036679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 13046679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 13056679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 13066679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 13076679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 13086679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 13096679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 13106679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 13116679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 13126679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 13136679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 13146679dcc1SBarry Smith v += 48; 13156679dcc1SBarry Smith } 13169371c9d4SSatish Balay z[0] = sum1; 13179371c9d4SSatish Balay z[1] = sum2; 13189371c9d4SSatish Balay z[2] = sum3; 13199371c9d4SSatish Balay z[3] = sum4; 13209371c9d4SSatish Balay z[4] = sum5; 13219371c9d4SSatish Balay z[5] = sum6; 13229371c9d4SSatish Balay z[6] = sum7; 13239371c9d4SSatish Balay z[7] = sum8; 13249371c9d4SSatish Balay z[8] = sum9; 13259371c9d4SSatish Balay z[9] = sum10; 13269371c9d4SSatish Balay z[10] = sum11; 13279371c9d4SSatish Balay z[11] = sum12; 13286679dcc1SBarry Smith if (!usecprow) { 13296679dcc1SBarry Smith y += 12; 13306679dcc1SBarry Smith z += 12; 13316679dcc1SBarry Smith } 13326679dcc1SBarry Smith } 13339566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 13349566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 13359566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 13366679dcc1SBarry Smith PetscFunctionReturn(0); 13376679dcc1SBarry Smith } 13386679dcc1SBarry Smith 13396679dcc1SBarry Smith #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 13409371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) { 13416679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 13426679dcc1SBarry Smith PetscScalar *z = NULL, *zarray; 13436679dcc1SBarry Smith const PetscScalar *x, *work; 13446679dcc1SBarry Smith const MatScalar *v = a->a; 13456679dcc1SBarry Smith PetscInt mbs, i, j, n; 13466679dcc1SBarry Smith const PetscInt *idx = a->j, *ii, *ridx = NULL; 13476679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 13486679dcc1SBarry Smith const PetscInt bs = 12, bs2 = 144; 13496679dcc1SBarry Smith 13506679dcc1SBarry Smith __m256d a0, a1, a2, a3, a4, a5; 13516679dcc1SBarry Smith __m256d w0, w1, w2, w3; 13526679dcc1SBarry Smith __m256d z0, z1, z2; 13536679dcc1SBarry Smith 13546679dcc1SBarry Smith PetscFunctionBegin; 13559566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 13569566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 13576679dcc1SBarry Smith 13586679dcc1SBarry Smith if (usecprow) { 13596679dcc1SBarry Smith mbs = a->compressedrow.nrows; 13606679dcc1SBarry Smith ii = a->compressedrow.i; 13616679dcc1SBarry Smith ridx = a->compressedrow.rindex; 13629566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 13636679dcc1SBarry Smith } else { 13646679dcc1SBarry Smith mbs = a->mbs; 13656679dcc1SBarry Smith ii = a->i; 13666679dcc1SBarry Smith z = zarray; 13676679dcc1SBarry Smith } 13686679dcc1SBarry Smith 13696679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 13709371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 13719371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 13729371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 13736679dcc1SBarry Smith 13749371c9d4SSatish Balay n = ii[1] - ii[0]; 13759371c9d4SSatish Balay ii++; 13766679dcc1SBarry Smith for (j = 0; j < n; j++) { 13776679dcc1SBarry Smith work = x + bs * (*idx++); 13786679dcc1SBarry Smith 13796679dcc1SBarry Smith /* first column of a */ 13806679dcc1SBarry Smith w0 = _mm256_set1_pd(work[0]); 13819371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 0); 13829371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 13839371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 4); 13849371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 13859371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 8); 13869371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 13876679dcc1SBarry Smith 13886679dcc1SBarry Smith /* second column of a */ 13896679dcc1SBarry Smith w1 = _mm256_set1_pd(work[1]); 13909371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 12); 13919371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 13929371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 16); 13939371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 13949371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 20); 13959371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 13966679dcc1SBarry Smith 13976679dcc1SBarry Smith /* third column of a */ 13986679dcc1SBarry Smith w2 = _mm256_set1_pd(work[2]); 13999371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 24); 14009371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14019371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 28); 14029371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14039371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 32); 14049371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14056679dcc1SBarry Smith 14066679dcc1SBarry Smith /* fourth column of a */ 14076679dcc1SBarry Smith w3 = _mm256_set1_pd(work[3]); 14089371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 36); 14099371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14109371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 40); 14119371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14129371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 44); 14139371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14146679dcc1SBarry Smith 14156679dcc1SBarry Smith /* fifth column of a */ 14166679dcc1SBarry Smith w0 = _mm256_set1_pd(work[4]); 14179371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 48); 14189371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14199371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 52); 14209371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14219371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 56); 14229371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14236679dcc1SBarry Smith 14246679dcc1SBarry Smith /* sixth column of a */ 14256679dcc1SBarry Smith w1 = _mm256_set1_pd(work[5]); 14269371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 60); 14279371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14289371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 64); 14299371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14309371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 68); 14319371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14326679dcc1SBarry Smith 14336679dcc1SBarry Smith /* seventh column of a */ 14346679dcc1SBarry Smith w2 = _mm256_set1_pd(work[6]); 14359371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 72); 14369371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14379371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 76); 14389371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14399371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 80); 14409371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14416679dcc1SBarry Smith 14426aad120cSJose E. Roman /* eighth column of a */ 14436679dcc1SBarry Smith w3 = _mm256_set1_pd(work[7]); 14449371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 84); 14459371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14469371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 88); 14479371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14489371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 92); 14499371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14506679dcc1SBarry Smith 14516679dcc1SBarry Smith /* ninth column of a */ 14526679dcc1SBarry Smith w0 = _mm256_set1_pd(work[8]); 14539371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 96); 14549371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14559371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 100); 14569371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14579371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 104); 14589371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14596679dcc1SBarry Smith 14606679dcc1SBarry Smith /* tenth column of a */ 14616679dcc1SBarry Smith w1 = _mm256_set1_pd(work[9]); 14629371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 108); 14639371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14649371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 112); 14659371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14669371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 116); 14679371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14686679dcc1SBarry Smith 14696679dcc1SBarry Smith /* eleventh column of a */ 14706679dcc1SBarry Smith w2 = _mm256_set1_pd(work[10]); 14719371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 120); 14729371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14739371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 124); 14749371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14759371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 128); 14769371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14776679dcc1SBarry Smith 14786679dcc1SBarry Smith /* twelveth column of a */ 14796679dcc1SBarry Smith w3 = _mm256_set1_pd(work[11]); 14809371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 132); 14819371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14829371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 136); 14839371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14849371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 140); 14859371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14866679dcc1SBarry Smith 14876679dcc1SBarry Smith v += bs2; 14886679dcc1SBarry Smith } 14896679dcc1SBarry Smith if (usecprow) z = zarray + bs * ridx[i]; 14909371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 14919371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 14929371c9d4SSatish Balay _mm256_storeu_pd(&z[8], z2); 14936679dcc1SBarry Smith if (!usecprow) z += bs; 14946679dcc1SBarry Smith } 14959566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 14969566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 14979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 14986679dcc1SBarry Smith PetscFunctionReturn(0); 14996679dcc1SBarry Smith } 15006679dcc1SBarry Smith #endif 15016679dcc1SBarry Smith 15028ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1503832cc040SShri Abhyankar /* Default MatMult for block size 15 */ 15049371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) { 15058ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1506f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 15078ab949d8SShri Abhyankar const PetscScalar *x, *xb; 150853ef36baSBarry Smith PetscScalar *zarray, xv; 15098ab949d8SShri Abhyankar const MatScalar *v; 15108ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 15117c565772SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 1512ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 15138ab949d8SShri Abhyankar 15148ab949d8SShri Abhyankar PetscFunctionBegin; 15159566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 15169566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 15178ab949d8SShri Abhyankar 15188ab949d8SShri Abhyankar v = a->a; 15198ab949d8SShri Abhyankar if (usecprow) { 15208ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 15218ab949d8SShri Abhyankar ii = a->compressedrow.i; 15228ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 15239566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 15248ab949d8SShri Abhyankar } else { 15258ab949d8SShri Abhyankar mbs = a->mbs; 15268ab949d8SShri Abhyankar ii = a->i; 15278ab949d8SShri Abhyankar z = zarray; 15288ab949d8SShri Abhyankar } 15298ab949d8SShri Abhyankar 15308ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 15318ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 15328ab949d8SShri Abhyankar idx = ij + ii[i]; 15339371c9d4SSatish Balay sum1 = 0.0; 15349371c9d4SSatish Balay sum2 = 0.0; 15359371c9d4SSatish Balay sum3 = 0.0; 15369371c9d4SSatish Balay sum4 = 0.0; 15379371c9d4SSatish Balay sum5 = 0.0; 15389371c9d4SSatish Balay sum6 = 0.0; 15399371c9d4SSatish Balay sum7 = 0.0; 15409371c9d4SSatish Balay sum8 = 0.0; 15419371c9d4SSatish Balay sum9 = 0.0; 15429371c9d4SSatish Balay sum10 = 0.0; 15439371c9d4SSatish Balay sum11 = 0.0; 15449371c9d4SSatish Balay sum12 = 0.0; 15459371c9d4SSatish Balay sum13 = 0.0; 15469371c9d4SSatish Balay sum14 = 0.0; 15479371c9d4SSatish Balay sum15 = 0.0; 15488ab949d8SShri Abhyankar 15498ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 15508ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 15518ab949d8SShri Abhyankar 15528ab949d8SShri Abhyankar for (k = 0; k < 15; k++) { 155353ef36baSBarry Smith xv = xb[k]; 155453ef36baSBarry Smith sum1 += v[0] * xv; 155553ef36baSBarry Smith sum2 += v[1] * xv; 155653ef36baSBarry Smith sum3 += v[2] * xv; 155753ef36baSBarry Smith sum4 += v[3] * xv; 155853ef36baSBarry Smith sum5 += v[4] * xv; 155953ef36baSBarry Smith sum6 += v[5] * xv; 156053ef36baSBarry Smith sum7 += v[6] * xv; 156153ef36baSBarry Smith sum8 += v[7] * xv; 156253ef36baSBarry Smith sum9 += v[8] * xv; 156353ef36baSBarry Smith sum10 += v[9] * xv; 156453ef36baSBarry Smith sum11 += v[10] * xv; 156553ef36baSBarry Smith sum12 += v[11] * xv; 156653ef36baSBarry Smith sum13 += v[12] * xv; 156753ef36baSBarry Smith sum14 += v[13] * xv; 156853ef36baSBarry Smith sum15 += v[14] * xv; 15698ab949d8SShri Abhyankar v += 15; 15708ab949d8SShri Abhyankar } 15718ab949d8SShri Abhyankar } 15728ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 15739371c9d4SSatish Balay z[0] = sum1; 15749371c9d4SSatish Balay z[1] = sum2; 15759371c9d4SSatish Balay z[2] = sum3; 15769371c9d4SSatish Balay z[3] = sum4; 15779371c9d4SSatish Balay z[4] = sum5; 15789371c9d4SSatish Balay z[5] = sum6; 15799371c9d4SSatish Balay z[6] = sum7; 15809371c9d4SSatish Balay z[7] = sum8; 15819371c9d4SSatish Balay z[8] = sum9; 15829371c9d4SSatish Balay z[9] = sum10; 15839371c9d4SSatish Balay z[10] = sum11; 15849371c9d4SSatish Balay z[11] = sum12; 15859371c9d4SSatish Balay z[12] = sum13; 15869371c9d4SSatish Balay z[13] = sum14; 15879371c9d4SSatish Balay z[14] = sum15; 15888ab949d8SShri Abhyankar 15898ab949d8SShri Abhyankar if (!usecprow) z += 15; 15908ab949d8SShri Abhyankar } 15918ab949d8SShri Abhyankar 15929566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 15939566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 15949566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 15958ab949d8SShri Abhyankar PetscFunctionReturn(0); 15968ab949d8SShri Abhyankar } 15978ab949d8SShri Abhyankar 15988ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 15999371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) { 16008ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1601f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 16028ab949d8SShri Abhyankar const PetscScalar *x, *xb; 16030b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, *zarray; 16048ab949d8SShri Abhyankar const MatScalar *v; 16058ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 16067c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1607ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 16088ab949d8SShri Abhyankar 16098ab949d8SShri Abhyankar PetscFunctionBegin; 16109566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 16119566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 16128ab949d8SShri Abhyankar 16138ab949d8SShri Abhyankar v = a->a; 16148ab949d8SShri Abhyankar if (usecprow) { 16158ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 16168ab949d8SShri Abhyankar ii = a->compressedrow.i; 16178ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 16189566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 16198ab949d8SShri Abhyankar } else { 16208ab949d8SShri Abhyankar mbs = a->mbs; 16218ab949d8SShri Abhyankar ii = a->i; 16228ab949d8SShri Abhyankar z = zarray; 16238ab949d8SShri Abhyankar } 16248ab949d8SShri Abhyankar 16258ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 16268ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 16278ab949d8SShri Abhyankar idx = ij + ii[i]; 16289371c9d4SSatish Balay sum1 = 0.0; 16299371c9d4SSatish Balay sum2 = 0.0; 16309371c9d4SSatish Balay sum3 = 0.0; 16319371c9d4SSatish Balay sum4 = 0.0; 16329371c9d4SSatish Balay sum5 = 0.0; 16339371c9d4SSatish Balay sum6 = 0.0; 16349371c9d4SSatish Balay sum7 = 0.0; 16359371c9d4SSatish Balay sum8 = 0.0; 16369371c9d4SSatish Balay sum9 = 0.0; 16379371c9d4SSatish Balay sum10 = 0.0; 16389371c9d4SSatish Balay sum11 = 0.0; 16399371c9d4SSatish Balay sum12 = 0.0; 16409371c9d4SSatish Balay sum13 = 0.0; 16419371c9d4SSatish Balay sum14 = 0.0; 16429371c9d4SSatish Balay sum15 = 0.0; 16438ab949d8SShri Abhyankar 16448ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 16458ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 16469371c9d4SSatish Balay x1 = xb[0]; 16479371c9d4SSatish Balay x2 = xb[1]; 16489371c9d4SSatish Balay x3 = xb[2]; 16499371c9d4SSatish Balay x4 = xb[3]; 16508ab949d8SShri Abhyankar 16518ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16528ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16538ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16548ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16558ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16568ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16578ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16588ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16598ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16608ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16618ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16628ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16638ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16648ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16658ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16668ab949d8SShri Abhyankar 16678ab949d8SShri Abhyankar v += 60; 16688ab949d8SShri Abhyankar 16699371c9d4SSatish Balay x1 = xb[4]; 16709371c9d4SSatish Balay x2 = xb[5]; 16719371c9d4SSatish Balay x3 = xb[6]; 16729371c9d4SSatish Balay x4 = xb[7]; 16738ab949d8SShri Abhyankar 16748ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16758ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16768ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16778ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16788ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16798ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16808ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16818ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16828ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16838ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16848ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16858ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16868ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16878ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16888ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16898ab949d8SShri Abhyankar v += 60; 16908ab949d8SShri Abhyankar 16919371c9d4SSatish Balay x1 = xb[8]; 16929371c9d4SSatish Balay x2 = xb[9]; 16939371c9d4SSatish Balay x3 = xb[10]; 16949371c9d4SSatish Balay x4 = xb[11]; 16950b8f6341SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16960b8f6341SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16970b8f6341SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16980b8f6341SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16990b8f6341SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 17000b8f6341SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 17010b8f6341SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 17020b8f6341SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 17030b8f6341SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 17040b8f6341SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 17050b8f6341SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 17060b8f6341SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 17070b8f6341SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 17080b8f6341SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 17090b8f6341SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 17100b8f6341SShri Abhyankar v += 60; 17110b8f6341SShri Abhyankar 17129371c9d4SSatish Balay x1 = xb[12]; 17139371c9d4SSatish Balay x2 = xb[13]; 17149371c9d4SSatish Balay x3 = xb[14]; 17158ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 17168ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 17178ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 17188ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 17198ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 17208ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 17218ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 17228ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 17238ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 17248ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 17258ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 17268ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 17278ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 17288ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 17298ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 17308ab949d8SShri Abhyankar v += 45; 17318ab949d8SShri Abhyankar } 17328ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 17339371c9d4SSatish Balay z[0] = sum1; 17349371c9d4SSatish Balay z[1] = sum2; 17359371c9d4SSatish Balay z[2] = sum3; 17369371c9d4SSatish Balay z[3] = sum4; 17379371c9d4SSatish Balay z[4] = sum5; 17389371c9d4SSatish Balay z[5] = sum6; 17399371c9d4SSatish Balay z[6] = sum7; 17409371c9d4SSatish Balay z[7] = sum8; 17419371c9d4SSatish Balay z[8] = sum9; 17429371c9d4SSatish Balay z[9] = sum10; 17439371c9d4SSatish Balay z[10] = sum11; 17449371c9d4SSatish Balay z[11] = sum12; 17459371c9d4SSatish Balay z[12] = sum13; 17469371c9d4SSatish Balay z[13] = sum14; 17479371c9d4SSatish Balay z[14] = sum15; 17488ab949d8SShri Abhyankar 17498ab949d8SShri Abhyankar if (!usecprow) z += 15; 17508ab949d8SShri Abhyankar } 17518ab949d8SShri Abhyankar 17529566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 17539566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 17549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 17558ab949d8SShri Abhyankar PetscFunctionReturn(0); 17568ab949d8SShri Abhyankar } 17578ab949d8SShri Abhyankar 17588ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 17599371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) { 17608ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1761f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 17628ab949d8SShri Abhyankar const PetscScalar *x, *xb; 17630b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 17648ab949d8SShri Abhyankar const MatScalar *v; 17658ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 17667c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1767ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 17688ab949d8SShri Abhyankar 17698ab949d8SShri Abhyankar PetscFunctionBegin; 17709566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 17719566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 17728ab949d8SShri Abhyankar 17738ab949d8SShri Abhyankar v = a->a; 17748ab949d8SShri Abhyankar if (usecprow) { 17758ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 17768ab949d8SShri Abhyankar ii = a->compressedrow.i; 17778ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 17789566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 17798ab949d8SShri Abhyankar } else { 17808ab949d8SShri Abhyankar mbs = a->mbs; 17818ab949d8SShri Abhyankar ii = a->i; 17828ab949d8SShri Abhyankar z = zarray; 17838ab949d8SShri Abhyankar } 17848ab949d8SShri Abhyankar 17858ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 17868ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 17878ab949d8SShri Abhyankar idx = ij + ii[i]; 17889371c9d4SSatish Balay sum1 = 0.0; 17899371c9d4SSatish Balay sum2 = 0.0; 17909371c9d4SSatish Balay sum3 = 0.0; 17919371c9d4SSatish Balay sum4 = 0.0; 17929371c9d4SSatish Balay sum5 = 0.0; 17939371c9d4SSatish Balay sum6 = 0.0; 17949371c9d4SSatish Balay sum7 = 0.0; 17959371c9d4SSatish Balay sum8 = 0.0; 17969371c9d4SSatish Balay sum9 = 0.0; 17979371c9d4SSatish Balay sum10 = 0.0; 17989371c9d4SSatish Balay sum11 = 0.0; 17999371c9d4SSatish Balay sum12 = 0.0; 18009371c9d4SSatish Balay sum13 = 0.0; 18019371c9d4SSatish Balay sum14 = 0.0; 18029371c9d4SSatish Balay sum15 = 0.0; 18038ab949d8SShri Abhyankar 18048ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 18058ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 18069371c9d4SSatish Balay x1 = xb[0]; 18079371c9d4SSatish Balay x2 = xb[1]; 18089371c9d4SSatish Balay x3 = xb[2]; 18099371c9d4SSatish Balay x4 = xb[3]; 18109371c9d4SSatish Balay x5 = xb[4]; 18119371c9d4SSatish Balay x6 = xb[5]; 18129371c9d4SSatish Balay x7 = xb[6]; 18130b8f6341SShri Abhyankar x8 = xb[7]; 18148ab949d8SShri Abhyankar 18158ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 18168ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 18178ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 18188ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 18198ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 18208ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 18218ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 18228ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 18238ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 18248ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 18258ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 18268ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 18278ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 18288ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 18298ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 18308ab949d8SShri Abhyankar v += 120; 18318ab949d8SShri Abhyankar 18329371c9d4SSatish Balay x1 = xb[8]; 18339371c9d4SSatish Balay x2 = xb[9]; 18349371c9d4SSatish Balay x3 = xb[10]; 18359371c9d4SSatish Balay x4 = xb[11]; 18369371c9d4SSatish Balay x5 = xb[12]; 18379371c9d4SSatish Balay x6 = xb[13]; 18389371c9d4SSatish Balay x7 = xb[14]; 18390b8f6341SShri Abhyankar 18408ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 18418ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 18428ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 18438ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 18448ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 18458ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 18468ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 18478ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 18488ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 18498ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 18508ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 18518ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 18528ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 18538ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 18548ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 18558ab949d8SShri Abhyankar v += 105; 18568ab949d8SShri Abhyankar } 18578ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 18589371c9d4SSatish Balay z[0] = sum1; 18599371c9d4SSatish Balay z[1] = sum2; 18609371c9d4SSatish Balay z[2] = sum3; 18619371c9d4SSatish Balay z[3] = sum4; 18629371c9d4SSatish Balay z[4] = sum5; 18639371c9d4SSatish Balay z[5] = sum6; 18649371c9d4SSatish Balay z[6] = sum7; 18659371c9d4SSatish Balay z[7] = sum8; 18669371c9d4SSatish Balay z[8] = sum9; 18679371c9d4SSatish Balay z[9] = sum10; 18689371c9d4SSatish Balay z[10] = sum11; 18699371c9d4SSatish Balay z[11] = sum12; 18709371c9d4SSatish Balay z[12] = sum13; 18719371c9d4SSatish Balay z[13] = sum14; 18729371c9d4SSatish Balay z[14] = sum15; 18738ab949d8SShri Abhyankar 18748ab949d8SShri Abhyankar if (!usecprow) z += 15; 18758ab949d8SShri Abhyankar } 18768ab949d8SShri Abhyankar 18779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 18789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 18799566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 18808ab949d8SShri Abhyankar PetscFunctionReturn(0); 18818ab949d8SShri Abhyankar } 18828ab949d8SShri Abhyankar 18838ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 18849371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) { 18858ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1886f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 18878ab949d8SShri Abhyankar const PetscScalar *x, *xb; 18888ab949d8SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 18898ab949d8SShri Abhyankar const MatScalar *v; 18908ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 18917c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1892ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 18938ab949d8SShri Abhyankar 18948ab949d8SShri Abhyankar PetscFunctionBegin; 18959566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 18969566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 18978ab949d8SShri Abhyankar 18988ab949d8SShri Abhyankar v = a->a; 18998ab949d8SShri Abhyankar if (usecprow) { 19008ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 19018ab949d8SShri Abhyankar ii = a->compressedrow.i; 19028ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 19039566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 19048ab949d8SShri Abhyankar } else { 19058ab949d8SShri Abhyankar mbs = a->mbs; 19068ab949d8SShri Abhyankar ii = a->i; 19078ab949d8SShri Abhyankar z = zarray; 19088ab949d8SShri Abhyankar } 19098ab949d8SShri Abhyankar 19108ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 19118ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 19128ab949d8SShri Abhyankar idx = ij + ii[i]; 19139371c9d4SSatish Balay sum1 = 0.0; 19149371c9d4SSatish Balay sum2 = 0.0; 19159371c9d4SSatish Balay sum3 = 0.0; 19169371c9d4SSatish Balay sum4 = 0.0; 19179371c9d4SSatish Balay sum5 = 0.0; 19189371c9d4SSatish Balay sum6 = 0.0; 19199371c9d4SSatish Balay sum7 = 0.0; 19209371c9d4SSatish Balay sum8 = 0.0; 19219371c9d4SSatish Balay sum9 = 0.0; 19229371c9d4SSatish Balay sum10 = 0.0; 19239371c9d4SSatish Balay sum11 = 0.0; 19249371c9d4SSatish Balay sum12 = 0.0; 19259371c9d4SSatish Balay sum13 = 0.0; 19269371c9d4SSatish Balay sum14 = 0.0; 19279371c9d4SSatish Balay sum15 = 0.0; 19288ab949d8SShri Abhyankar 19298ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 19308ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 19319371c9d4SSatish Balay x1 = xb[0]; 19329371c9d4SSatish Balay x2 = xb[1]; 19339371c9d4SSatish Balay x3 = xb[2]; 19349371c9d4SSatish Balay x4 = xb[3]; 19359371c9d4SSatish Balay x5 = xb[4]; 19369371c9d4SSatish Balay x6 = xb[5]; 19379371c9d4SSatish Balay x7 = xb[6]; 19389371c9d4SSatish Balay x8 = xb[7]; 19399371c9d4SSatish Balay x9 = xb[8]; 19409371c9d4SSatish Balay x10 = xb[9]; 19419371c9d4SSatish Balay x11 = xb[10]; 19429371c9d4SSatish Balay x12 = xb[11]; 19439371c9d4SSatish Balay x13 = xb[12]; 19449371c9d4SSatish Balay x14 = xb[13]; 19459371c9d4SSatish Balay x15 = xb[14]; 19468ab949d8SShri Abhyankar 19478ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 19488ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 19498ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 19508ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 19518ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 19528ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 19538ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 19548ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 19558ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 19568ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 19578ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 19588ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 19598ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 19608ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 19618ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 19628ab949d8SShri Abhyankar v += 225; 19638ab949d8SShri Abhyankar } 19648ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 19659371c9d4SSatish Balay z[0] = sum1; 19669371c9d4SSatish Balay z[1] = sum2; 19679371c9d4SSatish Balay z[2] = sum3; 19689371c9d4SSatish Balay z[3] = sum4; 19699371c9d4SSatish Balay z[4] = sum5; 19709371c9d4SSatish Balay z[5] = sum6; 19719371c9d4SSatish Balay z[6] = sum7; 19729371c9d4SSatish Balay z[7] = sum8; 19739371c9d4SSatish Balay z[8] = sum9; 19749371c9d4SSatish Balay z[9] = sum10; 19759371c9d4SSatish Balay z[10] = sum11; 19769371c9d4SSatish Balay z[11] = sum12; 19779371c9d4SSatish Balay z[12] = sum13; 19789371c9d4SSatish Balay z[13] = sum14; 19799371c9d4SSatish Balay z[14] = sum15; 19808ab949d8SShri Abhyankar 19818ab949d8SShri Abhyankar if (!usecprow) z += 15; 19828ab949d8SShri Abhyankar } 19838ab949d8SShri Abhyankar 19849566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 19859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 19869566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 19878ab949d8SShri Abhyankar PetscFunctionReturn(0); 19888ab949d8SShri Abhyankar } 19898ab949d8SShri Abhyankar 19903f1db9ecSBarry Smith /* 19913f1db9ecSBarry Smith This will not work with MatScalar == float because it calls the BLAS 19923f1db9ecSBarry Smith */ 19939371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) { 19942d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1995f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 1996d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1997d9ca1df4SBarry Smith const MatScalar *v; 1998d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1999d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2000d9ca1df4SBarry Smith PetscInt ncols, k; 2001ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20022d61bbb3SSatish Balay 20032d61bbb3SSatish Balay PetscFunctionBegin; 20049566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20059566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 20062d61bbb3SSatish Balay 20072d61bbb3SSatish Balay idx = a->j; 20082d61bbb3SSatish Balay v = a->a; 200926e093fcSHong Zhang if (usecprow) { 201026e093fcSHong Zhang mbs = a->compressedrow.nrows; 201126e093fcSHong Zhang ii = a->compressedrow.i; 20127b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 20139566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 201426e093fcSHong Zhang } else { 201526e093fcSHong Zhang mbs = a->mbs; 20162d61bbb3SSatish Balay ii = a->i; 201726e093fcSHong Zhang z = zarray; 201826e093fcSHong Zhang } 2019218c64b6SSatish Balay 20202d61bbb3SSatish Balay if (!a->mult_work) { 2021d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 20229566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 20232d61bbb3SSatish Balay } 20242d61bbb3SSatish Balay work = a->mult_work; 20252d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 20269371c9d4SSatish Balay n = ii[1] - ii[0]; 20279371c9d4SSatish Balay ii++; 20282d61bbb3SSatish Balay ncols = n * bs; 20292d61bbb3SSatish Balay workt = work; 20302d61bbb3SSatish Balay for (j = 0; j < n; j++) { 20312d61bbb3SSatish Balay xb = x + bs * (*idx++); 20322d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 20332d61bbb3SSatish Balay workt += bs; 20342d61bbb3SSatish Balay } 20357b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 203696b95a6bSBarry Smith PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 20372d61bbb3SSatish Balay v += n * bs2; 203826e093fcSHong Zhang if (!usecprow) z += bs; 20392d61bbb3SSatish Balay } 20409566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20419566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 20429566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 20432d61bbb3SSatish Balay PetscFunctionReturn(0); 20442d61bbb3SSatish Balay } 20452d61bbb3SSatish Balay 20469371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) { 20472d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2048122f12eaSBarry Smith const PetscScalar *x; 2049122f12eaSBarry Smith PetscScalar *y, *z, sum; 2050122f12eaSBarry Smith const MatScalar *v; 20517c565772SBarry Smith PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2052122f12eaSBarry Smith const PetscInt *idx, *ii; 2053ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20542d61bbb3SSatish Balay 20552d61bbb3SSatish Balay PetscFunctionBegin; 20569566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20579566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 20582d61bbb3SSatish Balay 20592d61bbb3SSatish Balay idx = a->j; 20602d61bbb3SSatish Balay v = a->a; 206126e093fcSHong Zhang if (usecprow) { 206248a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 206326e093fcSHong Zhang mbs = a->compressedrow.nrows; 206426e093fcSHong Zhang ii = a->compressedrow.i; 20657b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 206626e093fcSHong Zhang } else { 20672d61bbb3SSatish Balay ii = a->i; 206826e093fcSHong Zhang } 20692d61bbb3SSatish Balay 20702d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 2071122f12eaSBarry Smith n = ii[1] - ii[0]; 2072122f12eaSBarry Smith ii++; 207326e093fcSHong Zhang if (!usecprow) { 2074122f12eaSBarry Smith sum = y[i]; 2075122f12eaSBarry Smith } else { 2076122f12eaSBarry Smith sum = y[ridx[i]]; 2077122f12eaSBarry Smith } 2078444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2079444d8c10SJed Brown PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2080122f12eaSBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 2081122f12eaSBarry Smith v += n; 2082122f12eaSBarry Smith idx += n; 2083122f12eaSBarry Smith if (usecprow) { 2084122f12eaSBarry Smith z[ridx[i]] = sum; 2085122f12eaSBarry Smith } else { 2086122f12eaSBarry Smith z[i] = sum; 208726e093fcSHong Zhang } 20882d61bbb3SSatish Balay } 20899566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20909566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 20919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 20922d61bbb3SSatish Balay PetscFunctionReturn(0); 20932d61bbb3SSatish Balay } 20942d61bbb3SSatish Balay 20959371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) { 20962d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2097f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2098d9ca1df4SBarry Smith const PetscScalar *x, *xb; 209926e093fcSHong Zhang PetscScalar x1, x2, *yarray, *zarray; 2100d9ca1df4SBarry Smith const MatScalar *v; 2101d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, j; 2102d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2103ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 21042d61bbb3SSatish Balay 21052d61bbb3SSatish Balay PetscFunctionBegin; 21069566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 21079566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 21082d61bbb3SSatish Balay 21092d61bbb3SSatish Balay idx = a->j; 21102d61bbb3SSatish Balay v = a->a; 211126e093fcSHong Zhang if (usecprow) { 211248a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 211326e093fcSHong Zhang mbs = a->compressedrow.nrows; 211426e093fcSHong Zhang ii = a->compressedrow.i; 21157b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 211626e093fcSHong Zhang } else { 21172d61bbb3SSatish Balay ii = a->i; 211826e093fcSHong Zhang y = yarray; 211926e093fcSHong Zhang z = zarray; 212026e093fcSHong Zhang } 21212d61bbb3SSatish Balay 21222d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 21239371c9d4SSatish Balay n = ii[1] - ii[0]; 21249371c9d4SSatish Balay ii++; 212526e093fcSHong Zhang if (usecprow) { 21267b2bb3b9SHong Zhang z = zarray + 2 * ridx[i]; 21277b2bb3b9SHong Zhang y = yarray + 2 * ridx[i]; 212826e093fcSHong Zhang } 21299371c9d4SSatish Balay sum1 = y[0]; 21309371c9d4SSatish Balay sum2 = y[1]; 2131444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2132444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 21332d61bbb3SSatish Balay for (j = 0; j < n; j++) { 213426fbe8dcSKarl Rupp xb = x + 2 * (*idx++); 213526fbe8dcSKarl Rupp x1 = xb[0]; 213626fbe8dcSKarl Rupp x2 = xb[1]; 213726fbe8dcSKarl Rupp 21382d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 21392d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 21402d61bbb3SSatish Balay v += 4; 21412d61bbb3SSatish Balay } 21429371c9d4SSatish Balay z[0] = sum1; 21439371c9d4SSatish Balay z[1] = sum2; 214426e093fcSHong Zhang if (!usecprow) { 21459371c9d4SSatish Balay z += 2; 21469371c9d4SSatish Balay y += 2; 21472d61bbb3SSatish Balay } 214826e093fcSHong Zhang } 21499566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 21509566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 21519566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * a->nz)); 21522d61bbb3SSatish Balay PetscFunctionReturn(0); 21532d61bbb3SSatish Balay } 21542d61bbb3SSatish Balay 21559371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) { 21562d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2157f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2158d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2159d9ca1df4SBarry Smith const MatScalar *v; 2160d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2161d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2162ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 21632d61bbb3SSatish Balay 21642d61bbb3SSatish Balay PetscFunctionBegin; 21659566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 21669566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 21672d61bbb3SSatish Balay 21682d61bbb3SSatish Balay idx = a->j; 21692d61bbb3SSatish Balay v = a->a; 217026e093fcSHong Zhang if (usecprow) { 217148a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 217226e093fcSHong Zhang mbs = a->compressedrow.nrows; 217326e093fcSHong Zhang ii = a->compressedrow.i; 21747b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 217526e093fcSHong Zhang } else { 21762d61bbb3SSatish Balay ii = a->i; 217726e093fcSHong Zhang y = yarray; 217826e093fcSHong Zhang z = zarray; 217926e093fcSHong Zhang } 21802d61bbb3SSatish Balay 21812d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 21829371c9d4SSatish Balay n = ii[1] - ii[0]; 21839371c9d4SSatish Balay ii++; 218426e093fcSHong Zhang if (usecprow) { 21857b2bb3b9SHong Zhang z = zarray + 3 * ridx[i]; 21867b2bb3b9SHong Zhang y = yarray + 3 * ridx[i]; 218726e093fcSHong Zhang } 21889371c9d4SSatish Balay sum1 = y[0]; 21899371c9d4SSatish Balay sum2 = y[1]; 21909371c9d4SSatish Balay sum3 = y[2]; 2191444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2192444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 21932d61bbb3SSatish Balay for (j = 0; j < n; j++) { 21949371c9d4SSatish Balay xb = x + 3 * (*idx++); 21959371c9d4SSatish Balay x1 = xb[0]; 21969371c9d4SSatish Balay x2 = xb[1]; 21979371c9d4SSatish Balay x3 = xb[2]; 21982d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 21992d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 22002d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 22012d61bbb3SSatish Balay v += 9; 22022d61bbb3SSatish Balay } 22039371c9d4SSatish Balay z[0] = sum1; 22049371c9d4SSatish Balay z[1] = sum2; 22059371c9d4SSatish Balay z[2] = sum3; 220626e093fcSHong Zhang if (!usecprow) { 22079371c9d4SSatish Balay z += 3; 22089371c9d4SSatish Balay y += 3; 22092d61bbb3SSatish Balay } 221026e093fcSHong Zhang } 22119566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 22129566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 22139566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz)); 22142d61bbb3SSatish Balay PetscFunctionReturn(0); 22152d61bbb3SSatish Balay } 22162d61bbb3SSatish Balay 22179371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) { 22182d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2219f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2220d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2221d9ca1df4SBarry Smith const MatScalar *v; 2222d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2223d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2224ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 22252d61bbb3SSatish Balay 22262d61bbb3SSatish Balay PetscFunctionBegin; 22279566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 22289566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 22292d61bbb3SSatish Balay 22302d61bbb3SSatish Balay idx = a->j; 22312d61bbb3SSatish Balay v = a->a; 223226e093fcSHong Zhang if (usecprow) { 223348a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 223426e093fcSHong Zhang mbs = a->compressedrow.nrows; 223526e093fcSHong Zhang ii = a->compressedrow.i; 22367b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 223726e093fcSHong Zhang } else { 22382d61bbb3SSatish Balay ii = a->i; 223926e093fcSHong Zhang y = yarray; 224026e093fcSHong Zhang z = zarray; 224126e093fcSHong Zhang } 22422d61bbb3SSatish Balay 22432d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 22449371c9d4SSatish Balay n = ii[1] - ii[0]; 22459371c9d4SSatish Balay ii++; 224626e093fcSHong Zhang if (usecprow) { 22477b2bb3b9SHong Zhang z = zarray + 4 * ridx[i]; 22487b2bb3b9SHong Zhang y = yarray + 4 * ridx[i]; 224926e093fcSHong Zhang } 22509371c9d4SSatish Balay sum1 = y[0]; 22519371c9d4SSatish Balay sum2 = y[1]; 22529371c9d4SSatish Balay sum3 = y[2]; 22539371c9d4SSatish Balay sum4 = y[3]; 2254444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2255444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 22562d61bbb3SSatish Balay for (j = 0; j < n; j++) { 22572d61bbb3SSatish Balay xb = x + 4 * (*idx++); 22589371c9d4SSatish Balay x1 = xb[0]; 22599371c9d4SSatish Balay x2 = xb[1]; 22609371c9d4SSatish Balay x3 = xb[2]; 22619371c9d4SSatish Balay x4 = xb[3]; 22622d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 22632d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 22642d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 22652d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 22662d61bbb3SSatish Balay v += 16; 22672d61bbb3SSatish Balay } 22689371c9d4SSatish Balay z[0] = sum1; 22699371c9d4SSatish Balay z[1] = sum2; 22709371c9d4SSatish Balay z[2] = sum3; 22719371c9d4SSatish Balay z[3] = sum4; 227226e093fcSHong Zhang if (!usecprow) { 22739371c9d4SSatish Balay z += 4; 22749371c9d4SSatish Balay y += 4; 22752d61bbb3SSatish Balay } 227626e093fcSHong Zhang } 22779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 22789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 22799566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz)); 22802d61bbb3SSatish Balay PetscFunctionReturn(0); 22812d61bbb3SSatish Balay } 22822d61bbb3SSatish Balay 22839371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) { 22842d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2285f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2286d9ca1df4SBarry Smith const PetscScalar *x, *xb; 228726e093fcSHong Zhang PetscScalar *yarray, *zarray; 2288d9ca1df4SBarry Smith const MatScalar *v; 2289d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2290d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2291ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 22922d61bbb3SSatish Balay 22932d61bbb3SSatish Balay PetscFunctionBegin; 22949566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 22959566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 22962d61bbb3SSatish Balay 22972d61bbb3SSatish Balay idx = a->j; 22982d61bbb3SSatish Balay v = a->a; 229926e093fcSHong Zhang if (usecprow) { 230048a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 230126e093fcSHong Zhang mbs = a->compressedrow.nrows; 230226e093fcSHong Zhang ii = a->compressedrow.i; 23037b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 230426e093fcSHong Zhang } else { 23052d61bbb3SSatish Balay ii = a->i; 230626e093fcSHong Zhang y = yarray; 230726e093fcSHong Zhang z = zarray; 230826e093fcSHong Zhang } 23092d61bbb3SSatish Balay 23102d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 23119371c9d4SSatish Balay n = ii[1] - ii[0]; 23129371c9d4SSatish Balay ii++; 231326e093fcSHong Zhang if (usecprow) { 23147b2bb3b9SHong Zhang z = zarray + 5 * ridx[i]; 23157b2bb3b9SHong Zhang y = yarray + 5 * ridx[i]; 231626e093fcSHong Zhang } 23179371c9d4SSatish Balay sum1 = y[0]; 23189371c9d4SSatish Balay sum2 = y[1]; 23199371c9d4SSatish Balay sum3 = y[2]; 23209371c9d4SSatish Balay sum4 = y[3]; 23219371c9d4SSatish Balay sum5 = y[4]; 2322444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2323444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 23242d61bbb3SSatish Balay for (j = 0; j < n; j++) { 23252d61bbb3SSatish Balay xb = x + 5 * (*idx++); 23269371c9d4SSatish Balay x1 = xb[0]; 23279371c9d4SSatish Balay x2 = xb[1]; 23289371c9d4SSatish Balay x3 = xb[2]; 23299371c9d4SSatish Balay x4 = xb[3]; 23309371c9d4SSatish Balay x5 = xb[4]; 23312d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 23322d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 23332d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 23342d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 23352d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 23362d61bbb3SSatish Balay v += 25; 23372d61bbb3SSatish Balay } 23389371c9d4SSatish Balay z[0] = sum1; 23399371c9d4SSatish Balay z[1] = sum2; 23409371c9d4SSatish Balay z[2] = sum3; 23419371c9d4SSatish Balay z[3] = sum4; 23429371c9d4SSatish Balay z[4] = sum5; 234326e093fcSHong Zhang if (!usecprow) { 23449371c9d4SSatish Balay z += 5; 23459371c9d4SSatish Balay y += 5; 23462d61bbb3SSatish Balay } 234726e093fcSHong Zhang } 23489566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 23499566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 23509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz)); 23512d61bbb3SSatish Balay PetscFunctionReturn(0); 23522d61bbb3SSatish Balay } 2353c2916339SPierre Jolivet 23549371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) { 235515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2356f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2357d9ca1df4SBarry Smith const PetscScalar *x, *xb; 235826e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2359d9ca1df4SBarry Smith const MatScalar *v; 2360d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2361d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2362ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 236315091d37SBarry Smith 236415091d37SBarry Smith PetscFunctionBegin; 23659566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 23669566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 236715091d37SBarry Smith 236815091d37SBarry Smith idx = a->j; 236915091d37SBarry Smith v = a->a; 237026e093fcSHong Zhang if (usecprow) { 237148a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 237226e093fcSHong Zhang mbs = a->compressedrow.nrows; 237326e093fcSHong Zhang ii = a->compressedrow.i; 23747b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 237526e093fcSHong Zhang } else { 237615091d37SBarry Smith ii = a->i; 237726e093fcSHong Zhang y = yarray; 237826e093fcSHong Zhang z = zarray; 237926e093fcSHong Zhang } 238015091d37SBarry Smith 238115091d37SBarry Smith for (i = 0; i < mbs; i++) { 23829371c9d4SSatish Balay n = ii[1] - ii[0]; 23839371c9d4SSatish Balay ii++; 238426e093fcSHong Zhang if (usecprow) { 23857b2bb3b9SHong Zhang z = zarray + 6 * ridx[i]; 23867b2bb3b9SHong Zhang y = yarray + 6 * ridx[i]; 238726e093fcSHong Zhang } 23889371c9d4SSatish Balay sum1 = y[0]; 23899371c9d4SSatish Balay sum2 = y[1]; 23909371c9d4SSatish Balay sum3 = y[2]; 23919371c9d4SSatish Balay sum4 = y[3]; 23929371c9d4SSatish Balay sum5 = y[4]; 23939371c9d4SSatish Balay sum6 = y[5]; 2394444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2395444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 239615091d37SBarry Smith for (j = 0; j < n; j++) { 23973b95cb0eSSatish Balay xb = x + 6 * (*idx++); 23989371c9d4SSatish Balay x1 = xb[0]; 23999371c9d4SSatish Balay x2 = xb[1]; 24009371c9d4SSatish Balay x3 = xb[2]; 24019371c9d4SSatish Balay x4 = xb[3]; 24029371c9d4SSatish Balay x5 = xb[4]; 24039371c9d4SSatish Balay x6 = xb[5]; 240415091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 240515091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 240615091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 240715091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 240815091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 240915091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 241015091d37SBarry Smith v += 36; 241115091d37SBarry Smith } 24129371c9d4SSatish Balay z[0] = sum1; 24139371c9d4SSatish Balay z[1] = sum2; 24149371c9d4SSatish Balay z[2] = sum3; 24159371c9d4SSatish Balay z[3] = sum4; 24169371c9d4SSatish Balay z[4] = sum5; 24179371c9d4SSatish Balay z[5] = sum6; 241826e093fcSHong Zhang if (!usecprow) { 24199371c9d4SSatish Balay z += 6; 24209371c9d4SSatish Balay y += 6; 242115091d37SBarry Smith } 242226e093fcSHong Zhang } 24239566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 24249566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 24259566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz)); 242615091d37SBarry Smith PetscFunctionReturn(0); 242715091d37SBarry Smith } 24282d61bbb3SSatish Balay 24299371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) { 24302d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2431f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2432d9ca1df4SBarry Smith const PetscScalar *x, *xb; 243326e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2434d9ca1df4SBarry Smith const MatScalar *v; 2435d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2436d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2437ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 24382d61bbb3SSatish Balay 24392d61bbb3SSatish Balay PetscFunctionBegin; 24409566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 24419566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 24422d61bbb3SSatish Balay 24432d61bbb3SSatish Balay idx = a->j; 24442d61bbb3SSatish Balay v = a->a; 244526e093fcSHong Zhang if (usecprow) { 244648a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 244726e093fcSHong Zhang mbs = a->compressedrow.nrows; 244826e093fcSHong Zhang ii = a->compressedrow.i; 24497b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 245026e093fcSHong Zhang } else { 24512d61bbb3SSatish Balay ii = a->i; 245226e093fcSHong Zhang y = yarray; 245326e093fcSHong Zhang z = zarray; 245426e093fcSHong Zhang } 24552d61bbb3SSatish Balay 24562d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 24579371c9d4SSatish Balay n = ii[1] - ii[0]; 24589371c9d4SSatish Balay ii++; 245926e093fcSHong Zhang if (usecprow) { 24607b2bb3b9SHong Zhang z = zarray + 7 * ridx[i]; 24617b2bb3b9SHong Zhang y = yarray + 7 * ridx[i]; 246226e093fcSHong Zhang } 24639371c9d4SSatish Balay sum1 = y[0]; 24649371c9d4SSatish Balay sum2 = y[1]; 24659371c9d4SSatish Balay sum3 = y[2]; 24669371c9d4SSatish Balay sum4 = y[3]; 24679371c9d4SSatish Balay sum5 = y[4]; 24689371c9d4SSatish Balay sum6 = y[5]; 24699371c9d4SSatish Balay sum7 = y[6]; 2470444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2471444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 24722d61bbb3SSatish Balay for (j = 0; j < n; j++) { 24732d61bbb3SSatish Balay xb = x + 7 * (*idx++); 24749371c9d4SSatish Balay x1 = xb[0]; 24759371c9d4SSatish Balay x2 = xb[1]; 24769371c9d4SSatish Balay x3 = xb[2]; 24779371c9d4SSatish Balay x4 = xb[3]; 24789371c9d4SSatish Balay x5 = xb[4]; 24799371c9d4SSatish Balay x6 = xb[5]; 24809371c9d4SSatish Balay x7 = xb[6]; 24812d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 24822d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 24832d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 24842d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 24852d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 24862d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 24872d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 24882d61bbb3SSatish Balay v += 49; 24892d61bbb3SSatish Balay } 24909371c9d4SSatish Balay z[0] = sum1; 24919371c9d4SSatish Balay z[1] = sum2; 24929371c9d4SSatish Balay z[2] = sum3; 24939371c9d4SSatish Balay z[3] = sum4; 24949371c9d4SSatish Balay z[4] = sum5; 24959371c9d4SSatish Balay z[5] = sum6; 24969371c9d4SSatish Balay z[6] = sum7; 249726e093fcSHong Zhang if (!usecprow) { 24989371c9d4SSatish Balay z += 7; 24999371c9d4SSatish Balay y += 7; 25002d61bbb3SSatish Balay } 250126e093fcSHong Zhang } 25029566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 25039566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 25049566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz)); 25052d61bbb3SSatish Balay PetscFunctionReturn(0); 25062d61bbb3SSatish Balay } 2507218c64b6SSatish Balay 25085f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 25099371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) { 251096e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2511f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 251296e086a2SDaniel Kokron const PetscScalar *x, *xb; 251396e086a2SDaniel Kokron const MatScalar *v; 25146679dcc1SBarry Smith PetscInt mbs, i, j, n; 2515ce68d72fSJed Brown PetscInt k; 251696e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 25176679dcc1SBarry Smith const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 251896e086a2SDaniel Kokron 251996e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 2520ce68d72fSJed Brown __m256d w0, w1, w2, w3; 252196e086a2SDaniel Kokron __m256d z0, z1, z2; 252296e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 252396e086a2SDaniel Kokron 252496e086a2SDaniel Kokron PetscFunctionBegin; 25259566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 25269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 25279566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 252896e086a2SDaniel Kokron 252996e086a2SDaniel Kokron idx = a->j; 253096e086a2SDaniel Kokron v = a->a; 253196e086a2SDaniel Kokron if (usecprow) { 253296e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 253396e086a2SDaniel Kokron ii = a->compressedrow.i; 253496e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 253596e086a2SDaniel Kokron } else { 253696e086a2SDaniel Kokron mbs = a->mbs; 253796e086a2SDaniel Kokron ii = a->i; 253896e086a2SDaniel Kokron z = zarray; 253996e086a2SDaniel Kokron } 254096e086a2SDaniel Kokron 254196e086a2SDaniel Kokron if (!a->mult_work) { 254296e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 25439566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 254496e086a2SDaniel Kokron } 254596e086a2SDaniel Kokron 254696e086a2SDaniel Kokron work = a->mult_work; 254796e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 25489371c9d4SSatish Balay n = ii[1] - ii[0]; 25499371c9d4SSatish Balay ii++; 255096e086a2SDaniel Kokron workt = work; 255196e086a2SDaniel Kokron for (j = 0; j < n; j++) { 255296e086a2SDaniel Kokron xb = x + bs * (*idx++); 255396e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 255496e086a2SDaniel Kokron workt += bs; 255596e086a2SDaniel Kokron } 255696e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 255796e086a2SDaniel Kokron 25589371c9d4SSatish Balay z0 = _mm256_loadu_pd(&z[0]); 25599371c9d4SSatish Balay z1 = _mm256_loadu_pd(&z[4]); 25609371c9d4SSatish Balay z2 = _mm256_set1_pd(z[8]); 256196e086a2SDaniel Kokron 256296e086a2SDaniel Kokron for (j = 0; j < n; j++) { 2563c05b70c4SSatish Balay /* first column of a */ 256496e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 25659371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 25669371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 25679371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 25689371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 25699371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 25709371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 257196e086a2SDaniel Kokron 2572c05b70c4SSatish Balay /* second column of a */ 257396e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 25749371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 25759371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 25769371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 25779371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 25789371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 25799371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 258096e086a2SDaniel Kokron 2581c05b70c4SSatish Balay /* third column of a */ 258296e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 25839371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 25849371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 25859371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 25869371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 25879371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 25889371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 258996e086a2SDaniel Kokron 2590c05b70c4SSatish Balay /* fourth column of a */ 259196e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 25929371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 25939371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 25949371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 25959371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 25969371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 25979371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 259896e086a2SDaniel Kokron 2599c05b70c4SSatish Balay /* fifth column of a */ 260096e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 26019371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 26029371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 26039371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 26049371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 26059371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 26069371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 260796e086a2SDaniel Kokron 2608c05b70c4SSatish Balay /* sixth column of a */ 260996e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 26109371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 26119371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 26129371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 26139371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 26149371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 26159371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 261696e086a2SDaniel Kokron 2617c05b70c4SSatish Balay /* seventh column of a */ 261896e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 26199371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 26209371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 26219371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 26229371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 26239371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 26249371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 262596e086a2SDaniel Kokron 26266aad120cSJose E. Roman /* eighth column of a */ 262796e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 26289371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 26299371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 26309371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 26319371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 26329371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 26339371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 263496e086a2SDaniel Kokron 2635c05b70c4SSatish Balay /* ninth column of a */ 263696e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 26379371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 26389371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 26399371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 26409371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 26419371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 26429371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 264396e086a2SDaniel Kokron } 264496e086a2SDaniel Kokron 26459371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 26469371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 26479371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 264896e086a2SDaniel Kokron 264996e086a2SDaniel Kokron v += n * bs2; 265096e086a2SDaniel Kokron if (!usecprow) z += bs; 265196e086a2SDaniel Kokron } 26529566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 26539566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 26549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(162.0 * a->nz)); 265596e086a2SDaniel Kokron PetscFunctionReturn(0); 265696e086a2SDaniel Kokron } 265796e086a2SDaniel Kokron #endif 265896e086a2SDaniel Kokron 26599371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) { 2660ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2661f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2662ebada01fSBarry Smith const PetscScalar *x, *xb; 2663ebada01fSBarry Smith PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2664ebada01fSBarry Smith const MatScalar *v; 2665ebada01fSBarry Smith PetscInt mbs = a->mbs, i, j, n; 2666ebada01fSBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2667ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 2668ebada01fSBarry Smith 2669ebada01fSBarry Smith PetscFunctionBegin; 26709566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 26719566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2672ebada01fSBarry Smith 2673ebada01fSBarry Smith idx = a->j; 2674ebada01fSBarry Smith v = a->a; 2675ebada01fSBarry Smith if (usecprow) { 267648a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2677ebada01fSBarry Smith mbs = a->compressedrow.nrows; 2678ebada01fSBarry Smith ii = a->compressedrow.i; 2679ebada01fSBarry Smith ridx = a->compressedrow.rindex; 2680ebada01fSBarry Smith } else { 2681ebada01fSBarry Smith ii = a->i; 2682ebada01fSBarry Smith y = yarray; 2683ebada01fSBarry Smith z = zarray; 2684ebada01fSBarry Smith } 2685ebada01fSBarry Smith 2686ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 26879371c9d4SSatish Balay n = ii[1] - ii[0]; 26889371c9d4SSatish Balay ii++; 2689ebada01fSBarry Smith if (usecprow) { 2690ebada01fSBarry Smith z = zarray + 11 * ridx[i]; 2691ebada01fSBarry Smith y = yarray + 11 * ridx[i]; 2692ebada01fSBarry Smith } 26939371c9d4SSatish Balay sum1 = y[0]; 26949371c9d4SSatish Balay sum2 = y[1]; 26959371c9d4SSatish Balay sum3 = y[2]; 26969371c9d4SSatish Balay sum4 = y[3]; 26979371c9d4SSatish Balay sum5 = y[4]; 26989371c9d4SSatish Balay sum6 = y[5]; 26999371c9d4SSatish Balay sum7 = y[6]; 27009371c9d4SSatish Balay sum8 = y[7]; 27019371c9d4SSatish Balay sum9 = y[8]; 27029371c9d4SSatish Balay sum10 = y[9]; 27039371c9d4SSatish Balay sum11 = y[10]; 2704ebada01fSBarry Smith PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2705ebada01fSBarry Smith PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2706ebada01fSBarry Smith for (j = 0; j < n; j++) { 2707ebada01fSBarry Smith xb = x + 11 * (*idx++); 27089371c9d4SSatish Balay x1 = xb[0]; 27099371c9d4SSatish Balay x2 = xb[1]; 27109371c9d4SSatish Balay x3 = xb[2]; 27119371c9d4SSatish Balay x4 = xb[3]; 27129371c9d4SSatish Balay x5 = xb[4]; 27139371c9d4SSatish Balay x6 = xb[5]; 27149371c9d4SSatish Balay x7 = xb[6]; 27159371c9d4SSatish Balay x8 = xb[7]; 27169371c9d4SSatish Balay x9 = xb[8]; 27179371c9d4SSatish Balay x10 = xb[9]; 27189371c9d4SSatish Balay x11 = xb[10]; 2719ebada01fSBarry Smith sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2720ebada01fSBarry Smith sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2721ebada01fSBarry Smith sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2722ebada01fSBarry Smith sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2723ebada01fSBarry Smith sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2724ebada01fSBarry Smith sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2725ebada01fSBarry Smith sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2726ebada01fSBarry Smith sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2727ebada01fSBarry Smith sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2728ebada01fSBarry Smith sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2729ebada01fSBarry Smith sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2730ebada01fSBarry Smith v += 121; 2731ebada01fSBarry Smith } 27329371c9d4SSatish Balay z[0] = sum1; 27339371c9d4SSatish Balay z[1] = sum2; 27349371c9d4SSatish Balay z[2] = sum3; 27359371c9d4SSatish Balay z[3] = sum4; 27369371c9d4SSatish Balay z[4] = sum5; 27379371c9d4SSatish Balay z[5] = sum6; 27389371c9d4SSatish Balay z[6] = sum7; 27399371c9d4SSatish Balay z[7] = sum8; 27409371c9d4SSatish Balay z[8] = sum9; 27419371c9d4SSatish Balay z[9] = sum10; 27429371c9d4SSatish Balay z[10] = sum11; 2743ebada01fSBarry Smith if (!usecprow) { 27449371c9d4SSatish Balay z += 11; 27459371c9d4SSatish Balay y += 11; 2746ebada01fSBarry Smith } 2747ebada01fSBarry Smith } 27489566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 27499566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 27509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz)); 2751ebada01fSBarry Smith PetscFunctionReturn(0); 2752ebada01fSBarry Smith } 2753ebada01fSBarry Smith 27549371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) { 27552d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2756f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 2757d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2758d9ca1df4SBarry Smith const MatScalar *v; 2759d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2760d9ca1df4SBarry Smith PetscInt ncols, k; 2761d9ca1df4SBarry Smith const PetscInt *ridx = NULL, *idx, *ii; 2762ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2763218c64b6SSatish Balay 27642d61bbb3SSatish Balay PetscFunctionBegin; 27659566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 27669566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 27679566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 27682d61bbb3SSatish Balay 27692d61bbb3SSatish Balay idx = a->j; 27702d61bbb3SSatish Balay v = a->a; 277126e093fcSHong Zhang if (usecprow) { 277226e093fcSHong Zhang mbs = a->compressedrow.nrows; 277326e093fcSHong Zhang ii = a->compressedrow.i; 27747b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 277526e093fcSHong Zhang } else { 277626e093fcSHong Zhang mbs = a->mbs; 27772d61bbb3SSatish Balay ii = a->i; 277826e093fcSHong Zhang z = zarray; 277926e093fcSHong Zhang } 27802d61bbb3SSatish Balay 27812d61bbb3SSatish Balay if (!a->mult_work) { 2782d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 27839566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 27842d61bbb3SSatish Balay } 27852d61bbb3SSatish Balay work = a->mult_work; 27862d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 27879371c9d4SSatish Balay n = ii[1] - ii[0]; 27889371c9d4SSatish Balay ii++; 27892d61bbb3SSatish Balay ncols = n * bs; 27902d61bbb3SSatish Balay workt = work; 27912d61bbb3SSatish Balay for (j = 0; j < n; j++) { 27922d61bbb3SSatish Balay xb = x + bs * (*idx++); 27932d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 27942d61bbb3SSatish Balay workt += bs; 27952d61bbb3SSatish Balay } 27967b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 279796b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 27982d61bbb3SSatish Balay v += n * bs2; 279926fbe8dcSKarl Rupp if (!usecprow) z += bs; 280026e093fcSHong Zhang } 28019566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 28029566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 28039566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 28042d61bbb3SSatish Balay PetscFunctionReturn(0); 28052d61bbb3SSatish Balay } 28062d61bbb3SSatish Balay 28079371c9d4SSatish Balay PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) { 2808547795f9SHong Zhang PetscScalar zero = 0.0; 2809547795f9SHong Zhang 2810547795f9SHong Zhang PetscFunctionBegin; 28119566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28129566063dSJacob Faibussowitsch PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2813547795f9SHong Zhang PetscFunctionReturn(0); 2814547795f9SHong Zhang } 2815547795f9SHong Zhang 28169371c9d4SSatish Balay PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) { 28173447b6efSHong Zhang PetscScalar zero = 0.0; 28182d61bbb3SSatish Balay 28192d61bbb3SSatish Balay PetscFunctionBegin; 28209566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28219566063dSJacob Faibussowitsch PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 28222d61bbb3SSatish Balay PetscFunctionReturn(0); 28232d61bbb3SSatish Balay } 28242d61bbb3SSatish Balay 28259371c9d4SSatish Balay PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) { 2826547795f9SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2827b8c08b77SHong Zhang PetscScalar *z, x1, x2, x3, x4, x5; 2828d9ca1df4SBarry Smith const PetscScalar *x, *xb = NULL; 2829d9ca1df4SBarry Smith const MatScalar *v; 2830b8c08b77SHong Zhang PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2831d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 2832547795f9SHong Zhang Mat_CompressedRow cprow = a->compressedrow; 2833ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 2834547795f9SHong Zhang 2835547795f9SHong Zhang PetscFunctionBegin; 28369566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 28379566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 28389566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 2839547795f9SHong Zhang 2840547795f9SHong Zhang idx = a->j; 2841547795f9SHong Zhang v = a->a; 2842547795f9SHong Zhang if (usecprow) { 2843547795f9SHong Zhang mbs = cprow.nrows; 2844547795f9SHong Zhang ii = cprow.i; 2845547795f9SHong Zhang ridx = cprow.rindex; 2846547795f9SHong Zhang } else { 2847547795f9SHong Zhang mbs = a->mbs; 2848547795f9SHong Zhang ii = a->i; 2849547795f9SHong Zhang xb = x; 2850547795f9SHong Zhang } 2851547795f9SHong Zhang 2852547795f9SHong Zhang switch (bs) { 2853547795f9SHong Zhang case 1: 2854547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2855547795f9SHong Zhang if (usecprow) xb = x + ridx[i]; 2856547795f9SHong Zhang x1 = xb[0]; 2857547795f9SHong Zhang ib = idx + ii[0]; 28589371c9d4SSatish Balay n = ii[1] - ii[0]; 28599371c9d4SSatish Balay ii++; 2860547795f9SHong Zhang for (j = 0; j < n; j++) { 2861547795f9SHong Zhang rval = ib[j]; 2862547795f9SHong Zhang z[rval] += PetscConj(*v) * x1; 2863547795f9SHong Zhang v++; 2864547795f9SHong Zhang } 2865547795f9SHong Zhang if (!usecprow) xb++; 2866547795f9SHong Zhang } 2867547795f9SHong Zhang break; 2868547795f9SHong Zhang case 2: 2869547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2870547795f9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 28719371c9d4SSatish Balay x1 = xb[0]; 28729371c9d4SSatish Balay x2 = xb[1]; 2873547795f9SHong Zhang ib = idx + ii[0]; 28749371c9d4SSatish Balay n = ii[1] - ii[0]; 28759371c9d4SSatish Balay ii++; 2876547795f9SHong Zhang for (j = 0; j < n; j++) { 2877547795f9SHong Zhang rval = ib[j] * 2; 2878547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2879547795f9SHong Zhang z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2880547795f9SHong Zhang v += 4; 2881547795f9SHong Zhang } 2882547795f9SHong Zhang if (!usecprow) xb += 2; 2883547795f9SHong Zhang } 2884547795f9SHong Zhang break; 2885547795f9SHong Zhang case 3: 2886547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2887547795f9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 28889371c9d4SSatish Balay x1 = xb[0]; 28899371c9d4SSatish Balay x2 = xb[1]; 28909371c9d4SSatish Balay x3 = xb[2]; 2891547795f9SHong Zhang ib = idx + ii[0]; 28929371c9d4SSatish Balay n = ii[1] - ii[0]; 28939371c9d4SSatish Balay ii++; 2894547795f9SHong Zhang for (j = 0; j < n; j++) { 2895547795f9SHong Zhang rval = ib[j] * 3; 2896547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2897547795f9SHong Zhang z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2898547795f9SHong Zhang z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2899547795f9SHong Zhang v += 9; 2900547795f9SHong Zhang } 2901547795f9SHong Zhang if (!usecprow) xb += 3; 2902547795f9SHong Zhang } 2903547795f9SHong Zhang break; 2904547795f9SHong Zhang case 4: 2905547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2906547795f9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 29079371c9d4SSatish Balay x1 = xb[0]; 29089371c9d4SSatish Balay x2 = xb[1]; 29099371c9d4SSatish Balay x3 = xb[2]; 29109371c9d4SSatish Balay x4 = xb[3]; 2911547795f9SHong Zhang ib = idx + ii[0]; 29129371c9d4SSatish Balay n = ii[1] - ii[0]; 29139371c9d4SSatish Balay ii++; 2914547795f9SHong Zhang for (j = 0; j < n; j++) { 2915547795f9SHong Zhang rval = ib[j] * 4; 2916547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2917547795f9SHong Zhang z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2918547795f9SHong Zhang z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2919547795f9SHong Zhang z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2920547795f9SHong Zhang v += 16; 2921547795f9SHong Zhang } 2922547795f9SHong Zhang if (!usecprow) xb += 4; 2923547795f9SHong Zhang } 2924547795f9SHong Zhang break; 2925547795f9SHong Zhang case 5: 2926547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2927547795f9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 29289371c9d4SSatish Balay x1 = xb[0]; 29299371c9d4SSatish Balay x2 = xb[1]; 29309371c9d4SSatish Balay x3 = xb[2]; 29319371c9d4SSatish Balay x4 = xb[3]; 29329371c9d4SSatish Balay x5 = xb[4]; 2933547795f9SHong Zhang ib = idx + ii[0]; 29349371c9d4SSatish Balay n = ii[1] - ii[0]; 29359371c9d4SSatish Balay ii++; 2936547795f9SHong Zhang for (j = 0; j < n; j++) { 2937547795f9SHong Zhang rval = ib[j] * 5; 2938547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2939547795f9SHong Zhang z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2940547795f9SHong Zhang z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2941547795f9SHong Zhang z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2942547795f9SHong Zhang z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2943547795f9SHong Zhang v += 25; 2944547795f9SHong Zhang } 2945547795f9SHong Zhang if (!usecprow) xb += 5; 2946547795f9SHong Zhang } 2947547795f9SHong Zhang break; 29489371c9d4SSatish Balay default: /* block sizes larger than 5 by 5 are handled by BLAS */ SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2949968ae2c8SSatish Balay #if 0 2950968ae2c8SSatish Balay { 2951b8c08b77SHong Zhang PetscInt ncols,k,bs2=a->bs2; 2952b8c08b77SHong Zhang PetscScalar *work,*workt,zb; 2953d9ca1df4SBarry Smith const PetscScalar *xtmp; 2954547795f9SHong Zhang if (!a->mult_work) { 2955547795f9SHong Zhang k = PetscMax(A->rmap->n,A->cmap->n); 29569566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2957547795f9SHong Zhang } 2958547795f9SHong Zhang work = a->mult_work; 2959547795f9SHong Zhang xtmp = x; 2960547795f9SHong Zhang for (i=0; i<mbs; i++) { 2961547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 2962547795f9SHong Zhang ncols = n*bs; 29639566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work,ncols)); 296426fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs*ridx[i]; 296596b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2966547795f9SHong Zhang v += n*bs2; 2967547795f9SHong Zhang if (!usecprow) xtmp += bs; 2968547795f9SHong Zhang workt = work; 2969547795f9SHong Zhang for (j=0; j<n; j++) { 2970547795f9SHong Zhang zb = z + bs*(*idx++); 2971547795f9SHong Zhang for (k=0; k<bs; k++) zb[k] += workt[k] ; 2972547795f9SHong Zhang workt += bs; 2973547795f9SHong Zhang } 2974547795f9SHong Zhang } 2975547795f9SHong Zhang } 2976968ae2c8SSatish Balay #endif 2977547795f9SHong Zhang } 29789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 29799566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 29809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 2981547795f9SHong Zhang PetscFunctionReturn(0); 2982547795f9SHong Zhang } 2983547795f9SHong Zhang 29849371c9d4SSatish Balay PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) { 29852d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2986d9ca1df4SBarry Smith PetscScalar *zb, *z, x1, x2, x3, x4, x5; 2987f4259b30SLisandro Dalcin const PetscScalar *x, *xb = NULL; 2988d9ca1df4SBarry Smith const MatScalar *v; 2989d9ca1df4SBarry Smith PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2990d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 29913447b6efSHong Zhang Mat_CompressedRow cprow = a->compressedrow; 2992ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 29932d61bbb3SSatish Balay 29942d61bbb3SSatish Balay PetscFunctionBegin; 29959566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 29969566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 29979566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 29982d61bbb3SSatish Balay 29992d61bbb3SSatish Balay idx = a->j; 30002d61bbb3SSatish Balay v = a->a; 30013447b6efSHong Zhang if (usecprow) { 30023447b6efSHong Zhang mbs = cprow.nrows; 30033447b6efSHong Zhang ii = cprow.i; 30047b2bb3b9SHong Zhang ridx = cprow.rindex; 30053447b6efSHong Zhang } else { 30063447b6efSHong Zhang mbs = a->mbs; 30072d61bbb3SSatish Balay ii = a->i; 3008f1af5d2fSBarry Smith xb = x; 30093447b6efSHong Zhang } 30102d61bbb3SSatish Balay 30112d61bbb3SSatish Balay switch (bs) { 30122d61bbb3SSatish Balay case 1: 30132d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30147b2bb3b9SHong Zhang if (usecprow) xb = x + ridx[i]; 3015f1af5d2fSBarry Smith x1 = xb[0]; 30163447b6efSHong Zhang ib = idx + ii[0]; 30179371c9d4SSatish Balay n = ii[1] - ii[0]; 30189371c9d4SSatish Balay ii++; 30192d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30202d61bbb3SSatish Balay rval = ib[j]; 3021f1af5d2fSBarry Smith z[rval] += *v * x1; 3022f1af5d2fSBarry Smith v++; 30232d61bbb3SSatish Balay } 30243447b6efSHong Zhang if (!usecprow) xb++; 30252d61bbb3SSatish Balay } 30262d61bbb3SSatish Balay break; 30272d61bbb3SSatish Balay case 2: 30282d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30297b2bb3b9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 30309371c9d4SSatish Balay x1 = xb[0]; 30319371c9d4SSatish Balay x2 = xb[1]; 30323447b6efSHong Zhang ib = idx + ii[0]; 30339371c9d4SSatish Balay n = ii[1] - ii[0]; 30349371c9d4SSatish Balay ii++; 30352d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30362d61bbb3SSatish Balay rval = ib[j] * 2; 30372d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2; 30382d61bbb3SSatish Balay z[rval++] += v[2] * x1 + v[3] * x2; 30392d61bbb3SSatish Balay v += 4; 30402d61bbb3SSatish Balay } 30413447b6efSHong Zhang if (!usecprow) xb += 2; 30422d61bbb3SSatish Balay } 30432d61bbb3SSatish Balay break; 30442d61bbb3SSatish Balay case 3: 30452d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30467b2bb3b9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 30479371c9d4SSatish Balay x1 = xb[0]; 30489371c9d4SSatish Balay x2 = xb[1]; 30499371c9d4SSatish Balay x3 = xb[2]; 30503447b6efSHong Zhang ib = idx + ii[0]; 30519371c9d4SSatish Balay n = ii[1] - ii[0]; 30529371c9d4SSatish Balay ii++; 30532d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30542d61bbb3SSatish Balay rval = ib[j] * 3; 30552d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 30562d61bbb3SSatish Balay z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 30572d61bbb3SSatish Balay z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 30582d61bbb3SSatish Balay v += 9; 30592d61bbb3SSatish Balay } 30603447b6efSHong Zhang if (!usecprow) xb += 3; 30612d61bbb3SSatish Balay } 30622d61bbb3SSatish Balay break; 30632d61bbb3SSatish Balay case 4: 30642d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30657b2bb3b9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 30669371c9d4SSatish Balay x1 = xb[0]; 30679371c9d4SSatish Balay x2 = xb[1]; 30689371c9d4SSatish Balay x3 = xb[2]; 30699371c9d4SSatish Balay x4 = xb[3]; 30703447b6efSHong Zhang ib = idx + ii[0]; 30719371c9d4SSatish Balay n = ii[1] - ii[0]; 30729371c9d4SSatish Balay ii++; 30732d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30742d61bbb3SSatish Balay rval = ib[j] * 4; 30752d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 30762d61bbb3SSatish Balay z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 30772d61bbb3SSatish Balay z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 30782d61bbb3SSatish Balay z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 30792d61bbb3SSatish Balay v += 16; 30802d61bbb3SSatish Balay } 30813447b6efSHong Zhang if (!usecprow) xb += 4; 30822d61bbb3SSatish Balay } 30832d61bbb3SSatish Balay break; 30842d61bbb3SSatish Balay case 5: 30852d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30867b2bb3b9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 30879371c9d4SSatish Balay x1 = xb[0]; 30889371c9d4SSatish Balay x2 = xb[1]; 30899371c9d4SSatish Balay x3 = xb[2]; 30909371c9d4SSatish Balay x4 = xb[3]; 30919371c9d4SSatish Balay x5 = xb[4]; 30923447b6efSHong Zhang ib = idx + ii[0]; 30939371c9d4SSatish Balay n = ii[1] - ii[0]; 30949371c9d4SSatish Balay ii++; 30952d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30962d61bbb3SSatish Balay rval = ib[j] * 5; 30972d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 30982d61bbb3SSatish Balay z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 30992d61bbb3SSatish Balay z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 31002d61bbb3SSatish Balay z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 31012d61bbb3SSatish Balay z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 31022d61bbb3SSatish Balay v += 25; 31032d61bbb3SSatish Balay } 31043447b6efSHong Zhang if (!usecprow) xb += 5; 31052d61bbb3SSatish Balay } 31062d61bbb3SSatish Balay break; 3107f1af5d2fSBarry Smith default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3108690b6cddSBarry Smith PetscInt ncols, k; 3109d9ca1df4SBarry Smith PetscScalar *work, *workt; 3110d9ca1df4SBarry Smith const PetscScalar *xtmp; 31112d61bbb3SSatish Balay if (!a->mult_work) { 3112d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 31139566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 31142d61bbb3SSatish Balay } 31152d61bbb3SSatish Balay work = a->mult_work; 31163447b6efSHong Zhang xtmp = x; 31172d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 31189371c9d4SSatish Balay n = ii[1] - ii[0]; 31199371c9d4SSatish Balay ii++; 31202d61bbb3SSatish Balay ncols = n * bs; 31219566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work, ncols)); 312226fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs * ridx[i]; 312396b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 31242d61bbb3SSatish Balay v += n * bs2; 31253447b6efSHong Zhang if (!usecprow) xtmp += bs; 31262d61bbb3SSatish Balay workt = work; 31272d61bbb3SSatish Balay for (j = 0; j < n; j++) { 31282d61bbb3SSatish Balay zb = z + bs * (*idx++); 31292d61bbb3SSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 31302d61bbb3SSatish Balay workt += bs; 31312d61bbb3SSatish Balay } 31322d61bbb3SSatish Balay } 31332d61bbb3SSatish Balay } 31342d61bbb3SSatish Balay } 31359566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 31369566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 31379566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 31382d61bbb3SSatish Balay PetscFunctionReturn(0); 31392d61bbb3SSatish Balay } 31402d61bbb3SSatish Balay 31419371c9d4SSatish Balay PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) { 31422d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3143690b6cddSBarry Smith PetscInt totalnz = a->bs2 * a->nz; 3144f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 3145c5df96a5SBarry Smith PetscBLASInt one = 1, tnz; 31462d61bbb3SSatish Balay 31472d61bbb3SSatish Balay PetscFunctionBegin; 31489566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3149792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 31509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 31512d61bbb3SSatish Balay PetscFunctionReturn(0); 31522d61bbb3SSatish Balay } 31532d61bbb3SSatish Balay 31549371c9d4SSatish Balay PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) { 31552d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 31563f1db9ecSBarry Smith MatScalar *v = a->a; 3157329f5518SBarry Smith PetscReal sum = 0.0; 3158d0f46423SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 31592d61bbb3SSatish Balay 31602d61bbb3SSatish Balay PetscFunctionBegin; 31612d61bbb3SSatish Balay if (type == NORM_FROBENIUS) { 3162570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16) 3163570b7f6dSBarry Smith PetscBLASInt one = 1, cnt = bs2 * nz; 3164792fecdfSBarry Smith PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3165570b7f6dSBarry Smith #else 31662d61bbb3SSatish Balay for (i = 0; i < bs2 * nz; i++) { 31679371c9d4SSatish Balay sum += PetscRealPart(PetscConj(*v) * (*v)); 31689371c9d4SSatish Balay v++; 31692d61bbb3SSatish Balay } 3170570b7f6dSBarry Smith #endif 31718f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum); 31729566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 31738a62d963SHong Zhang } else if (type == NORM_1) { /* maximum column sum */ 31748a62d963SHong Zhang PetscReal *tmp; 31758a62d963SHong Zhang PetscInt *bcol = a->j; 31769566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 31778a62d963SHong Zhang for (i = 0; i < nz; i++) { 31788a62d963SHong Zhang for (j = 0; j < bs; j++) { 31798a62d963SHong Zhang k1 = bs * (*bcol) + j; /* column index */ 31808a62d963SHong Zhang for (k = 0; k < bs; k++) { 31819371c9d4SSatish Balay tmp[k1] += PetscAbsScalar(*v); 31829371c9d4SSatish Balay v++; 31838a62d963SHong Zhang } 31848a62d963SHong Zhang } 31858a62d963SHong Zhang bcol++; 31868a62d963SHong Zhang } 31878a62d963SHong Zhang *norm = 0.0; 3188d0f46423SBarry Smith for (j = 0; j < A->cmap->n; j++) { 31898a62d963SHong Zhang if (tmp[j] > *norm) *norm = tmp[j]; 31908a62d963SHong Zhang } 31919566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp)); 31929566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3193596552b5SBarry Smith } else if (type == NORM_INFINITY) { /* maximum row sum */ 3194596552b5SBarry Smith *norm = 0.0; 3195596552b5SBarry Smith for (k = 0; k < bs; k++) { 319674f84c7bSSatish Balay for (j = 0; j < a->mbs; j++) { 3197596552b5SBarry Smith v = a->a + bs2 * a->i[j] + k; 3198596552b5SBarry Smith sum = 0.0; 3199596552b5SBarry Smith for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 32000e90e235SBarry Smith for (k1 = 0; k1 < bs; k1++) { 3201596552b5SBarry Smith sum += PetscAbsScalar(*v); 3202596552b5SBarry Smith v += bs; 32032d61bbb3SSatish Balay } 32040e90e235SBarry Smith } 3205596552b5SBarry Smith if (sum > *norm) *norm = sum; 3206596552b5SBarry Smith } 3207596552b5SBarry Smith } 32089566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3209e7e72b3dSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 32102d61bbb3SSatish Balay PetscFunctionReturn(0); 32112d61bbb3SSatish Balay } 32122d61bbb3SSatish Balay 32139371c9d4SSatish Balay PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) { 32142d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 32152d61bbb3SSatish Balay 32162d61bbb3SSatish Balay PetscFunctionBegin; 32172d61bbb3SSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3218d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3219273d9f13SBarry Smith *flg = PETSC_FALSE; 3220273d9f13SBarry Smith PetscFunctionReturn(0); 32212d61bbb3SSatish Balay } 32222d61bbb3SSatish Balay 32232d61bbb3SSatish Balay /* if the a->i are the same */ 32249566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 322526fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 32262d61bbb3SSatish Balay 32272d61bbb3SSatish Balay /* if a->j are the same */ 32289566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 322926fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 323026fbe8dcSKarl Rupp 32312d61bbb3SSatish Balay /* if a->a are the same */ 32329566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 32332d61bbb3SSatish Balay PetscFunctionReturn(0); 32342d61bbb3SSatish Balay } 32352d61bbb3SSatish Balay 32369371c9d4SSatish Balay PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) { 32372d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3238690b6cddSBarry Smith PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 323987828ca2SBarry Smith PetscScalar *x, zero = 0.0; 32403f1db9ecSBarry Smith MatScalar *aa, *aa_j; 32412d61bbb3SSatish Balay 32422d61bbb3SSatish Balay PetscFunctionBegin; 324328b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3244d0f46423SBarry Smith bs = A->rmap->bs; 32452d61bbb3SSatish Balay aa = a->a; 32462d61bbb3SSatish Balay ai = a->i; 32472d61bbb3SSatish Balay aj = a->j; 32482d61bbb3SSatish Balay ambs = a->mbs; 32492d61bbb3SSatish Balay bs2 = a->bs2; 32502d61bbb3SSatish Balay 32519566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 32529566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 32539566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 325408401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 32552d61bbb3SSatish Balay for (i = 0; i < ambs; i++) { 32562d61bbb3SSatish Balay for (j = ai[i]; j < ai[i + 1]; j++) { 32572d61bbb3SSatish Balay if (aj[j] == i) { 32582d61bbb3SSatish Balay row = i * bs; 32592d61bbb3SSatish Balay aa_j = aa + j * bs2; 32602d61bbb3SSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 32612d61bbb3SSatish Balay break; 32622d61bbb3SSatish Balay } 32632d61bbb3SSatish Balay } 32642d61bbb3SSatish Balay } 32659566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 32662d61bbb3SSatish Balay PetscFunctionReturn(0); 32672d61bbb3SSatish Balay } 32682d61bbb3SSatish Balay 32699371c9d4SSatish Balay PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) { 32702d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 327153ef36baSBarry Smith const PetscScalar *l, *r, *li, *ri; 327253ef36baSBarry Smith PetscScalar x; 32733f1db9ecSBarry Smith MatScalar *aa, *v; 327453ef36baSBarry Smith PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 327553ef36baSBarry Smith const PetscInt *ai, *aj; 32762d61bbb3SSatish Balay 32772d61bbb3SSatish Balay PetscFunctionBegin; 32782d61bbb3SSatish Balay ai = a->i; 32792d61bbb3SSatish Balay aj = a->j; 32802d61bbb3SSatish Balay aa = a->a; 3281d0f46423SBarry Smith m = A->rmap->n; 3282d0f46423SBarry Smith n = A->cmap->n; 3283d0f46423SBarry Smith bs = A->rmap->bs; 32842d61bbb3SSatish Balay mbs = a->mbs; 32852d61bbb3SSatish Balay bs2 = a->bs2; 32862d61bbb3SSatish Balay if (ll) { 32879566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 32889566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 328908401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 32902d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 32912d61bbb3SSatish Balay M = ai[i + 1] - ai[i]; 32922d61bbb3SSatish Balay li = l + i * bs; 32932d61bbb3SSatish Balay v = aa + bs2 * ai[i]; 32942d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 3295ad540459SPierre Jolivet for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; 32962d61bbb3SSatish Balay } 32972d61bbb3SSatish Balay } 32989566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 32999566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 33002d61bbb3SSatish Balay } 33012d61bbb3SSatish Balay 33022d61bbb3SSatish Balay if (rr) { 33039566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(rr, &r)); 33049566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(rr, &rn)); 330508401ef6SPierre Jolivet PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 33062d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 330753ef36baSBarry Smith iai = ai[i]; 330853ef36baSBarry Smith M = ai[i + 1] - iai; 330953ef36baSBarry Smith v = aa + bs2 * iai; 33102d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 331153ef36baSBarry Smith ri = r + bs * aj[iai + j]; 33122d61bbb3SSatish Balay for (k = 0; k < bs; k++) { 33132d61bbb3SSatish Balay x = ri[k]; 331453ef36baSBarry Smith for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 331553ef36baSBarry Smith v += bs; 33162d61bbb3SSatish Balay } 33172d61bbb3SSatish Balay } 33182d61bbb3SSatish Balay } 33199566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(rr, &r)); 33209566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 33212d61bbb3SSatish Balay } 33222d61bbb3SSatish Balay PetscFunctionReturn(0); 33232d61bbb3SSatish Balay } 33242d61bbb3SSatish Balay 33259371c9d4SSatish Balay PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) { 33262d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33272d61bbb3SSatish Balay 33282d61bbb3SSatish Balay PetscFunctionBegin; 33292d61bbb3SSatish Balay info->block_size = a->bs2; 3330ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; 33312d61bbb3SSatish Balay info->nz_used = a->bs2 * a->nz; 33323966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 33332d61bbb3SSatish Balay info->assemblies = A->num_ass; 33348e58a170SBarry Smith info->mallocs = A->info.mallocs; 33357adad957SLisandro Dalcin info->memory = ((PetscObject)A)->mem; 3336d5f3da31SBarry Smith if (A->factortype) { 33372d61bbb3SSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 33382d61bbb3SSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 33392d61bbb3SSatish Balay info->factor_mallocs = A->info.factor_mallocs; 33402d61bbb3SSatish Balay } else { 33412d61bbb3SSatish Balay info->fill_ratio_given = 0; 33422d61bbb3SSatish Balay info->fill_ratio_needed = 0; 33432d61bbb3SSatish Balay info->factor_mallocs = 0; 33442d61bbb3SSatish Balay } 33452d61bbb3SSatish Balay PetscFunctionReturn(0); 33462d61bbb3SSatish Balay } 33472d61bbb3SSatish Balay 33489371c9d4SSatish Balay PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) { 33492d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33502d61bbb3SSatish Balay 33512d61bbb3SSatish Balay PetscFunctionBegin; 33529566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 33532d61bbb3SSatish Balay PetscFunctionReturn(0); 33542d61bbb3SSatish Balay } 3355a001520aSPierre Jolivet 33569371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) { 3357a001520aSPierre Jolivet PetscFunctionBegin; 33589566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 33594222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 3360a001520aSPierre Jolivet PetscFunctionReturn(0); 3361a001520aSPierre Jolivet } 3362a001520aSPierre Jolivet 33639371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 336474eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3365f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1; 3366bcf10a7aSPierre Jolivet const PetscScalar *xb; 336774eeabc5SPierre Jolivet PetscScalar x1; 336874eeabc5SPierre Jolivet const MatScalar *v, *vv; 336974eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 337074eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 337174eeabc5SPierre Jolivet 337274eeabc5SPierre Jolivet PetscFunctionBegin; 337374eeabc5SPierre Jolivet idx = a->j; 337474eeabc5SPierre Jolivet v = a->a; 337574eeabc5SPierre Jolivet if (usecprow) { 337674eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 337774eeabc5SPierre Jolivet ii = a->compressedrow.i; 337874eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 337974eeabc5SPierre Jolivet } else { 338074eeabc5SPierre Jolivet mbs = a->mbs; 338174eeabc5SPierre Jolivet ii = a->i; 338274eeabc5SPierre Jolivet z = c; 338374eeabc5SPierre Jolivet } 338474eeabc5SPierre Jolivet 338574eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 33869371c9d4SSatish Balay n = ii[1] - ii[0]; 33879371c9d4SSatish Balay ii++; 338874eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 338974eeabc5SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 339074eeabc5SPierre Jolivet if (usecprow) z = c + ridx[i]; 339174eeabc5SPierre Jolivet jj = idx; 339274eeabc5SPierre Jolivet vv = v; 339374eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 339474eeabc5SPierre Jolivet idx = jj; 339574eeabc5SPierre Jolivet v = vv; 339674eeabc5SPierre Jolivet sum1 = 0.0; 339774eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 33989371c9d4SSatish Balay xb = b + (*idx++); 33999371c9d4SSatish Balay x1 = xb[0 + k * bm]; 340074eeabc5SPierre Jolivet sum1 += v[0] * x1; 340174eeabc5SPierre Jolivet v += 1; 340274eeabc5SPierre Jolivet } 3403feb237baSPierre Jolivet z[0 + k * cm] = sum1; 340474eeabc5SPierre Jolivet } 340574eeabc5SPierre Jolivet if (!usecprow) z += 1; 340674eeabc5SPierre Jolivet } 340774eeabc5SPierre Jolivet PetscFunctionReturn(0); 340874eeabc5SPierre Jolivet } 340974eeabc5SPierre Jolivet 34109371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 34114b7054f4SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3412f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2; 3413bcf10a7aSPierre Jolivet const PetscScalar *xb; 34144b7054f4SPierre Jolivet PetscScalar x1, x2; 34154b7054f4SPierre Jolivet const MatScalar *v, *vv; 34164b7054f4SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 34174b7054f4SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 34184b7054f4SPierre Jolivet 34194b7054f4SPierre Jolivet PetscFunctionBegin; 34204b7054f4SPierre Jolivet idx = a->j; 34214b7054f4SPierre Jolivet v = a->a; 34224b7054f4SPierre Jolivet if (usecprow) { 34234b7054f4SPierre Jolivet mbs = a->compressedrow.nrows; 34244b7054f4SPierre Jolivet ii = a->compressedrow.i; 34254b7054f4SPierre Jolivet ridx = a->compressedrow.rindex; 34264b7054f4SPierre Jolivet } else { 34274b7054f4SPierre Jolivet mbs = a->mbs; 34284b7054f4SPierre Jolivet ii = a->i; 34294b7054f4SPierre Jolivet z = c; 34304b7054f4SPierre Jolivet } 34314b7054f4SPierre Jolivet 34324b7054f4SPierre Jolivet for (i = 0; i < mbs; i++) { 34339371c9d4SSatish Balay n = ii[1] - ii[0]; 34349371c9d4SSatish Balay ii++; 34354b7054f4SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 34364b7054f4SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 34374b7054f4SPierre Jolivet if (usecprow) z = c + 2 * ridx[i]; 34384b7054f4SPierre Jolivet jj = idx; 34394b7054f4SPierre Jolivet vv = v; 34404b7054f4SPierre Jolivet for (k = 0; k < cn; k++) { 34414b7054f4SPierre Jolivet idx = jj; 34424b7054f4SPierre Jolivet v = vv; 34439371c9d4SSatish Balay sum1 = 0.0; 34449371c9d4SSatish Balay sum2 = 0.0; 34454b7054f4SPierre Jolivet for (j = 0; j < n; j++) { 34469371c9d4SSatish Balay xb = b + 2 * (*idx++); 34479371c9d4SSatish Balay x1 = xb[0 + k * bm]; 34489371c9d4SSatish Balay x2 = xb[1 + k * bm]; 34494b7054f4SPierre Jolivet sum1 += v[0] * x1 + v[2] * x2; 34504b7054f4SPierre Jolivet sum2 += v[1] * x1 + v[3] * x2; 34514b7054f4SPierre Jolivet v += 4; 34524b7054f4SPierre Jolivet } 34539371c9d4SSatish Balay z[0 + k * cm] = sum1; 34549371c9d4SSatish Balay z[1 + k * cm] = sum2; 34554b7054f4SPierre Jolivet } 34564b7054f4SPierre Jolivet if (!usecprow) z += 2; 34574b7054f4SPierre Jolivet } 34584b7054f4SPierre Jolivet PetscFunctionReturn(0); 34594b7054f4SPierre Jolivet } 34604b7054f4SPierre Jolivet 34619371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 346274eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3463f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3; 3464bcf10a7aSPierre Jolivet const PetscScalar *xb; 346574eeabc5SPierre Jolivet PetscScalar x1, x2, x3; 346674eeabc5SPierre Jolivet const MatScalar *v, *vv; 346774eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 346874eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 346974eeabc5SPierre Jolivet 347074eeabc5SPierre Jolivet PetscFunctionBegin; 347174eeabc5SPierre Jolivet idx = a->j; 347274eeabc5SPierre Jolivet v = a->a; 347374eeabc5SPierre Jolivet if (usecprow) { 347474eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 347574eeabc5SPierre Jolivet ii = a->compressedrow.i; 347674eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 347774eeabc5SPierre Jolivet } else { 347874eeabc5SPierre Jolivet mbs = a->mbs; 347974eeabc5SPierre Jolivet ii = a->i; 348074eeabc5SPierre Jolivet z = c; 348174eeabc5SPierre Jolivet } 348274eeabc5SPierre Jolivet 348374eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 34849371c9d4SSatish Balay n = ii[1] - ii[0]; 34859371c9d4SSatish Balay ii++; 348674eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 348774eeabc5SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 348874eeabc5SPierre Jolivet if (usecprow) z = c + 3 * ridx[i]; 348974eeabc5SPierre Jolivet jj = idx; 349074eeabc5SPierre Jolivet vv = v; 349174eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 349274eeabc5SPierre Jolivet idx = jj; 349374eeabc5SPierre Jolivet v = vv; 34949371c9d4SSatish Balay sum1 = 0.0; 34959371c9d4SSatish Balay sum2 = 0.0; 34969371c9d4SSatish Balay sum3 = 0.0; 349774eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 34989371c9d4SSatish Balay xb = b + 3 * (*idx++); 34999371c9d4SSatish Balay x1 = xb[0 + k * bm]; 35009371c9d4SSatish Balay x2 = xb[1 + k * bm]; 35019371c9d4SSatish Balay x3 = xb[2 + k * bm]; 350274eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 350374eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 350474eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 350574eeabc5SPierre Jolivet v += 9; 350674eeabc5SPierre Jolivet } 35079371c9d4SSatish Balay z[0 + k * cm] = sum1; 35089371c9d4SSatish Balay z[1 + k * cm] = sum2; 35099371c9d4SSatish Balay z[2 + k * cm] = sum3; 351074eeabc5SPierre Jolivet } 351174eeabc5SPierre Jolivet if (!usecprow) z += 3; 351274eeabc5SPierre Jolivet } 351374eeabc5SPierre Jolivet PetscFunctionReturn(0); 351474eeabc5SPierre Jolivet } 351574eeabc5SPierre Jolivet 35169371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 351774eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3518f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3519bcf10a7aSPierre Jolivet const PetscScalar *xb; 352074eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4; 352174eeabc5SPierre Jolivet const MatScalar *v, *vv; 352274eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 352374eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 352474eeabc5SPierre Jolivet 352574eeabc5SPierre Jolivet PetscFunctionBegin; 352674eeabc5SPierre Jolivet idx = a->j; 352774eeabc5SPierre Jolivet v = a->a; 352874eeabc5SPierre Jolivet if (usecprow) { 352974eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 353074eeabc5SPierre Jolivet ii = a->compressedrow.i; 353174eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 353274eeabc5SPierre Jolivet } else { 353374eeabc5SPierre Jolivet mbs = a->mbs; 353474eeabc5SPierre Jolivet ii = a->i; 353574eeabc5SPierre Jolivet z = c; 353674eeabc5SPierre Jolivet } 353774eeabc5SPierre Jolivet 353874eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 35399371c9d4SSatish Balay n = ii[1] - ii[0]; 35409371c9d4SSatish Balay ii++; 354174eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 354274eeabc5SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 354374eeabc5SPierre Jolivet if (usecprow) z = c + 4 * ridx[i]; 354474eeabc5SPierre Jolivet jj = idx; 354574eeabc5SPierre Jolivet vv = v; 354674eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 354774eeabc5SPierre Jolivet idx = jj; 354874eeabc5SPierre Jolivet v = vv; 35499371c9d4SSatish Balay sum1 = 0.0; 35509371c9d4SSatish Balay sum2 = 0.0; 35519371c9d4SSatish Balay sum3 = 0.0; 35529371c9d4SSatish Balay sum4 = 0.0; 355374eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 35549371c9d4SSatish Balay xb = b + 4 * (*idx++); 35559371c9d4SSatish Balay x1 = xb[0 + k * bm]; 35569371c9d4SSatish Balay x2 = xb[1 + k * bm]; 35579371c9d4SSatish Balay x3 = xb[2 + k * bm]; 35589371c9d4SSatish Balay x4 = xb[3 + k * bm]; 355974eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 356074eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 356174eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 356274eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 356374eeabc5SPierre Jolivet v += 16; 356474eeabc5SPierre Jolivet } 35659371c9d4SSatish Balay z[0 + k * cm] = sum1; 35669371c9d4SSatish Balay z[1 + k * cm] = sum2; 35679371c9d4SSatish Balay z[2 + k * cm] = sum3; 35689371c9d4SSatish Balay z[3 + k * cm] = sum4; 356974eeabc5SPierre Jolivet } 357074eeabc5SPierre Jolivet if (!usecprow) z += 4; 357174eeabc5SPierre Jolivet } 357274eeabc5SPierre Jolivet PetscFunctionReturn(0); 357374eeabc5SPierre Jolivet } 357474eeabc5SPierre Jolivet 35759371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 357674eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3577f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3578bcf10a7aSPierre Jolivet const PetscScalar *xb; 357974eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 358074eeabc5SPierre Jolivet const MatScalar *v, *vv; 358174eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 358274eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 358374eeabc5SPierre Jolivet 358474eeabc5SPierre Jolivet PetscFunctionBegin; 358574eeabc5SPierre Jolivet idx = a->j; 358674eeabc5SPierre Jolivet v = a->a; 358774eeabc5SPierre Jolivet if (usecprow) { 358874eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 358974eeabc5SPierre Jolivet ii = a->compressedrow.i; 359074eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 359174eeabc5SPierre Jolivet } else { 359274eeabc5SPierre Jolivet mbs = a->mbs; 359374eeabc5SPierre Jolivet ii = a->i; 359474eeabc5SPierre Jolivet z = c; 359574eeabc5SPierre Jolivet } 359674eeabc5SPierre Jolivet 359774eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 35989371c9d4SSatish Balay n = ii[1] - ii[0]; 35999371c9d4SSatish Balay ii++; 360074eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 360174eeabc5SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 360274eeabc5SPierre Jolivet if (usecprow) z = c + 5 * ridx[i]; 360374eeabc5SPierre Jolivet jj = idx; 360474eeabc5SPierre Jolivet vv = v; 360574eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 360674eeabc5SPierre Jolivet idx = jj; 360774eeabc5SPierre Jolivet v = vv; 36089371c9d4SSatish Balay sum1 = 0.0; 36099371c9d4SSatish Balay sum2 = 0.0; 36109371c9d4SSatish Balay sum3 = 0.0; 36119371c9d4SSatish Balay sum4 = 0.0; 36129371c9d4SSatish Balay sum5 = 0.0; 361374eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 36149371c9d4SSatish Balay xb = b + 5 * (*idx++); 36159371c9d4SSatish Balay x1 = xb[0 + k * bm]; 36169371c9d4SSatish Balay x2 = xb[1 + k * bm]; 36179371c9d4SSatish Balay x3 = xb[2 + k * bm]; 36189371c9d4SSatish Balay x4 = xb[3 + k * bm]; 36199371c9d4SSatish Balay x5 = xb[4 + k * bm]; 362074eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 362174eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 362274eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 362374eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 362474eeabc5SPierre Jolivet sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 362574eeabc5SPierre Jolivet v += 25; 362674eeabc5SPierre Jolivet } 36279371c9d4SSatish Balay z[0 + k * cm] = sum1; 36289371c9d4SSatish Balay z[1 + k * cm] = sum2; 36299371c9d4SSatish Balay z[2 + k * cm] = sum3; 36309371c9d4SSatish Balay z[3 + k * cm] = sum4; 36319371c9d4SSatish Balay z[4 + k * cm] = sum5; 363274eeabc5SPierre Jolivet } 363374eeabc5SPierre Jolivet if (!usecprow) z += 5; 363474eeabc5SPierre Jolivet } 363574eeabc5SPierre Jolivet PetscFunctionReturn(0); 363674eeabc5SPierre Jolivet } 363774eeabc5SPierre Jolivet 36389371c9d4SSatish Balay PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) { 3639a001520aSPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3640a001520aSPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3641910cf402Sprj- Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3642bcf10a7aSPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3643a001520aSPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3644a001520aSPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 3645f4259b30SLisandro Dalcin PetscScalar *z = NULL; 3646a001520aSPierre Jolivet PetscScalar *c, *b; 3647a001520aSPierre Jolivet const MatScalar *v; 3648a001520aSPierre Jolivet const PetscInt *idx, *ii, *ridx = NULL; 36494b7054f4SPierre Jolivet PetscScalar _DZero = 0.0, _DOne = 1.0; 3650a001520aSPierre Jolivet PetscBool usecprow = a->compressedrow.use; 3651a001520aSPierre Jolivet 3652a001520aSPierre Jolivet PetscFunctionBegin; 3653a001520aSPierre Jolivet if (!cm || !cn) PetscFunctionReturn(0); 365408401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 365508401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 365608401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3657a001520aSPierre Jolivet b = bd->v; 365848a46eb9SPierre Jolivet if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 36599566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 366074eeabc5SPierre Jolivet switch (bs) { 36619371c9d4SSatish Balay case 1: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break; 36629371c9d4SSatish Balay case 2: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break; 36639371c9d4SSatish Balay case 3: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break; 36649371c9d4SSatish Balay case 4: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break; 36659371c9d4SSatish Balay case 5: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break; 366674eeabc5SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 36679566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 36689566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 36699566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 36709566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 3671a001520aSPierre Jolivet idx = a->j; 3672a001520aSPierre Jolivet v = a->a; 3673a001520aSPierre Jolivet if (usecprow) { 3674a001520aSPierre Jolivet mbs = a->compressedrow.nrows; 3675a001520aSPierre Jolivet ii = a->compressedrow.i; 3676a001520aSPierre Jolivet ridx = a->compressedrow.rindex; 3677a001520aSPierre Jolivet } else { 3678a001520aSPierre Jolivet mbs = a->mbs; 3679a001520aSPierre Jolivet ii = a->i; 3680a001520aSPierre Jolivet z = c; 3681a001520aSPierre Jolivet } 3682a001520aSPierre Jolivet for (i = 0; i < mbs; i++) { 36839371c9d4SSatish Balay n = ii[1] - ii[0]; 36849371c9d4SSatish Balay ii++; 3685a001520aSPierre Jolivet if (usecprow) z = c + bs * ridx[i]; 36864b7054f4SPierre Jolivet if (n) { 3687792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 36884b7054f4SPierre Jolivet v += bs2; 36894b7054f4SPierre Jolivet } 36904b7054f4SPierre Jolivet for (j = 1; j < n; j++) { 3691792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3692a001520aSPierre Jolivet v += bs2; 3693a001520aSPierre Jolivet } 3694a001520aSPierre Jolivet if (!usecprow) z += bs; 3695a001520aSPierre Jolivet } 36964b7054f4SPierre Jolivet } 36979566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 36989566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 3699a001520aSPierre Jolivet PetscFunctionReturn(0); 3700a001520aSPierre Jolivet } 3701