1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 2a001520aSPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <petscblaslapack.h> 6cac129eeSSatish Balay 75f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 896e086a2SDaniel Kokron #include <immintrin.h> 996e086a2SDaniel Kokron #endif 1096e086a2SDaniel Kokron 119371c9d4SSatish Balay PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) { 12a3192f15SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 135d0c19d7SBarry Smith PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 145d0c19d7SBarry Smith const PetscInt *idx; 15690b6cddSBarry Smith PetscInt start, end, *ai, *aj, bs, *nidx2; 16f1af5d2fSBarry Smith PetscBT table; 17a3192f15SSatish Balay 183a40ed3dSBarry Smith PetscFunctionBegin; 19a3192f15SSatish Balay m = a->mbs; 20a3192f15SSatish Balay ai = a->i; 21a3192f15SSatish Balay aj = a->j; 22d0f46423SBarry Smith bs = A->rmap->bs; 23a3192f15SSatish Balay 2408401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 25a3192f15SSatish Balay 269566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(m, &table)); 279566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &nidx)); 289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2)); 29a3192f15SSatish Balay 30a3192f15SSatish Balay for (i = 0; i < is_max; i++) { 31a3192f15SSatish Balay /* Initialise the two local arrays */ 32a3192f15SSatish Balay isz = 0; 339566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(m, table)); 34a3192f15SSatish Balay 35a3192f15SSatish Balay /* Extract the indices, assume there can be duplicate entries */ 369566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 379566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 38a3192f15SSatish Balay 39a3192f15SSatish Balay /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40a3192f15SSatish Balay for (j = 0; j < n; ++j) { 41218c64b6SSatish Balay ival = idx[j] / bs; /* convert the indices into block indices */ 4208401ef6SPierre Jolivet PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 4326fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 44a3192f15SSatish Balay } 459566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 469566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 47a3192f15SSatish Balay 48a3192f15SSatish Balay k = 0; 49a3192f15SSatish Balay for (j = 0; j < ov; j++) { /* for each overlap*/ 50a3192f15SSatish Balay n = isz; 51a3192f15SSatish Balay for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52a3192f15SSatish Balay row = nidx[k]; 53a3192f15SSatish Balay start = ai[row]; 54a3192f15SSatish Balay end = ai[row + 1]; 55a3192f15SSatish Balay for (l = start; l < end; l++) { 56a3192f15SSatish Balay val = aj[l]; 5726fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 58a3192f15SSatish Balay } 59a3192f15SSatish Balay } 60a3192f15SSatish Balay } 61218c64b6SSatish Balay /* expand the Index Set */ 62218c64b6SSatish Balay for (j = 0; j < isz; j++) { 6326fbe8dcSKarl Rupp for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k; 64218c64b6SSatish Balay } 659566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i)); 66a3192f15SSatish Balay } 679566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table)); 689566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 699566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx2)); 703a40ed3dSBarry Smith PetscFunctionReturn(0); 71a3192f15SSatish Balay } 721c351548SSatish Balay 739371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 74736121d4SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 75690b6cddSBarry Smith PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 76690b6cddSBarry Smith PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 775d0c19d7SBarry Smith const PetscInt *irow, *icol; 785d0c19d7SBarry Smith PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 79690b6cddSBarry Smith PetscInt *aj = a->j, *ai = a->i; 803f1db9ecSBarry Smith MatScalar *mat_a; 81736121d4SSatish Balay Mat C; 826041f1b1SToby Isaac PetscBool flag; 83736121d4SSatish Balay 843a40ed3dSBarry Smith PetscFunctionBegin; 859566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 869566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 879566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 889566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 89736121d4SSatish Balay 909566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 91736121d4SSatish Balay ssmap = smap; 929566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 93736121d4SSatish Balay for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 94736121d4SSatish Balay /* determine lens of each row */ 95736121d4SSatish Balay for (i = 0; i < nrows; i++) { 96736121d4SSatish Balay kstart = ai[irow[i]]; 97736121d4SSatish Balay kend = kstart + a->ilen[irow[i]]; 98736121d4SSatish Balay lens[i] = 0; 99736121d4SSatish Balay for (k = kstart; k < kend; k++) { 10026fbe8dcSKarl Rupp if (ssmap[aj[k]]) lens[i]++; 101736121d4SSatish Balay } 102736121d4SSatish Balay } 103736121d4SSatish Balay /* Create and fill new matrix */ 104736121d4SSatish Balay if (scall == MAT_REUSE_MATRIX) { 105736121d4SSatish Balay c = (Mat_SeqBAIJ *)((*B)->data); 106736121d4SSatish Balay 107aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1089566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 10928b400f6SJacob Faibussowitsch PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 1109566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 111736121d4SSatish Balay C = *B; 1123a40ed3dSBarry Smith } else { 1139566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1149566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 1159566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1169566063dSJacob Faibussowitsch PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 117736121d4SSatish Balay } 118736121d4SSatish Balay c = (Mat_SeqBAIJ *)(C->data); 119736121d4SSatish Balay for (i = 0; i < nrows; i++) { 120736121d4SSatish Balay row = irow[i]; 121736121d4SSatish Balay kstart = ai[row]; 122736121d4SSatish Balay kend = kstart + a->ilen[row]; 123736121d4SSatish Balay mat_i = c->i[i]; 124d29f2997SMatthew Woehlke mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 125d29f2997SMatthew Woehlke mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 126736121d4SSatish Balay mat_ilen = c->ilen + i; 127736121d4SSatish Balay for (k = kstart; k < kend; k++) { 128736121d4SSatish Balay if ((tcol = ssmap[a->j[k]])) { 129736121d4SSatish Balay *mat_j++ = tcol - 1; 1309566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 131549d3d68SSatish Balay mat_a += bs2; 132736121d4SSatish Balay (*mat_ilen)++; 133736121d4SSatish Balay } 134736121d4SSatish Balay } 135736121d4SSatish Balay } 136cdc6f3adSToby Isaac /* sort */ 137d29f2997SMatthew Woehlke if (c->j && c->a) { 138cdc6f3adSToby Isaac MatScalar *work; 1399566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 140cdc6f3adSToby Isaac for (i = 0; i < nrows; i++) { 141cdc6f3adSToby Isaac PetscInt ilen; 142cdc6f3adSToby Isaac mat_i = c->i[i]; 143cdc6f3adSToby Isaac mat_j = c->j + mat_i; 144cdc6f3adSToby Isaac mat_a = c->a + mat_i * bs2; 145cdc6f3adSToby Isaac ilen = c->ilen[i]; 1469566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 147cdc6f3adSToby Isaac } 1489566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 149cdc6f3adSToby Isaac } 150218c64b6SSatish Balay 151736121d4SSatish Balay /* Free work space */ 1529566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 1539566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 1549566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 1559566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1569566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 157736121d4SSatish Balay 1589566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 159736121d4SSatish Balay *B = C; 1603a40ed3dSBarry Smith PetscFunctionReturn(0); 161736121d4SSatish Balay } 162736121d4SSatish Balay 1639371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 164218c64b6SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 165218c64b6SSatish Balay IS is1, is2; 166afebec48SHong Zhang PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j; 1675d0c19d7SBarry Smith const PetscInt *irow, *icol; 168218c64b6SSatish Balay 1693a40ed3dSBarry Smith PetscFunctionBegin; 1709566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 1719566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 1729566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 1739566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 174218c64b6SSatish Balay 175218c64b6SSatish Balay /* Verify if the indices corespond to each element in a block 176218c64b6SSatish Balay and form the IS with compressed IS */ 177f8ecb639SStefano Zampini maxmnbs = PetscMax(a->mbs, a->nbs); 1789566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 1799566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->mbs)); 180218c64b6SSatish Balay for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 1819371c9d4SSatish Balay for (i = 0; i < a->mbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); } 1826041f1b1SToby Isaac count = 0; 1836041f1b1SToby Isaac for (i = 0; i < nrows; i++) { 184afebec48SHong Zhang j = irow[i] / bs; 1856041f1b1SToby Isaac if ((vary[j]--) == bs) iary[count++] = j; 186218c64b6SSatish Balay } 1879566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 188218c64b6SSatish Balay 1899566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->nbs)); 190218c64b6SSatish Balay for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 1919371c9d4SSatish Balay for (i = 0; i < a->nbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); } 1926041f1b1SToby Isaac count = 0; 1936041f1b1SToby Isaac for (i = 0; i < ncols; i++) { 194afebec48SHong Zhang j = icol[i] / bs; 1956041f1b1SToby Isaac if ((vary[j]--) == bs) iary[count++] = j; 1966041f1b1SToby Isaac } 1979566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 1989566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 1999566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 2009566063dSJacob Faibussowitsch PetscCall(PetscFree2(vary, iary)); 201218c64b6SSatish Balay 2029566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 2039566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2049566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 2053a40ed3dSBarry Smith PetscFunctionReturn(0); 206218c64b6SSatish Balay } 207218c64b6SSatish Balay 2089371c9d4SSatish Balay PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) { 20916b64355SHong Zhang Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 2105c39f6d9SHong Zhang Mat_SubSppt *submatj = c->submatis1; 21116b64355SHong Zhang 21216b64355SHong Zhang PetscFunctionBegin; 2139566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 2149566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 21516b64355SHong Zhang PetscFunctionReturn(0); 21616b64355SHong Zhang } 21716b64355SHong Zhang 21889a1a59bSHong Zhang /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 2199371c9d4SSatish Balay PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) { 22086e85357SHong Zhang PetscInt i; 22186e85357SHong Zhang Mat C; 22286e85357SHong Zhang Mat_SeqBAIJ *c; 22386e85357SHong Zhang Mat_SubSppt *submatj; 22486e85357SHong Zhang 22586e85357SHong Zhang PetscFunctionBegin; 22686e85357SHong Zhang for (i = 0; i < n; i++) { 22786e85357SHong Zhang C = (*mat)[i]; 22886e85357SHong Zhang c = (Mat_SeqBAIJ *)C->data; 22986e85357SHong Zhang submatj = c->submatis1; 23086e85357SHong Zhang if (submatj) { 2317daefbafSJunchao Zhang if (--((PetscObject)C)->refct <= 0) { 23226cc229bSBarry Smith PetscCall(PetscFree(C->factorprefix)); 2339566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 2349566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 2359566063dSJacob Faibussowitsch PetscCall(PetscFree(C->defaultvectype)); 2369566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->rmap)); 2379566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->cmap)); 2389566063dSJacob Faibussowitsch PetscCall(PetscHeaderDestroy(&C)); 2397daefbafSJunchao Zhang } 24086e85357SHong Zhang } else { 2419566063dSJacob Faibussowitsch PetscCall(MatDestroy(&C)); 24286e85357SHong Zhang } 24386e85357SHong Zhang } 2447daefbafSJunchao Zhang 2457daefbafSJunchao Zhang /* Destroy Dummy submatrices created for reuse */ 2469566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 2477daefbafSJunchao Zhang 2489566063dSJacob Faibussowitsch PetscCall(PetscFree(*mat)); 24986e85357SHong Zhang PetscFunctionReturn(0); 25086e85357SHong Zhang } 25186e85357SHong Zhang 2529371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) { 253690b6cddSBarry Smith PetscInt i; 254736121d4SSatish Balay 2553a40ed3dSBarry Smith PetscFunctionBegin; 256*48a46eb9SPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 257736121d4SSatish Balay 258*48a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 2593a40ed3dSBarry Smith PetscFunctionReturn(0); 260736121d4SSatish Balay } 261218c64b6SSatish Balay 2622d61bbb3SSatish Balay /* -------------------------------------------------------*/ 2632d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */ 2642d61bbb3SSatish Balay /* -------------------------------------------------------*/ 2652d61bbb3SSatish Balay 2669371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) { 2672d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 268d9fead3dSBarry Smith PetscScalar *z, sum; 269d9fead3dSBarry Smith const PetscScalar *x; 270d9fead3dSBarry Smith const MatScalar *v; 2717c565772SBarry Smith PetscInt mbs, i, n; 2720298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 273ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2742d61bbb3SSatish Balay 2752d61bbb3SSatish Balay PetscFunctionBegin; 2769566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2779566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &z)); 2782d61bbb3SSatish Balay 27926e093fcSHong Zhang if (usecprow) { 28026e093fcSHong Zhang mbs = a->compressedrow.nrows; 28126e093fcSHong Zhang ii = a->compressedrow.i; 2827b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 2839566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(z, a->mbs)); 28426e093fcSHong Zhang } else { 28526e093fcSHong Zhang mbs = a->mbs; 2862d61bbb3SSatish Balay ii = a->i; 28726e093fcSHong Zhang } 2882d61bbb3SSatish Balay 2892d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 290ee54c7eeSHong Zhang n = ii[1] - ii[0]; 291ee54c7eeSHong Zhang v = a->a + ii[0]; 292ee54c7eeSHong Zhang idx = a->j + ii[0]; 293ee54c7eeSHong Zhang ii++; 294444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 295444d8c10SJed Brown PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2962d61bbb3SSatish Balay sum = 0.0; 2972162cab8SBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 29826e093fcSHong Zhang if (usecprow) { 2997b2bb3b9SHong Zhang z[ridx[i]] = sum; 30026e093fcSHong Zhang } else { 3012d61bbb3SSatish Balay z[i] = sum; 3022d61bbb3SSatish Balay } 30326e093fcSHong Zhang } 3049566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3059566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &z)); 3069566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 3072d61bbb3SSatish Balay PetscFunctionReturn(0); 3082d61bbb3SSatish Balay } 3092d61bbb3SSatish Balay 3109371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) { 3112d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 312f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, *zarray; 313d9fead3dSBarry Smith const PetscScalar *x, *xb; 31487828ca2SBarry Smith PetscScalar x1, x2; 315d9fead3dSBarry Smith const MatScalar *v; 3167c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 317ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 3182d61bbb3SSatish Balay 3192d61bbb3SSatish Balay PetscFunctionBegin; 3209566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3219566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 3222d61bbb3SSatish Balay 3232d61bbb3SSatish Balay idx = a->j; 3242d61bbb3SSatish Balay v = a->a; 32526e093fcSHong Zhang if (usecprow) { 32626e093fcSHong Zhang mbs = a->compressedrow.nrows; 32726e093fcSHong Zhang ii = a->compressedrow.i; 3287b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 3299566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 33026e093fcSHong Zhang } else { 33126e093fcSHong Zhang mbs = a->mbs; 3322d61bbb3SSatish Balay ii = a->i; 33326e093fcSHong Zhang z = zarray; 33426e093fcSHong Zhang } 3352d61bbb3SSatish Balay 3362d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 3379371c9d4SSatish Balay n = ii[1] - ii[0]; 3389371c9d4SSatish Balay ii++; 3399371c9d4SSatish Balay sum1 = 0.0; 3409371c9d4SSatish Balay sum2 = 0.0; 341444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 342444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3432d61bbb3SSatish Balay for (j = 0; j < n; j++) { 3449371c9d4SSatish Balay xb = x + 2 * (*idx++); 3459371c9d4SSatish Balay x1 = xb[0]; 3469371c9d4SSatish Balay x2 = xb[1]; 3472d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 3482d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 3492d61bbb3SSatish Balay v += 4; 3502d61bbb3SSatish Balay } 3517b2bb3b9SHong Zhang if (usecprow) z = zarray + 2 * ridx[i]; 3529371c9d4SSatish Balay z[0] = sum1; 3539371c9d4SSatish Balay z[1] = sum2; 35426e093fcSHong Zhang if (!usecprow) z += 2; 3552d61bbb3SSatish Balay } 3569566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3579566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 3589566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 3592d61bbb3SSatish Balay PetscFunctionReturn(0); 3602d61bbb3SSatish Balay } 3612d61bbb3SSatish Balay 3629371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) { 3632d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 364f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 365d9fead3dSBarry Smith const PetscScalar *x, *xb; 366d9fead3dSBarry Smith const MatScalar *v; 3677c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 368ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 36926e093fcSHong Zhang 370b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 371fee21e36SBarry Smith #pragma disjoint(*v, *z, *xb) 372fee21e36SBarry Smith #endif 373fee21e36SBarry Smith 3742d61bbb3SSatish Balay PetscFunctionBegin; 3759566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3769566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 3772d61bbb3SSatish Balay 3782d61bbb3SSatish Balay idx = a->j; 3792d61bbb3SSatish Balay v = a->a; 38026e093fcSHong Zhang if (usecprow) { 38126e093fcSHong Zhang mbs = a->compressedrow.nrows; 38226e093fcSHong Zhang ii = a->compressedrow.i; 3837b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 3849566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 38526e093fcSHong Zhang } else { 38626e093fcSHong Zhang mbs = a->mbs; 3872d61bbb3SSatish Balay ii = a->i; 38826e093fcSHong Zhang z = zarray; 38926e093fcSHong Zhang } 3902d61bbb3SSatish Balay 3912d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 3929371c9d4SSatish Balay n = ii[1] - ii[0]; 3939371c9d4SSatish Balay ii++; 3949371c9d4SSatish Balay sum1 = 0.0; 3959371c9d4SSatish Balay sum2 = 0.0; 3969371c9d4SSatish Balay sum3 = 0.0; 397444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 398444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3992d61bbb3SSatish Balay for (j = 0; j < n; j++) { 40026fbe8dcSKarl Rupp xb = x + 3 * (*idx++); 40126fbe8dcSKarl Rupp x1 = xb[0]; 40226fbe8dcSKarl Rupp x2 = xb[1]; 40326fbe8dcSKarl Rupp x3 = xb[2]; 40426fbe8dcSKarl Rupp 4052d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 4062d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 4072d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 4082d61bbb3SSatish Balay v += 9; 4092d61bbb3SSatish Balay } 4107b2bb3b9SHong Zhang if (usecprow) z = zarray + 3 * ridx[i]; 4119371c9d4SSatish Balay z[0] = sum1; 4129371c9d4SSatish Balay z[1] = sum2; 4139371c9d4SSatish Balay z[2] = sum3; 41426e093fcSHong Zhang if (!usecprow) z += 3; 4152d61bbb3SSatish Balay } 4169566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4179566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 4189566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 4192d61bbb3SSatish Balay PetscFunctionReturn(0); 4202d61bbb3SSatish Balay } 4212d61bbb3SSatish Balay 4229371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) { 4232d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 424f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 425d9fead3dSBarry Smith const PetscScalar *x, *xb; 426d9fead3dSBarry Smith const MatScalar *v; 4277c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 428ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 4292d61bbb3SSatish Balay 4302d61bbb3SSatish Balay PetscFunctionBegin; 4319566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4329566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 4332d61bbb3SSatish Balay 4342d61bbb3SSatish Balay idx = a->j; 4352d61bbb3SSatish Balay v = a->a; 43626e093fcSHong Zhang if (usecprow) { 43726e093fcSHong Zhang mbs = a->compressedrow.nrows; 43826e093fcSHong Zhang ii = a->compressedrow.i; 4397b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 4409566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 44126e093fcSHong Zhang } else { 44226e093fcSHong Zhang mbs = a->mbs; 4432d61bbb3SSatish Balay ii = a->i; 44426e093fcSHong Zhang z = zarray; 44526e093fcSHong Zhang } 4462d61bbb3SSatish Balay 4472d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 44826fbe8dcSKarl Rupp n = ii[1] - ii[0]; 44926fbe8dcSKarl Rupp ii++; 45026fbe8dcSKarl Rupp sum1 = 0.0; 45126fbe8dcSKarl Rupp sum2 = 0.0; 45226fbe8dcSKarl Rupp sum3 = 0.0; 45326fbe8dcSKarl Rupp sum4 = 0.0; 45426fbe8dcSKarl Rupp 455444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 456444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4572d61bbb3SSatish Balay for (j = 0; j < n; j++) { 4582d61bbb3SSatish Balay xb = x + 4 * (*idx++); 4599371c9d4SSatish Balay x1 = xb[0]; 4609371c9d4SSatish Balay x2 = xb[1]; 4619371c9d4SSatish Balay x3 = xb[2]; 4629371c9d4SSatish Balay x4 = xb[3]; 4632d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 4642d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 4652d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 4662d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 4672d61bbb3SSatish Balay v += 16; 4682d61bbb3SSatish Balay } 4697b2bb3b9SHong Zhang if (usecprow) z = zarray + 4 * ridx[i]; 4709371c9d4SSatish Balay z[0] = sum1; 4719371c9d4SSatish Balay z[1] = sum2; 4729371c9d4SSatish Balay z[2] = sum3; 4739371c9d4SSatish Balay z[3] = sum4; 47426e093fcSHong Zhang if (!usecprow) z += 4; 4752d61bbb3SSatish Balay } 4769566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 4789566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 4792d61bbb3SSatish Balay PetscFunctionReturn(0); 4802d61bbb3SSatish Balay } 4812d61bbb3SSatish Balay 4829371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) { 4832d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 484f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 485d9fead3dSBarry Smith const PetscScalar *xb, *x; 486d9fead3dSBarry Smith const MatScalar *v; 4870298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 4887c565772SBarry Smith PetscInt mbs, i, j, n; 489ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 4902d61bbb3SSatish Balay 491433994e6SBarry Smith PetscFunctionBegin; 4929566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4939566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 4942d61bbb3SSatish Balay 4952d61bbb3SSatish Balay idx = a->j; 4962d61bbb3SSatish Balay v = a->a; 49726e093fcSHong Zhang if (usecprow) { 49826e093fcSHong Zhang mbs = a->compressedrow.nrows; 49926e093fcSHong Zhang ii = a->compressedrow.i; 5007b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 5019566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 50226e093fcSHong Zhang } else { 50326e093fcSHong Zhang mbs = a->mbs; 5042d61bbb3SSatish Balay ii = a->i; 50526e093fcSHong Zhang z = zarray; 50626e093fcSHong Zhang } 5072d61bbb3SSatish Balay 5082d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 5099371c9d4SSatish Balay n = ii[1] - ii[0]; 5109371c9d4SSatish Balay ii++; 5119371c9d4SSatish Balay sum1 = 0.0; 5129371c9d4SSatish Balay sum2 = 0.0; 5139371c9d4SSatish Balay sum3 = 0.0; 5149371c9d4SSatish Balay sum4 = 0.0; 5159371c9d4SSatish Balay sum5 = 0.0; 516444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 517444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 5182d61bbb3SSatish Balay for (j = 0; j < n; j++) { 5192d61bbb3SSatish Balay xb = x + 5 * (*idx++); 5209371c9d4SSatish Balay x1 = xb[0]; 5219371c9d4SSatish Balay x2 = xb[1]; 5229371c9d4SSatish Balay x3 = xb[2]; 5239371c9d4SSatish Balay x4 = xb[3]; 5249371c9d4SSatish Balay x5 = xb[4]; 5252d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 5262d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 5272d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 5282d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 5292d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 5302d61bbb3SSatish Balay v += 25; 5312d61bbb3SSatish Balay } 5327b2bb3b9SHong Zhang if (usecprow) z = zarray + 5 * ridx[i]; 5339371c9d4SSatish Balay z[0] = sum1; 5349371c9d4SSatish Balay z[1] = sum2; 5359371c9d4SSatish Balay z[2] = sum3; 5369371c9d4SSatish Balay z[3] = sum4; 5379371c9d4SSatish Balay z[4] = sum5; 53826e093fcSHong Zhang if (!usecprow) z += 5; 5392d61bbb3SSatish Balay } 5409566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5419566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 5429566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 5432d61bbb3SSatish Balay PetscFunctionReturn(0); 5442d61bbb3SSatish Balay } 5452d61bbb3SSatish Balay 5469371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) { 54715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 548f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 549d9fead3dSBarry Smith const PetscScalar *x, *xb; 55026e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 551d9fead3dSBarry Smith const MatScalar *v; 5527c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 553ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 55415091d37SBarry Smith 555433994e6SBarry Smith PetscFunctionBegin; 5569566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5579566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 55815091d37SBarry Smith 55915091d37SBarry Smith idx = a->j; 56015091d37SBarry Smith v = a->a; 56126e093fcSHong Zhang if (usecprow) { 56226e093fcSHong Zhang mbs = a->compressedrow.nrows; 56326e093fcSHong Zhang ii = a->compressedrow.i; 5647b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 5659566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 56626e093fcSHong Zhang } else { 56726e093fcSHong Zhang mbs = a->mbs; 56815091d37SBarry Smith ii = a->i; 56926e093fcSHong Zhang z = zarray; 57026e093fcSHong Zhang } 57115091d37SBarry Smith 57215091d37SBarry Smith for (i = 0; i < mbs; i++) { 57326fbe8dcSKarl Rupp n = ii[1] - ii[0]; 57426fbe8dcSKarl Rupp ii++; 57526fbe8dcSKarl Rupp sum1 = 0.0; 57626fbe8dcSKarl Rupp sum2 = 0.0; 57726fbe8dcSKarl Rupp sum3 = 0.0; 57826fbe8dcSKarl Rupp sum4 = 0.0; 57926fbe8dcSKarl Rupp sum5 = 0.0; 58026fbe8dcSKarl Rupp sum6 = 0.0; 58126fbe8dcSKarl Rupp 582444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 583444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 58415091d37SBarry Smith for (j = 0; j < n; j++) { 58515091d37SBarry Smith xb = x + 6 * (*idx++); 5869371c9d4SSatish Balay x1 = xb[0]; 5879371c9d4SSatish Balay x2 = xb[1]; 5889371c9d4SSatish Balay x3 = xb[2]; 5899371c9d4SSatish Balay x4 = xb[3]; 5909371c9d4SSatish Balay x5 = xb[4]; 5919371c9d4SSatish Balay x6 = xb[5]; 59215091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 59315091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 59415091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 59515091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 59615091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 59715091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 59815091d37SBarry Smith v += 36; 59915091d37SBarry Smith } 6007b2bb3b9SHong Zhang if (usecprow) z = zarray + 6 * ridx[i]; 6019371c9d4SSatish Balay z[0] = sum1; 6029371c9d4SSatish Balay z[1] = sum2; 6039371c9d4SSatish Balay z[2] = sum3; 6049371c9d4SSatish Balay z[3] = sum4; 6059371c9d4SSatish Balay z[4] = sum5; 6069371c9d4SSatish Balay z[5] = sum6; 60726e093fcSHong Zhang if (!usecprow) z += 6; 60815091d37SBarry Smith } 60915091d37SBarry Smith 6109566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6119566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 6129566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 61315091d37SBarry Smith PetscFunctionReturn(0); 61415091d37SBarry Smith } 6158ab949d8SShri Abhyankar 6169371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) { 6172d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 618f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 619d9fead3dSBarry Smith const PetscScalar *x, *xb; 62026e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 621d9fead3dSBarry Smith const MatScalar *v; 6227c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 623ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 6242d61bbb3SSatish Balay 625433994e6SBarry Smith PetscFunctionBegin; 6269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6279566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 6282d61bbb3SSatish Balay 6292d61bbb3SSatish Balay idx = a->j; 6302d61bbb3SSatish Balay v = a->a; 63126e093fcSHong Zhang if (usecprow) { 63226e093fcSHong Zhang mbs = a->compressedrow.nrows; 63326e093fcSHong Zhang ii = a->compressedrow.i; 6347b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 6359566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 63626e093fcSHong Zhang } else { 63726e093fcSHong Zhang mbs = a->mbs; 6382d61bbb3SSatish Balay ii = a->i; 63926e093fcSHong Zhang z = zarray; 64026e093fcSHong Zhang } 6412d61bbb3SSatish Balay 6422d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 64326fbe8dcSKarl Rupp n = ii[1] - ii[0]; 64426fbe8dcSKarl Rupp ii++; 64526fbe8dcSKarl Rupp sum1 = 0.0; 64626fbe8dcSKarl Rupp sum2 = 0.0; 64726fbe8dcSKarl Rupp sum3 = 0.0; 64826fbe8dcSKarl Rupp sum4 = 0.0; 64926fbe8dcSKarl Rupp sum5 = 0.0; 65026fbe8dcSKarl Rupp sum6 = 0.0; 65126fbe8dcSKarl Rupp sum7 = 0.0; 65226fbe8dcSKarl Rupp 653444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 654444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 6552d61bbb3SSatish Balay for (j = 0; j < n; j++) { 6562d61bbb3SSatish Balay xb = x + 7 * (*idx++); 6579371c9d4SSatish Balay x1 = xb[0]; 6589371c9d4SSatish Balay x2 = xb[1]; 6599371c9d4SSatish Balay x3 = xb[2]; 6609371c9d4SSatish Balay x4 = xb[3]; 6619371c9d4SSatish Balay x5 = xb[4]; 6629371c9d4SSatish Balay x6 = xb[5]; 6639371c9d4SSatish Balay x7 = xb[6]; 6642d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 6652d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 6662d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 6672d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 6682d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 6692d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 6702d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 6712d61bbb3SSatish Balay v += 49; 6722d61bbb3SSatish Balay } 6737b2bb3b9SHong Zhang if (usecprow) z = zarray + 7 * ridx[i]; 6749371c9d4SSatish Balay z[0] = sum1; 6759371c9d4SSatish Balay z[1] = sum2; 6769371c9d4SSatish Balay z[2] = sum3; 6779371c9d4SSatish Balay z[3] = sum4; 6789371c9d4SSatish Balay z[4] = sum5; 6799371c9d4SSatish Balay z[5] = sum6; 6809371c9d4SSatish Balay z[6] = sum7; 68126e093fcSHong Zhang if (!usecprow) z += 7; 6822d61bbb3SSatish Balay } 6832d61bbb3SSatish Balay 6849566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 6869566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 6872d61bbb3SSatish Balay PetscFunctionReturn(0); 6882d61bbb3SSatish Balay } 6892d61bbb3SSatish Balay 6905f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 6919371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) { 69296e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 693f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 69496e086a2SDaniel Kokron const PetscScalar *x, *xb; 69596e086a2SDaniel Kokron const MatScalar *v; 69696e086a2SDaniel Kokron PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 69796e086a2SDaniel Kokron const PetscInt *idx, *ii, *ridx = NULL; 698ce68d72fSJed Brown PetscInt k; 69996e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 70096e086a2SDaniel Kokron 70196e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 702ce68d72fSJed Brown __m256d w0, w1, w2, w3; 70396e086a2SDaniel Kokron __m256d z0, z1, z2; 70496e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 70596e086a2SDaniel Kokron 70696e086a2SDaniel Kokron PetscFunctionBegin; 7079566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7089566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 70996e086a2SDaniel Kokron 71096e086a2SDaniel Kokron idx = a->j; 71196e086a2SDaniel Kokron v = a->a; 71296e086a2SDaniel Kokron if (usecprow) { 71396e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 71496e086a2SDaniel Kokron ii = a->compressedrow.i; 71596e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 7169566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 71796e086a2SDaniel Kokron } else { 71896e086a2SDaniel Kokron mbs = a->mbs; 71996e086a2SDaniel Kokron ii = a->i; 72096e086a2SDaniel Kokron z = zarray; 72196e086a2SDaniel Kokron } 72296e086a2SDaniel Kokron 72396e086a2SDaniel Kokron if (!a->mult_work) { 72496e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 7259566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 72696e086a2SDaniel Kokron } 72796e086a2SDaniel Kokron 72896e086a2SDaniel Kokron work = a->mult_work; 72996e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 7309371c9d4SSatish Balay n = ii[1] - ii[0]; 7319371c9d4SSatish Balay ii++; 73296e086a2SDaniel Kokron workt = work; 73396e086a2SDaniel Kokron for (j = 0; j < n; j++) { 73496e086a2SDaniel Kokron xb = x + bs * (*idx++); 73596e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 73696e086a2SDaniel Kokron workt += bs; 73796e086a2SDaniel Kokron } 73896e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 73996e086a2SDaniel Kokron 7409371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 7419371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 7429371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 74396e086a2SDaniel Kokron 74496e086a2SDaniel Kokron for (j = 0; j < n; j++) { 745c05b70c4SSatish Balay /* first column of a */ 74696e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 7479371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 7489371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 7499371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 7509371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 7519371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 7529371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 75396e086a2SDaniel Kokron 754c05b70c4SSatish Balay /* second column of a */ 75596e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 7569371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 7579371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 7589371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 7599371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 7609371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 7619371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 76296e086a2SDaniel Kokron 763c05b70c4SSatish Balay /* third column of a */ 76496e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 7659371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 7669371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 7679371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 7689371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 7699371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 7709371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 77196e086a2SDaniel Kokron 772c05b70c4SSatish Balay /* fourth column of a */ 77396e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 7749371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 7759371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 7769371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 7779371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 7789371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 7799371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 78096e086a2SDaniel Kokron 781c05b70c4SSatish Balay /* fifth column of a */ 78296e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 7839371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 7849371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 7859371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 7869371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 7879371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 7889371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 78996e086a2SDaniel Kokron 790c05b70c4SSatish Balay /* sixth column of a */ 79196e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 7929371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 7939371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 7949371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 7959371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 7969371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 7979371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 79896e086a2SDaniel Kokron 799c05b70c4SSatish Balay /* seventh column of a */ 80096e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 8019371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 8029371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 8039371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 8049371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 8059371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 8069371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 80796e086a2SDaniel Kokron 8086aad120cSJose E. Roman /* eighth column of a */ 80996e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 8109371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 8119371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 8129371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 8139371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 8149371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 8159371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 81696e086a2SDaniel Kokron 817c05b70c4SSatish Balay /* ninth column of a */ 81896e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 8199371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 8209371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 8219371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 8229371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 8239371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 8249371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 82596e086a2SDaniel Kokron } 82696e086a2SDaniel Kokron 8279371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 8289371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 8299371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 83096e086a2SDaniel Kokron 83196e086a2SDaniel Kokron v += n * bs2; 83296e086a2SDaniel Kokron if (!usecprow) z += bs; 83396e086a2SDaniel Kokron } 8349566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8359566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 8369566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 83796e086a2SDaniel Kokron PetscFunctionReturn(0); 83896e086a2SDaniel Kokron } 83996e086a2SDaniel Kokron #endif 84096e086a2SDaniel Kokron 8419371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) { 842ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 843f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 844ebada01fSBarry Smith const PetscScalar *x, *xb; 845ebada01fSBarry Smith PetscScalar *zarray, xv; 846ebada01fSBarry Smith const MatScalar *v; 847ebada01fSBarry Smith const PetscInt *ii, *ij = a->j, *idx; 848ebada01fSBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 849ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 850ebada01fSBarry Smith 851ebada01fSBarry Smith PetscFunctionBegin; 8529566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8539566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 854ebada01fSBarry Smith 855ebada01fSBarry Smith v = a->a; 856ebada01fSBarry Smith if (usecprow) { 857ebada01fSBarry Smith mbs = a->compressedrow.nrows; 858ebada01fSBarry Smith ii = a->compressedrow.i; 859ebada01fSBarry Smith ridx = a->compressedrow.rindex; 8609566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 861ebada01fSBarry Smith } else { 862ebada01fSBarry Smith mbs = a->mbs; 863ebada01fSBarry Smith ii = a->i; 864ebada01fSBarry Smith z = zarray; 865ebada01fSBarry Smith } 866ebada01fSBarry Smith 867ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 868ebada01fSBarry Smith n = ii[i + 1] - ii[i]; 869ebada01fSBarry Smith idx = ij + ii[i]; 8709371c9d4SSatish Balay sum1 = 0.0; 8719371c9d4SSatish Balay sum2 = 0.0; 8729371c9d4SSatish Balay sum3 = 0.0; 8739371c9d4SSatish Balay sum4 = 0.0; 8749371c9d4SSatish Balay sum5 = 0.0; 8759371c9d4SSatish Balay sum6 = 0.0; 8769371c9d4SSatish Balay sum7 = 0.0; 8779371c9d4SSatish Balay sum8 = 0.0; 8789371c9d4SSatish Balay sum9 = 0.0; 8799371c9d4SSatish Balay sum10 = 0.0; 8809371c9d4SSatish Balay sum11 = 0.0; 881ebada01fSBarry Smith 882ebada01fSBarry Smith for (j = 0; j < n; j++) { 883ebada01fSBarry Smith xb = x + 11 * (idx[j]); 884ebada01fSBarry Smith 885ebada01fSBarry Smith for (k = 0; k < 11; k++) { 886ebada01fSBarry Smith xv = xb[k]; 887ebada01fSBarry Smith sum1 += v[0] * xv; 888ebada01fSBarry Smith sum2 += v[1] * xv; 889ebada01fSBarry Smith sum3 += v[2] * xv; 890ebada01fSBarry Smith sum4 += v[3] * xv; 891ebada01fSBarry Smith sum5 += v[4] * xv; 892ebada01fSBarry Smith sum6 += v[5] * xv; 893ebada01fSBarry Smith sum7 += v[6] * xv; 894ebada01fSBarry Smith sum8 += v[7] * xv; 895ebada01fSBarry Smith sum9 += v[8] * xv; 896ebada01fSBarry Smith sum10 += v[9] * xv; 897ebada01fSBarry Smith sum11 += v[10] * xv; 898ebada01fSBarry Smith v += 11; 899ebada01fSBarry Smith } 900ebada01fSBarry Smith } 901ebada01fSBarry Smith if (usecprow) z = zarray + 11 * ridx[i]; 9029371c9d4SSatish Balay z[0] = sum1; 9039371c9d4SSatish Balay z[1] = sum2; 9049371c9d4SSatish Balay z[2] = sum3; 9059371c9d4SSatish Balay z[3] = sum4; 9069371c9d4SSatish Balay z[4] = sum5; 9079371c9d4SSatish Balay z[5] = sum6; 9089371c9d4SSatish Balay z[6] = sum7; 9099371c9d4SSatish Balay z[7] = sum8; 9109371c9d4SSatish Balay z[8] = sum9; 9119371c9d4SSatish Balay z[9] = sum10; 9129371c9d4SSatish Balay z[10] = sum11; 913ebada01fSBarry Smith 914ebada01fSBarry Smith if (!usecprow) z += 11; 915ebada01fSBarry Smith } 916ebada01fSBarry Smith 9179566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9189566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 9199566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 920ebada01fSBarry Smith PetscFunctionReturn(0); 921ebada01fSBarry Smith } 922ebada01fSBarry Smith 9236679dcc1SBarry Smith /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 9249371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) { 9256679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 9266679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 9276679dcc1SBarry Smith const PetscScalar *x, *xb; 9286679dcc1SBarry Smith PetscScalar *zarray, xv; 9296679dcc1SBarry Smith const MatScalar *v; 9306679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 9316679dcc1SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 9326679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 9336679dcc1SBarry Smith 9346679dcc1SBarry Smith PetscFunctionBegin; 9359566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9369566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 9376679dcc1SBarry Smith 9386679dcc1SBarry Smith v = a->a; 9396679dcc1SBarry Smith if (usecprow) { 9406679dcc1SBarry Smith mbs = a->compressedrow.nrows; 9416679dcc1SBarry Smith ii = a->compressedrow.i; 9426679dcc1SBarry Smith ridx = a->compressedrow.rindex; 9439566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 9446679dcc1SBarry Smith } else { 9456679dcc1SBarry Smith mbs = a->mbs; 9466679dcc1SBarry Smith ii = a->i; 9476679dcc1SBarry Smith z = zarray; 9486679dcc1SBarry Smith } 9496679dcc1SBarry Smith 9506679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 9516679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 9526679dcc1SBarry Smith idx = ij + ii[i]; 9539371c9d4SSatish Balay sum1 = 0.0; 9549371c9d4SSatish Balay sum2 = 0.0; 9559371c9d4SSatish Balay sum3 = 0.0; 9569371c9d4SSatish Balay sum4 = 0.0; 9579371c9d4SSatish Balay sum5 = 0.0; 9589371c9d4SSatish Balay sum6 = 0.0; 9599371c9d4SSatish Balay sum7 = 0.0; 9609371c9d4SSatish Balay sum8 = 0.0; 9619371c9d4SSatish Balay sum9 = 0.0; 9629371c9d4SSatish Balay sum10 = 0.0; 9639371c9d4SSatish Balay sum11 = 0.0; 9649371c9d4SSatish Balay sum12 = 0.0; 9656679dcc1SBarry Smith 9666679dcc1SBarry Smith for (j = 0; j < n; j++) { 9676679dcc1SBarry Smith xb = x + 12 * (idx[j]); 9686679dcc1SBarry Smith 9696679dcc1SBarry Smith for (k = 0; k < 12; k++) { 9706679dcc1SBarry Smith xv = xb[k]; 9716679dcc1SBarry Smith sum1 += v[0] * xv; 9726679dcc1SBarry Smith sum2 += v[1] * xv; 9736679dcc1SBarry Smith sum3 += v[2] * xv; 9746679dcc1SBarry Smith sum4 += v[3] * xv; 9756679dcc1SBarry Smith sum5 += v[4] * xv; 9766679dcc1SBarry Smith sum6 += v[5] * xv; 9776679dcc1SBarry Smith sum7 += v[6] * xv; 9786679dcc1SBarry Smith sum8 += v[7] * xv; 9796679dcc1SBarry Smith sum9 += v[8] * xv; 9806679dcc1SBarry Smith sum10 += v[9] * xv; 9816679dcc1SBarry Smith sum11 += v[10] * xv; 9826679dcc1SBarry Smith sum12 += v[11] * xv; 9836679dcc1SBarry Smith v += 12; 9846679dcc1SBarry Smith } 9856679dcc1SBarry Smith } 9866679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 9879371c9d4SSatish Balay z[0] = sum1; 9889371c9d4SSatish Balay z[1] = sum2; 9899371c9d4SSatish Balay z[2] = sum3; 9909371c9d4SSatish Balay z[3] = sum4; 9919371c9d4SSatish Balay z[4] = sum5; 9929371c9d4SSatish Balay z[5] = sum6; 9939371c9d4SSatish Balay z[6] = sum7; 9949371c9d4SSatish Balay z[7] = sum8; 9959371c9d4SSatish Balay z[8] = sum9; 9969371c9d4SSatish Balay z[9] = sum10; 9979371c9d4SSatish Balay z[10] = sum11; 9989371c9d4SSatish Balay z[11] = sum12; 9996679dcc1SBarry Smith if (!usecprow) z += 12; 10006679dcc1SBarry Smith } 10019566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10029566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 10039566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 10046679dcc1SBarry Smith PetscFunctionReturn(0); 10056679dcc1SBarry Smith } 10066679dcc1SBarry Smith 10079371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) { 10086679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 10096679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 10106679dcc1SBarry Smith const PetscScalar *x, *xb; 10116679dcc1SBarry Smith PetscScalar *zarray, *yarray, xv; 10126679dcc1SBarry Smith const MatScalar *v; 10136679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 10146679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 10156679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 10166679dcc1SBarry Smith 10176679dcc1SBarry Smith PetscFunctionBegin; 10189566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10199566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 10206679dcc1SBarry Smith 10216679dcc1SBarry Smith v = a->a; 10226679dcc1SBarry Smith if (usecprow) { 1023*48a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 10246679dcc1SBarry Smith mbs = a->compressedrow.nrows; 10256679dcc1SBarry Smith ii = a->compressedrow.i; 10266679dcc1SBarry Smith ridx = a->compressedrow.rindex; 10276679dcc1SBarry Smith } else { 10286679dcc1SBarry Smith ii = a->i; 10296679dcc1SBarry Smith y = yarray; 10306679dcc1SBarry Smith z = zarray; 10316679dcc1SBarry Smith } 10326679dcc1SBarry Smith 10336679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 10346679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 10356679dcc1SBarry Smith idx = ij + ii[i]; 10366679dcc1SBarry Smith 10376679dcc1SBarry Smith if (usecprow) { 10386679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 10396679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 10406679dcc1SBarry Smith } 10419371c9d4SSatish Balay sum1 = y[0]; 10429371c9d4SSatish Balay sum2 = y[1]; 10439371c9d4SSatish Balay sum3 = y[2]; 10449371c9d4SSatish Balay sum4 = y[3]; 10459371c9d4SSatish Balay sum5 = y[4]; 10469371c9d4SSatish Balay sum6 = y[5]; 10479371c9d4SSatish Balay sum7 = y[6]; 10489371c9d4SSatish Balay sum8 = y[7]; 10499371c9d4SSatish Balay sum9 = y[8]; 10509371c9d4SSatish Balay sum10 = y[9]; 10519371c9d4SSatish Balay sum11 = y[10]; 10529371c9d4SSatish Balay sum12 = y[11]; 10536679dcc1SBarry Smith 10546679dcc1SBarry Smith for (j = 0; j < n; j++) { 10556679dcc1SBarry Smith xb = x + 12 * (idx[j]); 10566679dcc1SBarry Smith 10576679dcc1SBarry Smith for (k = 0; k < 12; k++) { 10586679dcc1SBarry Smith xv = xb[k]; 10596679dcc1SBarry Smith sum1 += v[0] * xv; 10606679dcc1SBarry Smith sum2 += v[1] * xv; 10616679dcc1SBarry Smith sum3 += v[2] * xv; 10626679dcc1SBarry Smith sum4 += v[3] * xv; 10636679dcc1SBarry Smith sum5 += v[4] * xv; 10646679dcc1SBarry Smith sum6 += v[5] * xv; 10656679dcc1SBarry Smith sum7 += v[6] * xv; 10666679dcc1SBarry Smith sum8 += v[7] * xv; 10676679dcc1SBarry Smith sum9 += v[8] * xv; 10686679dcc1SBarry Smith sum10 += v[9] * xv; 10696679dcc1SBarry Smith sum11 += v[10] * xv; 10706679dcc1SBarry Smith sum12 += v[11] * xv; 10716679dcc1SBarry Smith v += 12; 10726679dcc1SBarry Smith } 10736679dcc1SBarry Smith } 10746679dcc1SBarry Smith 10759371c9d4SSatish Balay z[0] = sum1; 10769371c9d4SSatish Balay z[1] = sum2; 10779371c9d4SSatish Balay z[2] = sum3; 10789371c9d4SSatish Balay z[3] = sum4; 10799371c9d4SSatish Balay z[4] = sum5; 10809371c9d4SSatish Balay z[5] = sum6; 10819371c9d4SSatish Balay z[6] = sum7; 10829371c9d4SSatish Balay z[7] = sum8; 10839371c9d4SSatish Balay z[8] = sum9; 10849371c9d4SSatish Balay z[9] = sum10; 10859371c9d4SSatish Balay z[10] = sum11; 10869371c9d4SSatish Balay z[11] = sum12; 10876679dcc1SBarry Smith if (!usecprow) { 10886679dcc1SBarry Smith y += 12; 10896679dcc1SBarry Smith z += 12; 10906679dcc1SBarry Smith } 10916679dcc1SBarry Smith } 10929566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10939566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 10949566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 10956679dcc1SBarry Smith PetscFunctionReturn(0); 10966679dcc1SBarry Smith } 10976679dcc1SBarry Smith 10986679dcc1SBarry Smith /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 10999371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) { 11006679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 11016679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 11026679dcc1SBarry Smith const PetscScalar *x, *xb; 11036679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray; 11046679dcc1SBarry Smith const MatScalar *v; 11056679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 11066679dcc1SBarry Smith PetscInt mbs, i, j, n; 11076679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 11086679dcc1SBarry Smith 11096679dcc1SBarry Smith PetscFunctionBegin; 11109566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 11119566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 11126679dcc1SBarry Smith 11136679dcc1SBarry Smith v = a->a; 11146679dcc1SBarry Smith if (usecprow) { 11156679dcc1SBarry Smith mbs = a->compressedrow.nrows; 11166679dcc1SBarry Smith ii = a->compressedrow.i; 11176679dcc1SBarry Smith ridx = a->compressedrow.rindex; 11189566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 11196679dcc1SBarry Smith } else { 11206679dcc1SBarry Smith mbs = a->mbs; 11216679dcc1SBarry Smith ii = a->i; 11226679dcc1SBarry Smith z = zarray; 11236679dcc1SBarry Smith } 11246679dcc1SBarry Smith 11256679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 11266679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 11276679dcc1SBarry Smith idx = ij + ii[i]; 11286679dcc1SBarry Smith 11296679dcc1SBarry Smith sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 11306679dcc1SBarry Smith for (j = 0; j < n; j++) { 11316679dcc1SBarry Smith xb = x + 12 * (idx[j]); 11329371c9d4SSatish Balay x1 = xb[0]; 11339371c9d4SSatish Balay x2 = xb[1]; 11349371c9d4SSatish Balay x3 = xb[2]; 11359371c9d4SSatish Balay x4 = xb[3]; 11366679dcc1SBarry Smith 11376679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11386679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11396679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11406679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11416679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11426679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11436679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11446679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11456679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11466679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11476679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11486679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11496679dcc1SBarry Smith v += 48; 11506679dcc1SBarry Smith 11519371c9d4SSatish Balay x1 = xb[4]; 11529371c9d4SSatish Balay x2 = xb[5]; 11539371c9d4SSatish Balay x3 = xb[6]; 11549371c9d4SSatish Balay x4 = xb[7]; 11556679dcc1SBarry Smith 11566679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11576679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11586679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11596679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11606679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11616679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11626679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11636679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11646679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11656679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11666679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11676679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11686679dcc1SBarry Smith v += 48; 11696679dcc1SBarry Smith 11709371c9d4SSatish Balay x1 = xb[8]; 11719371c9d4SSatish Balay x2 = xb[9]; 11729371c9d4SSatish Balay x3 = xb[10]; 11739371c9d4SSatish Balay x4 = xb[11]; 11746679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11756679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11766679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11776679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11786679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11796679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11806679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11816679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11826679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11836679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11846679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11856679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11866679dcc1SBarry Smith v += 48; 11876679dcc1SBarry Smith } 11886679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 11899371c9d4SSatish Balay z[0] = sum1; 11909371c9d4SSatish Balay z[1] = sum2; 11919371c9d4SSatish Balay z[2] = sum3; 11929371c9d4SSatish Balay z[3] = sum4; 11939371c9d4SSatish Balay z[4] = sum5; 11949371c9d4SSatish Balay z[5] = sum6; 11959371c9d4SSatish Balay z[6] = sum7; 11969371c9d4SSatish Balay z[7] = sum8; 11979371c9d4SSatish Balay z[8] = sum9; 11989371c9d4SSatish Balay z[9] = sum10; 11999371c9d4SSatish Balay z[10] = sum11; 12009371c9d4SSatish Balay z[11] = sum12; 12016679dcc1SBarry Smith if (!usecprow) z += 12; 12026679dcc1SBarry Smith } 12039566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 12049566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 12059566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 12066679dcc1SBarry Smith PetscFunctionReturn(0); 12076679dcc1SBarry Smith } 12086679dcc1SBarry Smith 12096679dcc1SBarry Smith /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 12109371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) { 12116679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 12126679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 12136679dcc1SBarry Smith const PetscScalar *x, *xb; 12146679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray, *yarray; 12156679dcc1SBarry Smith const MatScalar *v; 12166679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 12176679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, n; 12186679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 12196679dcc1SBarry Smith 12206679dcc1SBarry Smith PetscFunctionBegin; 12219566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 12229566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 12236679dcc1SBarry Smith 12246679dcc1SBarry Smith v = a->a; 12256679dcc1SBarry Smith if (usecprow) { 1226*48a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 12276679dcc1SBarry Smith mbs = a->compressedrow.nrows; 12286679dcc1SBarry Smith ii = a->compressedrow.i; 12296679dcc1SBarry Smith ridx = a->compressedrow.rindex; 12306679dcc1SBarry Smith } else { 12316679dcc1SBarry Smith ii = a->i; 12326679dcc1SBarry Smith y = yarray; 12336679dcc1SBarry Smith z = zarray; 12346679dcc1SBarry Smith } 12356679dcc1SBarry Smith 12366679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 12376679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 12386679dcc1SBarry Smith idx = ij + ii[i]; 12396679dcc1SBarry Smith 12406679dcc1SBarry Smith if (usecprow) { 12416679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 12426679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 12436679dcc1SBarry Smith } 12449371c9d4SSatish Balay sum1 = y[0]; 12459371c9d4SSatish Balay sum2 = y[1]; 12469371c9d4SSatish Balay sum3 = y[2]; 12479371c9d4SSatish Balay sum4 = y[3]; 12489371c9d4SSatish Balay sum5 = y[4]; 12499371c9d4SSatish Balay sum6 = y[5]; 12509371c9d4SSatish Balay sum7 = y[6]; 12519371c9d4SSatish Balay sum8 = y[7]; 12529371c9d4SSatish Balay sum9 = y[8]; 12539371c9d4SSatish Balay sum10 = y[9]; 12549371c9d4SSatish Balay sum11 = y[10]; 12559371c9d4SSatish Balay sum12 = y[11]; 12566679dcc1SBarry Smith 12576679dcc1SBarry Smith for (j = 0; j < n; j++) { 12586679dcc1SBarry Smith xb = x + 12 * (idx[j]); 12599371c9d4SSatish Balay x1 = xb[0]; 12609371c9d4SSatish Balay x2 = xb[1]; 12619371c9d4SSatish Balay x3 = xb[2]; 12629371c9d4SSatish Balay x4 = xb[3]; 12636679dcc1SBarry Smith 12646679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12656679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12666679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12676679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12686679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12696679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12706679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12716679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12726679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12736679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12746679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12756679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12766679dcc1SBarry Smith v += 48; 12776679dcc1SBarry Smith 12789371c9d4SSatish Balay x1 = xb[4]; 12799371c9d4SSatish Balay x2 = xb[5]; 12809371c9d4SSatish Balay x3 = xb[6]; 12819371c9d4SSatish Balay x4 = xb[7]; 12826679dcc1SBarry Smith 12836679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12846679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12856679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12866679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12876679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12886679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12896679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12906679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12916679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12926679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12936679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12946679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12956679dcc1SBarry Smith v += 48; 12966679dcc1SBarry Smith 12979371c9d4SSatish Balay x1 = xb[8]; 12989371c9d4SSatish Balay x2 = xb[9]; 12999371c9d4SSatish Balay x3 = xb[10]; 13009371c9d4SSatish Balay x4 = xb[11]; 13016679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 13026679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 13036679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 13046679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 13056679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 13066679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 13076679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 13086679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 13096679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 13106679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 13116679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 13126679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 13136679dcc1SBarry Smith v += 48; 13146679dcc1SBarry Smith } 13159371c9d4SSatish Balay z[0] = sum1; 13169371c9d4SSatish Balay z[1] = sum2; 13179371c9d4SSatish Balay z[2] = sum3; 13189371c9d4SSatish Balay z[3] = sum4; 13199371c9d4SSatish Balay z[4] = sum5; 13209371c9d4SSatish Balay z[5] = sum6; 13219371c9d4SSatish Balay z[6] = sum7; 13229371c9d4SSatish Balay z[7] = sum8; 13239371c9d4SSatish Balay z[8] = sum9; 13249371c9d4SSatish Balay z[9] = sum10; 13259371c9d4SSatish Balay z[10] = sum11; 13269371c9d4SSatish Balay z[11] = sum12; 13276679dcc1SBarry Smith if (!usecprow) { 13286679dcc1SBarry Smith y += 12; 13296679dcc1SBarry Smith z += 12; 13306679dcc1SBarry Smith } 13316679dcc1SBarry Smith } 13329566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 13339566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 13349566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 13356679dcc1SBarry Smith PetscFunctionReturn(0); 13366679dcc1SBarry Smith } 13376679dcc1SBarry Smith 13386679dcc1SBarry Smith #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 13399371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) { 13406679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 13416679dcc1SBarry Smith PetscScalar *z = NULL, *zarray; 13426679dcc1SBarry Smith const PetscScalar *x, *work; 13436679dcc1SBarry Smith const MatScalar *v = a->a; 13446679dcc1SBarry Smith PetscInt mbs, i, j, n; 13456679dcc1SBarry Smith const PetscInt *idx = a->j, *ii, *ridx = NULL; 13466679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 13476679dcc1SBarry Smith const PetscInt bs = 12, bs2 = 144; 13486679dcc1SBarry Smith 13496679dcc1SBarry Smith __m256d a0, a1, a2, a3, a4, a5; 13506679dcc1SBarry Smith __m256d w0, w1, w2, w3; 13516679dcc1SBarry Smith __m256d z0, z1, z2; 13526679dcc1SBarry Smith 13536679dcc1SBarry Smith PetscFunctionBegin; 13549566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 13559566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 13566679dcc1SBarry Smith 13576679dcc1SBarry Smith if (usecprow) { 13586679dcc1SBarry Smith mbs = a->compressedrow.nrows; 13596679dcc1SBarry Smith ii = a->compressedrow.i; 13606679dcc1SBarry Smith ridx = a->compressedrow.rindex; 13619566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 13626679dcc1SBarry Smith } else { 13636679dcc1SBarry Smith mbs = a->mbs; 13646679dcc1SBarry Smith ii = a->i; 13656679dcc1SBarry Smith z = zarray; 13666679dcc1SBarry Smith } 13676679dcc1SBarry Smith 13686679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 13699371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 13709371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 13719371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 13726679dcc1SBarry Smith 13739371c9d4SSatish Balay n = ii[1] - ii[0]; 13749371c9d4SSatish Balay ii++; 13756679dcc1SBarry Smith for (j = 0; j < n; j++) { 13766679dcc1SBarry Smith work = x + bs * (*idx++); 13776679dcc1SBarry Smith 13786679dcc1SBarry Smith /* first column of a */ 13796679dcc1SBarry Smith w0 = _mm256_set1_pd(work[0]); 13809371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 0); 13819371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 13829371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 4); 13839371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 13849371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 8); 13859371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 13866679dcc1SBarry Smith 13876679dcc1SBarry Smith /* second column of a */ 13886679dcc1SBarry Smith w1 = _mm256_set1_pd(work[1]); 13899371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 12); 13909371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 13919371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 16); 13929371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 13939371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 20); 13949371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 13956679dcc1SBarry Smith 13966679dcc1SBarry Smith /* third column of a */ 13976679dcc1SBarry Smith w2 = _mm256_set1_pd(work[2]); 13989371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 24); 13999371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14009371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 28); 14019371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14029371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 32); 14039371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14046679dcc1SBarry Smith 14056679dcc1SBarry Smith /* fourth column of a */ 14066679dcc1SBarry Smith w3 = _mm256_set1_pd(work[3]); 14079371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 36); 14089371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14099371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 40); 14109371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14119371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 44); 14129371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14136679dcc1SBarry Smith 14146679dcc1SBarry Smith /* fifth column of a */ 14156679dcc1SBarry Smith w0 = _mm256_set1_pd(work[4]); 14169371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 48); 14179371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14189371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 52); 14199371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14209371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 56); 14219371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14226679dcc1SBarry Smith 14236679dcc1SBarry Smith /* sixth column of a */ 14246679dcc1SBarry Smith w1 = _mm256_set1_pd(work[5]); 14259371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 60); 14269371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14279371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 64); 14289371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14299371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 68); 14309371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14316679dcc1SBarry Smith 14326679dcc1SBarry Smith /* seventh column of a */ 14336679dcc1SBarry Smith w2 = _mm256_set1_pd(work[6]); 14349371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 72); 14359371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14369371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 76); 14379371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14389371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 80); 14399371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14406679dcc1SBarry Smith 14416aad120cSJose E. Roman /* eighth column of a */ 14426679dcc1SBarry Smith w3 = _mm256_set1_pd(work[7]); 14439371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 84); 14449371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14459371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 88); 14469371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14479371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 92); 14489371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14496679dcc1SBarry Smith 14506679dcc1SBarry Smith /* ninth column of a */ 14516679dcc1SBarry Smith w0 = _mm256_set1_pd(work[8]); 14529371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 96); 14539371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14549371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 100); 14559371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14569371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 104); 14579371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14586679dcc1SBarry Smith 14596679dcc1SBarry Smith /* tenth column of a */ 14606679dcc1SBarry Smith w1 = _mm256_set1_pd(work[9]); 14619371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 108); 14629371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14639371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 112); 14649371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14659371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 116); 14669371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14676679dcc1SBarry Smith 14686679dcc1SBarry Smith /* eleventh column of a */ 14696679dcc1SBarry Smith w2 = _mm256_set1_pd(work[10]); 14709371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 120); 14719371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14729371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 124); 14739371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14749371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 128); 14759371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14766679dcc1SBarry Smith 14776679dcc1SBarry Smith /* twelveth column of a */ 14786679dcc1SBarry Smith w3 = _mm256_set1_pd(work[11]); 14799371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 132); 14809371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14819371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 136); 14829371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14839371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 140); 14849371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14856679dcc1SBarry Smith 14866679dcc1SBarry Smith v += bs2; 14876679dcc1SBarry Smith } 14886679dcc1SBarry Smith if (usecprow) z = zarray + bs * ridx[i]; 14899371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 14909371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 14919371c9d4SSatish Balay _mm256_storeu_pd(&z[8], z2); 14926679dcc1SBarry Smith if (!usecprow) z += bs; 14936679dcc1SBarry Smith } 14949566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 14959566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 14969566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 14976679dcc1SBarry Smith PetscFunctionReturn(0); 14986679dcc1SBarry Smith } 14996679dcc1SBarry Smith #endif 15006679dcc1SBarry Smith 15018ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1502832cc040SShri Abhyankar /* Default MatMult for block size 15 */ 15039371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) { 15048ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1505f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 15068ab949d8SShri Abhyankar const PetscScalar *x, *xb; 150753ef36baSBarry Smith PetscScalar *zarray, xv; 15088ab949d8SShri Abhyankar const MatScalar *v; 15098ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 15107c565772SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 1511ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 15128ab949d8SShri Abhyankar 15138ab949d8SShri Abhyankar PetscFunctionBegin; 15149566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 15159566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 15168ab949d8SShri Abhyankar 15178ab949d8SShri Abhyankar v = a->a; 15188ab949d8SShri Abhyankar if (usecprow) { 15198ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 15208ab949d8SShri Abhyankar ii = a->compressedrow.i; 15218ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 15229566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 15238ab949d8SShri Abhyankar } else { 15248ab949d8SShri Abhyankar mbs = a->mbs; 15258ab949d8SShri Abhyankar ii = a->i; 15268ab949d8SShri Abhyankar z = zarray; 15278ab949d8SShri Abhyankar } 15288ab949d8SShri Abhyankar 15298ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 15308ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 15318ab949d8SShri Abhyankar idx = ij + ii[i]; 15329371c9d4SSatish Balay sum1 = 0.0; 15339371c9d4SSatish Balay sum2 = 0.0; 15349371c9d4SSatish Balay sum3 = 0.0; 15359371c9d4SSatish Balay sum4 = 0.0; 15369371c9d4SSatish Balay sum5 = 0.0; 15379371c9d4SSatish Balay sum6 = 0.0; 15389371c9d4SSatish Balay sum7 = 0.0; 15399371c9d4SSatish Balay sum8 = 0.0; 15409371c9d4SSatish Balay sum9 = 0.0; 15419371c9d4SSatish Balay sum10 = 0.0; 15429371c9d4SSatish Balay sum11 = 0.0; 15439371c9d4SSatish Balay sum12 = 0.0; 15449371c9d4SSatish Balay sum13 = 0.0; 15459371c9d4SSatish Balay sum14 = 0.0; 15469371c9d4SSatish Balay sum15 = 0.0; 15478ab949d8SShri Abhyankar 15488ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 15498ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 15508ab949d8SShri Abhyankar 15518ab949d8SShri Abhyankar for (k = 0; k < 15; k++) { 155253ef36baSBarry Smith xv = xb[k]; 155353ef36baSBarry Smith sum1 += v[0] * xv; 155453ef36baSBarry Smith sum2 += v[1] * xv; 155553ef36baSBarry Smith sum3 += v[2] * xv; 155653ef36baSBarry Smith sum4 += v[3] * xv; 155753ef36baSBarry Smith sum5 += v[4] * xv; 155853ef36baSBarry Smith sum6 += v[5] * xv; 155953ef36baSBarry Smith sum7 += v[6] * xv; 156053ef36baSBarry Smith sum8 += v[7] * xv; 156153ef36baSBarry Smith sum9 += v[8] * xv; 156253ef36baSBarry Smith sum10 += v[9] * xv; 156353ef36baSBarry Smith sum11 += v[10] * xv; 156453ef36baSBarry Smith sum12 += v[11] * xv; 156553ef36baSBarry Smith sum13 += v[12] * xv; 156653ef36baSBarry Smith sum14 += v[13] * xv; 156753ef36baSBarry Smith sum15 += v[14] * xv; 15688ab949d8SShri Abhyankar v += 15; 15698ab949d8SShri Abhyankar } 15708ab949d8SShri Abhyankar } 15718ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 15729371c9d4SSatish Balay z[0] = sum1; 15739371c9d4SSatish Balay z[1] = sum2; 15749371c9d4SSatish Balay z[2] = sum3; 15759371c9d4SSatish Balay z[3] = sum4; 15769371c9d4SSatish Balay z[4] = sum5; 15779371c9d4SSatish Balay z[5] = sum6; 15789371c9d4SSatish Balay z[6] = sum7; 15799371c9d4SSatish Balay z[7] = sum8; 15809371c9d4SSatish Balay z[8] = sum9; 15819371c9d4SSatish Balay z[9] = sum10; 15829371c9d4SSatish Balay z[10] = sum11; 15839371c9d4SSatish Balay z[11] = sum12; 15849371c9d4SSatish Balay z[12] = sum13; 15859371c9d4SSatish Balay z[13] = sum14; 15869371c9d4SSatish Balay z[14] = sum15; 15878ab949d8SShri Abhyankar 15888ab949d8SShri Abhyankar if (!usecprow) z += 15; 15898ab949d8SShri Abhyankar } 15908ab949d8SShri Abhyankar 15919566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 15929566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 15939566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 15948ab949d8SShri Abhyankar PetscFunctionReturn(0); 15958ab949d8SShri Abhyankar } 15968ab949d8SShri Abhyankar 15978ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 15989371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) { 15998ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1600f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 16018ab949d8SShri Abhyankar const PetscScalar *x, *xb; 16020b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, *zarray; 16038ab949d8SShri Abhyankar const MatScalar *v; 16048ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 16057c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1606ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 16078ab949d8SShri Abhyankar 16088ab949d8SShri Abhyankar PetscFunctionBegin; 16099566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 16109566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 16118ab949d8SShri Abhyankar 16128ab949d8SShri Abhyankar v = a->a; 16138ab949d8SShri Abhyankar if (usecprow) { 16148ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 16158ab949d8SShri Abhyankar ii = a->compressedrow.i; 16168ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 16179566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 16188ab949d8SShri Abhyankar } else { 16198ab949d8SShri Abhyankar mbs = a->mbs; 16208ab949d8SShri Abhyankar ii = a->i; 16218ab949d8SShri Abhyankar z = zarray; 16228ab949d8SShri Abhyankar } 16238ab949d8SShri Abhyankar 16248ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 16258ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 16268ab949d8SShri Abhyankar idx = ij + ii[i]; 16279371c9d4SSatish Balay sum1 = 0.0; 16289371c9d4SSatish Balay sum2 = 0.0; 16299371c9d4SSatish Balay sum3 = 0.0; 16309371c9d4SSatish Balay sum4 = 0.0; 16319371c9d4SSatish Balay sum5 = 0.0; 16329371c9d4SSatish Balay sum6 = 0.0; 16339371c9d4SSatish Balay sum7 = 0.0; 16349371c9d4SSatish Balay sum8 = 0.0; 16359371c9d4SSatish Balay sum9 = 0.0; 16369371c9d4SSatish Balay sum10 = 0.0; 16379371c9d4SSatish Balay sum11 = 0.0; 16389371c9d4SSatish Balay sum12 = 0.0; 16399371c9d4SSatish Balay sum13 = 0.0; 16409371c9d4SSatish Balay sum14 = 0.0; 16419371c9d4SSatish Balay sum15 = 0.0; 16428ab949d8SShri Abhyankar 16438ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 16448ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 16459371c9d4SSatish Balay x1 = xb[0]; 16469371c9d4SSatish Balay x2 = xb[1]; 16479371c9d4SSatish Balay x3 = xb[2]; 16489371c9d4SSatish Balay x4 = xb[3]; 16498ab949d8SShri Abhyankar 16508ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16518ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16528ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16538ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16548ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16558ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16568ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16578ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16588ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16598ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16608ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16618ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16628ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16638ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16648ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16658ab949d8SShri Abhyankar 16668ab949d8SShri Abhyankar v += 60; 16678ab949d8SShri Abhyankar 16689371c9d4SSatish Balay x1 = xb[4]; 16699371c9d4SSatish Balay x2 = xb[5]; 16709371c9d4SSatish Balay x3 = xb[6]; 16719371c9d4SSatish Balay x4 = xb[7]; 16728ab949d8SShri Abhyankar 16738ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16748ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16758ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16768ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16778ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16788ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16798ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16808ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16818ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16828ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16838ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16848ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16858ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16868ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16878ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16888ab949d8SShri Abhyankar v += 60; 16898ab949d8SShri Abhyankar 16909371c9d4SSatish Balay x1 = xb[8]; 16919371c9d4SSatish Balay x2 = xb[9]; 16929371c9d4SSatish Balay x3 = xb[10]; 16939371c9d4SSatish Balay x4 = xb[11]; 16940b8f6341SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16950b8f6341SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16960b8f6341SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16970b8f6341SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16980b8f6341SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16990b8f6341SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 17000b8f6341SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 17010b8f6341SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 17020b8f6341SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 17030b8f6341SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 17040b8f6341SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 17050b8f6341SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 17060b8f6341SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 17070b8f6341SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 17080b8f6341SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 17090b8f6341SShri Abhyankar v += 60; 17100b8f6341SShri Abhyankar 17119371c9d4SSatish Balay x1 = xb[12]; 17129371c9d4SSatish Balay x2 = xb[13]; 17139371c9d4SSatish Balay x3 = xb[14]; 17148ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 17158ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 17168ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 17178ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 17188ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 17198ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 17208ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 17218ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 17228ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 17238ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 17248ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 17258ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 17268ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 17278ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 17288ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 17298ab949d8SShri Abhyankar v += 45; 17308ab949d8SShri Abhyankar } 17318ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 17329371c9d4SSatish Balay z[0] = sum1; 17339371c9d4SSatish Balay z[1] = sum2; 17349371c9d4SSatish Balay z[2] = sum3; 17359371c9d4SSatish Balay z[3] = sum4; 17369371c9d4SSatish Balay z[4] = sum5; 17379371c9d4SSatish Balay z[5] = sum6; 17389371c9d4SSatish Balay z[6] = sum7; 17399371c9d4SSatish Balay z[7] = sum8; 17409371c9d4SSatish Balay z[8] = sum9; 17419371c9d4SSatish Balay z[9] = sum10; 17429371c9d4SSatish Balay z[10] = sum11; 17439371c9d4SSatish Balay z[11] = sum12; 17449371c9d4SSatish Balay z[12] = sum13; 17459371c9d4SSatish Balay z[13] = sum14; 17469371c9d4SSatish Balay z[14] = sum15; 17478ab949d8SShri Abhyankar 17488ab949d8SShri Abhyankar if (!usecprow) z += 15; 17498ab949d8SShri Abhyankar } 17508ab949d8SShri Abhyankar 17519566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 17529566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 17539566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 17548ab949d8SShri Abhyankar PetscFunctionReturn(0); 17558ab949d8SShri Abhyankar } 17568ab949d8SShri Abhyankar 17578ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 17589371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) { 17598ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1760f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 17618ab949d8SShri Abhyankar const PetscScalar *x, *xb; 17620b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 17638ab949d8SShri Abhyankar const MatScalar *v; 17648ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 17657c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1766ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 17678ab949d8SShri Abhyankar 17688ab949d8SShri Abhyankar PetscFunctionBegin; 17699566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 17709566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 17718ab949d8SShri Abhyankar 17728ab949d8SShri Abhyankar v = a->a; 17738ab949d8SShri Abhyankar if (usecprow) { 17748ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 17758ab949d8SShri Abhyankar ii = a->compressedrow.i; 17768ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 17779566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 17788ab949d8SShri Abhyankar } else { 17798ab949d8SShri Abhyankar mbs = a->mbs; 17808ab949d8SShri Abhyankar ii = a->i; 17818ab949d8SShri Abhyankar z = zarray; 17828ab949d8SShri Abhyankar } 17838ab949d8SShri Abhyankar 17848ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 17858ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 17868ab949d8SShri Abhyankar idx = ij + ii[i]; 17879371c9d4SSatish Balay sum1 = 0.0; 17889371c9d4SSatish Balay sum2 = 0.0; 17899371c9d4SSatish Balay sum3 = 0.0; 17909371c9d4SSatish Balay sum4 = 0.0; 17919371c9d4SSatish Balay sum5 = 0.0; 17929371c9d4SSatish Balay sum6 = 0.0; 17939371c9d4SSatish Balay sum7 = 0.0; 17949371c9d4SSatish Balay sum8 = 0.0; 17959371c9d4SSatish Balay sum9 = 0.0; 17969371c9d4SSatish Balay sum10 = 0.0; 17979371c9d4SSatish Balay sum11 = 0.0; 17989371c9d4SSatish Balay sum12 = 0.0; 17999371c9d4SSatish Balay sum13 = 0.0; 18009371c9d4SSatish Balay sum14 = 0.0; 18019371c9d4SSatish Balay sum15 = 0.0; 18028ab949d8SShri Abhyankar 18038ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 18048ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 18059371c9d4SSatish Balay x1 = xb[0]; 18069371c9d4SSatish Balay x2 = xb[1]; 18079371c9d4SSatish Balay x3 = xb[2]; 18089371c9d4SSatish Balay x4 = xb[3]; 18099371c9d4SSatish Balay x5 = xb[4]; 18109371c9d4SSatish Balay x6 = xb[5]; 18119371c9d4SSatish Balay x7 = xb[6]; 18120b8f6341SShri Abhyankar x8 = xb[7]; 18138ab949d8SShri Abhyankar 18148ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 18158ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 18168ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 18178ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 18188ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 18198ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 18208ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 18218ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 18228ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 18238ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 18248ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 18258ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 18268ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 18278ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 18288ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 18298ab949d8SShri Abhyankar v += 120; 18308ab949d8SShri Abhyankar 18319371c9d4SSatish Balay x1 = xb[8]; 18329371c9d4SSatish Balay x2 = xb[9]; 18339371c9d4SSatish Balay x3 = xb[10]; 18349371c9d4SSatish Balay x4 = xb[11]; 18359371c9d4SSatish Balay x5 = xb[12]; 18369371c9d4SSatish Balay x6 = xb[13]; 18379371c9d4SSatish Balay x7 = xb[14]; 18380b8f6341SShri Abhyankar 18398ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 18408ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 18418ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 18428ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 18438ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 18448ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 18458ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 18468ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 18478ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 18488ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 18498ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 18508ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 18518ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 18528ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 18538ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 18548ab949d8SShri Abhyankar v += 105; 18558ab949d8SShri Abhyankar } 18568ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 18579371c9d4SSatish Balay z[0] = sum1; 18589371c9d4SSatish Balay z[1] = sum2; 18599371c9d4SSatish Balay z[2] = sum3; 18609371c9d4SSatish Balay z[3] = sum4; 18619371c9d4SSatish Balay z[4] = sum5; 18629371c9d4SSatish Balay z[5] = sum6; 18639371c9d4SSatish Balay z[6] = sum7; 18649371c9d4SSatish Balay z[7] = sum8; 18659371c9d4SSatish Balay z[8] = sum9; 18669371c9d4SSatish Balay z[9] = sum10; 18679371c9d4SSatish Balay z[10] = sum11; 18689371c9d4SSatish Balay z[11] = sum12; 18699371c9d4SSatish Balay z[12] = sum13; 18709371c9d4SSatish Balay z[13] = sum14; 18719371c9d4SSatish Balay z[14] = sum15; 18728ab949d8SShri Abhyankar 18738ab949d8SShri Abhyankar if (!usecprow) z += 15; 18748ab949d8SShri Abhyankar } 18758ab949d8SShri Abhyankar 18769566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 18779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 18789566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 18798ab949d8SShri Abhyankar PetscFunctionReturn(0); 18808ab949d8SShri Abhyankar } 18818ab949d8SShri Abhyankar 18828ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 18839371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) { 18848ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1885f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 18868ab949d8SShri Abhyankar const PetscScalar *x, *xb; 18878ab949d8SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 18888ab949d8SShri Abhyankar const MatScalar *v; 18898ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 18907c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1891ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 18928ab949d8SShri Abhyankar 18938ab949d8SShri Abhyankar PetscFunctionBegin; 18949566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 18959566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 18968ab949d8SShri Abhyankar 18978ab949d8SShri Abhyankar v = a->a; 18988ab949d8SShri Abhyankar if (usecprow) { 18998ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 19008ab949d8SShri Abhyankar ii = a->compressedrow.i; 19018ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 19029566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 19038ab949d8SShri Abhyankar } else { 19048ab949d8SShri Abhyankar mbs = a->mbs; 19058ab949d8SShri Abhyankar ii = a->i; 19068ab949d8SShri Abhyankar z = zarray; 19078ab949d8SShri Abhyankar } 19088ab949d8SShri Abhyankar 19098ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 19108ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 19118ab949d8SShri Abhyankar idx = ij + ii[i]; 19129371c9d4SSatish Balay sum1 = 0.0; 19139371c9d4SSatish Balay sum2 = 0.0; 19149371c9d4SSatish Balay sum3 = 0.0; 19159371c9d4SSatish Balay sum4 = 0.0; 19169371c9d4SSatish Balay sum5 = 0.0; 19179371c9d4SSatish Balay sum6 = 0.0; 19189371c9d4SSatish Balay sum7 = 0.0; 19199371c9d4SSatish Balay sum8 = 0.0; 19209371c9d4SSatish Balay sum9 = 0.0; 19219371c9d4SSatish Balay sum10 = 0.0; 19229371c9d4SSatish Balay sum11 = 0.0; 19239371c9d4SSatish Balay sum12 = 0.0; 19249371c9d4SSatish Balay sum13 = 0.0; 19259371c9d4SSatish Balay sum14 = 0.0; 19269371c9d4SSatish Balay sum15 = 0.0; 19278ab949d8SShri Abhyankar 19288ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 19298ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 19309371c9d4SSatish Balay x1 = xb[0]; 19319371c9d4SSatish Balay x2 = xb[1]; 19329371c9d4SSatish Balay x3 = xb[2]; 19339371c9d4SSatish Balay x4 = xb[3]; 19349371c9d4SSatish Balay x5 = xb[4]; 19359371c9d4SSatish Balay x6 = xb[5]; 19369371c9d4SSatish Balay x7 = xb[6]; 19379371c9d4SSatish Balay x8 = xb[7]; 19389371c9d4SSatish Balay x9 = xb[8]; 19399371c9d4SSatish Balay x10 = xb[9]; 19409371c9d4SSatish Balay x11 = xb[10]; 19419371c9d4SSatish Balay x12 = xb[11]; 19429371c9d4SSatish Balay x13 = xb[12]; 19439371c9d4SSatish Balay x14 = xb[13]; 19449371c9d4SSatish Balay x15 = xb[14]; 19458ab949d8SShri Abhyankar 19468ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 19478ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 19488ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 19498ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 19508ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 19518ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 19528ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 19538ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 19548ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 19558ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 19568ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 19578ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 19588ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 19598ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 19608ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 19618ab949d8SShri Abhyankar v += 225; 19628ab949d8SShri Abhyankar } 19638ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 19649371c9d4SSatish Balay z[0] = sum1; 19659371c9d4SSatish Balay z[1] = sum2; 19669371c9d4SSatish Balay z[2] = sum3; 19679371c9d4SSatish Balay z[3] = sum4; 19689371c9d4SSatish Balay z[4] = sum5; 19699371c9d4SSatish Balay z[5] = sum6; 19709371c9d4SSatish Balay z[6] = sum7; 19719371c9d4SSatish Balay z[7] = sum8; 19729371c9d4SSatish Balay z[8] = sum9; 19739371c9d4SSatish Balay z[9] = sum10; 19749371c9d4SSatish Balay z[10] = sum11; 19759371c9d4SSatish Balay z[11] = sum12; 19769371c9d4SSatish Balay z[12] = sum13; 19779371c9d4SSatish Balay z[13] = sum14; 19789371c9d4SSatish Balay z[14] = sum15; 19798ab949d8SShri Abhyankar 19808ab949d8SShri Abhyankar if (!usecprow) z += 15; 19818ab949d8SShri Abhyankar } 19828ab949d8SShri Abhyankar 19839566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 19849566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 19859566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 19868ab949d8SShri Abhyankar PetscFunctionReturn(0); 19878ab949d8SShri Abhyankar } 19888ab949d8SShri Abhyankar 19893f1db9ecSBarry Smith /* 19903f1db9ecSBarry Smith This will not work with MatScalar == float because it calls the BLAS 19913f1db9ecSBarry Smith */ 19929371c9d4SSatish Balay PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) { 19932d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1994f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 1995d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1996d9ca1df4SBarry Smith const MatScalar *v; 1997d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1998d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 1999d9ca1df4SBarry Smith PetscInt ncols, k; 2000ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20012d61bbb3SSatish Balay 20022d61bbb3SSatish Balay PetscFunctionBegin; 20039566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20049566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 20052d61bbb3SSatish Balay 20062d61bbb3SSatish Balay idx = a->j; 20072d61bbb3SSatish Balay v = a->a; 200826e093fcSHong Zhang if (usecprow) { 200926e093fcSHong Zhang mbs = a->compressedrow.nrows; 201026e093fcSHong Zhang ii = a->compressedrow.i; 20117b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 20129566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 201326e093fcSHong Zhang } else { 201426e093fcSHong Zhang mbs = a->mbs; 20152d61bbb3SSatish Balay ii = a->i; 201626e093fcSHong Zhang z = zarray; 201726e093fcSHong Zhang } 2018218c64b6SSatish Balay 20192d61bbb3SSatish Balay if (!a->mult_work) { 2020d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 20219566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 20222d61bbb3SSatish Balay } 20232d61bbb3SSatish Balay work = a->mult_work; 20242d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 20259371c9d4SSatish Balay n = ii[1] - ii[0]; 20269371c9d4SSatish Balay ii++; 20272d61bbb3SSatish Balay ncols = n * bs; 20282d61bbb3SSatish Balay workt = work; 20292d61bbb3SSatish Balay for (j = 0; j < n; j++) { 20302d61bbb3SSatish Balay xb = x + bs * (*idx++); 20312d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 20322d61bbb3SSatish Balay workt += bs; 20332d61bbb3SSatish Balay } 20347b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 203596b95a6bSBarry Smith PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 20362d61bbb3SSatish Balay v += n * bs2; 203726e093fcSHong Zhang if (!usecprow) z += bs; 20382d61bbb3SSatish Balay } 20399566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20409566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 20419566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 20422d61bbb3SSatish Balay PetscFunctionReturn(0); 20432d61bbb3SSatish Balay } 20442d61bbb3SSatish Balay 20459371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) { 20462d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2047122f12eaSBarry Smith const PetscScalar *x; 2048122f12eaSBarry Smith PetscScalar *y, *z, sum; 2049122f12eaSBarry Smith const MatScalar *v; 20507c565772SBarry Smith PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2051122f12eaSBarry Smith const PetscInt *idx, *ii; 2052ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20532d61bbb3SSatish Balay 20542d61bbb3SSatish Balay PetscFunctionBegin; 20559566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20569566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 20572d61bbb3SSatish Balay 20582d61bbb3SSatish Balay idx = a->j; 20592d61bbb3SSatish Balay v = a->a; 206026e093fcSHong Zhang if (usecprow) { 2061*48a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 206226e093fcSHong Zhang mbs = a->compressedrow.nrows; 206326e093fcSHong Zhang ii = a->compressedrow.i; 20647b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 206526e093fcSHong Zhang } else { 20662d61bbb3SSatish Balay ii = a->i; 206726e093fcSHong Zhang } 20682d61bbb3SSatish Balay 20692d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 2070122f12eaSBarry Smith n = ii[1] - ii[0]; 2071122f12eaSBarry Smith ii++; 207226e093fcSHong Zhang if (!usecprow) { 2073122f12eaSBarry Smith sum = y[i]; 2074122f12eaSBarry Smith } else { 2075122f12eaSBarry Smith sum = y[ridx[i]]; 2076122f12eaSBarry Smith } 2077444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2078444d8c10SJed Brown PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2079122f12eaSBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 2080122f12eaSBarry Smith v += n; 2081122f12eaSBarry Smith idx += n; 2082122f12eaSBarry Smith if (usecprow) { 2083122f12eaSBarry Smith z[ridx[i]] = sum; 2084122f12eaSBarry Smith } else { 2085122f12eaSBarry Smith z[i] = sum; 208626e093fcSHong Zhang } 20872d61bbb3SSatish Balay } 20889566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20899566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 20909566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 20912d61bbb3SSatish Balay PetscFunctionReturn(0); 20922d61bbb3SSatish Balay } 20932d61bbb3SSatish Balay 20949371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) { 20952d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2096f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2097d9ca1df4SBarry Smith const PetscScalar *x, *xb; 209826e093fcSHong Zhang PetscScalar x1, x2, *yarray, *zarray; 2099d9ca1df4SBarry Smith const MatScalar *v; 2100d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, j; 2101d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2102ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 21032d61bbb3SSatish Balay 21042d61bbb3SSatish Balay PetscFunctionBegin; 21059566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 21069566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 21072d61bbb3SSatish Balay 21082d61bbb3SSatish Balay idx = a->j; 21092d61bbb3SSatish Balay v = a->a; 211026e093fcSHong Zhang if (usecprow) { 2111*48a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 211226e093fcSHong Zhang mbs = a->compressedrow.nrows; 211326e093fcSHong Zhang ii = a->compressedrow.i; 21147b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 211526e093fcSHong Zhang } else { 21162d61bbb3SSatish Balay ii = a->i; 211726e093fcSHong Zhang y = yarray; 211826e093fcSHong Zhang z = zarray; 211926e093fcSHong Zhang } 21202d61bbb3SSatish Balay 21212d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 21229371c9d4SSatish Balay n = ii[1] - ii[0]; 21239371c9d4SSatish Balay ii++; 212426e093fcSHong Zhang if (usecprow) { 21257b2bb3b9SHong Zhang z = zarray + 2 * ridx[i]; 21267b2bb3b9SHong Zhang y = yarray + 2 * ridx[i]; 212726e093fcSHong Zhang } 21289371c9d4SSatish Balay sum1 = y[0]; 21299371c9d4SSatish Balay sum2 = y[1]; 2130444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2131444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 21322d61bbb3SSatish Balay for (j = 0; j < n; j++) { 213326fbe8dcSKarl Rupp xb = x + 2 * (*idx++); 213426fbe8dcSKarl Rupp x1 = xb[0]; 213526fbe8dcSKarl Rupp x2 = xb[1]; 213626fbe8dcSKarl Rupp 21372d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 21382d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 21392d61bbb3SSatish Balay v += 4; 21402d61bbb3SSatish Balay } 21419371c9d4SSatish Balay z[0] = sum1; 21429371c9d4SSatish Balay z[1] = sum2; 214326e093fcSHong Zhang if (!usecprow) { 21449371c9d4SSatish Balay z += 2; 21459371c9d4SSatish Balay y += 2; 21462d61bbb3SSatish Balay } 214726e093fcSHong Zhang } 21489566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 21499566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 21509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * a->nz)); 21512d61bbb3SSatish Balay PetscFunctionReturn(0); 21522d61bbb3SSatish Balay } 21532d61bbb3SSatish Balay 21549371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) { 21552d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2156f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2157d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2158d9ca1df4SBarry Smith const MatScalar *v; 2159d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2160d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2161ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 21622d61bbb3SSatish Balay 21632d61bbb3SSatish Balay PetscFunctionBegin; 21649566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 21659566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 21662d61bbb3SSatish Balay 21672d61bbb3SSatish Balay idx = a->j; 21682d61bbb3SSatish Balay v = a->a; 216926e093fcSHong Zhang if (usecprow) { 2170*48a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 217126e093fcSHong Zhang mbs = a->compressedrow.nrows; 217226e093fcSHong Zhang ii = a->compressedrow.i; 21737b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 217426e093fcSHong Zhang } else { 21752d61bbb3SSatish Balay ii = a->i; 217626e093fcSHong Zhang y = yarray; 217726e093fcSHong Zhang z = zarray; 217826e093fcSHong Zhang } 21792d61bbb3SSatish Balay 21802d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 21819371c9d4SSatish Balay n = ii[1] - ii[0]; 21829371c9d4SSatish Balay ii++; 218326e093fcSHong Zhang if (usecprow) { 21847b2bb3b9SHong Zhang z = zarray + 3 * ridx[i]; 21857b2bb3b9SHong Zhang y = yarray + 3 * ridx[i]; 218626e093fcSHong Zhang } 21879371c9d4SSatish Balay sum1 = y[0]; 21889371c9d4SSatish Balay sum2 = y[1]; 21899371c9d4SSatish Balay sum3 = y[2]; 2190444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2191444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 21922d61bbb3SSatish Balay for (j = 0; j < n; j++) { 21939371c9d4SSatish Balay xb = x + 3 * (*idx++); 21949371c9d4SSatish Balay x1 = xb[0]; 21959371c9d4SSatish Balay x2 = xb[1]; 21969371c9d4SSatish Balay x3 = xb[2]; 21972d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 21982d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 21992d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 22002d61bbb3SSatish Balay v += 9; 22012d61bbb3SSatish Balay } 22029371c9d4SSatish Balay z[0] = sum1; 22039371c9d4SSatish Balay z[1] = sum2; 22049371c9d4SSatish Balay z[2] = sum3; 220526e093fcSHong Zhang if (!usecprow) { 22069371c9d4SSatish Balay z += 3; 22079371c9d4SSatish Balay y += 3; 22082d61bbb3SSatish Balay } 220926e093fcSHong Zhang } 22109566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 22119566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 22129566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz)); 22132d61bbb3SSatish Balay PetscFunctionReturn(0); 22142d61bbb3SSatish Balay } 22152d61bbb3SSatish Balay 22169371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) { 22172d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2218f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2219d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2220d9ca1df4SBarry Smith const MatScalar *v; 2221d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2222d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2223ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 22242d61bbb3SSatish Balay 22252d61bbb3SSatish Balay PetscFunctionBegin; 22269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 22279566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 22282d61bbb3SSatish Balay 22292d61bbb3SSatish Balay idx = a->j; 22302d61bbb3SSatish Balay v = a->a; 223126e093fcSHong Zhang if (usecprow) { 2232*48a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 223326e093fcSHong Zhang mbs = a->compressedrow.nrows; 223426e093fcSHong Zhang ii = a->compressedrow.i; 22357b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 223626e093fcSHong Zhang } else { 22372d61bbb3SSatish Balay ii = a->i; 223826e093fcSHong Zhang y = yarray; 223926e093fcSHong Zhang z = zarray; 224026e093fcSHong Zhang } 22412d61bbb3SSatish Balay 22422d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 22439371c9d4SSatish Balay n = ii[1] - ii[0]; 22449371c9d4SSatish Balay ii++; 224526e093fcSHong Zhang if (usecprow) { 22467b2bb3b9SHong Zhang z = zarray + 4 * ridx[i]; 22477b2bb3b9SHong Zhang y = yarray + 4 * ridx[i]; 224826e093fcSHong Zhang } 22499371c9d4SSatish Balay sum1 = y[0]; 22509371c9d4SSatish Balay sum2 = y[1]; 22519371c9d4SSatish Balay sum3 = y[2]; 22529371c9d4SSatish Balay sum4 = y[3]; 2253444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2254444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 22552d61bbb3SSatish Balay for (j = 0; j < n; j++) { 22562d61bbb3SSatish Balay xb = x + 4 * (*idx++); 22579371c9d4SSatish Balay x1 = xb[0]; 22589371c9d4SSatish Balay x2 = xb[1]; 22599371c9d4SSatish Balay x3 = xb[2]; 22609371c9d4SSatish Balay x4 = xb[3]; 22612d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 22622d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 22632d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 22642d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 22652d61bbb3SSatish Balay v += 16; 22662d61bbb3SSatish Balay } 22679371c9d4SSatish Balay z[0] = sum1; 22689371c9d4SSatish Balay z[1] = sum2; 22699371c9d4SSatish Balay z[2] = sum3; 22709371c9d4SSatish Balay z[3] = sum4; 227126e093fcSHong Zhang if (!usecprow) { 22729371c9d4SSatish Balay z += 4; 22739371c9d4SSatish Balay y += 4; 22742d61bbb3SSatish Balay } 227526e093fcSHong Zhang } 22769566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 22779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 22789566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz)); 22792d61bbb3SSatish Balay PetscFunctionReturn(0); 22802d61bbb3SSatish Balay } 22812d61bbb3SSatish Balay 22829371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) { 22832d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2284f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2285d9ca1df4SBarry Smith const PetscScalar *x, *xb; 228626e093fcSHong Zhang PetscScalar *yarray, *zarray; 2287d9ca1df4SBarry Smith const MatScalar *v; 2288d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2289d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2290ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 22912d61bbb3SSatish Balay 22922d61bbb3SSatish Balay PetscFunctionBegin; 22939566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 22949566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 22952d61bbb3SSatish Balay 22962d61bbb3SSatish Balay idx = a->j; 22972d61bbb3SSatish Balay v = a->a; 229826e093fcSHong Zhang if (usecprow) { 2299*48a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 230026e093fcSHong Zhang mbs = a->compressedrow.nrows; 230126e093fcSHong Zhang ii = a->compressedrow.i; 23027b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 230326e093fcSHong Zhang } else { 23042d61bbb3SSatish Balay ii = a->i; 230526e093fcSHong Zhang y = yarray; 230626e093fcSHong Zhang z = zarray; 230726e093fcSHong Zhang } 23082d61bbb3SSatish Balay 23092d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 23109371c9d4SSatish Balay n = ii[1] - ii[0]; 23119371c9d4SSatish Balay ii++; 231226e093fcSHong Zhang if (usecprow) { 23137b2bb3b9SHong Zhang z = zarray + 5 * ridx[i]; 23147b2bb3b9SHong Zhang y = yarray + 5 * ridx[i]; 231526e093fcSHong Zhang } 23169371c9d4SSatish Balay sum1 = y[0]; 23179371c9d4SSatish Balay sum2 = y[1]; 23189371c9d4SSatish Balay sum3 = y[2]; 23199371c9d4SSatish Balay sum4 = y[3]; 23209371c9d4SSatish Balay sum5 = y[4]; 2321444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2322444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 23232d61bbb3SSatish Balay for (j = 0; j < n; j++) { 23242d61bbb3SSatish Balay xb = x + 5 * (*idx++); 23259371c9d4SSatish Balay x1 = xb[0]; 23269371c9d4SSatish Balay x2 = xb[1]; 23279371c9d4SSatish Balay x3 = xb[2]; 23289371c9d4SSatish Balay x4 = xb[3]; 23299371c9d4SSatish Balay x5 = xb[4]; 23302d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 23312d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 23322d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 23332d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 23342d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 23352d61bbb3SSatish Balay v += 25; 23362d61bbb3SSatish Balay } 23379371c9d4SSatish Balay z[0] = sum1; 23389371c9d4SSatish Balay z[1] = sum2; 23399371c9d4SSatish Balay z[2] = sum3; 23409371c9d4SSatish Balay z[3] = sum4; 23419371c9d4SSatish Balay z[4] = sum5; 234226e093fcSHong Zhang if (!usecprow) { 23439371c9d4SSatish Balay z += 5; 23449371c9d4SSatish Balay y += 5; 23452d61bbb3SSatish Balay } 234626e093fcSHong Zhang } 23479566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 23489566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 23499566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz)); 23502d61bbb3SSatish Balay PetscFunctionReturn(0); 23512d61bbb3SSatish Balay } 2352c2916339SPierre Jolivet 23539371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) { 235415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2355f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2356d9ca1df4SBarry Smith const PetscScalar *x, *xb; 235726e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2358d9ca1df4SBarry Smith const MatScalar *v; 2359d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2360d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2361ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 236215091d37SBarry Smith 236315091d37SBarry Smith PetscFunctionBegin; 23649566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 23659566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 236615091d37SBarry Smith 236715091d37SBarry Smith idx = a->j; 236815091d37SBarry Smith v = a->a; 236926e093fcSHong Zhang if (usecprow) { 2370*48a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 237126e093fcSHong Zhang mbs = a->compressedrow.nrows; 237226e093fcSHong Zhang ii = a->compressedrow.i; 23737b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 237426e093fcSHong Zhang } else { 237515091d37SBarry Smith ii = a->i; 237626e093fcSHong Zhang y = yarray; 237726e093fcSHong Zhang z = zarray; 237826e093fcSHong Zhang } 237915091d37SBarry Smith 238015091d37SBarry Smith for (i = 0; i < mbs; i++) { 23819371c9d4SSatish Balay n = ii[1] - ii[0]; 23829371c9d4SSatish Balay ii++; 238326e093fcSHong Zhang if (usecprow) { 23847b2bb3b9SHong Zhang z = zarray + 6 * ridx[i]; 23857b2bb3b9SHong Zhang y = yarray + 6 * ridx[i]; 238626e093fcSHong Zhang } 23879371c9d4SSatish Balay sum1 = y[0]; 23889371c9d4SSatish Balay sum2 = y[1]; 23899371c9d4SSatish Balay sum3 = y[2]; 23909371c9d4SSatish Balay sum4 = y[3]; 23919371c9d4SSatish Balay sum5 = y[4]; 23929371c9d4SSatish Balay sum6 = y[5]; 2393444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2394444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 239515091d37SBarry Smith for (j = 0; j < n; j++) { 23963b95cb0eSSatish Balay xb = x + 6 * (*idx++); 23979371c9d4SSatish Balay x1 = xb[0]; 23989371c9d4SSatish Balay x2 = xb[1]; 23999371c9d4SSatish Balay x3 = xb[2]; 24009371c9d4SSatish Balay x4 = xb[3]; 24019371c9d4SSatish Balay x5 = xb[4]; 24029371c9d4SSatish Balay x6 = xb[5]; 240315091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 240415091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 240515091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 240615091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 240715091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 240815091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 240915091d37SBarry Smith v += 36; 241015091d37SBarry Smith } 24119371c9d4SSatish Balay z[0] = sum1; 24129371c9d4SSatish Balay z[1] = sum2; 24139371c9d4SSatish Balay z[2] = sum3; 24149371c9d4SSatish Balay z[3] = sum4; 24159371c9d4SSatish Balay z[4] = sum5; 24169371c9d4SSatish Balay z[5] = sum6; 241726e093fcSHong Zhang if (!usecprow) { 24189371c9d4SSatish Balay z += 6; 24199371c9d4SSatish Balay y += 6; 242015091d37SBarry Smith } 242126e093fcSHong Zhang } 24229566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 24239566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 24249566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz)); 242515091d37SBarry Smith PetscFunctionReturn(0); 242615091d37SBarry Smith } 24272d61bbb3SSatish Balay 24289371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) { 24292d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2430f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2431d9ca1df4SBarry Smith const PetscScalar *x, *xb; 243226e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2433d9ca1df4SBarry Smith const MatScalar *v; 2434d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2435d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2436ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 24372d61bbb3SSatish Balay 24382d61bbb3SSatish Balay PetscFunctionBegin; 24399566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 24409566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 24412d61bbb3SSatish Balay 24422d61bbb3SSatish Balay idx = a->j; 24432d61bbb3SSatish Balay v = a->a; 244426e093fcSHong Zhang if (usecprow) { 2445*48a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 244626e093fcSHong Zhang mbs = a->compressedrow.nrows; 244726e093fcSHong Zhang ii = a->compressedrow.i; 24487b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 244926e093fcSHong Zhang } else { 24502d61bbb3SSatish Balay ii = a->i; 245126e093fcSHong Zhang y = yarray; 245226e093fcSHong Zhang z = zarray; 245326e093fcSHong Zhang } 24542d61bbb3SSatish Balay 24552d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 24569371c9d4SSatish Balay n = ii[1] - ii[0]; 24579371c9d4SSatish Balay ii++; 245826e093fcSHong Zhang if (usecprow) { 24597b2bb3b9SHong Zhang z = zarray + 7 * ridx[i]; 24607b2bb3b9SHong Zhang y = yarray + 7 * ridx[i]; 246126e093fcSHong Zhang } 24629371c9d4SSatish Balay sum1 = y[0]; 24639371c9d4SSatish Balay sum2 = y[1]; 24649371c9d4SSatish Balay sum3 = y[2]; 24659371c9d4SSatish Balay sum4 = y[3]; 24669371c9d4SSatish Balay sum5 = y[4]; 24679371c9d4SSatish Balay sum6 = y[5]; 24689371c9d4SSatish Balay sum7 = y[6]; 2469444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2470444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 24712d61bbb3SSatish Balay for (j = 0; j < n; j++) { 24722d61bbb3SSatish Balay xb = x + 7 * (*idx++); 24739371c9d4SSatish Balay x1 = xb[0]; 24749371c9d4SSatish Balay x2 = xb[1]; 24759371c9d4SSatish Balay x3 = xb[2]; 24769371c9d4SSatish Balay x4 = xb[3]; 24779371c9d4SSatish Balay x5 = xb[4]; 24789371c9d4SSatish Balay x6 = xb[5]; 24799371c9d4SSatish Balay x7 = xb[6]; 24802d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 24812d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 24822d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 24832d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 24842d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 24852d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 24862d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 24872d61bbb3SSatish Balay v += 49; 24882d61bbb3SSatish Balay } 24899371c9d4SSatish Balay z[0] = sum1; 24909371c9d4SSatish Balay z[1] = sum2; 24919371c9d4SSatish Balay z[2] = sum3; 24929371c9d4SSatish Balay z[3] = sum4; 24939371c9d4SSatish Balay z[4] = sum5; 24949371c9d4SSatish Balay z[5] = sum6; 24959371c9d4SSatish Balay z[6] = sum7; 249626e093fcSHong Zhang if (!usecprow) { 24979371c9d4SSatish Balay z += 7; 24989371c9d4SSatish Balay y += 7; 24992d61bbb3SSatish Balay } 250026e093fcSHong Zhang } 25019566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 25029566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 25039566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz)); 25042d61bbb3SSatish Balay PetscFunctionReturn(0); 25052d61bbb3SSatish Balay } 2506218c64b6SSatish Balay 25075f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 25089371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) { 250996e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2510f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 251196e086a2SDaniel Kokron const PetscScalar *x, *xb; 251296e086a2SDaniel Kokron const MatScalar *v; 25136679dcc1SBarry Smith PetscInt mbs, i, j, n; 2514ce68d72fSJed Brown PetscInt k; 251596e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 25166679dcc1SBarry Smith const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 251796e086a2SDaniel Kokron 251896e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 2519ce68d72fSJed Brown __m256d w0, w1, w2, w3; 252096e086a2SDaniel Kokron __m256d z0, z1, z2; 252196e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 252296e086a2SDaniel Kokron 252396e086a2SDaniel Kokron PetscFunctionBegin; 25249566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 25259566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 25269566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 252796e086a2SDaniel Kokron 252896e086a2SDaniel Kokron idx = a->j; 252996e086a2SDaniel Kokron v = a->a; 253096e086a2SDaniel Kokron if (usecprow) { 253196e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 253296e086a2SDaniel Kokron ii = a->compressedrow.i; 253396e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 253496e086a2SDaniel Kokron } else { 253596e086a2SDaniel Kokron mbs = a->mbs; 253696e086a2SDaniel Kokron ii = a->i; 253796e086a2SDaniel Kokron z = zarray; 253896e086a2SDaniel Kokron } 253996e086a2SDaniel Kokron 254096e086a2SDaniel Kokron if (!a->mult_work) { 254196e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 25429566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 254396e086a2SDaniel Kokron } 254496e086a2SDaniel Kokron 254596e086a2SDaniel Kokron work = a->mult_work; 254696e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 25479371c9d4SSatish Balay n = ii[1] - ii[0]; 25489371c9d4SSatish Balay ii++; 254996e086a2SDaniel Kokron workt = work; 255096e086a2SDaniel Kokron for (j = 0; j < n; j++) { 255196e086a2SDaniel Kokron xb = x + bs * (*idx++); 255296e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 255396e086a2SDaniel Kokron workt += bs; 255496e086a2SDaniel Kokron } 255596e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 255696e086a2SDaniel Kokron 25579371c9d4SSatish Balay z0 = _mm256_loadu_pd(&z[0]); 25589371c9d4SSatish Balay z1 = _mm256_loadu_pd(&z[4]); 25599371c9d4SSatish Balay z2 = _mm256_set1_pd(z[8]); 256096e086a2SDaniel Kokron 256196e086a2SDaniel Kokron for (j = 0; j < n; j++) { 2562c05b70c4SSatish Balay /* first column of a */ 256396e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 25649371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 25659371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 25669371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 25679371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 25689371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 25699371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 257096e086a2SDaniel Kokron 2571c05b70c4SSatish Balay /* second column of a */ 257296e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 25739371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 25749371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 25759371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 25769371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 25779371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 25789371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 257996e086a2SDaniel Kokron 2580c05b70c4SSatish Balay /* third column of a */ 258196e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 25829371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 25839371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 25849371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 25859371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 25869371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 25879371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 258896e086a2SDaniel Kokron 2589c05b70c4SSatish Balay /* fourth column of a */ 259096e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 25919371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 25929371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 25939371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 25949371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 25959371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 25969371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 259796e086a2SDaniel Kokron 2598c05b70c4SSatish Balay /* fifth column of a */ 259996e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 26009371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 26019371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 26029371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 26039371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 26049371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 26059371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 260696e086a2SDaniel Kokron 2607c05b70c4SSatish Balay /* sixth column of a */ 260896e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 26099371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 26109371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 26119371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 26129371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 26139371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 26149371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 261596e086a2SDaniel Kokron 2616c05b70c4SSatish Balay /* seventh column of a */ 261796e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 26189371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 26199371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 26209371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 26219371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 26229371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 26239371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 262496e086a2SDaniel Kokron 26256aad120cSJose E. Roman /* eighth column of a */ 262696e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 26279371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 26289371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 26299371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 26309371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 26319371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 26329371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 263396e086a2SDaniel Kokron 2634c05b70c4SSatish Balay /* ninth column of a */ 263596e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 26369371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 26379371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 26389371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 26399371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 26409371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 26419371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 264296e086a2SDaniel Kokron } 264396e086a2SDaniel Kokron 26449371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 26459371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 26469371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 264796e086a2SDaniel Kokron 264896e086a2SDaniel Kokron v += n * bs2; 264996e086a2SDaniel Kokron if (!usecprow) z += bs; 265096e086a2SDaniel Kokron } 26519566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 26529566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 26539566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(162.0 * a->nz)); 265496e086a2SDaniel Kokron PetscFunctionReturn(0); 265596e086a2SDaniel Kokron } 265696e086a2SDaniel Kokron #endif 265796e086a2SDaniel Kokron 26589371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) { 2659ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2660f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2661ebada01fSBarry Smith const PetscScalar *x, *xb; 2662ebada01fSBarry Smith PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2663ebada01fSBarry Smith const MatScalar *v; 2664ebada01fSBarry Smith PetscInt mbs = a->mbs, i, j, n; 2665ebada01fSBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2666ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 2667ebada01fSBarry Smith 2668ebada01fSBarry Smith PetscFunctionBegin; 26699566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 26709566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2671ebada01fSBarry Smith 2672ebada01fSBarry Smith idx = a->j; 2673ebada01fSBarry Smith v = a->a; 2674ebada01fSBarry Smith if (usecprow) { 2675*48a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2676ebada01fSBarry Smith mbs = a->compressedrow.nrows; 2677ebada01fSBarry Smith ii = a->compressedrow.i; 2678ebada01fSBarry Smith ridx = a->compressedrow.rindex; 2679ebada01fSBarry Smith } else { 2680ebada01fSBarry Smith ii = a->i; 2681ebada01fSBarry Smith y = yarray; 2682ebada01fSBarry Smith z = zarray; 2683ebada01fSBarry Smith } 2684ebada01fSBarry Smith 2685ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 26869371c9d4SSatish Balay n = ii[1] - ii[0]; 26879371c9d4SSatish Balay ii++; 2688ebada01fSBarry Smith if (usecprow) { 2689ebada01fSBarry Smith z = zarray + 11 * ridx[i]; 2690ebada01fSBarry Smith y = yarray + 11 * ridx[i]; 2691ebada01fSBarry Smith } 26929371c9d4SSatish Balay sum1 = y[0]; 26939371c9d4SSatish Balay sum2 = y[1]; 26949371c9d4SSatish Balay sum3 = y[2]; 26959371c9d4SSatish Balay sum4 = y[3]; 26969371c9d4SSatish Balay sum5 = y[4]; 26979371c9d4SSatish Balay sum6 = y[5]; 26989371c9d4SSatish Balay sum7 = y[6]; 26999371c9d4SSatish Balay sum8 = y[7]; 27009371c9d4SSatish Balay sum9 = y[8]; 27019371c9d4SSatish Balay sum10 = y[9]; 27029371c9d4SSatish Balay sum11 = y[10]; 2703ebada01fSBarry Smith PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2704ebada01fSBarry Smith PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2705ebada01fSBarry Smith for (j = 0; j < n; j++) { 2706ebada01fSBarry Smith xb = x + 11 * (*idx++); 27079371c9d4SSatish Balay x1 = xb[0]; 27089371c9d4SSatish Balay x2 = xb[1]; 27099371c9d4SSatish Balay x3 = xb[2]; 27109371c9d4SSatish Balay x4 = xb[3]; 27119371c9d4SSatish Balay x5 = xb[4]; 27129371c9d4SSatish Balay x6 = xb[5]; 27139371c9d4SSatish Balay x7 = xb[6]; 27149371c9d4SSatish Balay x8 = xb[7]; 27159371c9d4SSatish Balay x9 = xb[8]; 27169371c9d4SSatish Balay x10 = xb[9]; 27179371c9d4SSatish Balay x11 = xb[10]; 2718ebada01fSBarry Smith sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2719ebada01fSBarry Smith sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2720ebada01fSBarry Smith sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2721ebada01fSBarry Smith sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2722ebada01fSBarry Smith sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2723ebada01fSBarry Smith sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2724ebada01fSBarry Smith sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2725ebada01fSBarry Smith sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2726ebada01fSBarry Smith sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2727ebada01fSBarry Smith sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2728ebada01fSBarry Smith sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2729ebada01fSBarry Smith v += 121; 2730ebada01fSBarry Smith } 27319371c9d4SSatish Balay z[0] = sum1; 27329371c9d4SSatish Balay z[1] = sum2; 27339371c9d4SSatish Balay z[2] = sum3; 27349371c9d4SSatish Balay z[3] = sum4; 27359371c9d4SSatish Balay z[4] = sum5; 27369371c9d4SSatish Balay z[5] = sum6; 27379371c9d4SSatish Balay z[6] = sum7; 27389371c9d4SSatish Balay z[7] = sum8; 27399371c9d4SSatish Balay z[8] = sum9; 27409371c9d4SSatish Balay z[9] = sum10; 27419371c9d4SSatish Balay z[10] = sum11; 2742ebada01fSBarry Smith if (!usecprow) { 27439371c9d4SSatish Balay z += 11; 27449371c9d4SSatish Balay y += 11; 2745ebada01fSBarry Smith } 2746ebada01fSBarry Smith } 27479566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 27489566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 27499566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz)); 2750ebada01fSBarry Smith PetscFunctionReturn(0); 2751ebada01fSBarry Smith } 2752ebada01fSBarry Smith 27539371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) { 27542d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2755f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 2756d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2757d9ca1df4SBarry Smith const MatScalar *v; 2758d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2759d9ca1df4SBarry Smith PetscInt ncols, k; 2760d9ca1df4SBarry Smith const PetscInt *ridx = NULL, *idx, *ii; 2761ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2762218c64b6SSatish Balay 27632d61bbb3SSatish Balay PetscFunctionBegin; 27649566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 27659566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 27669566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 27672d61bbb3SSatish Balay 27682d61bbb3SSatish Balay idx = a->j; 27692d61bbb3SSatish Balay v = a->a; 277026e093fcSHong Zhang if (usecprow) { 277126e093fcSHong Zhang mbs = a->compressedrow.nrows; 277226e093fcSHong Zhang ii = a->compressedrow.i; 27737b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 277426e093fcSHong Zhang } else { 277526e093fcSHong Zhang mbs = a->mbs; 27762d61bbb3SSatish Balay ii = a->i; 277726e093fcSHong Zhang z = zarray; 277826e093fcSHong Zhang } 27792d61bbb3SSatish Balay 27802d61bbb3SSatish Balay if (!a->mult_work) { 2781d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 27829566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 27832d61bbb3SSatish Balay } 27842d61bbb3SSatish Balay work = a->mult_work; 27852d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 27869371c9d4SSatish Balay n = ii[1] - ii[0]; 27879371c9d4SSatish Balay ii++; 27882d61bbb3SSatish Balay ncols = n * bs; 27892d61bbb3SSatish Balay workt = work; 27902d61bbb3SSatish Balay for (j = 0; j < n; j++) { 27912d61bbb3SSatish Balay xb = x + bs * (*idx++); 27922d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 27932d61bbb3SSatish Balay workt += bs; 27942d61bbb3SSatish Balay } 27957b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 279696b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 27972d61bbb3SSatish Balay v += n * bs2; 279826fbe8dcSKarl Rupp if (!usecprow) z += bs; 279926e093fcSHong Zhang } 28009566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 28019566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 28029566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 28032d61bbb3SSatish Balay PetscFunctionReturn(0); 28042d61bbb3SSatish Balay } 28052d61bbb3SSatish Balay 28069371c9d4SSatish Balay PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) { 2807547795f9SHong Zhang PetscScalar zero = 0.0; 2808547795f9SHong Zhang 2809547795f9SHong Zhang PetscFunctionBegin; 28109566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28119566063dSJacob Faibussowitsch PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2812547795f9SHong Zhang PetscFunctionReturn(0); 2813547795f9SHong Zhang } 2814547795f9SHong Zhang 28159371c9d4SSatish Balay PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) { 28163447b6efSHong Zhang PetscScalar zero = 0.0; 28172d61bbb3SSatish Balay 28182d61bbb3SSatish Balay PetscFunctionBegin; 28199566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28209566063dSJacob Faibussowitsch PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 28212d61bbb3SSatish Balay PetscFunctionReturn(0); 28222d61bbb3SSatish Balay } 28232d61bbb3SSatish Balay 28249371c9d4SSatish Balay PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) { 2825547795f9SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2826b8c08b77SHong Zhang PetscScalar *z, x1, x2, x3, x4, x5; 2827d9ca1df4SBarry Smith const PetscScalar *x, *xb = NULL; 2828d9ca1df4SBarry Smith const MatScalar *v; 2829b8c08b77SHong Zhang PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2830d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 2831547795f9SHong Zhang Mat_CompressedRow cprow = a->compressedrow; 2832ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 2833547795f9SHong Zhang 2834547795f9SHong Zhang PetscFunctionBegin; 28359566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 28369566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 28379566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 2838547795f9SHong Zhang 2839547795f9SHong Zhang idx = a->j; 2840547795f9SHong Zhang v = a->a; 2841547795f9SHong Zhang if (usecprow) { 2842547795f9SHong Zhang mbs = cprow.nrows; 2843547795f9SHong Zhang ii = cprow.i; 2844547795f9SHong Zhang ridx = cprow.rindex; 2845547795f9SHong Zhang } else { 2846547795f9SHong Zhang mbs = a->mbs; 2847547795f9SHong Zhang ii = a->i; 2848547795f9SHong Zhang xb = x; 2849547795f9SHong Zhang } 2850547795f9SHong Zhang 2851547795f9SHong Zhang switch (bs) { 2852547795f9SHong Zhang case 1: 2853547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2854547795f9SHong Zhang if (usecprow) xb = x + ridx[i]; 2855547795f9SHong Zhang x1 = xb[0]; 2856547795f9SHong Zhang ib = idx + ii[0]; 28579371c9d4SSatish Balay n = ii[1] - ii[0]; 28589371c9d4SSatish Balay ii++; 2859547795f9SHong Zhang for (j = 0; j < n; j++) { 2860547795f9SHong Zhang rval = ib[j]; 2861547795f9SHong Zhang z[rval] += PetscConj(*v) * x1; 2862547795f9SHong Zhang v++; 2863547795f9SHong Zhang } 2864547795f9SHong Zhang if (!usecprow) xb++; 2865547795f9SHong Zhang } 2866547795f9SHong Zhang break; 2867547795f9SHong Zhang case 2: 2868547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2869547795f9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 28709371c9d4SSatish Balay x1 = xb[0]; 28719371c9d4SSatish Balay x2 = xb[1]; 2872547795f9SHong Zhang ib = idx + ii[0]; 28739371c9d4SSatish Balay n = ii[1] - ii[0]; 28749371c9d4SSatish Balay ii++; 2875547795f9SHong Zhang for (j = 0; j < n; j++) { 2876547795f9SHong Zhang rval = ib[j] * 2; 2877547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2878547795f9SHong Zhang z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2879547795f9SHong Zhang v += 4; 2880547795f9SHong Zhang } 2881547795f9SHong Zhang if (!usecprow) xb += 2; 2882547795f9SHong Zhang } 2883547795f9SHong Zhang break; 2884547795f9SHong Zhang case 3: 2885547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2886547795f9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 28879371c9d4SSatish Balay x1 = xb[0]; 28889371c9d4SSatish Balay x2 = xb[1]; 28899371c9d4SSatish Balay x3 = xb[2]; 2890547795f9SHong Zhang ib = idx + ii[0]; 28919371c9d4SSatish Balay n = ii[1] - ii[0]; 28929371c9d4SSatish Balay ii++; 2893547795f9SHong Zhang for (j = 0; j < n; j++) { 2894547795f9SHong Zhang rval = ib[j] * 3; 2895547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2896547795f9SHong Zhang z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2897547795f9SHong Zhang z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2898547795f9SHong Zhang v += 9; 2899547795f9SHong Zhang } 2900547795f9SHong Zhang if (!usecprow) xb += 3; 2901547795f9SHong Zhang } 2902547795f9SHong Zhang break; 2903547795f9SHong Zhang case 4: 2904547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2905547795f9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 29069371c9d4SSatish Balay x1 = xb[0]; 29079371c9d4SSatish Balay x2 = xb[1]; 29089371c9d4SSatish Balay x3 = xb[2]; 29099371c9d4SSatish Balay x4 = xb[3]; 2910547795f9SHong Zhang ib = idx + ii[0]; 29119371c9d4SSatish Balay n = ii[1] - ii[0]; 29129371c9d4SSatish Balay ii++; 2913547795f9SHong Zhang for (j = 0; j < n; j++) { 2914547795f9SHong Zhang rval = ib[j] * 4; 2915547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2916547795f9SHong Zhang z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2917547795f9SHong Zhang z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2918547795f9SHong Zhang z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2919547795f9SHong Zhang v += 16; 2920547795f9SHong Zhang } 2921547795f9SHong Zhang if (!usecprow) xb += 4; 2922547795f9SHong Zhang } 2923547795f9SHong Zhang break; 2924547795f9SHong Zhang case 5: 2925547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2926547795f9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 29279371c9d4SSatish Balay x1 = xb[0]; 29289371c9d4SSatish Balay x2 = xb[1]; 29299371c9d4SSatish Balay x3 = xb[2]; 29309371c9d4SSatish Balay x4 = xb[3]; 29319371c9d4SSatish Balay x5 = xb[4]; 2932547795f9SHong Zhang ib = idx + ii[0]; 29339371c9d4SSatish Balay n = ii[1] - ii[0]; 29349371c9d4SSatish Balay ii++; 2935547795f9SHong Zhang for (j = 0; j < n; j++) { 2936547795f9SHong Zhang rval = ib[j] * 5; 2937547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2938547795f9SHong Zhang z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2939547795f9SHong Zhang z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2940547795f9SHong Zhang z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2941547795f9SHong Zhang z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2942547795f9SHong Zhang v += 25; 2943547795f9SHong Zhang } 2944547795f9SHong Zhang if (!usecprow) xb += 5; 2945547795f9SHong Zhang } 2946547795f9SHong Zhang break; 29479371c9d4SSatish Balay default: /* block sizes larger than 5 by 5 are handled by BLAS */ SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2948968ae2c8SSatish Balay #if 0 2949968ae2c8SSatish Balay { 2950b8c08b77SHong Zhang PetscInt ncols,k,bs2=a->bs2; 2951b8c08b77SHong Zhang PetscScalar *work,*workt,zb; 2952d9ca1df4SBarry Smith const PetscScalar *xtmp; 2953547795f9SHong Zhang if (!a->mult_work) { 2954547795f9SHong Zhang k = PetscMax(A->rmap->n,A->cmap->n); 29559566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2956547795f9SHong Zhang } 2957547795f9SHong Zhang work = a->mult_work; 2958547795f9SHong Zhang xtmp = x; 2959547795f9SHong Zhang for (i=0; i<mbs; i++) { 2960547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 2961547795f9SHong Zhang ncols = n*bs; 29629566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work,ncols)); 296326fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs*ridx[i]; 296496b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2965547795f9SHong Zhang v += n*bs2; 2966547795f9SHong Zhang if (!usecprow) xtmp += bs; 2967547795f9SHong Zhang workt = work; 2968547795f9SHong Zhang for (j=0; j<n; j++) { 2969547795f9SHong Zhang zb = z + bs*(*idx++); 2970547795f9SHong Zhang for (k=0; k<bs; k++) zb[k] += workt[k] ; 2971547795f9SHong Zhang workt += bs; 2972547795f9SHong Zhang } 2973547795f9SHong Zhang } 2974547795f9SHong Zhang } 2975968ae2c8SSatish Balay #endif 2976547795f9SHong Zhang } 29779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 29789566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 29799566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 2980547795f9SHong Zhang PetscFunctionReturn(0); 2981547795f9SHong Zhang } 2982547795f9SHong Zhang 29839371c9d4SSatish Balay PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) { 29842d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2985d9ca1df4SBarry Smith PetscScalar *zb, *z, x1, x2, x3, x4, x5; 2986f4259b30SLisandro Dalcin const PetscScalar *x, *xb = NULL; 2987d9ca1df4SBarry Smith const MatScalar *v; 2988d9ca1df4SBarry Smith PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2989d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 29903447b6efSHong Zhang Mat_CompressedRow cprow = a->compressedrow; 2991ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 29922d61bbb3SSatish Balay 29932d61bbb3SSatish Balay PetscFunctionBegin; 29949566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 29959566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 29969566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 29972d61bbb3SSatish Balay 29982d61bbb3SSatish Balay idx = a->j; 29992d61bbb3SSatish Balay v = a->a; 30003447b6efSHong Zhang if (usecprow) { 30013447b6efSHong Zhang mbs = cprow.nrows; 30023447b6efSHong Zhang ii = cprow.i; 30037b2bb3b9SHong Zhang ridx = cprow.rindex; 30043447b6efSHong Zhang } else { 30053447b6efSHong Zhang mbs = a->mbs; 30062d61bbb3SSatish Balay ii = a->i; 3007f1af5d2fSBarry Smith xb = x; 30083447b6efSHong Zhang } 30092d61bbb3SSatish Balay 30102d61bbb3SSatish Balay switch (bs) { 30112d61bbb3SSatish Balay case 1: 30122d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30137b2bb3b9SHong Zhang if (usecprow) xb = x + ridx[i]; 3014f1af5d2fSBarry Smith x1 = xb[0]; 30153447b6efSHong Zhang ib = idx + ii[0]; 30169371c9d4SSatish Balay n = ii[1] - ii[0]; 30179371c9d4SSatish Balay ii++; 30182d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30192d61bbb3SSatish Balay rval = ib[j]; 3020f1af5d2fSBarry Smith z[rval] += *v * x1; 3021f1af5d2fSBarry Smith v++; 30222d61bbb3SSatish Balay } 30233447b6efSHong Zhang if (!usecprow) xb++; 30242d61bbb3SSatish Balay } 30252d61bbb3SSatish Balay break; 30262d61bbb3SSatish Balay case 2: 30272d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30287b2bb3b9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 30299371c9d4SSatish Balay x1 = xb[0]; 30309371c9d4SSatish Balay x2 = xb[1]; 30313447b6efSHong Zhang ib = idx + ii[0]; 30329371c9d4SSatish Balay n = ii[1] - ii[0]; 30339371c9d4SSatish Balay ii++; 30342d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30352d61bbb3SSatish Balay rval = ib[j] * 2; 30362d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2; 30372d61bbb3SSatish Balay z[rval++] += v[2] * x1 + v[3] * x2; 30382d61bbb3SSatish Balay v += 4; 30392d61bbb3SSatish Balay } 30403447b6efSHong Zhang if (!usecprow) xb += 2; 30412d61bbb3SSatish Balay } 30422d61bbb3SSatish Balay break; 30432d61bbb3SSatish Balay case 3: 30442d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30457b2bb3b9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 30469371c9d4SSatish Balay x1 = xb[0]; 30479371c9d4SSatish Balay x2 = xb[1]; 30489371c9d4SSatish Balay x3 = xb[2]; 30493447b6efSHong Zhang ib = idx + ii[0]; 30509371c9d4SSatish Balay n = ii[1] - ii[0]; 30519371c9d4SSatish Balay ii++; 30522d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30532d61bbb3SSatish Balay rval = ib[j] * 3; 30542d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 30552d61bbb3SSatish Balay z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 30562d61bbb3SSatish Balay z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 30572d61bbb3SSatish Balay v += 9; 30582d61bbb3SSatish Balay } 30593447b6efSHong Zhang if (!usecprow) xb += 3; 30602d61bbb3SSatish Balay } 30612d61bbb3SSatish Balay break; 30622d61bbb3SSatish Balay case 4: 30632d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30647b2bb3b9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 30659371c9d4SSatish Balay x1 = xb[0]; 30669371c9d4SSatish Balay x2 = xb[1]; 30679371c9d4SSatish Balay x3 = xb[2]; 30689371c9d4SSatish Balay x4 = xb[3]; 30693447b6efSHong Zhang ib = idx + ii[0]; 30709371c9d4SSatish Balay n = ii[1] - ii[0]; 30719371c9d4SSatish Balay ii++; 30722d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30732d61bbb3SSatish Balay rval = ib[j] * 4; 30742d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 30752d61bbb3SSatish Balay z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 30762d61bbb3SSatish Balay z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 30772d61bbb3SSatish Balay z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 30782d61bbb3SSatish Balay v += 16; 30792d61bbb3SSatish Balay } 30803447b6efSHong Zhang if (!usecprow) xb += 4; 30812d61bbb3SSatish Balay } 30822d61bbb3SSatish Balay break; 30832d61bbb3SSatish Balay case 5: 30842d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30857b2bb3b9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 30869371c9d4SSatish Balay x1 = xb[0]; 30879371c9d4SSatish Balay x2 = xb[1]; 30889371c9d4SSatish Balay x3 = xb[2]; 30899371c9d4SSatish Balay x4 = xb[3]; 30909371c9d4SSatish Balay x5 = xb[4]; 30913447b6efSHong Zhang ib = idx + ii[0]; 30929371c9d4SSatish Balay n = ii[1] - ii[0]; 30939371c9d4SSatish Balay ii++; 30942d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30952d61bbb3SSatish Balay rval = ib[j] * 5; 30962d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 30972d61bbb3SSatish Balay z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 30982d61bbb3SSatish Balay z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 30992d61bbb3SSatish Balay z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 31002d61bbb3SSatish Balay z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 31012d61bbb3SSatish Balay v += 25; 31022d61bbb3SSatish Balay } 31033447b6efSHong Zhang if (!usecprow) xb += 5; 31042d61bbb3SSatish Balay } 31052d61bbb3SSatish Balay break; 3106f1af5d2fSBarry Smith default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3107690b6cddSBarry Smith PetscInt ncols, k; 3108d9ca1df4SBarry Smith PetscScalar *work, *workt; 3109d9ca1df4SBarry Smith const PetscScalar *xtmp; 31102d61bbb3SSatish Balay if (!a->mult_work) { 3111d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 31129566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 31132d61bbb3SSatish Balay } 31142d61bbb3SSatish Balay work = a->mult_work; 31153447b6efSHong Zhang xtmp = x; 31162d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 31179371c9d4SSatish Balay n = ii[1] - ii[0]; 31189371c9d4SSatish Balay ii++; 31192d61bbb3SSatish Balay ncols = n * bs; 31209566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work, ncols)); 312126fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs * ridx[i]; 312296b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 31232d61bbb3SSatish Balay v += n * bs2; 31243447b6efSHong Zhang if (!usecprow) xtmp += bs; 31252d61bbb3SSatish Balay workt = work; 31262d61bbb3SSatish Balay for (j = 0; j < n; j++) { 31272d61bbb3SSatish Balay zb = z + bs * (*idx++); 31282d61bbb3SSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 31292d61bbb3SSatish Balay workt += bs; 31302d61bbb3SSatish Balay } 31312d61bbb3SSatish Balay } 31322d61bbb3SSatish Balay } 31332d61bbb3SSatish Balay } 31349566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 31359566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 31369566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 31372d61bbb3SSatish Balay PetscFunctionReturn(0); 31382d61bbb3SSatish Balay } 31392d61bbb3SSatish Balay 31409371c9d4SSatish Balay PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) { 31412d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3142690b6cddSBarry Smith PetscInt totalnz = a->bs2 * a->nz; 3143f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 3144c5df96a5SBarry Smith PetscBLASInt one = 1, tnz; 31452d61bbb3SSatish Balay 31462d61bbb3SSatish Balay PetscFunctionBegin; 31479566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3148792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 31499566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 31502d61bbb3SSatish Balay PetscFunctionReturn(0); 31512d61bbb3SSatish Balay } 31522d61bbb3SSatish Balay 31539371c9d4SSatish Balay PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) { 31542d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 31553f1db9ecSBarry Smith MatScalar *v = a->a; 3156329f5518SBarry Smith PetscReal sum = 0.0; 3157d0f46423SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 31582d61bbb3SSatish Balay 31592d61bbb3SSatish Balay PetscFunctionBegin; 31602d61bbb3SSatish Balay if (type == NORM_FROBENIUS) { 3161570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16) 3162570b7f6dSBarry Smith PetscBLASInt one = 1, cnt = bs2 * nz; 3163792fecdfSBarry Smith PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3164570b7f6dSBarry Smith #else 31652d61bbb3SSatish Balay for (i = 0; i < bs2 * nz; i++) { 31669371c9d4SSatish Balay sum += PetscRealPart(PetscConj(*v) * (*v)); 31679371c9d4SSatish Balay v++; 31682d61bbb3SSatish Balay } 3169570b7f6dSBarry Smith #endif 31708f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum); 31719566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 31728a62d963SHong Zhang } else if (type == NORM_1) { /* maximum column sum */ 31738a62d963SHong Zhang PetscReal *tmp; 31748a62d963SHong Zhang PetscInt *bcol = a->j; 31759566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 31768a62d963SHong Zhang for (i = 0; i < nz; i++) { 31778a62d963SHong Zhang for (j = 0; j < bs; j++) { 31788a62d963SHong Zhang k1 = bs * (*bcol) + j; /* column index */ 31798a62d963SHong Zhang for (k = 0; k < bs; k++) { 31809371c9d4SSatish Balay tmp[k1] += PetscAbsScalar(*v); 31819371c9d4SSatish Balay v++; 31828a62d963SHong Zhang } 31838a62d963SHong Zhang } 31848a62d963SHong Zhang bcol++; 31858a62d963SHong Zhang } 31868a62d963SHong Zhang *norm = 0.0; 3187d0f46423SBarry Smith for (j = 0; j < A->cmap->n; j++) { 31888a62d963SHong Zhang if (tmp[j] > *norm) *norm = tmp[j]; 31898a62d963SHong Zhang } 31909566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp)); 31919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3192596552b5SBarry Smith } else if (type == NORM_INFINITY) { /* maximum row sum */ 3193596552b5SBarry Smith *norm = 0.0; 3194596552b5SBarry Smith for (k = 0; k < bs; k++) { 319574f84c7bSSatish Balay for (j = 0; j < a->mbs; j++) { 3196596552b5SBarry Smith v = a->a + bs2 * a->i[j] + k; 3197596552b5SBarry Smith sum = 0.0; 3198596552b5SBarry Smith for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 31990e90e235SBarry Smith for (k1 = 0; k1 < bs; k1++) { 3200596552b5SBarry Smith sum += PetscAbsScalar(*v); 3201596552b5SBarry Smith v += bs; 32022d61bbb3SSatish Balay } 32030e90e235SBarry Smith } 3204596552b5SBarry Smith if (sum > *norm) *norm = sum; 3205596552b5SBarry Smith } 3206596552b5SBarry Smith } 32079566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3208e7e72b3dSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 32092d61bbb3SSatish Balay PetscFunctionReturn(0); 32102d61bbb3SSatish Balay } 32112d61bbb3SSatish Balay 32129371c9d4SSatish Balay PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) { 32132d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 32142d61bbb3SSatish Balay 32152d61bbb3SSatish Balay PetscFunctionBegin; 32162d61bbb3SSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3217d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3218273d9f13SBarry Smith *flg = PETSC_FALSE; 3219273d9f13SBarry Smith PetscFunctionReturn(0); 32202d61bbb3SSatish Balay } 32212d61bbb3SSatish Balay 32222d61bbb3SSatish Balay /* if the a->i are the same */ 32239566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 322426fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 32252d61bbb3SSatish Balay 32262d61bbb3SSatish Balay /* if a->j are the same */ 32279566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 322826fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 322926fbe8dcSKarl Rupp 32302d61bbb3SSatish Balay /* if a->a are the same */ 32319566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 32322d61bbb3SSatish Balay PetscFunctionReturn(0); 32332d61bbb3SSatish Balay } 32342d61bbb3SSatish Balay 32359371c9d4SSatish Balay PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) { 32362d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3237690b6cddSBarry Smith PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 323887828ca2SBarry Smith PetscScalar *x, zero = 0.0; 32393f1db9ecSBarry Smith MatScalar *aa, *aa_j; 32402d61bbb3SSatish Balay 32412d61bbb3SSatish Balay PetscFunctionBegin; 324228b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3243d0f46423SBarry Smith bs = A->rmap->bs; 32442d61bbb3SSatish Balay aa = a->a; 32452d61bbb3SSatish Balay ai = a->i; 32462d61bbb3SSatish Balay aj = a->j; 32472d61bbb3SSatish Balay ambs = a->mbs; 32482d61bbb3SSatish Balay bs2 = a->bs2; 32492d61bbb3SSatish Balay 32509566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 32519566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 32529566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 325308401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 32542d61bbb3SSatish Balay for (i = 0; i < ambs; i++) { 32552d61bbb3SSatish Balay for (j = ai[i]; j < ai[i + 1]; j++) { 32562d61bbb3SSatish Balay if (aj[j] == i) { 32572d61bbb3SSatish Balay row = i * bs; 32582d61bbb3SSatish Balay aa_j = aa + j * bs2; 32592d61bbb3SSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 32602d61bbb3SSatish Balay break; 32612d61bbb3SSatish Balay } 32622d61bbb3SSatish Balay } 32632d61bbb3SSatish Balay } 32649566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 32652d61bbb3SSatish Balay PetscFunctionReturn(0); 32662d61bbb3SSatish Balay } 32672d61bbb3SSatish Balay 32689371c9d4SSatish Balay PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) { 32692d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 327053ef36baSBarry Smith const PetscScalar *l, *r, *li, *ri; 327153ef36baSBarry Smith PetscScalar x; 32723f1db9ecSBarry Smith MatScalar *aa, *v; 327353ef36baSBarry Smith PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 327453ef36baSBarry Smith const PetscInt *ai, *aj; 32752d61bbb3SSatish Balay 32762d61bbb3SSatish Balay PetscFunctionBegin; 32772d61bbb3SSatish Balay ai = a->i; 32782d61bbb3SSatish Balay aj = a->j; 32792d61bbb3SSatish Balay aa = a->a; 3280d0f46423SBarry Smith m = A->rmap->n; 3281d0f46423SBarry Smith n = A->cmap->n; 3282d0f46423SBarry Smith bs = A->rmap->bs; 32832d61bbb3SSatish Balay mbs = a->mbs; 32842d61bbb3SSatish Balay bs2 = a->bs2; 32852d61bbb3SSatish Balay if (ll) { 32869566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 32879566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 328808401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 32892d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 32902d61bbb3SSatish Balay M = ai[i + 1] - ai[i]; 32912d61bbb3SSatish Balay li = l + i * bs; 32922d61bbb3SSatish Balay v = aa + bs2 * ai[i]; 32932d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 32949371c9d4SSatish Balay for (k = 0; k < bs2; k++) { (*v++) *= li[k % bs]; } 32952d61bbb3SSatish Balay } 32962d61bbb3SSatish Balay } 32979566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 32989566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 32992d61bbb3SSatish Balay } 33002d61bbb3SSatish Balay 33012d61bbb3SSatish Balay if (rr) { 33029566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(rr, &r)); 33039566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(rr, &rn)); 330408401ef6SPierre Jolivet PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 33052d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 330653ef36baSBarry Smith iai = ai[i]; 330753ef36baSBarry Smith M = ai[i + 1] - iai; 330853ef36baSBarry Smith v = aa + bs2 * iai; 33092d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 331053ef36baSBarry Smith ri = r + bs * aj[iai + j]; 33112d61bbb3SSatish Balay for (k = 0; k < bs; k++) { 33122d61bbb3SSatish Balay x = ri[k]; 331353ef36baSBarry Smith for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 331453ef36baSBarry Smith v += bs; 33152d61bbb3SSatish Balay } 33162d61bbb3SSatish Balay } 33172d61bbb3SSatish Balay } 33189566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(rr, &r)); 33199566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 33202d61bbb3SSatish Balay } 33212d61bbb3SSatish Balay PetscFunctionReturn(0); 33222d61bbb3SSatish Balay } 33232d61bbb3SSatish Balay 33249371c9d4SSatish Balay PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) { 33252d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33262d61bbb3SSatish Balay 33272d61bbb3SSatish Balay PetscFunctionBegin; 33282d61bbb3SSatish Balay info->block_size = a->bs2; 3329ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; 33302d61bbb3SSatish Balay info->nz_used = a->bs2 * a->nz; 33313966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 33322d61bbb3SSatish Balay info->assemblies = A->num_ass; 33338e58a170SBarry Smith info->mallocs = A->info.mallocs; 33347adad957SLisandro Dalcin info->memory = ((PetscObject)A)->mem; 3335d5f3da31SBarry Smith if (A->factortype) { 33362d61bbb3SSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 33372d61bbb3SSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 33382d61bbb3SSatish Balay info->factor_mallocs = A->info.factor_mallocs; 33392d61bbb3SSatish Balay } else { 33402d61bbb3SSatish Balay info->fill_ratio_given = 0; 33412d61bbb3SSatish Balay info->fill_ratio_needed = 0; 33422d61bbb3SSatish Balay info->factor_mallocs = 0; 33432d61bbb3SSatish Balay } 33442d61bbb3SSatish Balay PetscFunctionReturn(0); 33452d61bbb3SSatish Balay } 33462d61bbb3SSatish Balay 33479371c9d4SSatish Balay PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) { 33482d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33492d61bbb3SSatish Balay 33502d61bbb3SSatish Balay PetscFunctionBegin; 33519566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 33522d61bbb3SSatish Balay PetscFunctionReturn(0); 33532d61bbb3SSatish Balay } 3354a001520aSPierre Jolivet 33559371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) { 3356a001520aSPierre Jolivet PetscFunctionBegin; 33579566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 33584222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 3359a001520aSPierre Jolivet PetscFunctionReturn(0); 3360a001520aSPierre Jolivet } 3361a001520aSPierre Jolivet 33629371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 336374eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3364f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1; 3365bcf10a7aSPierre Jolivet const PetscScalar *xb; 336674eeabc5SPierre Jolivet PetscScalar x1; 336774eeabc5SPierre Jolivet const MatScalar *v, *vv; 336874eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 336974eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 337074eeabc5SPierre Jolivet 337174eeabc5SPierre Jolivet PetscFunctionBegin; 337274eeabc5SPierre Jolivet idx = a->j; 337374eeabc5SPierre Jolivet v = a->a; 337474eeabc5SPierre Jolivet if (usecprow) { 337574eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 337674eeabc5SPierre Jolivet ii = a->compressedrow.i; 337774eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 337874eeabc5SPierre Jolivet } else { 337974eeabc5SPierre Jolivet mbs = a->mbs; 338074eeabc5SPierre Jolivet ii = a->i; 338174eeabc5SPierre Jolivet z = c; 338274eeabc5SPierre Jolivet } 338374eeabc5SPierre Jolivet 338474eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 33859371c9d4SSatish Balay n = ii[1] - ii[0]; 33869371c9d4SSatish Balay ii++; 338774eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 338874eeabc5SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 338974eeabc5SPierre Jolivet if (usecprow) z = c + ridx[i]; 339074eeabc5SPierre Jolivet jj = idx; 339174eeabc5SPierre Jolivet vv = v; 339274eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 339374eeabc5SPierre Jolivet idx = jj; 339474eeabc5SPierre Jolivet v = vv; 339574eeabc5SPierre Jolivet sum1 = 0.0; 339674eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 33979371c9d4SSatish Balay xb = b + (*idx++); 33989371c9d4SSatish Balay x1 = xb[0 + k * bm]; 339974eeabc5SPierre Jolivet sum1 += v[0] * x1; 340074eeabc5SPierre Jolivet v += 1; 340174eeabc5SPierre Jolivet } 3402feb237baSPierre Jolivet z[0 + k * cm] = sum1; 340374eeabc5SPierre Jolivet } 340474eeabc5SPierre Jolivet if (!usecprow) z += 1; 340574eeabc5SPierre Jolivet } 340674eeabc5SPierre Jolivet PetscFunctionReturn(0); 340774eeabc5SPierre Jolivet } 340874eeabc5SPierre Jolivet 34099371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 34104b7054f4SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3411f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2; 3412bcf10a7aSPierre Jolivet const PetscScalar *xb; 34134b7054f4SPierre Jolivet PetscScalar x1, x2; 34144b7054f4SPierre Jolivet const MatScalar *v, *vv; 34154b7054f4SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 34164b7054f4SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 34174b7054f4SPierre Jolivet 34184b7054f4SPierre Jolivet PetscFunctionBegin; 34194b7054f4SPierre Jolivet idx = a->j; 34204b7054f4SPierre Jolivet v = a->a; 34214b7054f4SPierre Jolivet if (usecprow) { 34224b7054f4SPierre Jolivet mbs = a->compressedrow.nrows; 34234b7054f4SPierre Jolivet ii = a->compressedrow.i; 34244b7054f4SPierre Jolivet ridx = a->compressedrow.rindex; 34254b7054f4SPierre Jolivet } else { 34264b7054f4SPierre Jolivet mbs = a->mbs; 34274b7054f4SPierre Jolivet ii = a->i; 34284b7054f4SPierre Jolivet z = c; 34294b7054f4SPierre Jolivet } 34304b7054f4SPierre Jolivet 34314b7054f4SPierre Jolivet for (i = 0; i < mbs; i++) { 34329371c9d4SSatish Balay n = ii[1] - ii[0]; 34339371c9d4SSatish Balay ii++; 34344b7054f4SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 34354b7054f4SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 34364b7054f4SPierre Jolivet if (usecprow) z = c + 2 * ridx[i]; 34374b7054f4SPierre Jolivet jj = idx; 34384b7054f4SPierre Jolivet vv = v; 34394b7054f4SPierre Jolivet for (k = 0; k < cn; k++) { 34404b7054f4SPierre Jolivet idx = jj; 34414b7054f4SPierre Jolivet v = vv; 34429371c9d4SSatish Balay sum1 = 0.0; 34439371c9d4SSatish Balay sum2 = 0.0; 34444b7054f4SPierre Jolivet for (j = 0; j < n; j++) { 34459371c9d4SSatish Balay xb = b + 2 * (*idx++); 34469371c9d4SSatish Balay x1 = xb[0 + k * bm]; 34479371c9d4SSatish Balay x2 = xb[1 + k * bm]; 34484b7054f4SPierre Jolivet sum1 += v[0] * x1 + v[2] * x2; 34494b7054f4SPierre Jolivet sum2 += v[1] * x1 + v[3] * x2; 34504b7054f4SPierre Jolivet v += 4; 34514b7054f4SPierre Jolivet } 34529371c9d4SSatish Balay z[0 + k * cm] = sum1; 34539371c9d4SSatish Balay z[1 + k * cm] = sum2; 34544b7054f4SPierre Jolivet } 34554b7054f4SPierre Jolivet if (!usecprow) z += 2; 34564b7054f4SPierre Jolivet } 34574b7054f4SPierre Jolivet PetscFunctionReturn(0); 34584b7054f4SPierre Jolivet } 34594b7054f4SPierre Jolivet 34609371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 346174eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3462f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3; 3463bcf10a7aSPierre Jolivet const PetscScalar *xb; 346474eeabc5SPierre Jolivet PetscScalar x1, x2, x3; 346574eeabc5SPierre Jolivet const MatScalar *v, *vv; 346674eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 346774eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 346874eeabc5SPierre Jolivet 346974eeabc5SPierre Jolivet PetscFunctionBegin; 347074eeabc5SPierre Jolivet idx = a->j; 347174eeabc5SPierre Jolivet v = a->a; 347274eeabc5SPierre Jolivet if (usecprow) { 347374eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 347474eeabc5SPierre Jolivet ii = a->compressedrow.i; 347574eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 347674eeabc5SPierre Jolivet } else { 347774eeabc5SPierre Jolivet mbs = a->mbs; 347874eeabc5SPierre Jolivet ii = a->i; 347974eeabc5SPierre Jolivet z = c; 348074eeabc5SPierre Jolivet } 348174eeabc5SPierre Jolivet 348274eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 34839371c9d4SSatish Balay n = ii[1] - ii[0]; 34849371c9d4SSatish Balay ii++; 348574eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 348674eeabc5SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 348774eeabc5SPierre Jolivet if (usecprow) z = c + 3 * ridx[i]; 348874eeabc5SPierre Jolivet jj = idx; 348974eeabc5SPierre Jolivet vv = v; 349074eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 349174eeabc5SPierre Jolivet idx = jj; 349274eeabc5SPierre Jolivet v = vv; 34939371c9d4SSatish Balay sum1 = 0.0; 34949371c9d4SSatish Balay sum2 = 0.0; 34959371c9d4SSatish Balay sum3 = 0.0; 349674eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 34979371c9d4SSatish Balay xb = b + 3 * (*idx++); 34989371c9d4SSatish Balay x1 = xb[0 + k * bm]; 34999371c9d4SSatish Balay x2 = xb[1 + k * bm]; 35009371c9d4SSatish Balay x3 = xb[2 + k * bm]; 350174eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 350274eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 350374eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 350474eeabc5SPierre Jolivet v += 9; 350574eeabc5SPierre Jolivet } 35069371c9d4SSatish Balay z[0 + k * cm] = sum1; 35079371c9d4SSatish Balay z[1 + k * cm] = sum2; 35089371c9d4SSatish Balay z[2 + k * cm] = sum3; 350974eeabc5SPierre Jolivet } 351074eeabc5SPierre Jolivet if (!usecprow) z += 3; 351174eeabc5SPierre Jolivet } 351274eeabc5SPierre Jolivet PetscFunctionReturn(0); 351374eeabc5SPierre Jolivet } 351474eeabc5SPierre Jolivet 35159371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 351674eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3517f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3518bcf10a7aSPierre Jolivet const PetscScalar *xb; 351974eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4; 352074eeabc5SPierre Jolivet const MatScalar *v, *vv; 352174eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 352274eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 352374eeabc5SPierre Jolivet 352474eeabc5SPierre Jolivet PetscFunctionBegin; 352574eeabc5SPierre Jolivet idx = a->j; 352674eeabc5SPierre Jolivet v = a->a; 352774eeabc5SPierre Jolivet if (usecprow) { 352874eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 352974eeabc5SPierre Jolivet ii = a->compressedrow.i; 353074eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 353174eeabc5SPierre Jolivet } else { 353274eeabc5SPierre Jolivet mbs = a->mbs; 353374eeabc5SPierre Jolivet ii = a->i; 353474eeabc5SPierre Jolivet z = c; 353574eeabc5SPierre Jolivet } 353674eeabc5SPierre Jolivet 353774eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 35389371c9d4SSatish Balay n = ii[1] - ii[0]; 35399371c9d4SSatish Balay ii++; 354074eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 354174eeabc5SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 354274eeabc5SPierre Jolivet if (usecprow) z = c + 4 * ridx[i]; 354374eeabc5SPierre Jolivet jj = idx; 354474eeabc5SPierre Jolivet vv = v; 354574eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 354674eeabc5SPierre Jolivet idx = jj; 354774eeabc5SPierre Jolivet v = vv; 35489371c9d4SSatish Balay sum1 = 0.0; 35499371c9d4SSatish Balay sum2 = 0.0; 35509371c9d4SSatish Balay sum3 = 0.0; 35519371c9d4SSatish Balay sum4 = 0.0; 355274eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 35539371c9d4SSatish Balay xb = b + 4 * (*idx++); 35549371c9d4SSatish Balay x1 = xb[0 + k * bm]; 35559371c9d4SSatish Balay x2 = xb[1 + k * bm]; 35569371c9d4SSatish Balay x3 = xb[2 + k * bm]; 35579371c9d4SSatish Balay x4 = xb[3 + k * bm]; 355874eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 355974eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 356074eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 356174eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 356274eeabc5SPierre Jolivet v += 16; 356374eeabc5SPierre Jolivet } 35649371c9d4SSatish Balay z[0 + k * cm] = sum1; 35659371c9d4SSatish Balay z[1 + k * cm] = sum2; 35669371c9d4SSatish Balay z[2 + k * cm] = sum3; 35679371c9d4SSatish Balay z[3 + k * cm] = sum4; 356874eeabc5SPierre Jolivet } 356974eeabc5SPierre Jolivet if (!usecprow) z += 4; 357074eeabc5SPierre Jolivet } 357174eeabc5SPierre Jolivet PetscFunctionReturn(0); 357274eeabc5SPierre Jolivet } 357374eeabc5SPierre Jolivet 35749371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 357574eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3576f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3577bcf10a7aSPierre Jolivet const PetscScalar *xb; 357874eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 357974eeabc5SPierre Jolivet const MatScalar *v, *vv; 358074eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 358174eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 358274eeabc5SPierre Jolivet 358374eeabc5SPierre Jolivet PetscFunctionBegin; 358474eeabc5SPierre Jolivet idx = a->j; 358574eeabc5SPierre Jolivet v = a->a; 358674eeabc5SPierre Jolivet if (usecprow) { 358774eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 358874eeabc5SPierre Jolivet ii = a->compressedrow.i; 358974eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 359074eeabc5SPierre Jolivet } else { 359174eeabc5SPierre Jolivet mbs = a->mbs; 359274eeabc5SPierre Jolivet ii = a->i; 359374eeabc5SPierre Jolivet z = c; 359474eeabc5SPierre Jolivet } 359574eeabc5SPierre Jolivet 359674eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 35979371c9d4SSatish Balay n = ii[1] - ii[0]; 35989371c9d4SSatish Balay ii++; 359974eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 360074eeabc5SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 360174eeabc5SPierre Jolivet if (usecprow) z = c + 5 * ridx[i]; 360274eeabc5SPierre Jolivet jj = idx; 360374eeabc5SPierre Jolivet vv = v; 360474eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 360574eeabc5SPierre Jolivet idx = jj; 360674eeabc5SPierre Jolivet v = vv; 36079371c9d4SSatish Balay sum1 = 0.0; 36089371c9d4SSatish Balay sum2 = 0.0; 36099371c9d4SSatish Balay sum3 = 0.0; 36109371c9d4SSatish Balay sum4 = 0.0; 36119371c9d4SSatish Balay sum5 = 0.0; 361274eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 36139371c9d4SSatish Balay xb = b + 5 * (*idx++); 36149371c9d4SSatish Balay x1 = xb[0 + k * bm]; 36159371c9d4SSatish Balay x2 = xb[1 + k * bm]; 36169371c9d4SSatish Balay x3 = xb[2 + k * bm]; 36179371c9d4SSatish Balay x4 = xb[3 + k * bm]; 36189371c9d4SSatish Balay x5 = xb[4 + k * bm]; 361974eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 362074eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 362174eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 362274eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 362374eeabc5SPierre Jolivet sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 362474eeabc5SPierre Jolivet v += 25; 362574eeabc5SPierre Jolivet } 36269371c9d4SSatish Balay z[0 + k * cm] = sum1; 36279371c9d4SSatish Balay z[1 + k * cm] = sum2; 36289371c9d4SSatish Balay z[2 + k * cm] = sum3; 36299371c9d4SSatish Balay z[3 + k * cm] = sum4; 36309371c9d4SSatish Balay z[4 + k * cm] = sum5; 363174eeabc5SPierre Jolivet } 363274eeabc5SPierre Jolivet if (!usecprow) z += 5; 363374eeabc5SPierre Jolivet } 363474eeabc5SPierre Jolivet PetscFunctionReturn(0); 363574eeabc5SPierre Jolivet } 363674eeabc5SPierre Jolivet 36379371c9d4SSatish Balay PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) { 3638a001520aSPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3639a001520aSPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3640910cf402Sprj- Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3641bcf10a7aSPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3642a001520aSPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3643a001520aSPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 3644f4259b30SLisandro Dalcin PetscScalar *z = NULL; 3645a001520aSPierre Jolivet PetscScalar *c, *b; 3646a001520aSPierre Jolivet const MatScalar *v; 3647a001520aSPierre Jolivet const PetscInt *idx, *ii, *ridx = NULL; 36484b7054f4SPierre Jolivet PetscScalar _DZero = 0.0, _DOne = 1.0; 3649a001520aSPierre Jolivet PetscBool usecprow = a->compressedrow.use; 3650a001520aSPierre Jolivet 3651a001520aSPierre Jolivet PetscFunctionBegin; 3652a001520aSPierre Jolivet if (!cm || !cn) PetscFunctionReturn(0); 365308401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 365408401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 365508401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3656a001520aSPierre Jolivet b = bd->v; 3657*48a46eb9SPierre Jolivet if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 36589566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 365974eeabc5SPierre Jolivet switch (bs) { 36609371c9d4SSatish Balay case 1: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break; 36619371c9d4SSatish Balay case 2: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break; 36629371c9d4SSatish Balay case 3: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break; 36639371c9d4SSatish Balay case 4: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break; 36649371c9d4SSatish Balay case 5: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break; 366574eeabc5SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 36669566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 36679566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 36689566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 36699566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 3670a001520aSPierre Jolivet idx = a->j; 3671a001520aSPierre Jolivet v = a->a; 3672a001520aSPierre Jolivet if (usecprow) { 3673a001520aSPierre Jolivet mbs = a->compressedrow.nrows; 3674a001520aSPierre Jolivet ii = a->compressedrow.i; 3675a001520aSPierre Jolivet ridx = a->compressedrow.rindex; 3676a001520aSPierre Jolivet } else { 3677a001520aSPierre Jolivet mbs = a->mbs; 3678a001520aSPierre Jolivet ii = a->i; 3679a001520aSPierre Jolivet z = c; 3680a001520aSPierre Jolivet } 3681a001520aSPierre Jolivet for (i = 0; i < mbs; i++) { 36829371c9d4SSatish Balay n = ii[1] - ii[0]; 36839371c9d4SSatish Balay ii++; 3684a001520aSPierre Jolivet if (usecprow) z = c + bs * ridx[i]; 36854b7054f4SPierre Jolivet if (n) { 3686792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 36874b7054f4SPierre Jolivet v += bs2; 36884b7054f4SPierre Jolivet } 36894b7054f4SPierre Jolivet for (j = 1; j < n; j++) { 3690792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3691a001520aSPierre Jolivet v += bs2; 3692a001520aSPierre Jolivet } 3693a001520aSPierre Jolivet if (!usecprow) z += bs; 3694a001520aSPierre Jolivet } 36954b7054f4SPierre Jolivet } 36969566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 36979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 3698a001520aSPierre Jolivet PetscFunctionReturn(0); 3699a001520aSPierre Jolivet } 3700