1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 2a001520aSPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <petscblaslapack.h> 6cac129eeSSatish Balay 75f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 896e086a2SDaniel Kokron #include <immintrin.h> 996e086a2SDaniel Kokron #endif 1096e086a2SDaniel Kokron 11d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 12d71ae5a4SJacob Faibussowitsch { 13a3192f15SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 145d0c19d7SBarry Smith PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; 155d0c19d7SBarry Smith const PetscInt *idx; 167bede89fSBarry Smith PetscInt start, end, *ai, *aj, bs; 17f1af5d2fSBarry Smith PetscBT table; 18a3192f15SSatish Balay 193a40ed3dSBarry Smith PetscFunctionBegin; 20a3192f15SSatish Balay m = a->mbs; 21a3192f15SSatish Balay ai = a->i; 22a3192f15SSatish Balay aj = a->j; 23d0f46423SBarry Smith bs = A->rmap->bs; 24a3192f15SSatish Balay 2508401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 26a3192f15SSatish Balay 279566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(m, &table)); 289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &nidx)); 29a3192f15SSatish Balay 30a3192f15SSatish Balay for (i = 0; i < is_max; i++) { 31a3192f15SSatish Balay /* Initialise the two local arrays */ 32a3192f15SSatish Balay isz = 0; 339566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(m, table)); 34a3192f15SSatish Balay 35a3192f15SSatish Balay /* Extract the indices, assume there can be duplicate entries */ 369566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 379566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 38a3192f15SSatish Balay 39a3192f15SSatish Balay /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40a3192f15SSatish Balay for (j = 0; j < n; ++j) { 41218c64b6SSatish Balay ival = idx[j] / bs; /* convert the indices into block indices */ 4208401ef6SPierre Jolivet PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 4326fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; 44a3192f15SSatish Balay } 459566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 469566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 47a3192f15SSatish Balay 48a3192f15SSatish Balay k = 0; 49a3192f15SSatish Balay for (j = 0; j < ov; j++) { /* for each overlap*/ 50a3192f15SSatish Balay n = isz; 51a3192f15SSatish Balay for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52a3192f15SSatish Balay row = nidx[k]; 53a3192f15SSatish Balay start = ai[row]; 54a3192f15SSatish Balay end = ai[row + 1]; 55a3192f15SSatish Balay for (l = start; l < end; l++) { 56a3192f15SSatish Balay val = aj[l]; 5726fbe8dcSKarl Rupp if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; 58a3192f15SSatish Balay } 59a3192f15SSatish Balay } 60a3192f15SSatish Balay } 617bede89fSBarry Smith PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 62a3192f15SSatish Balay } 639566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table)); 649566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 65*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 66a3192f15SSatish Balay } 671c351548SSatish Balay 68d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 69d71ae5a4SJacob Faibussowitsch { 70736121d4SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; 71690b6cddSBarry Smith PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 72690b6cddSBarry Smith PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 735d0c19d7SBarry Smith const PetscInt *irow, *icol; 745d0c19d7SBarry Smith PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 75690b6cddSBarry Smith PetscInt *aj = a->j, *ai = a->i; 763f1db9ecSBarry Smith MatScalar *mat_a; 77736121d4SSatish Balay Mat C; 786041f1b1SToby Isaac PetscBool flag; 79736121d4SSatish Balay 803a40ed3dSBarry Smith PetscFunctionBegin; 819566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 829566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 839566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 849566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 85736121d4SSatish Balay 869566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 87736121d4SSatish Balay ssmap = smap; 889566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 89736121d4SSatish Balay for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 90736121d4SSatish Balay /* determine lens of each row */ 91736121d4SSatish Balay for (i = 0; i < nrows; i++) { 92736121d4SSatish Balay kstart = ai[irow[i]]; 93736121d4SSatish Balay kend = kstart + a->ilen[irow[i]]; 94736121d4SSatish Balay lens[i] = 0; 95736121d4SSatish Balay for (k = kstart; k < kend; k++) { 9626fbe8dcSKarl Rupp if (ssmap[aj[k]]) lens[i]++; 97736121d4SSatish Balay } 98736121d4SSatish Balay } 99736121d4SSatish Balay /* Create and fill new matrix */ 100736121d4SSatish Balay if (scall == MAT_REUSE_MATRIX) { 101736121d4SSatish Balay c = (Mat_SeqBAIJ *)((*B)->data); 102736121d4SSatish Balay 103aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1049566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 10528b400f6SJacob Faibussowitsch PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 1069566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 107736121d4SSatish Balay C = *B; 1083a40ed3dSBarry Smith } else { 1099566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1109566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 1119566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1129566063dSJacob Faibussowitsch PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 113736121d4SSatish Balay } 114736121d4SSatish Balay c = (Mat_SeqBAIJ *)(C->data); 115736121d4SSatish Balay for (i = 0; i < nrows; i++) { 116736121d4SSatish Balay row = irow[i]; 117736121d4SSatish Balay kstart = ai[row]; 118736121d4SSatish Balay kend = kstart + a->ilen[row]; 119736121d4SSatish Balay mat_i = c->i[i]; 120d29f2997SMatthew Woehlke mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 121d29f2997SMatthew Woehlke mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */ 122736121d4SSatish Balay mat_ilen = c->ilen + i; 123736121d4SSatish Balay for (k = kstart; k < kend; k++) { 124736121d4SSatish Balay if ((tcol = ssmap[a->j[k]])) { 125736121d4SSatish Balay *mat_j++ = tcol - 1; 1269566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 127549d3d68SSatish Balay mat_a += bs2; 128736121d4SSatish Balay (*mat_ilen)++; 129736121d4SSatish Balay } 130736121d4SSatish Balay } 131736121d4SSatish Balay } 132cdc6f3adSToby Isaac /* sort */ 133d29f2997SMatthew Woehlke if (c->j && c->a) { 134cdc6f3adSToby Isaac MatScalar *work; 1359566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 136cdc6f3adSToby Isaac for (i = 0; i < nrows; i++) { 137cdc6f3adSToby Isaac PetscInt ilen; 138cdc6f3adSToby Isaac mat_i = c->i[i]; 139cdc6f3adSToby Isaac mat_j = c->j + mat_i; 140cdc6f3adSToby Isaac mat_a = c->a + mat_i * bs2; 141cdc6f3adSToby Isaac ilen = c->ilen[i]; 1429566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 143cdc6f3adSToby Isaac } 1449566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 145cdc6f3adSToby Isaac } 146218c64b6SSatish Balay 147736121d4SSatish Balay /* Free work space */ 1489566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 1499566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 1509566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 1519566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1529566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 153736121d4SSatish Balay 1549566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 155736121d4SSatish Balay *B = C; 156*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 157736121d4SSatish Balay } 158736121d4SSatish Balay 159d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 160d71ae5a4SJacob Faibussowitsch { 161218c64b6SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 162218c64b6SSatish Balay IS is1, is2; 163afebec48SHong Zhang PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j; 1645d0c19d7SBarry Smith const PetscInt *irow, *icol; 165218c64b6SSatish Balay 1663a40ed3dSBarry Smith PetscFunctionBegin; 1679566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 1689566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 1699566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 1709566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 171218c64b6SSatish Balay 172218c64b6SSatish Balay /* Verify if the indices corespond to each element in a block 173218c64b6SSatish Balay and form the IS with compressed IS */ 174f8ecb639SStefano Zampini maxmnbs = PetscMax(a->mbs, a->nbs); 1759566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 1769566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->mbs)); 177218c64b6SSatish Balay for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 178ad540459SPierre Jolivet for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 1796041f1b1SToby Isaac count = 0; 1806041f1b1SToby Isaac for (i = 0; i < nrows; i++) { 181afebec48SHong Zhang j = irow[i] / bs; 1826041f1b1SToby Isaac if ((vary[j]--) == bs) iary[count++] = j; 183218c64b6SSatish Balay } 1849566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 185218c64b6SSatish Balay 1869566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->nbs)); 187218c64b6SSatish Balay for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 188ad540459SPierre Jolivet for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 1896041f1b1SToby Isaac count = 0; 1906041f1b1SToby Isaac for (i = 0; i < ncols; i++) { 191afebec48SHong Zhang j = icol[i] / bs; 1926041f1b1SToby Isaac if ((vary[j]--) == bs) iary[count++] = j; 1936041f1b1SToby Isaac } 1949566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 1959566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 1969566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 1979566063dSJacob Faibussowitsch PetscCall(PetscFree2(vary, iary)); 198218c64b6SSatish Balay 1999566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); 2009566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2019566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 202*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 203218c64b6SSatish Balay } 204218c64b6SSatish Balay 205d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 206d71ae5a4SJacob Faibussowitsch { 20716b64355SHong Zhang Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; 2085c39f6d9SHong Zhang Mat_SubSppt *submatj = c->submatis1; 20916b64355SHong Zhang 21016b64355SHong Zhang PetscFunctionBegin; 2119566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 2129566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 213*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21416b64355SHong Zhang } 21516b64355SHong Zhang 21689a1a59bSHong Zhang /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 217d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) 218d71ae5a4SJacob Faibussowitsch { 21986e85357SHong Zhang PetscInt i; 22086e85357SHong Zhang Mat C; 22186e85357SHong Zhang Mat_SeqBAIJ *c; 22286e85357SHong Zhang Mat_SubSppt *submatj; 22386e85357SHong Zhang 22486e85357SHong Zhang PetscFunctionBegin; 22586e85357SHong Zhang for (i = 0; i < n; i++) { 22686e85357SHong Zhang C = (*mat)[i]; 22786e85357SHong Zhang c = (Mat_SeqBAIJ *)C->data; 22886e85357SHong Zhang submatj = c->submatis1; 22986e85357SHong Zhang if (submatj) { 2307daefbafSJunchao Zhang if (--((PetscObject)C)->refct <= 0) { 23126cc229bSBarry Smith PetscCall(PetscFree(C->factorprefix)); 2329566063dSJacob Faibussowitsch PetscCall((*submatj->destroy)(C)); 2339566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrix_Private(submatj)); 2349566063dSJacob Faibussowitsch PetscCall(PetscFree(C->defaultvectype)); 2353faff063SStefano Zampini PetscCall(PetscFree(C->defaultrandtype)); 2369566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->rmap)); 2379566063dSJacob Faibussowitsch PetscCall(PetscLayoutDestroy(&C->cmap)); 2389566063dSJacob Faibussowitsch PetscCall(PetscHeaderDestroy(&C)); 2397daefbafSJunchao Zhang } 24086e85357SHong Zhang } else { 2419566063dSJacob Faibussowitsch PetscCall(MatDestroy(&C)); 24286e85357SHong Zhang } 24386e85357SHong Zhang } 2447daefbafSJunchao Zhang 2457daefbafSJunchao Zhang /* Destroy Dummy submatrices created for reuse */ 2469566063dSJacob Faibussowitsch PetscCall(MatDestroySubMatrices_Dummy(n, mat)); 2477daefbafSJunchao Zhang 2489566063dSJacob Faibussowitsch PetscCall(PetscFree(*mat)); 249*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 25086e85357SHong Zhang } 25186e85357SHong Zhang 252d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 253d71ae5a4SJacob Faibussowitsch { 254690b6cddSBarry Smith PetscInt i; 255736121d4SSatish Balay 2563a40ed3dSBarry Smith PetscFunctionBegin; 25748a46eb9SPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 258736121d4SSatish Balay 25948a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 260*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 261736121d4SSatish Balay } 262218c64b6SSatish Balay 2632d61bbb3SSatish Balay /* -------------------------------------------------------*/ 2642d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */ 2652d61bbb3SSatish Balay /* -------------------------------------------------------*/ 2662d61bbb3SSatish Balay 267d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) 268d71ae5a4SJacob Faibussowitsch { 2692d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 270d9fead3dSBarry Smith PetscScalar *z, sum; 271d9fead3dSBarry Smith const PetscScalar *x; 272d9fead3dSBarry Smith const MatScalar *v; 2737c565772SBarry Smith PetscInt mbs, i, n; 2740298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 275ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2762d61bbb3SSatish Balay 2772d61bbb3SSatish Balay PetscFunctionBegin; 2789566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2799566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &z)); 2802d61bbb3SSatish Balay 28126e093fcSHong Zhang if (usecprow) { 28226e093fcSHong Zhang mbs = a->compressedrow.nrows; 28326e093fcSHong Zhang ii = a->compressedrow.i; 2847b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 2859566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(z, a->mbs)); 28626e093fcSHong Zhang } else { 28726e093fcSHong Zhang mbs = a->mbs; 2882d61bbb3SSatish Balay ii = a->i; 28926e093fcSHong Zhang } 2902d61bbb3SSatish Balay 2912d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 292ee54c7eeSHong Zhang n = ii[1] - ii[0]; 293ee54c7eeSHong Zhang v = a->a + ii[0]; 294ee54c7eeSHong Zhang idx = a->j + ii[0]; 295ee54c7eeSHong Zhang ii++; 296444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 297444d8c10SJed Brown PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2982d61bbb3SSatish Balay sum = 0.0; 2992162cab8SBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 30026e093fcSHong Zhang if (usecprow) { 3017b2bb3b9SHong Zhang z[ridx[i]] = sum; 30226e093fcSHong Zhang } else { 3032d61bbb3SSatish Balay z[i] = sum; 3042d61bbb3SSatish Balay } 30526e093fcSHong Zhang } 3069566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3079566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &z)); 3089566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); 309*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3102d61bbb3SSatish Balay } 3112d61bbb3SSatish Balay 312d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) 313d71ae5a4SJacob Faibussowitsch { 3142d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 315f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, *zarray; 316d9fead3dSBarry Smith const PetscScalar *x, *xb; 31787828ca2SBarry Smith PetscScalar x1, x2; 318d9fead3dSBarry Smith const MatScalar *v; 3197c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 320ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 3212d61bbb3SSatish Balay 3222d61bbb3SSatish Balay PetscFunctionBegin; 3239566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3249566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 3252d61bbb3SSatish Balay 3262d61bbb3SSatish Balay idx = a->j; 3272d61bbb3SSatish Balay v = a->a; 32826e093fcSHong Zhang if (usecprow) { 32926e093fcSHong Zhang mbs = a->compressedrow.nrows; 33026e093fcSHong Zhang ii = a->compressedrow.i; 3317b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 3329566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); 33326e093fcSHong Zhang } else { 33426e093fcSHong Zhang mbs = a->mbs; 3352d61bbb3SSatish Balay ii = a->i; 33626e093fcSHong Zhang z = zarray; 33726e093fcSHong Zhang } 3382d61bbb3SSatish Balay 3392d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 3409371c9d4SSatish Balay n = ii[1] - ii[0]; 3419371c9d4SSatish Balay ii++; 3429371c9d4SSatish Balay sum1 = 0.0; 3439371c9d4SSatish Balay sum2 = 0.0; 344444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 345444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3462d61bbb3SSatish Balay for (j = 0; j < n; j++) { 3479371c9d4SSatish Balay xb = x + 2 * (*idx++); 3489371c9d4SSatish Balay x1 = xb[0]; 3499371c9d4SSatish Balay x2 = xb[1]; 3502d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 3512d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 3522d61bbb3SSatish Balay v += 4; 3532d61bbb3SSatish Balay } 3547b2bb3b9SHong Zhang if (usecprow) z = zarray + 2 * ridx[i]; 3559371c9d4SSatish Balay z[0] = sum1; 3569371c9d4SSatish Balay z[1] = sum2; 35726e093fcSHong Zhang if (!usecprow) z += 2; 3582d61bbb3SSatish Balay } 3599566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3609566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 3619566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); 362*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3632d61bbb3SSatish Balay } 3642d61bbb3SSatish Balay 365d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) 366d71ae5a4SJacob Faibussowitsch { 3672d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 368f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; 369d9fead3dSBarry Smith const PetscScalar *x, *xb; 370d9fead3dSBarry Smith const MatScalar *v; 3717c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 372ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 37326e093fcSHong Zhang 374b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 375fee21e36SBarry Smith #pragma disjoint(*v, *z, *xb) 376fee21e36SBarry Smith #endif 377fee21e36SBarry Smith 3782d61bbb3SSatish Balay PetscFunctionBegin; 3799566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3809566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 3812d61bbb3SSatish Balay 3822d61bbb3SSatish Balay idx = a->j; 3832d61bbb3SSatish Balay v = a->a; 38426e093fcSHong Zhang if (usecprow) { 38526e093fcSHong Zhang mbs = a->compressedrow.nrows; 38626e093fcSHong Zhang ii = a->compressedrow.i; 3877b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 3889566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); 38926e093fcSHong Zhang } else { 39026e093fcSHong Zhang mbs = a->mbs; 3912d61bbb3SSatish Balay ii = a->i; 39226e093fcSHong Zhang z = zarray; 39326e093fcSHong Zhang } 3942d61bbb3SSatish Balay 3952d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 3969371c9d4SSatish Balay n = ii[1] - ii[0]; 3979371c9d4SSatish Balay ii++; 3989371c9d4SSatish Balay sum1 = 0.0; 3999371c9d4SSatish Balay sum2 = 0.0; 4009371c9d4SSatish Balay sum3 = 0.0; 401444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 402444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4032d61bbb3SSatish Balay for (j = 0; j < n; j++) { 40426fbe8dcSKarl Rupp xb = x + 3 * (*idx++); 40526fbe8dcSKarl Rupp x1 = xb[0]; 40626fbe8dcSKarl Rupp x2 = xb[1]; 40726fbe8dcSKarl Rupp x3 = xb[2]; 40826fbe8dcSKarl Rupp 4092d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 4102d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 4112d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 4122d61bbb3SSatish Balay v += 9; 4132d61bbb3SSatish Balay } 4147b2bb3b9SHong Zhang if (usecprow) z = zarray + 3 * ridx[i]; 4159371c9d4SSatish Balay z[0] = sum1; 4169371c9d4SSatish Balay z[1] = sum2; 4179371c9d4SSatish Balay z[2] = sum3; 41826e093fcSHong Zhang if (!usecprow) z += 3; 4192d61bbb3SSatish Balay } 4209566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4219566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 4229566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); 423*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4242d61bbb3SSatish Balay } 4252d61bbb3SSatish Balay 426d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) 427d71ae5a4SJacob Faibussowitsch { 4282d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 429f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; 430d9fead3dSBarry Smith const PetscScalar *x, *xb; 431d9fead3dSBarry Smith const MatScalar *v; 4327c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 433ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 4342d61bbb3SSatish Balay 4352d61bbb3SSatish Balay PetscFunctionBegin; 4369566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4379566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 4382d61bbb3SSatish Balay 4392d61bbb3SSatish Balay idx = a->j; 4402d61bbb3SSatish Balay v = a->a; 44126e093fcSHong Zhang if (usecprow) { 44226e093fcSHong Zhang mbs = a->compressedrow.nrows; 44326e093fcSHong Zhang ii = a->compressedrow.i; 4447b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 4459566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); 44626e093fcSHong Zhang } else { 44726e093fcSHong Zhang mbs = a->mbs; 4482d61bbb3SSatish Balay ii = a->i; 44926e093fcSHong Zhang z = zarray; 45026e093fcSHong Zhang } 4512d61bbb3SSatish Balay 4522d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 45326fbe8dcSKarl Rupp n = ii[1] - ii[0]; 45426fbe8dcSKarl Rupp ii++; 45526fbe8dcSKarl Rupp sum1 = 0.0; 45626fbe8dcSKarl Rupp sum2 = 0.0; 45726fbe8dcSKarl Rupp sum3 = 0.0; 45826fbe8dcSKarl Rupp sum4 = 0.0; 45926fbe8dcSKarl Rupp 460444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 461444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4622d61bbb3SSatish Balay for (j = 0; j < n; j++) { 4632d61bbb3SSatish Balay xb = x + 4 * (*idx++); 4649371c9d4SSatish Balay x1 = xb[0]; 4659371c9d4SSatish Balay x2 = xb[1]; 4669371c9d4SSatish Balay x3 = xb[2]; 4679371c9d4SSatish Balay x4 = xb[3]; 4682d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 4692d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 4702d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 4712d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 4722d61bbb3SSatish Balay v += 16; 4732d61bbb3SSatish Balay } 4747b2bb3b9SHong Zhang if (usecprow) z = zarray + 4 * ridx[i]; 4759371c9d4SSatish Balay z[0] = sum1; 4769371c9d4SSatish Balay z[1] = sum2; 4779371c9d4SSatish Balay z[2] = sum3; 4789371c9d4SSatish Balay z[3] = sum4; 47926e093fcSHong Zhang if (!usecprow) z += 4; 4802d61bbb3SSatish Balay } 4819566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4829566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 4839566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); 484*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4852d61bbb3SSatish Balay } 4862d61bbb3SSatish Balay 487d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) 488d71ae5a4SJacob Faibussowitsch { 4892d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 490f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; 491d9fead3dSBarry Smith const PetscScalar *xb, *x; 492d9fead3dSBarry Smith const MatScalar *v; 4930298fd71SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 4947c565772SBarry Smith PetscInt mbs, i, j, n; 495ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 4962d61bbb3SSatish Balay 497433994e6SBarry Smith PetscFunctionBegin; 4989566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4999566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 5002d61bbb3SSatish Balay 5012d61bbb3SSatish Balay idx = a->j; 5022d61bbb3SSatish Balay v = a->a; 50326e093fcSHong Zhang if (usecprow) { 50426e093fcSHong Zhang mbs = a->compressedrow.nrows; 50526e093fcSHong Zhang ii = a->compressedrow.i; 5067b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 5079566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); 50826e093fcSHong Zhang } else { 50926e093fcSHong Zhang mbs = a->mbs; 5102d61bbb3SSatish Balay ii = a->i; 51126e093fcSHong Zhang z = zarray; 51226e093fcSHong Zhang } 5132d61bbb3SSatish Balay 5142d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 5159371c9d4SSatish Balay n = ii[1] - ii[0]; 5169371c9d4SSatish Balay ii++; 5179371c9d4SSatish Balay sum1 = 0.0; 5189371c9d4SSatish Balay sum2 = 0.0; 5199371c9d4SSatish Balay sum3 = 0.0; 5209371c9d4SSatish Balay sum4 = 0.0; 5219371c9d4SSatish Balay sum5 = 0.0; 522444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 523444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 5242d61bbb3SSatish Balay for (j = 0; j < n; j++) { 5252d61bbb3SSatish Balay xb = x + 5 * (*idx++); 5269371c9d4SSatish Balay x1 = xb[0]; 5279371c9d4SSatish Balay x2 = xb[1]; 5289371c9d4SSatish Balay x3 = xb[2]; 5299371c9d4SSatish Balay x4 = xb[3]; 5309371c9d4SSatish Balay x5 = xb[4]; 5312d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 5322d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 5332d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 5342d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 5352d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 5362d61bbb3SSatish Balay v += 25; 5372d61bbb3SSatish Balay } 5387b2bb3b9SHong Zhang if (usecprow) z = zarray + 5 * ridx[i]; 5399371c9d4SSatish Balay z[0] = sum1; 5409371c9d4SSatish Balay z[1] = sum2; 5419371c9d4SSatish Balay z[2] = sum3; 5429371c9d4SSatish Balay z[3] = sum4; 5439371c9d4SSatish Balay z[4] = sum5; 54426e093fcSHong Zhang if (!usecprow) z += 5; 5452d61bbb3SSatish Balay } 5469566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5479566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 5489566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); 549*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5502d61bbb3SSatish Balay } 5512d61bbb3SSatish Balay 552d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) 553d71ae5a4SJacob Faibussowitsch { 55415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 555f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 556d9fead3dSBarry Smith const PetscScalar *x, *xb; 55726e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *zarray; 558d9fead3dSBarry Smith const MatScalar *v; 5597c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 560ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 56115091d37SBarry Smith 562433994e6SBarry Smith PetscFunctionBegin; 5639566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5649566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 56515091d37SBarry Smith 56615091d37SBarry Smith idx = a->j; 56715091d37SBarry Smith v = a->a; 56826e093fcSHong Zhang if (usecprow) { 56926e093fcSHong Zhang mbs = a->compressedrow.nrows; 57026e093fcSHong Zhang ii = a->compressedrow.i; 5717b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 5729566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); 57326e093fcSHong Zhang } else { 57426e093fcSHong Zhang mbs = a->mbs; 57515091d37SBarry Smith ii = a->i; 57626e093fcSHong Zhang z = zarray; 57726e093fcSHong Zhang } 57815091d37SBarry Smith 57915091d37SBarry Smith for (i = 0; i < mbs; i++) { 58026fbe8dcSKarl Rupp n = ii[1] - ii[0]; 58126fbe8dcSKarl Rupp ii++; 58226fbe8dcSKarl Rupp sum1 = 0.0; 58326fbe8dcSKarl Rupp sum2 = 0.0; 58426fbe8dcSKarl Rupp sum3 = 0.0; 58526fbe8dcSKarl Rupp sum4 = 0.0; 58626fbe8dcSKarl Rupp sum5 = 0.0; 58726fbe8dcSKarl Rupp sum6 = 0.0; 58826fbe8dcSKarl Rupp 589444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 590444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 59115091d37SBarry Smith for (j = 0; j < n; j++) { 59215091d37SBarry Smith xb = x + 6 * (*idx++); 5939371c9d4SSatish Balay x1 = xb[0]; 5949371c9d4SSatish Balay x2 = xb[1]; 5959371c9d4SSatish Balay x3 = xb[2]; 5969371c9d4SSatish Balay x4 = xb[3]; 5979371c9d4SSatish Balay x5 = xb[4]; 5989371c9d4SSatish Balay x6 = xb[5]; 59915091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 60015091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 60115091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 60215091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 60315091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 60415091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 60515091d37SBarry Smith v += 36; 60615091d37SBarry Smith } 6077b2bb3b9SHong Zhang if (usecprow) z = zarray + 6 * ridx[i]; 6089371c9d4SSatish Balay z[0] = sum1; 6099371c9d4SSatish Balay z[1] = sum2; 6109371c9d4SSatish Balay z[2] = sum3; 6119371c9d4SSatish Balay z[3] = sum4; 6129371c9d4SSatish Balay z[4] = sum5; 6139371c9d4SSatish Balay z[5] = sum6; 61426e093fcSHong Zhang if (!usecprow) z += 6; 61515091d37SBarry Smith } 61615091d37SBarry Smith 6179566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6189566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 6199566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); 620*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 62115091d37SBarry Smith } 6228ab949d8SShri Abhyankar 623d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) 624d71ae5a4SJacob Faibussowitsch { 6252d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 626f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 627d9fead3dSBarry Smith const PetscScalar *x, *xb; 62826e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; 629d9fead3dSBarry Smith const MatScalar *v; 6307c565772SBarry Smith PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; 631ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 6322d61bbb3SSatish Balay 633433994e6SBarry Smith PetscFunctionBegin; 6349566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6359566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 6362d61bbb3SSatish Balay 6372d61bbb3SSatish Balay idx = a->j; 6382d61bbb3SSatish Balay v = a->a; 63926e093fcSHong Zhang if (usecprow) { 64026e093fcSHong Zhang mbs = a->compressedrow.nrows; 64126e093fcSHong Zhang ii = a->compressedrow.i; 6427b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 6439566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); 64426e093fcSHong Zhang } else { 64526e093fcSHong Zhang mbs = a->mbs; 6462d61bbb3SSatish Balay ii = a->i; 64726e093fcSHong Zhang z = zarray; 64826e093fcSHong Zhang } 6492d61bbb3SSatish Balay 6502d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 65126fbe8dcSKarl Rupp n = ii[1] - ii[0]; 65226fbe8dcSKarl Rupp ii++; 65326fbe8dcSKarl Rupp sum1 = 0.0; 65426fbe8dcSKarl Rupp sum2 = 0.0; 65526fbe8dcSKarl Rupp sum3 = 0.0; 65626fbe8dcSKarl Rupp sum4 = 0.0; 65726fbe8dcSKarl Rupp sum5 = 0.0; 65826fbe8dcSKarl Rupp sum6 = 0.0; 65926fbe8dcSKarl Rupp sum7 = 0.0; 66026fbe8dcSKarl Rupp 661444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 662444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 6632d61bbb3SSatish Balay for (j = 0; j < n; j++) { 6642d61bbb3SSatish Balay xb = x + 7 * (*idx++); 6659371c9d4SSatish Balay x1 = xb[0]; 6669371c9d4SSatish Balay x2 = xb[1]; 6679371c9d4SSatish Balay x3 = xb[2]; 6689371c9d4SSatish Balay x4 = xb[3]; 6699371c9d4SSatish Balay x5 = xb[4]; 6709371c9d4SSatish Balay x6 = xb[5]; 6719371c9d4SSatish Balay x7 = xb[6]; 6722d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 6732d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 6742d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 6752d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 6762d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 6772d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 6782d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 6792d61bbb3SSatish Balay v += 49; 6802d61bbb3SSatish Balay } 6817b2bb3b9SHong Zhang if (usecprow) z = zarray + 7 * ridx[i]; 6829371c9d4SSatish Balay z[0] = sum1; 6839371c9d4SSatish Balay z[1] = sum2; 6849371c9d4SSatish Balay z[2] = sum3; 6859371c9d4SSatish Balay z[3] = sum4; 6869371c9d4SSatish Balay z[4] = sum5; 6879371c9d4SSatish Balay z[5] = sum6; 6889371c9d4SSatish Balay z[6] = sum7; 68926e093fcSHong Zhang if (!usecprow) z += 7; 6902d61bbb3SSatish Balay } 6912d61bbb3SSatish Balay 6929566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6939566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 6949566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); 695*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6962d61bbb3SSatish Balay } 6972d61bbb3SSatish Balay 6985f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 699d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) 700d71ae5a4SJacob Faibussowitsch { 70196e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 702f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 70396e086a2SDaniel Kokron const PetscScalar *x, *xb; 70496e086a2SDaniel Kokron const MatScalar *v; 70596e086a2SDaniel Kokron PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 70696e086a2SDaniel Kokron const PetscInt *idx, *ii, *ridx = NULL; 707ce68d72fSJed Brown PetscInt k; 70896e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 70996e086a2SDaniel Kokron 71096e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 711ce68d72fSJed Brown __m256d w0, w1, w2, w3; 71296e086a2SDaniel Kokron __m256d z0, z1, z2; 71396e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 71496e086a2SDaniel Kokron 71596e086a2SDaniel Kokron PetscFunctionBegin; 7169566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7179566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 71896e086a2SDaniel Kokron 71996e086a2SDaniel Kokron idx = a->j; 72096e086a2SDaniel Kokron v = a->a; 72196e086a2SDaniel Kokron if (usecprow) { 72296e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 72396e086a2SDaniel Kokron ii = a->compressedrow.i; 72496e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 7259566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 72696e086a2SDaniel Kokron } else { 72796e086a2SDaniel Kokron mbs = a->mbs; 72896e086a2SDaniel Kokron ii = a->i; 72996e086a2SDaniel Kokron z = zarray; 73096e086a2SDaniel Kokron } 73196e086a2SDaniel Kokron 73296e086a2SDaniel Kokron if (!a->mult_work) { 73396e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 7349566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 73596e086a2SDaniel Kokron } 73696e086a2SDaniel Kokron 73796e086a2SDaniel Kokron work = a->mult_work; 73896e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 7399371c9d4SSatish Balay n = ii[1] - ii[0]; 7409371c9d4SSatish Balay ii++; 74196e086a2SDaniel Kokron workt = work; 74296e086a2SDaniel Kokron for (j = 0; j < n; j++) { 74396e086a2SDaniel Kokron xb = x + bs * (*idx++); 74496e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 74596e086a2SDaniel Kokron workt += bs; 74696e086a2SDaniel Kokron } 74796e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 74896e086a2SDaniel Kokron 7499371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 7509371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 7519371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 75296e086a2SDaniel Kokron 75396e086a2SDaniel Kokron for (j = 0; j < n; j++) { 754c05b70c4SSatish Balay /* first column of a */ 75596e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 7569371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 7579371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 7589371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 7599371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 7609371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 7619371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 76296e086a2SDaniel Kokron 763c05b70c4SSatish Balay /* second column of a */ 76496e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 7659371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 7669371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 7679371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 7689371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 7699371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 7709371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 77196e086a2SDaniel Kokron 772c05b70c4SSatish Balay /* third column of a */ 77396e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 7749371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 7759371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 7769371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 7779371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 7789371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 7799371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 78096e086a2SDaniel Kokron 781c05b70c4SSatish Balay /* fourth column of a */ 78296e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 7839371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 7849371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 7859371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 7869371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 7879371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 7889371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 78996e086a2SDaniel Kokron 790c05b70c4SSatish Balay /* fifth column of a */ 79196e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 7929371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 7939371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 7949371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 7959371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 7969371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 7979371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 79896e086a2SDaniel Kokron 799c05b70c4SSatish Balay /* sixth column of a */ 80096e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 8019371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 8029371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 8039371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 8049371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 8059371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 8069371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 80796e086a2SDaniel Kokron 808c05b70c4SSatish Balay /* seventh column of a */ 80996e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 8109371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 8119371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 8129371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 8139371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 8149371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 8159371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 81696e086a2SDaniel Kokron 8176aad120cSJose E. Roman /* eighth column of a */ 81896e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 8199371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 8209371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 8219371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 8229371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 8239371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 8249371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 82596e086a2SDaniel Kokron 826c05b70c4SSatish Balay /* ninth column of a */ 82796e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 8289371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 8299371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 8309371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 8319371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 8329371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 8339371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 83496e086a2SDaniel Kokron } 83596e086a2SDaniel Kokron 8369371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 8379371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 8389371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 83996e086a2SDaniel Kokron 84096e086a2SDaniel Kokron v += n * bs2; 84196e086a2SDaniel Kokron if (!usecprow) z += bs; 84296e086a2SDaniel Kokron } 8439566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8449566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 8459566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 846*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 84796e086a2SDaniel Kokron } 84896e086a2SDaniel Kokron #endif 84996e086a2SDaniel Kokron 850d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) 851d71ae5a4SJacob Faibussowitsch { 852ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 853f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 854ebada01fSBarry Smith const PetscScalar *x, *xb; 855ebada01fSBarry Smith PetscScalar *zarray, xv; 856ebada01fSBarry Smith const MatScalar *v; 857ebada01fSBarry Smith const PetscInt *ii, *ij = a->j, *idx; 858ebada01fSBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 859ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 860ebada01fSBarry Smith 861ebada01fSBarry Smith PetscFunctionBegin; 8629566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8639566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 864ebada01fSBarry Smith 865ebada01fSBarry Smith v = a->a; 866ebada01fSBarry Smith if (usecprow) { 867ebada01fSBarry Smith mbs = a->compressedrow.nrows; 868ebada01fSBarry Smith ii = a->compressedrow.i; 869ebada01fSBarry Smith ridx = a->compressedrow.rindex; 8709566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); 871ebada01fSBarry Smith } else { 872ebada01fSBarry Smith mbs = a->mbs; 873ebada01fSBarry Smith ii = a->i; 874ebada01fSBarry Smith z = zarray; 875ebada01fSBarry Smith } 876ebada01fSBarry Smith 877ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 878ebada01fSBarry Smith n = ii[i + 1] - ii[i]; 879ebada01fSBarry Smith idx = ij + ii[i]; 8809371c9d4SSatish Balay sum1 = 0.0; 8819371c9d4SSatish Balay sum2 = 0.0; 8829371c9d4SSatish Balay sum3 = 0.0; 8839371c9d4SSatish Balay sum4 = 0.0; 8849371c9d4SSatish Balay sum5 = 0.0; 8859371c9d4SSatish Balay sum6 = 0.0; 8869371c9d4SSatish Balay sum7 = 0.0; 8879371c9d4SSatish Balay sum8 = 0.0; 8889371c9d4SSatish Balay sum9 = 0.0; 8899371c9d4SSatish Balay sum10 = 0.0; 8909371c9d4SSatish Balay sum11 = 0.0; 891ebada01fSBarry Smith 892ebada01fSBarry Smith for (j = 0; j < n; j++) { 893ebada01fSBarry Smith xb = x + 11 * (idx[j]); 894ebada01fSBarry Smith 895ebada01fSBarry Smith for (k = 0; k < 11; k++) { 896ebada01fSBarry Smith xv = xb[k]; 897ebada01fSBarry Smith sum1 += v[0] * xv; 898ebada01fSBarry Smith sum2 += v[1] * xv; 899ebada01fSBarry Smith sum3 += v[2] * xv; 900ebada01fSBarry Smith sum4 += v[3] * xv; 901ebada01fSBarry Smith sum5 += v[4] * xv; 902ebada01fSBarry Smith sum6 += v[5] * xv; 903ebada01fSBarry Smith sum7 += v[6] * xv; 904ebada01fSBarry Smith sum8 += v[7] * xv; 905ebada01fSBarry Smith sum9 += v[8] * xv; 906ebada01fSBarry Smith sum10 += v[9] * xv; 907ebada01fSBarry Smith sum11 += v[10] * xv; 908ebada01fSBarry Smith v += 11; 909ebada01fSBarry Smith } 910ebada01fSBarry Smith } 911ebada01fSBarry Smith if (usecprow) z = zarray + 11 * ridx[i]; 9129371c9d4SSatish Balay z[0] = sum1; 9139371c9d4SSatish Balay z[1] = sum2; 9149371c9d4SSatish Balay z[2] = sum3; 9159371c9d4SSatish Balay z[3] = sum4; 9169371c9d4SSatish Balay z[4] = sum5; 9179371c9d4SSatish Balay z[5] = sum6; 9189371c9d4SSatish Balay z[6] = sum7; 9199371c9d4SSatish Balay z[7] = sum8; 9209371c9d4SSatish Balay z[8] = sum9; 9219371c9d4SSatish Balay z[9] = sum10; 9229371c9d4SSatish Balay z[10] = sum11; 923ebada01fSBarry Smith 924ebada01fSBarry Smith if (!usecprow) z += 11; 925ebada01fSBarry Smith } 926ebada01fSBarry Smith 9279566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9289566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 9299566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); 930*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 931ebada01fSBarry Smith } 932ebada01fSBarry Smith 9336679dcc1SBarry Smith /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 934d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) 935d71ae5a4SJacob Faibussowitsch { 9366679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 9376679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 9386679dcc1SBarry Smith const PetscScalar *x, *xb; 9396679dcc1SBarry Smith PetscScalar *zarray, xv; 9406679dcc1SBarry Smith const MatScalar *v; 9416679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 9426679dcc1SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 9436679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 9446679dcc1SBarry Smith 9456679dcc1SBarry Smith PetscFunctionBegin; 9469566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9479566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 9486679dcc1SBarry Smith 9496679dcc1SBarry Smith v = a->a; 9506679dcc1SBarry Smith if (usecprow) { 9516679dcc1SBarry Smith mbs = a->compressedrow.nrows; 9526679dcc1SBarry Smith ii = a->compressedrow.i; 9536679dcc1SBarry Smith ridx = a->compressedrow.rindex; 9549566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 9556679dcc1SBarry Smith } else { 9566679dcc1SBarry Smith mbs = a->mbs; 9576679dcc1SBarry Smith ii = a->i; 9586679dcc1SBarry Smith z = zarray; 9596679dcc1SBarry Smith } 9606679dcc1SBarry Smith 9616679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 9626679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 9636679dcc1SBarry Smith idx = ij + ii[i]; 9649371c9d4SSatish Balay sum1 = 0.0; 9659371c9d4SSatish Balay sum2 = 0.0; 9669371c9d4SSatish Balay sum3 = 0.0; 9679371c9d4SSatish Balay sum4 = 0.0; 9689371c9d4SSatish Balay sum5 = 0.0; 9699371c9d4SSatish Balay sum6 = 0.0; 9709371c9d4SSatish Balay sum7 = 0.0; 9719371c9d4SSatish Balay sum8 = 0.0; 9729371c9d4SSatish Balay sum9 = 0.0; 9739371c9d4SSatish Balay sum10 = 0.0; 9749371c9d4SSatish Balay sum11 = 0.0; 9759371c9d4SSatish Balay sum12 = 0.0; 9766679dcc1SBarry Smith 9776679dcc1SBarry Smith for (j = 0; j < n; j++) { 9786679dcc1SBarry Smith xb = x + 12 * (idx[j]); 9796679dcc1SBarry Smith 9806679dcc1SBarry Smith for (k = 0; k < 12; k++) { 9816679dcc1SBarry Smith xv = xb[k]; 9826679dcc1SBarry Smith sum1 += v[0] * xv; 9836679dcc1SBarry Smith sum2 += v[1] * xv; 9846679dcc1SBarry Smith sum3 += v[2] * xv; 9856679dcc1SBarry Smith sum4 += v[3] * xv; 9866679dcc1SBarry Smith sum5 += v[4] * xv; 9876679dcc1SBarry Smith sum6 += v[5] * xv; 9886679dcc1SBarry Smith sum7 += v[6] * xv; 9896679dcc1SBarry Smith sum8 += v[7] * xv; 9906679dcc1SBarry Smith sum9 += v[8] * xv; 9916679dcc1SBarry Smith sum10 += v[9] * xv; 9926679dcc1SBarry Smith sum11 += v[10] * xv; 9936679dcc1SBarry Smith sum12 += v[11] * xv; 9946679dcc1SBarry Smith v += 12; 9956679dcc1SBarry Smith } 9966679dcc1SBarry Smith } 9976679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 9989371c9d4SSatish Balay z[0] = sum1; 9999371c9d4SSatish Balay z[1] = sum2; 10009371c9d4SSatish Balay z[2] = sum3; 10019371c9d4SSatish Balay z[3] = sum4; 10029371c9d4SSatish Balay z[4] = sum5; 10039371c9d4SSatish Balay z[5] = sum6; 10049371c9d4SSatish Balay z[6] = sum7; 10059371c9d4SSatish Balay z[7] = sum8; 10069371c9d4SSatish Balay z[8] = sum9; 10079371c9d4SSatish Balay z[9] = sum10; 10089371c9d4SSatish Balay z[10] = sum11; 10099371c9d4SSatish Balay z[11] = sum12; 10106679dcc1SBarry Smith if (!usecprow) z += 12; 10116679dcc1SBarry Smith } 10129566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10139566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 10149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1015*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 10166679dcc1SBarry Smith } 10176679dcc1SBarry Smith 1018d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) 1019d71ae5a4SJacob Faibussowitsch { 10206679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 10216679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 10226679dcc1SBarry Smith const PetscScalar *x, *xb; 10236679dcc1SBarry Smith PetscScalar *zarray, *yarray, xv; 10246679dcc1SBarry Smith const MatScalar *v; 10256679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx; 10266679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; 10276679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 10286679dcc1SBarry Smith 10296679dcc1SBarry Smith PetscFunctionBegin; 10309566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10319566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 10326679dcc1SBarry Smith 10336679dcc1SBarry Smith v = a->a; 10346679dcc1SBarry Smith if (usecprow) { 103548a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 10366679dcc1SBarry Smith mbs = a->compressedrow.nrows; 10376679dcc1SBarry Smith ii = a->compressedrow.i; 10386679dcc1SBarry Smith ridx = a->compressedrow.rindex; 10396679dcc1SBarry Smith } else { 10406679dcc1SBarry Smith ii = a->i; 10416679dcc1SBarry Smith y = yarray; 10426679dcc1SBarry Smith z = zarray; 10436679dcc1SBarry Smith } 10446679dcc1SBarry Smith 10456679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 10466679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 10476679dcc1SBarry Smith idx = ij + ii[i]; 10486679dcc1SBarry Smith 10496679dcc1SBarry Smith if (usecprow) { 10506679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 10516679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 10526679dcc1SBarry Smith } 10539371c9d4SSatish Balay sum1 = y[0]; 10549371c9d4SSatish Balay sum2 = y[1]; 10559371c9d4SSatish Balay sum3 = y[2]; 10569371c9d4SSatish Balay sum4 = y[3]; 10579371c9d4SSatish Balay sum5 = y[4]; 10589371c9d4SSatish Balay sum6 = y[5]; 10599371c9d4SSatish Balay sum7 = y[6]; 10609371c9d4SSatish Balay sum8 = y[7]; 10619371c9d4SSatish Balay sum9 = y[8]; 10629371c9d4SSatish Balay sum10 = y[9]; 10639371c9d4SSatish Balay sum11 = y[10]; 10649371c9d4SSatish Balay sum12 = y[11]; 10656679dcc1SBarry Smith 10666679dcc1SBarry Smith for (j = 0; j < n; j++) { 10676679dcc1SBarry Smith xb = x + 12 * (idx[j]); 10686679dcc1SBarry Smith 10696679dcc1SBarry Smith for (k = 0; k < 12; k++) { 10706679dcc1SBarry Smith xv = xb[k]; 10716679dcc1SBarry Smith sum1 += v[0] * xv; 10726679dcc1SBarry Smith sum2 += v[1] * xv; 10736679dcc1SBarry Smith sum3 += v[2] * xv; 10746679dcc1SBarry Smith sum4 += v[3] * xv; 10756679dcc1SBarry Smith sum5 += v[4] * xv; 10766679dcc1SBarry Smith sum6 += v[5] * xv; 10776679dcc1SBarry Smith sum7 += v[6] * xv; 10786679dcc1SBarry Smith sum8 += v[7] * xv; 10796679dcc1SBarry Smith sum9 += v[8] * xv; 10806679dcc1SBarry Smith sum10 += v[9] * xv; 10816679dcc1SBarry Smith sum11 += v[10] * xv; 10826679dcc1SBarry Smith sum12 += v[11] * xv; 10836679dcc1SBarry Smith v += 12; 10846679dcc1SBarry Smith } 10856679dcc1SBarry Smith } 10866679dcc1SBarry Smith 10879371c9d4SSatish Balay z[0] = sum1; 10889371c9d4SSatish Balay z[1] = sum2; 10899371c9d4SSatish Balay z[2] = sum3; 10909371c9d4SSatish Balay z[3] = sum4; 10919371c9d4SSatish Balay z[4] = sum5; 10929371c9d4SSatish Balay z[5] = sum6; 10939371c9d4SSatish Balay z[6] = sum7; 10949371c9d4SSatish Balay z[7] = sum8; 10959371c9d4SSatish Balay z[8] = sum9; 10969371c9d4SSatish Balay z[9] = sum10; 10979371c9d4SSatish Balay z[10] = sum11; 10989371c9d4SSatish Balay z[11] = sum12; 10996679dcc1SBarry Smith if (!usecprow) { 11006679dcc1SBarry Smith y += 12; 11016679dcc1SBarry Smith z += 12; 11026679dcc1SBarry Smith } 11036679dcc1SBarry Smith } 11049566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11059566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 11069566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1107*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 11086679dcc1SBarry Smith } 11096679dcc1SBarry Smith 11106679dcc1SBarry Smith /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1111d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) 1112d71ae5a4SJacob Faibussowitsch { 11136679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 11146679dcc1SBarry Smith PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 11156679dcc1SBarry Smith const PetscScalar *x, *xb; 11166679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray; 11176679dcc1SBarry Smith const MatScalar *v; 11186679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 11196679dcc1SBarry Smith PetscInt mbs, i, j, n; 11206679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 11216679dcc1SBarry Smith 11226679dcc1SBarry Smith PetscFunctionBegin; 11239566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 11249566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 11256679dcc1SBarry Smith 11266679dcc1SBarry Smith v = a->a; 11276679dcc1SBarry Smith if (usecprow) { 11286679dcc1SBarry Smith mbs = a->compressedrow.nrows; 11296679dcc1SBarry Smith ii = a->compressedrow.i; 11306679dcc1SBarry Smith ridx = a->compressedrow.rindex; 11319566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); 11326679dcc1SBarry Smith } else { 11336679dcc1SBarry Smith mbs = a->mbs; 11346679dcc1SBarry Smith ii = a->i; 11356679dcc1SBarry Smith z = zarray; 11366679dcc1SBarry Smith } 11376679dcc1SBarry Smith 11386679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 11396679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 11406679dcc1SBarry Smith idx = ij + ii[i]; 11416679dcc1SBarry Smith 11426679dcc1SBarry Smith sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 11436679dcc1SBarry Smith for (j = 0; j < n; j++) { 11446679dcc1SBarry Smith xb = x + 12 * (idx[j]); 11459371c9d4SSatish Balay x1 = xb[0]; 11469371c9d4SSatish Balay x2 = xb[1]; 11479371c9d4SSatish Balay x3 = xb[2]; 11489371c9d4SSatish Balay x4 = xb[3]; 11496679dcc1SBarry Smith 11506679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11516679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11526679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11536679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11546679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11556679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11566679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11576679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11586679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11596679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11606679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11616679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11626679dcc1SBarry Smith v += 48; 11636679dcc1SBarry Smith 11649371c9d4SSatish Balay x1 = xb[4]; 11659371c9d4SSatish Balay x2 = xb[5]; 11669371c9d4SSatish Balay x3 = xb[6]; 11679371c9d4SSatish Balay x4 = xb[7]; 11686679dcc1SBarry Smith 11696679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11706679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11716679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11726679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11736679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11746679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11756679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11766679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11776679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11786679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11796679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11806679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11816679dcc1SBarry Smith v += 48; 11826679dcc1SBarry Smith 11839371c9d4SSatish Balay x1 = xb[8]; 11849371c9d4SSatish Balay x2 = xb[9]; 11859371c9d4SSatish Balay x3 = xb[10]; 11869371c9d4SSatish Balay x4 = xb[11]; 11876679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 11886679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 11896679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 11906679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 11916679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 11926679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 11936679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 11946679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 11956679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 11966679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 11976679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 11986679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 11996679dcc1SBarry Smith v += 48; 12006679dcc1SBarry Smith } 12016679dcc1SBarry Smith if (usecprow) z = zarray + 12 * ridx[i]; 12029371c9d4SSatish Balay z[0] = sum1; 12039371c9d4SSatish Balay z[1] = sum2; 12049371c9d4SSatish Balay z[2] = sum3; 12059371c9d4SSatish Balay z[3] = sum4; 12069371c9d4SSatish Balay z[4] = sum5; 12079371c9d4SSatish Balay z[5] = sum6; 12089371c9d4SSatish Balay z[6] = sum7; 12099371c9d4SSatish Balay z[7] = sum8; 12109371c9d4SSatish Balay z[8] = sum9; 12119371c9d4SSatish Balay z[9] = sum10; 12129371c9d4SSatish Balay z[10] = sum11; 12139371c9d4SSatish Balay z[11] = sum12; 12146679dcc1SBarry Smith if (!usecprow) z += 12; 12156679dcc1SBarry Smith } 12169566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 12179566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 12189566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1219*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 12206679dcc1SBarry Smith } 12216679dcc1SBarry Smith 12226679dcc1SBarry Smith /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1223d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) 1224d71ae5a4SJacob Faibussowitsch { 12256679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 12266679dcc1SBarry Smith PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; 12276679dcc1SBarry Smith const PetscScalar *x, *xb; 12286679dcc1SBarry Smith PetscScalar x1, x2, x3, x4, *zarray, *yarray; 12296679dcc1SBarry Smith const MatScalar *v; 12306679dcc1SBarry Smith const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; 12316679dcc1SBarry Smith PetscInt mbs = a->mbs, i, j, n; 12326679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 12336679dcc1SBarry Smith 12346679dcc1SBarry Smith PetscFunctionBegin; 12359566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 12369566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 12376679dcc1SBarry Smith 12386679dcc1SBarry Smith v = a->a; 12396679dcc1SBarry Smith if (usecprow) { 124048a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); 12416679dcc1SBarry Smith mbs = a->compressedrow.nrows; 12426679dcc1SBarry Smith ii = a->compressedrow.i; 12436679dcc1SBarry Smith ridx = a->compressedrow.rindex; 12446679dcc1SBarry Smith } else { 12456679dcc1SBarry Smith ii = a->i; 12466679dcc1SBarry Smith y = yarray; 12476679dcc1SBarry Smith z = zarray; 12486679dcc1SBarry Smith } 12496679dcc1SBarry Smith 12506679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 12516679dcc1SBarry Smith n = ii[i + 1] - ii[i]; 12526679dcc1SBarry Smith idx = ij + ii[i]; 12536679dcc1SBarry Smith 12546679dcc1SBarry Smith if (usecprow) { 12556679dcc1SBarry Smith y = yarray + 12 * ridx[i]; 12566679dcc1SBarry Smith z = zarray + 12 * ridx[i]; 12576679dcc1SBarry Smith } 12589371c9d4SSatish Balay sum1 = y[0]; 12599371c9d4SSatish Balay sum2 = y[1]; 12609371c9d4SSatish Balay sum3 = y[2]; 12619371c9d4SSatish Balay sum4 = y[3]; 12629371c9d4SSatish Balay sum5 = y[4]; 12639371c9d4SSatish Balay sum6 = y[5]; 12649371c9d4SSatish Balay sum7 = y[6]; 12659371c9d4SSatish Balay sum8 = y[7]; 12669371c9d4SSatish Balay sum9 = y[8]; 12679371c9d4SSatish Balay sum10 = y[9]; 12689371c9d4SSatish Balay sum11 = y[10]; 12699371c9d4SSatish Balay sum12 = y[11]; 12706679dcc1SBarry Smith 12716679dcc1SBarry Smith for (j = 0; j < n; j++) { 12726679dcc1SBarry Smith xb = x + 12 * (idx[j]); 12739371c9d4SSatish Balay x1 = xb[0]; 12749371c9d4SSatish Balay x2 = xb[1]; 12759371c9d4SSatish Balay x3 = xb[2]; 12769371c9d4SSatish Balay x4 = xb[3]; 12776679dcc1SBarry Smith 12786679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12796679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12806679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 12816679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 12826679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 12836679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 12846679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 12856679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 12866679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 12876679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 12886679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 12896679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 12906679dcc1SBarry Smith v += 48; 12916679dcc1SBarry Smith 12929371c9d4SSatish Balay x1 = xb[4]; 12939371c9d4SSatish Balay x2 = xb[5]; 12949371c9d4SSatish Balay x3 = xb[6]; 12959371c9d4SSatish Balay x4 = xb[7]; 12966679dcc1SBarry Smith 12976679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 12986679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 12996679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 13006679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 13016679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 13026679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 13036679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 13046679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 13056679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 13066679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 13076679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 13086679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 13096679dcc1SBarry Smith v += 48; 13106679dcc1SBarry Smith 13119371c9d4SSatish Balay x1 = xb[8]; 13129371c9d4SSatish Balay x2 = xb[9]; 13139371c9d4SSatish Balay x3 = xb[10]; 13149371c9d4SSatish Balay x4 = xb[11]; 13156679dcc1SBarry Smith sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; 13166679dcc1SBarry Smith sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; 13176679dcc1SBarry Smith sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; 13186679dcc1SBarry Smith sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; 13196679dcc1SBarry Smith sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; 13206679dcc1SBarry Smith sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; 13216679dcc1SBarry Smith sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; 13226679dcc1SBarry Smith sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; 13236679dcc1SBarry Smith sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; 13246679dcc1SBarry Smith sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; 13256679dcc1SBarry Smith sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; 13266679dcc1SBarry Smith sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; 13276679dcc1SBarry Smith v += 48; 13286679dcc1SBarry Smith } 13299371c9d4SSatish Balay z[0] = sum1; 13309371c9d4SSatish Balay z[1] = sum2; 13319371c9d4SSatish Balay z[2] = sum3; 13329371c9d4SSatish Balay z[3] = sum4; 13339371c9d4SSatish Balay z[4] = sum5; 13349371c9d4SSatish Balay z[5] = sum6; 13359371c9d4SSatish Balay z[6] = sum7; 13369371c9d4SSatish Balay z[7] = sum8; 13379371c9d4SSatish Balay z[8] = sum9; 13389371c9d4SSatish Balay z[9] = sum10; 13399371c9d4SSatish Balay z[10] = sum11; 13409371c9d4SSatish Balay z[11] = sum12; 13416679dcc1SBarry Smith if (!usecprow) { 13426679dcc1SBarry Smith y += 12; 13436679dcc1SBarry Smith z += 12; 13446679dcc1SBarry Smith } 13456679dcc1SBarry Smith } 13469566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 13479566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 13489566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); 1349*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13506679dcc1SBarry Smith } 13516679dcc1SBarry Smith 13526679dcc1SBarry Smith #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1353d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) 1354d71ae5a4SJacob Faibussowitsch { 13556679dcc1SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 13566679dcc1SBarry Smith PetscScalar *z = NULL, *zarray; 13576679dcc1SBarry Smith const PetscScalar *x, *work; 13586679dcc1SBarry Smith const MatScalar *v = a->a; 13596679dcc1SBarry Smith PetscInt mbs, i, j, n; 13606679dcc1SBarry Smith const PetscInt *idx = a->j, *ii, *ridx = NULL; 13616679dcc1SBarry Smith PetscBool usecprow = a->compressedrow.use; 13626679dcc1SBarry Smith const PetscInt bs = 12, bs2 = 144; 13636679dcc1SBarry Smith 13646679dcc1SBarry Smith __m256d a0, a1, a2, a3, a4, a5; 13656679dcc1SBarry Smith __m256d w0, w1, w2, w3; 13666679dcc1SBarry Smith __m256d z0, z1, z2; 13676679dcc1SBarry Smith 13686679dcc1SBarry Smith PetscFunctionBegin; 13699566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 13709566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 13716679dcc1SBarry Smith 13726679dcc1SBarry Smith if (usecprow) { 13736679dcc1SBarry Smith mbs = a->compressedrow.nrows; 13746679dcc1SBarry Smith ii = a->compressedrow.i; 13756679dcc1SBarry Smith ridx = a->compressedrow.rindex; 13769566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 13776679dcc1SBarry Smith } else { 13786679dcc1SBarry Smith mbs = a->mbs; 13796679dcc1SBarry Smith ii = a->i; 13806679dcc1SBarry Smith z = zarray; 13816679dcc1SBarry Smith } 13826679dcc1SBarry Smith 13836679dcc1SBarry Smith for (i = 0; i < mbs; i++) { 13849371c9d4SSatish Balay z0 = _mm256_setzero_pd(); 13859371c9d4SSatish Balay z1 = _mm256_setzero_pd(); 13869371c9d4SSatish Balay z2 = _mm256_setzero_pd(); 13876679dcc1SBarry Smith 13889371c9d4SSatish Balay n = ii[1] - ii[0]; 13899371c9d4SSatish Balay ii++; 13906679dcc1SBarry Smith for (j = 0; j < n; j++) { 13916679dcc1SBarry Smith work = x + bs * (*idx++); 13926679dcc1SBarry Smith 13936679dcc1SBarry Smith /* first column of a */ 13946679dcc1SBarry Smith w0 = _mm256_set1_pd(work[0]); 13959371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 0); 13969371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 13979371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 4); 13989371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 13999371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 8); 14009371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14016679dcc1SBarry Smith 14026679dcc1SBarry Smith /* second column of a */ 14036679dcc1SBarry Smith w1 = _mm256_set1_pd(work[1]); 14049371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 12); 14059371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14069371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 16); 14079371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14089371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 20); 14099371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14106679dcc1SBarry Smith 14116679dcc1SBarry Smith /* third column of a */ 14126679dcc1SBarry Smith w2 = _mm256_set1_pd(work[2]); 14139371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 24); 14149371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14159371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 28); 14169371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14179371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 32); 14189371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14196679dcc1SBarry Smith 14206679dcc1SBarry Smith /* fourth column of a */ 14216679dcc1SBarry Smith w3 = _mm256_set1_pd(work[3]); 14229371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 36); 14239371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14249371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 40); 14259371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14269371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 44); 14279371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14286679dcc1SBarry Smith 14296679dcc1SBarry Smith /* fifth column of a */ 14306679dcc1SBarry Smith w0 = _mm256_set1_pd(work[4]); 14319371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 48); 14329371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14339371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 52); 14349371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14359371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 56); 14369371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14376679dcc1SBarry Smith 14386679dcc1SBarry Smith /* sixth column of a */ 14396679dcc1SBarry Smith w1 = _mm256_set1_pd(work[5]); 14409371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 60); 14419371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14429371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 64); 14439371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14449371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 68); 14459371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14466679dcc1SBarry Smith 14476679dcc1SBarry Smith /* seventh column of a */ 14486679dcc1SBarry Smith w2 = _mm256_set1_pd(work[6]); 14499371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 72); 14509371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14519371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 76); 14529371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14539371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 80); 14549371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14556679dcc1SBarry Smith 14566aad120cSJose E. Roman /* eighth column of a */ 14576679dcc1SBarry Smith w3 = _mm256_set1_pd(work[7]); 14589371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 84); 14599371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14609371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 88); 14619371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14629371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 92); 14639371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 14646679dcc1SBarry Smith 14656679dcc1SBarry Smith /* ninth column of a */ 14666679dcc1SBarry Smith w0 = _mm256_set1_pd(work[8]); 14679371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 96); 14689371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 14699371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 100); 14709371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 14719371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 104); 14729371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 14736679dcc1SBarry Smith 14746679dcc1SBarry Smith /* tenth column of a */ 14756679dcc1SBarry Smith w1 = _mm256_set1_pd(work[9]); 14769371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 108); 14779371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w1, z0); 14789371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 112); 14799371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w1, z1); 14809371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 116); 14819371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w1, z2); 14826679dcc1SBarry Smith 14836679dcc1SBarry Smith /* eleventh column of a */ 14846679dcc1SBarry Smith w2 = _mm256_set1_pd(work[10]); 14859371c9d4SSatish Balay a0 = _mm256_loadu_pd(v + 120); 14869371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 14879371c9d4SSatish Balay a1 = _mm256_loadu_pd(v + 124); 14889371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 14899371c9d4SSatish Balay a2 = _mm256_loadu_pd(v + 128); 14909371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 14916679dcc1SBarry Smith 14926679dcc1SBarry Smith /* twelveth column of a */ 14936679dcc1SBarry Smith w3 = _mm256_set1_pd(work[11]); 14949371c9d4SSatish Balay a3 = _mm256_loadu_pd(v + 132); 14959371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 14969371c9d4SSatish Balay a4 = _mm256_loadu_pd(v + 136); 14979371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 14989371c9d4SSatish Balay a5 = _mm256_loadu_pd(v + 140); 14999371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 15006679dcc1SBarry Smith 15016679dcc1SBarry Smith v += bs2; 15026679dcc1SBarry Smith } 15036679dcc1SBarry Smith if (usecprow) z = zarray + bs * ridx[i]; 15049371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 15059371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 15069371c9d4SSatish Balay _mm256_storeu_pd(&z[8], z2); 15076679dcc1SBarry Smith if (!usecprow) z += bs; 15086679dcc1SBarry Smith } 15099566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 15109566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 15119566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 1512*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 15136679dcc1SBarry Smith } 15146679dcc1SBarry Smith #endif 15156679dcc1SBarry Smith 15168ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1517832cc040SShri Abhyankar /* Default MatMult for block size 15 */ 1518d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) 1519d71ae5a4SJacob Faibussowitsch { 15208ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1521f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 15228ab949d8SShri Abhyankar const PetscScalar *x, *xb; 152353ef36baSBarry Smith PetscScalar *zarray, xv; 15248ab949d8SShri Abhyankar const MatScalar *v; 15258ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 15267c565772SBarry Smith PetscInt mbs, i, j, k, n, *ridx = NULL; 1527ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 15288ab949d8SShri Abhyankar 15298ab949d8SShri Abhyankar PetscFunctionBegin; 15309566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 15319566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 15328ab949d8SShri Abhyankar 15338ab949d8SShri Abhyankar v = a->a; 15348ab949d8SShri Abhyankar if (usecprow) { 15358ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 15368ab949d8SShri Abhyankar ii = a->compressedrow.i; 15378ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 15389566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 15398ab949d8SShri Abhyankar } else { 15408ab949d8SShri Abhyankar mbs = a->mbs; 15418ab949d8SShri Abhyankar ii = a->i; 15428ab949d8SShri Abhyankar z = zarray; 15438ab949d8SShri Abhyankar } 15448ab949d8SShri Abhyankar 15458ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 15468ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 15478ab949d8SShri Abhyankar idx = ij + ii[i]; 15489371c9d4SSatish Balay sum1 = 0.0; 15499371c9d4SSatish Balay sum2 = 0.0; 15509371c9d4SSatish Balay sum3 = 0.0; 15519371c9d4SSatish Balay sum4 = 0.0; 15529371c9d4SSatish Balay sum5 = 0.0; 15539371c9d4SSatish Balay sum6 = 0.0; 15549371c9d4SSatish Balay sum7 = 0.0; 15559371c9d4SSatish Balay sum8 = 0.0; 15569371c9d4SSatish Balay sum9 = 0.0; 15579371c9d4SSatish Balay sum10 = 0.0; 15589371c9d4SSatish Balay sum11 = 0.0; 15599371c9d4SSatish Balay sum12 = 0.0; 15609371c9d4SSatish Balay sum13 = 0.0; 15619371c9d4SSatish Balay sum14 = 0.0; 15629371c9d4SSatish Balay sum15 = 0.0; 15638ab949d8SShri Abhyankar 15648ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 15658ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 15668ab949d8SShri Abhyankar 15678ab949d8SShri Abhyankar for (k = 0; k < 15; k++) { 156853ef36baSBarry Smith xv = xb[k]; 156953ef36baSBarry Smith sum1 += v[0] * xv; 157053ef36baSBarry Smith sum2 += v[1] * xv; 157153ef36baSBarry Smith sum3 += v[2] * xv; 157253ef36baSBarry Smith sum4 += v[3] * xv; 157353ef36baSBarry Smith sum5 += v[4] * xv; 157453ef36baSBarry Smith sum6 += v[5] * xv; 157553ef36baSBarry Smith sum7 += v[6] * xv; 157653ef36baSBarry Smith sum8 += v[7] * xv; 157753ef36baSBarry Smith sum9 += v[8] * xv; 157853ef36baSBarry Smith sum10 += v[9] * xv; 157953ef36baSBarry Smith sum11 += v[10] * xv; 158053ef36baSBarry Smith sum12 += v[11] * xv; 158153ef36baSBarry Smith sum13 += v[12] * xv; 158253ef36baSBarry Smith sum14 += v[13] * xv; 158353ef36baSBarry Smith sum15 += v[14] * xv; 15848ab949d8SShri Abhyankar v += 15; 15858ab949d8SShri Abhyankar } 15868ab949d8SShri Abhyankar } 15878ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 15889371c9d4SSatish Balay z[0] = sum1; 15899371c9d4SSatish Balay z[1] = sum2; 15909371c9d4SSatish Balay z[2] = sum3; 15919371c9d4SSatish Balay z[3] = sum4; 15929371c9d4SSatish Balay z[4] = sum5; 15939371c9d4SSatish Balay z[5] = sum6; 15949371c9d4SSatish Balay z[6] = sum7; 15959371c9d4SSatish Balay z[7] = sum8; 15969371c9d4SSatish Balay z[8] = sum9; 15979371c9d4SSatish Balay z[9] = sum10; 15989371c9d4SSatish Balay z[10] = sum11; 15999371c9d4SSatish Balay z[11] = sum12; 16009371c9d4SSatish Balay z[12] = sum13; 16019371c9d4SSatish Balay z[13] = sum14; 16029371c9d4SSatish Balay z[14] = sum15; 16038ab949d8SShri Abhyankar 16048ab949d8SShri Abhyankar if (!usecprow) z += 15; 16058ab949d8SShri Abhyankar } 16068ab949d8SShri Abhyankar 16079566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 16089566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 16099566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1610*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16118ab949d8SShri Abhyankar } 16128ab949d8SShri Abhyankar 16138ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1614d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) 1615d71ae5a4SJacob Faibussowitsch { 16168ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1617f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 16188ab949d8SShri Abhyankar const PetscScalar *x, *xb; 16190b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, *zarray; 16208ab949d8SShri Abhyankar const MatScalar *v; 16218ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 16227c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1623ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 16248ab949d8SShri Abhyankar 16258ab949d8SShri Abhyankar PetscFunctionBegin; 16269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 16279566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 16288ab949d8SShri Abhyankar 16298ab949d8SShri Abhyankar v = a->a; 16308ab949d8SShri Abhyankar if (usecprow) { 16318ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 16328ab949d8SShri Abhyankar ii = a->compressedrow.i; 16338ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 16349566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 16358ab949d8SShri Abhyankar } else { 16368ab949d8SShri Abhyankar mbs = a->mbs; 16378ab949d8SShri Abhyankar ii = a->i; 16388ab949d8SShri Abhyankar z = zarray; 16398ab949d8SShri Abhyankar } 16408ab949d8SShri Abhyankar 16418ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 16428ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 16438ab949d8SShri Abhyankar idx = ij + ii[i]; 16449371c9d4SSatish Balay sum1 = 0.0; 16459371c9d4SSatish Balay sum2 = 0.0; 16469371c9d4SSatish Balay sum3 = 0.0; 16479371c9d4SSatish Balay sum4 = 0.0; 16489371c9d4SSatish Balay sum5 = 0.0; 16499371c9d4SSatish Balay sum6 = 0.0; 16509371c9d4SSatish Balay sum7 = 0.0; 16519371c9d4SSatish Balay sum8 = 0.0; 16529371c9d4SSatish Balay sum9 = 0.0; 16539371c9d4SSatish Balay sum10 = 0.0; 16549371c9d4SSatish Balay sum11 = 0.0; 16559371c9d4SSatish Balay sum12 = 0.0; 16569371c9d4SSatish Balay sum13 = 0.0; 16579371c9d4SSatish Balay sum14 = 0.0; 16589371c9d4SSatish Balay sum15 = 0.0; 16598ab949d8SShri Abhyankar 16608ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 16618ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 16629371c9d4SSatish Balay x1 = xb[0]; 16639371c9d4SSatish Balay x2 = xb[1]; 16649371c9d4SSatish Balay x3 = xb[2]; 16659371c9d4SSatish Balay x4 = xb[3]; 16668ab949d8SShri Abhyankar 16678ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16688ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16698ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16708ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16718ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16728ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16738ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16748ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16758ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16768ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 16778ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 16788ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 16798ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 16808ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 16818ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 16828ab949d8SShri Abhyankar 16838ab949d8SShri Abhyankar v += 60; 16848ab949d8SShri Abhyankar 16859371c9d4SSatish Balay x1 = xb[4]; 16869371c9d4SSatish Balay x2 = xb[5]; 16879371c9d4SSatish Balay x3 = xb[6]; 16889371c9d4SSatish Balay x4 = xb[7]; 16898ab949d8SShri Abhyankar 16908ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 16918ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 16928ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 16938ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 16948ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 16958ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 16968ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 16978ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 16988ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 16998ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 17008ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 17018ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 17028ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 17038ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 17048ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 17058ab949d8SShri Abhyankar v += 60; 17068ab949d8SShri Abhyankar 17079371c9d4SSatish Balay x1 = xb[8]; 17089371c9d4SSatish Balay x2 = xb[9]; 17099371c9d4SSatish Balay x3 = xb[10]; 17109371c9d4SSatish Balay x4 = xb[11]; 17110b8f6341SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; 17120b8f6341SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; 17130b8f6341SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; 17140b8f6341SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; 17150b8f6341SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; 17160b8f6341SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; 17170b8f6341SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; 17180b8f6341SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; 17190b8f6341SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; 17200b8f6341SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; 17210b8f6341SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; 17220b8f6341SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; 17230b8f6341SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; 17240b8f6341SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; 17250b8f6341SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; 17260b8f6341SShri Abhyankar v += 60; 17270b8f6341SShri Abhyankar 17289371c9d4SSatish Balay x1 = xb[12]; 17299371c9d4SSatish Balay x2 = xb[13]; 17309371c9d4SSatish Balay x3 = xb[14]; 17318ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; 17328ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; 17338ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; 17348ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; 17358ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; 17368ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; 17378ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; 17388ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; 17398ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; 17408ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; 17418ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; 17428ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; 17438ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; 17448ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; 17458ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; 17468ab949d8SShri Abhyankar v += 45; 17478ab949d8SShri Abhyankar } 17488ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 17499371c9d4SSatish Balay z[0] = sum1; 17509371c9d4SSatish Balay z[1] = sum2; 17519371c9d4SSatish Balay z[2] = sum3; 17529371c9d4SSatish Balay z[3] = sum4; 17539371c9d4SSatish Balay z[4] = sum5; 17549371c9d4SSatish Balay z[5] = sum6; 17559371c9d4SSatish Balay z[6] = sum7; 17569371c9d4SSatish Balay z[7] = sum8; 17579371c9d4SSatish Balay z[8] = sum9; 17589371c9d4SSatish Balay z[9] = sum10; 17599371c9d4SSatish Balay z[10] = sum11; 17609371c9d4SSatish Balay z[11] = sum12; 17619371c9d4SSatish Balay z[12] = sum13; 17629371c9d4SSatish Balay z[13] = sum14; 17639371c9d4SSatish Balay z[14] = sum15; 17648ab949d8SShri Abhyankar 17658ab949d8SShri Abhyankar if (!usecprow) z += 15; 17668ab949d8SShri Abhyankar } 17678ab949d8SShri Abhyankar 17689566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 17699566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 17709566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1771*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17728ab949d8SShri Abhyankar } 17738ab949d8SShri Abhyankar 17748ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1775d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) 1776d71ae5a4SJacob Faibussowitsch { 17778ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1778f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 17798ab949d8SShri Abhyankar const PetscScalar *x, *xb; 17800b8f6341SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; 17818ab949d8SShri Abhyankar const MatScalar *v; 17828ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 17837c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1784ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 17858ab949d8SShri Abhyankar 17868ab949d8SShri Abhyankar PetscFunctionBegin; 17879566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 17889566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 17898ab949d8SShri Abhyankar 17908ab949d8SShri Abhyankar v = a->a; 17918ab949d8SShri Abhyankar if (usecprow) { 17928ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 17938ab949d8SShri Abhyankar ii = a->compressedrow.i; 17948ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 17959566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 17968ab949d8SShri Abhyankar } else { 17978ab949d8SShri Abhyankar mbs = a->mbs; 17988ab949d8SShri Abhyankar ii = a->i; 17998ab949d8SShri Abhyankar z = zarray; 18008ab949d8SShri Abhyankar } 18018ab949d8SShri Abhyankar 18028ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 18038ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 18048ab949d8SShri Abhyankar idx = ij + ii[i]; 18059371c9d4SSatish Balay sum1 = 0.0; 18069371c9d4SSatish Balay sum2 = 0.0; 18079371c9d4SSatish Balay sum3 = 0.0; 18089371c9d4SSatish Balay sum4 = 0.0; 18099371c9d4SSatish Balay sum5 = 0.0; 18109371c9d4SSatish Balay sum6 = 0.0; 18119371c9d4SSatish Balay sum7 = 0.0; 18129371c9d4SSatish Balay sum8 = 0.0; 18139371c9d4SSatish Balay sum9 = 0.0; 18149371c9d4SSatish Balay sum10 = 0.0; 18159371c9d4SSatish Balay sum11 = 0.0; 18169371c9d4SSatish Balay sum12 = 0.0; 18179371c9d4SSatish Balay sum13 = 0.0; 18189371c9d4SSatish Balay sum14 = 0.0; 18199371c9d4SSatish Balay sum15 = 0.0; 18208ab949d8SShri Abhyankar 18218ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 18228ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 18239371c9d4SSatish Balay x1 = xb[0]; 18249371c9d4SSatish Balay x2 = xb[1]; 18259371c9d4SSatish Balay x3 = xb[2]; 18269371c9d4SSatish Balay x4 = xb[3]; 18279371c9d4SSatish Balay x5 = xb[4]; 18289371c9d4SSatish Balay x6 = xb[5]; 18299371c9d4SSatish Balay x7 = xb[6]; 18300b8f6341SShri Abhyankar x8 = xb[7]; 18318ab949d8SShri Abhyankar 18328ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; 18338ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; 18348ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; 18358ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; 18368ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; 18378ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; 18388ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; 18398ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; 18408ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; 18418ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; 18428ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; 18438ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; 18448ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; 18458ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; 18468ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; 18478ab949d8SShri Abhyankar v += 120; 18488ab949d8SShri Abhyankar 18499371c9d4SSatish Balay x1 = xb[8]; 18509371c9d4SSatish Balay x2 = xb[9]; 18519371c9d4SSatish Balay x3 = xb[10]; 18529371c9d4SSatish Balay x4 = xb[11]; 18539371c9d4SSatish Balay x5 = xb[12]; 18549371c9d4SSatish Balay x6 = xb[13]; 18559371c9d4SSatish Balay x7 = xb[14]; 18560b8f6341SShri Abhyankar 18578ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; 18588ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; 18598ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; 18608ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; 18618ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; 18628ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; 18638ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; 18648ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; 18658ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; 18668ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; 18678ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; 18688ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; 18698ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; 18708ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; 18718ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; 18728ab949d8SShri Abhyankar v += 105; 18738ab949d8SShri Abhyankar } 18748ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 18759371c9d4SSatish Balay z[0] = sum1; 18769371c9d4SSatish Balay z[1] = sum2; 18779371c9d4SSatish Balay z[2] = sum3; 18789371c9d4SSatish Balay z[3] = sum4; 18799371c9d4SSatish Balay z[4] = sum5; 18809371c9d4SSatish Balay z[5] = sum6; 18819371c9d4SSatish Balay z[6] = sum7; 18829371c9d4SSatish Balay z[7] = sum8; 18839371c9d4SSatish Balay z[8] = sum9; 18849371c9d4SSatish Balay z[9] = sum10; 18859371c9d4SSatish Balay z[10] = sum11; 18869371c9d4SSatish Balay z[11] = sum12; 18879371c9d4SSatish Balay z[12] = sum13; 18889371c9d4SSatish Balay z[13] = sum14; 18899371c9d4SSatish Balay z[14] = sum15; 18908ab949d8SShri Abhyankar 18918ab949d8SShri Abhyankar if (!usecprow) z += 15; 18928ab949d8SShri Abhyankar } 18938ab949d8SShri Abhyankar 18949566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 18959566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 18969566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 1897*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18988ab949d8SShri Abhyankar } 18998ab949d8SShri Abhyankar 19008ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1901d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) 1902d71ae5a4SJacob Faibussowitsch { 19038ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1904f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; 19058ab949d8SShri Abhyankar const PetscScalar *x, *xb; 19068ab949d8SShri Abhyankar PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; 19078ab949d8SShri Abhyankar const MatScalar *v; 19088ab949d8SShri Abhyankar const PetscInt *ii, *ij = a->j, *idx; 19097c565772SBarry Smith PetscInt mbs, i, j, n, *ridx = NULL; 1910ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 19118ab949d8SShri Abhyankar 19128ab949d8SShri Abhyankar PetscFunctionBegin; 19139566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 19149566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 19158ab949d8SShri Abhyankar 19168ab949d8SShri Abhyankar v = a->a; 19178ab949d8SShri Abhyankar if (usecprow) { 19188ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 19198ab949d8SShri Abhyankar ii = a->compressedrow.i; 19208ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 19219566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); 19228ab949d8SShri Abhyankar } else { 19238ab949d8SShri Abhyankar mbs = a->mbs; 19248ab949d8SShri Abhyankar ii = a->i; 19258ab949d8SShri Abhyankar z = zarray; 19268ab949d8SShri Abhyankar } 19278ab949d8SShri Abhyankar 19288ab949d8SShri Abhyankar for (i = 0; i < mbs; i++) { 19298ab949d8SShri Abhyankar n = ii[i + 1] - ii[i]; 19308ab949d8SShri Abhyankar idx = ij + ii[i]; 19319371c9d4SSatish Balay sum1 = 0.0; 19329371c9d4SSatish Balay sum2 = 0.0; 19339371c9d4SSatish Balay sum3 = 0.0; 19349371c9d4SSatish Balay sum4 = 0.0; 19359371c9d4SSatish Balay sum5 = 0.0; 19369371c9d4SSatish Balay sum6 = 0.0; 19379371c9d4SSatish Balay sum7 = 0.0; 19389371c9d4SSatish Balay sum8 = 0.0; 19399371c9d4SSatish Balay sum9 = 0.0; 19409371c9d4SSatish Balay sum10 = 0.0; 19419371c9d4SSatish Balay sum11 = 0.0; 19429371c9d4SSatish Balay sum12 = 0.0; 19439371c9d4SSatish Balay sum13 = 0.0; 19449371c9d4SSatish Balay sum14 = 0.0; 19459371c9d4SSatish Balay sum15 = 0.0; 19468ab949d8SShri Abhyankar 19478ab949d8SShri Abhyankar for (j = 0; j < n; j++) { 19488ab949d8SShri Abhyankar xb = x + 15 * (idx[j]); 19499371c9d4SSatish Balay x1 = xb[0]; 19509371c9d4SSatish Balay x2 = xb[1]; 19519371c9d4SSatish Balay x3 = xb[2]; 19529371c9d4SSatish Balay x4 = xb[3]; 19539371c9d4SSatish Balay x5 = xb[4]; 19549371c9d4SSatish Balay x6 = xb[5]; 19559371c9d4SSatish Balay x7 = xb[6]; 19569371c9d4SSatish Balay x8 = xb[7]; 19579371c9d4SSatish Balay x9 = xb[8]; 19589371c9d4SSatish Balay x10 = xb[9]; 19599371c9d4SSatish Balay x11 = xb[10]; 19609371c9d4SSatish Balay x12 = xb[11]; 19619371c9d4SSatish Balay x13 = xb[12]; 19629371c9d4SSatish Balay x14 = xb[13]; 19639371c9d4SSatish Balay x15 = xb[14]; 19648ab949d8SShri Abhyankar 19658ab949d8SShri Abhyankar sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; 19668ab949d8SShri Abhyankar sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; 19678ab949d8SShri Abhyankar sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; 19688ab949d8SShri Abhyankar sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; 19698ab949d8SShri Abhyankar sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; 19708ab949d8SShri Abhyankar sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; 19718ab949d8SShri Abhyankar sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; 19728ab949d8SShri Abhyankar sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; 19738ab949d8SShri Abhyankar sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; 19748ab949d8SShri Abhyankar sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; 19758ab949d8SShri Abhyankar sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; 19768ab949d8SShri Abhyankar sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; 19778ab949d8SShri Abhyankar sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; 19788ab949d8SShri Abhyankar sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; 19798ab949d8SShri Abhyankar sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; 19808ab949d8SShri Abhyankar v += 225; 19818ab949d8SShri Abhyankar } 19828ab949d8SShri Abhyankar if (usecprow) z = zarray + 15 * ridx[i]; 19839371c9d4SSatish Balay z[0] = sum1; 19849371c9d4SSatish Balay z[1] = sum2; 19859371c9d4SSatish Balay z[2] = sum3; 19869371c9d4SSatish Balay z[3] = sum4; 19879371c9d4SSatish Balay z[4] = sum5; 19889371c9d4SSatish Balay z[5] = sum6; 19899371c9d4SSatish Balay z[6] = sum7; 19909371c9d4SSatish Balay z[7] = sum8; 19919371c9d4SSatish Balay z[8] = sum9; 19929371c9d4SSatish Balay z[9] = sum10; 19939371c9d4SSatish Balay z[10] = sum11; 19949371c9d4SSatish Balay z[11] = sum12; 19959371c9d4SSatish Balay z[12] = sum13; 19969371c9d4SSatish Balay z[13] = sum14; 19979371c9d4SSatish Balay z[14] = sum15; 19988ab949d8SShri Abhyankar 19998ab949d8SShri Abhyankar if (!usecprow) z += 15; 20008ab949d8SShri Abhyankar } 20018ab949d8SShri Abhyankar 20029566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20039566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 20049566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); 2005*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20068ab949d8SShri Abhyankar } 20078ab949d8SShri Abhyankar 20083f1db9ecSBarry Smith /* 20093f1db9ecSBarry Smith This will not work with MatScalar == float because it calls the BLAS 20103f1db9ecSBarry Smith */ 2011d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) 2012d71ae5a4SJacob Faibussowitsch { 20132d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2014f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 2015d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2016d9ca1df4SBarry Smith const MatScalar *v; 2017d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2018d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2019d9ca1df4SBarry Smith PetscInt ncols, k; 2020ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20212d61bbb3SSatish Balay 20222d61bbb3SSatish Balay PetscFunctionBegin; 20239566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20249566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(zz, &zarray)); 20252d61bbb3SSatish Balay 20262d61bbb3SSatish Balay idx = a->j; 20272d61bbb3SSatish Balay v = a->a; 202826e093fcSHong Zhang if (usecprow) { 202926e093fcSHong Zhang mbs = a->compressedrow.nrows; 203026e093fcSHong Zhang ii = a->compressedrow.i; 20317b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 20329566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(zarray, bs * a->mbs)); 203326e093fcSHong Zhang } else { 203426e093fcSHong Zhang mbs = a->mbs; 20352d61bbb3SSatish Balay ii = a->i; 203626e093fcSHong Zhang z = zarray; 203726e093fcSHong Zhang } 2038218c64b6SSatish Balay 20392d61bbb3SSatish Balay if (!a->mult_work) { 2040d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 20419566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 20422d61bbb3SSatish Balay } 20432d61bbb3SSatish Balay work = a->mult_work; 20442d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 20459371c9d4SSatish Balay n = ii[1] - ii[0]; 20469371c9d4SSatish Balay ii++; 20472d61bbb3SSatish Balay ncols = n * bs; 20482d61bbb3SSatish Balay workt = work; 20492d61bbb3SSatish Balay for (j = 0; j < n; j++) { 20502d61bbb3SSatish Balay xb = x + bs * (*idx++); 20512d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 20522d61bbb3SSatish Balay workt += bs; 20532d61bbb3SSatish Balay } 20547b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 205596b95a6bSBarry Smith PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); 20562d61bbb3SSatish Balay v += n * bs2; 205726e093fcSHong Zhang if (!usecprow) z += bs; 20582d61bbb3SSatish Balay } 20599566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 20609566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(zz, &zarray)); 20619566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); 2062*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20632d61bbb3SSatish Balay } 20642d61bbb3SSatish Balay 2065d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 2066d71ae5a4SJacob Faibussowitsch { 20672d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2068122f12eaSBarry Smith const PetscScalar *x; 2069122f12eaSBarry Smith PetscScalar *y, *z, sum; 2070122f12eaSBarry Smith const MatScalar *v; 20717c565772SBarry Smith PetscInt mbs = a->mbs, i, n, *ridx = NULL; 2072122f12eaSBarry Smith const PetscInt *idx, *ii; 2073ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 20742d61bbb3SSatish Balay 20752d61bbb3SSatish Balay PetscFunctionBegin; 20769566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 20779566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 20782d61bbb3SSatish Balay 20792d61bbb3SSatish Balay idx = a->j; 20802d61bbb3SSatish Balay v = a->a; 208126e093fcSHong Zhang if (usecprow) { 208248a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); 208326e093fcSHong Zhang mbs = a->compressedrow.nrows; 208426e093fcSHong Zhang ii = a->compressedrow.i; 20857b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 208626e093fcSHong Zhang } else { 20872d61bbb3SSatish Balay ii = a->i; 208826e093fcSHong Zhang } 20892d61bbb3SSatish Balay 20902d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 2091122f12eaSBarry Smith n = ii[1] - ii[0]; 2092122f12eaSBarry Smith ii++; 209326e093fcSHong Zhang if (!usecprow) { 2094122f12eaSBarry Smith sum = y[i]; 2095122f12eaSBarry Smith } else { 2096122f12eaSBarry Smith sum = y[ridx[i]]; 2097122f12eaSBarry Smith } 2098444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2099444d8c10SJed Brown PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2100122f12eaSBarry Smith PetscSparseDensePlusDot(sum, x, v, idx, n); 2101122f12eaSBarry Smith v += n; 2102122f12eaSBarry Smith idx += n; 2103122f12eaSBarry Smith if (usecprow) { 2104122f12eaSBarry Smith z[ridx[i]] = sum; 2105122f12eaSBarry Smith } else { 2106122f12eaSBarry Smith z[i] = sum; 210726e093fcSHong Zhang } 21082d61bbb3SSatish Balay } 21099566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 21109566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 21119566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 2112*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21132d61bbb3SSatish Balay } 21142d61bbb3SSatish Balay 2115d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 2116d71ae5a4SJacob Faibussowitsch { 21172d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2118f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2; 2119d9ca1df4SBarry Smith const PetscScalar *x, *xb; 212026e093fcSHong Zhang PetscScalar x1, x2, *yarray, *zarray; 2121d9ca1df4SBarry Smith const MatScalar *v; 2122d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, j; 2123d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2124ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 21252d61bbb3SSatish Balay 21262d61bbb3SSatish Balay PetscFunctionBegin; 21279566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 21289566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 21292d61bbb3SSatish Balay 21302d61bbb3SSatish Balay idx = a->j; 21312d61bbb3SSatish Balay v = a->a; 213226e093fcSHong Zhang if (usecprow) { 213348a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); 213426e093fcSHong Zhang mbs = a->compressedrow.nrows; 213526e093fcSHong Zhang ii = a->compressedrow.i; 21367b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 213726e093fcSHong Zhang } else { 21382d61bbb3SSatish Balay ii = a->i; 213926e093fcSHong Zhang y = yarray; 214026e093fcSHong Zhang z = zarray; 214126e093fcSHong Zhang } 21422d61bbb3SSatish Balay 21432d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 21449371c9d4SSatish Balay n = ii[1] - ii[0]; 21459371c9d4SSatish Balay ii++; 214626e093fcSHong Zhang if (usecprow) { 21477b2bb3b9SHong Zhang z = zarray + 2 * ridx[i]; 21487b2bb3b9SHong Zhang y = yarray + 2 * ridx[i]; 214926e093fcSHong Zhang } 21509371c9d4SSatish Balay sum1 = y[0]; 21519371c9d4SSatish Balay sum2 = y[1]; 2152444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2153444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 21542d61bbb3SSatish Balay for (j = 0; j < n; j++) { 215526fbe8dcSKarl Rupp xb = x + 2 * (*idx++); 215626fbe8dcSKarl Rupp x1 = xb[0]; 215726fbe8dcSKarl Rupp x2 = xb[1]; 215826fbe8dcSKarl Rupp 21592d61bbb3SSatish Balay sum1 += v[0] * x1 + v[2] * x2; 21602d61bbb3SSatish Balay sum2 += v[1] * x1 + v[3] * x2; 21612d61bbb3SSatish Balay v += 4; 21622d61bbb3SSatish Balay } 21639371c9d4SSatish Balay z[0] = sum1; 21649371c9d4SSatish Balay z[1] = sum2; 216526e093fcSHong Zhang if (!usecprow) { 21669371c9d4SSatish Balay z += 2; 21679371c9d4SSatish Balay y += 2; 21682d61bbb3SSatish Balay } 216926e093fcSHong Zhang } 21709566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 21719566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 21729566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * a->nz)); 2173*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21742d61bbb3SSatish Balay } 21752d61bbb3SSatish Balay 2176d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 2177d71ae5a4SJacob Faibussowitsch { 21782d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2179f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; 2180d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2181d9ca1df4SBarry Smith const MatScalar *v; 2182d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2183d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2184ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 21852d61bbb3SSatish Balay 21862d61bbb3SSatish Balay PetscFunctionBegin; 21879566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 21889566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 21892d61bbb3SSatish Balay 21902d61bbb3SSatish Balay idx = a->j; 21912d61bbb3SSatish Balay v = a->a; 219226e093fcSHong Zhang if (usecprow) { 219348a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); 219426e093fcSHong Zhang mbs = a->compressedrow.nrows; 219526e093fcSHong Zhang ii = a->compressedrow.i; 21967b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 219726e093fcSHong Zhang } else { 21982d61bbb3SSatish Balay ii = a->i; 219926e093fcSHong Zhang y = yarray; 220026e093fcSHong Zhang z = zarray; 220126e093fcSHong Zhang } 22022d61bbb3SSatish Balay 22032d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 22049371c9d4SSatish Balay n = ii[1] - ii[0]; 22059371c9d4SSatish Balay ii++; 220626e093fcSHong Zhang if (usecprow) { 22077b2bb3b9SHong Zhang z = zarray + 3 * ridx[i]; 22087b2bb3b9SHong Zhang y = yarray + 3 * ridx[i]; 220926e093fcSHong Zhang } 22109371c9d4SSatish Balay sum1 = y[0]; 22119371c9d4SSatish Balay sum2 = y[1]; 22129371c9d4SSatish Balay sum3 = y[2]; 2213444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2214444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 22152d61bbb3SSatish Balay for (j = 0; j < n; j++) { 22169371c9d4SSatish Balay xb = x + 3 * (*idx++); 22179371c9d4SSatish Balay x1 = xb[0]; 22189371c9d4SSatish Balay x2 = xb[1]; 22199371c9d4SSatish Balay x3 = xb[2]; 22202d61bbb3SSatish Balay sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 22212d61bbb3SSatish Balay sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 22222d61bbb3SSatish Balay sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 22232d61bbb3SSatish Balay v += 9; 22242d61bbb3SSatish Balay } 22259371c9d4SSatish Balay z[0] = sum1; 22269371c9d4SSatish Balay z[1] = sum2; 22279371c9d4SSatish Balay z[2] = sum3; 222826e093fcSHong Zhang if (!usecprow) { 22299371c9d4SSatish Balay z += 3; 22309371c9d4SSatish Balay y += 3; 22312d61bbb3SSatish Balay } 223226e093fcSHong Zhang } 22339566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 22349566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 22359566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * a->nz)); 2236*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 22372d61bbb3SSatish Balay } 22382d61bbb3SSatish Balay 2239d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 2240d71ae5a4SJacob Faibussowitsch { 22412d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2242f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; 2243d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2244d9ca1df4SBarry Smith const MatScalar *v; 2245d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2246d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2247ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 22482d61bbb3SSatish Balay 22492d61bbb3SSatish Balay PetscFunctionBegin; 22509566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 22519566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 22522d61bbb3SSatish Balay 22532d61bbb3SSatish Balay idx = a->j; 22542d61bbb3SSatish Balay v = a->a; 225526e093fcSHong Zhang if (usecprow) { 225648a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); 225726e093fcSHong Zhang mbs = a->compressedrow.nrows; 225826e093fcSHong Zhang ii = a->compressedrow.i; 22597b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 226026e093fcSHong Zhang } else { 22612d61bbb3SSatish Balay ii = a->i; 226226e093fcSHong Zhang y = yarray; 226326e093fcSHong Zhang z = zarray; 226426e093fcSHong Zhang } 22652d61bbb3SSatish Balay 22662d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 22679371c9d4SSatish Balay n = ii[1] - ii[0]; 22689371c9d4SSatish Balay ii++; 226926e093fcSHong Zhang if (usecprow) { 22707b2bb3b9SHong Zhang z = zarray + 4 * ridx[i]; 22717b2bb3b9SHong Zhang y = yarray + 4 * ridx[i]; 227226e093fcSHong Zhang } 22739371c9d4SSatish Balay sum1 = y[0]; 22749371c9d4SSatish Balay sum2 = y[1]; 22759371c9d4SSatish Balay sum3 = y[2]; 22769371c9d4SSatish Balay sum4 = y[3]; 2277444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2278444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 22792d61bbb3SSatish Balay for (j = 0; j < n; j++) { 22802d61bbb3SSatish Balay xb = x + 4 * (*idx++); 22819371c9d4SSatish Balay x1 = xb[0]; 22829371c9d4SSatish Balay x2 = xb[1]; 22839371c9d4SSatish Balay x3 = xb[2]; 22849371c9d4SSatish Balay x4 = xb[3]; 22852d61bbb3SSatish Balay sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 22862d61bbb3SSatish Balay sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 22872d61bbb3SSatish Balay sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 22882d61bbb3SSatish Balay sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 22892d61bbb3SSatish Balay v += 16; 22902d61bbb3SSatish Balay } 22919371c9d4SSatish Balay z[0] = sum1; 22929371c9d4SSatish Balay z[1] = sum2; 22939371c9d4SSatish Balay z[2] = sum3; 22949371c9d4SSatish Balay z[3] = sum4; 229526e093fcSHong Zhang if (!usecprow) { 22969371c9d4SSatish Balay z += 4; 22979371c9d4SSatish Balay y += 4; 22982d61bbb3SSatish Balay } 229926e093fcSHong Zhang } 23009566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 23019566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 23029566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * a->nz)); 2303*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23042d61bbb3SSatish Balay } 23052d61bbb3SSatish Balay 2306d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 2307d71ae5a4SJacob Faibussowitsch { 23082d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2309f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; 2310d9ca1df4SBarry Smith const PetscScalar *x, *xb; 231126e093fcSHong Zhang PetscScalar *yarray, *zarray; 2312d9ca1df4SBarry Smith const MatScalar *v; 2313d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2314d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2315ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 23162d61bbb3SSatish Balay 23172d61bbb3SSatish Balay PetscFunctionBegin; 23189566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 23199566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 23202d61bbb3SSatish Balay 23212d61bbb3SSatish Balay idx = a->j; 23222d61bbb3SSatish Balay v = a->a; 232326e093fcSHong Zhang if (usecprow) { 232448a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); 232526e093fcSHong Zhang mbs = a->compressedrow.nrows; 232626e093fcSHong Zhang ii = a->compressedrow.i; 23277b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 232826e093fcSHong Zhang } else { 23292d61bbb3SSatish Balay ii = a->i; 233026e093fcSHong Zhang y = yarray; 233126e093fcSHong Zhang z = zarray; 233226e093fcSHong Zhang } 23332d61bbb3SSatish Balay 23342d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 23359371c9d4SSatish Balay n = ii[1] - ii[0]; 23369371c9d4SSatish Balay ii++; 233726e093fcSHong Zhang if (usecprow) { 23387b2bb3b9SHong Zhang z = zarray + 5 * ridx[i]; 23397b2bb3b9SHong Zhang y = yarray + 5 * ridx[i]; 234026e093fcSHong Zhang } 23419371c9d4SSatish Balay sum1 = y[0]; 23429371c9d4SSatish Balay sum2 = y[1]; 23439371c9d4SSatish Balay sum3 = y[2]; 23449371c9d4SSatish Balay sum4 = y[3]; 23459371c9d4SSatish Balay sum5 = y[4]; 2346444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2347444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 23482d61bbb3SSatish Balay for (j = 0; j < n; j++) { 23492d61bbb3SSatish Balay xb = x + 5 * (*idx++); 23509371c9d4SSatish Balay x1 = xb[0]; 23519371c9d4SSatish Balay x2 = xb[1]; 23529371c9d4SSatish Balay x3 = xb[2]; 23539371c9d4SSatish Balay x4 = xb[3]; 23549371c9d4SSatish Balay x5 = xb[4]; 23552d61bbb3SSatish Balay sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 23562d61bbb3SSatish Balay sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 23572d61bbb3SSatish Balay sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 23582d61bbb3SSatish Balay sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 23592d61bbb3SSatish Balay sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 23602d61bbb3SSatish Balay v += 25; 23612d61bbb3SSatish Balay } 23629371c9d4SSatish Balay z[0] = sum1; 23639371c9d4SSatish Balay z[1] = sum2; 23649371c9d4SSatish Balay z[2] = sum3; 23659371c9d4SSatish Balay z[3] = sum4; 23669371c9d4SSatish Balay z[4] = sum5; 236726e093fcSHong Zhang if (!usecprow) { 23689371c9d4SSatish Balay z += 5; 23699371c9d4SSatish Balay y += 5; 23702d61bbb3SSatish Balay } 237126e093fcSHong Zhang } 23729566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 23739566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 23749566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * a->nz)); 2375*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23762d61bbb3SSatish Balay } 2377c2916339SPierre Jolivet 2378d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 2379d71ae5a4SJacob Faibussowitsch { 238015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2381f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; 2382d9ca1df4SBarry Smith const PetscScalar *x, *xb; 238326e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; 2384d9ca1df4SBarry Smith const MatScalar *v; 2385d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2386d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2387ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 238815091d37SBarry Smith 238915091d37SBarry Smith PetscFunctionBegin; 23909566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 23919566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 239215091d37SBarry Smith 239315091d37SBarry Smith idx = a->j; 239415091d37SBarry Smith v = a->a; 239526e093fcSHong Zhang if (usecprow) { 239648a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); 239726e093fcSHong Zhang mbs = a->compressedrow.nrows; 239826e093fcSHong Zhang ii = a->compressedrow.i; 23997b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 240026e093fcSHong Zhang } else { 240115091d37SBarry Smith ii = a->i; 240226e093fcSHong Zhang y = yarray; 240326e093fcSHong Zhang z = zarray; 240426e093fcSHong Zhang } 240515091d37SBarry Smith 240615091d37SBarry Smith for (i = 0; i < mbs; i++) { 24079371c9d4SSatish Balay n = ii[1] - ii[0]; 24089371c9d4SSatish Balay ii++; 240926e093fcSHong Zhang if (usecprow) { 24107b2bb3b9SHong Zhang z = zarray + 6 * ridx[i]; 24117b2bb3b9SHong Zhang y = yarray + 6 * ridx[i]; 241226e093fcSHong Zhang } 24139371c9d4SSatish Balay sum1 = y[0]; 24149371c9d4SSatish Balay sum2 = y[1]; 24159371c9d4SSatish Balay sum3 = y[2]; 24169371c9d4SSatish Balay sum4 = y[3]; 24179371c9d4SSatish Balay sum5 = y[4]; 24189371c9d4SSatish Balay sum6 = y[5]; 2419444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2420444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 242115091d37SBarry Smith for (j = 0; j < n; j++) { 24223b95cb0eSSatish Balay xb = x + 6 * (*idx++); 24239371c9d4SSatish Balay x1 = xb[0]; 24249371c9d4SSatish Balay x2 = xb[1]; 24259371c9d4SSatish Balay x3 = xb[2]; 24269371c9d4SSatish Balay x4 = xb[3]; 24279371c9d4SSatish Balay x5 = xb[4]; 24289371c9d4SSatish Balay x6 = xb[5]; 242915091d37SBarry Smith sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 243015091d37SBarry Smith sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 243115091d37SBarry Smith sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 243215091d37SBarry Smith sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 243315091d37SBarry Smith sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 243415091d37SBarry Smith sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 243515091d37SBarry Smith v += 36; 243615091d37SBarry Smith } 24379371c9d4SSatish Balay z[0] = sum1; 24389371c9d4SSatish Balay z[1] = sum2; 24399371c9d4SSatish Balay z[2] = sum3; 24409371c9d4SSatish Balay z[3] = sum4; 24419371c9d4SSatish Balay z[4] = sum5; 24429371c9d4SSatish Balay z[5] = sum6; 244326e093fcSHong Zhang if (!usecprow) { 24449371c9d4SSatish Balay z += 6; 24459371c9d4SSatish Balay y += 6; 244615091d37SBarry Smith } 244726e093fcSHong Zhang } 24489566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 24499566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 24509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * a->nz)); 2451*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 245215091d37SBarry Smith } 24532d61bbb3SSatish Balay 2454d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 2455d71ae5a4SJacob Faibussowitsch { 24562d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2457f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 2458d9ca1df4SBarry Smith const PetscScalar *x, *xb; 245926e093fcSHong Zhang PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; 2460d9ca1df4SBarry Smith const MatScalar *v; 2461d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, j, n; 2462d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2463ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 24642d61bbb3SSatish Balay 24652d61bbb3SSatish Balay PetscFunctionBegin; 24669566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 24679566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 24682d61bbb3SSatish Balay 24692d61bbb3SSatish Balay idx = a->j; 24702d61bbb3SSatish Balay v = a->a; 247126e093fcSHong Zhang if (usecprow) { 247248a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 247326e093fcSHong Zhang mbs = a->compressedrow.nrows; 247426e093fcSHong Zhang ii = a->compressedrow.i; 24757b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 247626e093fcSHong Zhang } else { 24772d61bbb3SSatish Balay ii = a->i; 247826e093fcSHong Zhang y = yarray; 247926e093fcSHong Zhang z = zarray; 248026e093fcSHong Zhang } 24812d61bbb3SSatish Balay 24822d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 24839371c9d4SSatish Balay n = ii[1] - ii[0]; 24849371c9d4SSatish Balay ii++; 248526e093fcSHong Zhang if (usecprow) { 24867b2bb3b9SHong Zhang z = zarray + 7 * ridx[i]; 24877b2bb3b9SHong Zhang y = yarray + 7 * ridx[i]; 248826e093fcSHong Zhang } 24899371c9d4SSatish Balay sum1 = y[0]; 24909371c9d4SSatish Balay sum2 = y[1]; 24919371c9d4SSatish Balay sum3 = y[2]; 24929371c9d4SSatish Balay sum4 = y[3]; 24939371c9d4SSatish Balay sum5 = y[4]; 24949371c9d4SSatish Balay sum6 = y[5]; 24959371c9d4SSatish Balay sum7 = y[6]; 2496444d8c10SJed Brown PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2497444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 24982d61bbb3SSatish Balay for (j = 0; j < n; j++) { 24992d61bbb3SSatish Balay xb = x + 7 * (*idx++); 25009371c9d4SSatish Balay x1 = xb[0]; 25019371c9d4SSatish Balay x2 = xb[1]; 25029371c9d4SSatish Balay x3 = xb[2]; 25039371c9d4SSatish Balay x4 = xb[3]; 25049371c9d4SSatish Balay x5 = xb[4]; 25059371c9d4SSatish Balay x6 = xb[5]; 25069371c9d4SSatish Balay x7 = xb[6]; 25072d61bbb3SSatish Balay sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 25082d61bbb3SSatish Balay sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 25092d61bbb3SSatish Balay sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 25102d61bbb3SSatish Balay sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 25112d61bbb3SSatish Balay sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 25122d61bbb3SSatish Balay sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 25132d61bbb3SSatish Balay sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 25142d61bbb3SSatish Balay v += 49; 25152d61bbb3SSatish Balay } 25169371c9d4SSatish Balay z[0] = sum1; 25179371c9d4SSatish Balay z[1] = sum2; 25189371c9d4SSatish Balay z[2] = sum3; 25199371c9d4SSatish Balay z[3] = sum4; 25209371c9d4SSatish Balay z[4] = sum5; 25219371c9d4SSatish Balay z[5] = sum6; 25229371c9d4SSatish Balay z[6] = sum7; 252326e093fcSHong Zhang if (!usecprow) { 25249371c9d4SSatish Balay z += 7; 25259371c9d4SSatish Balay y += 7; 25262d61bbb3SSatish Balay } 252726e093fcSHong Zhang } 25289566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 25299566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 25309566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * a->nz)); 2531*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 25322d61bbb3SSatish Balay } 2533218c64b6SSatish Balay 25345f70456aSHong Zhang #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2535d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) 2536d71ae5a4SJacob Faibussowitsch { 253796e086a2SDaniel Kokron Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2538f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 253996e086a2SDaniel Kokron const PetscScalar *x, *xb; 254096e086a2SDaniel Kokron const MatScalar *v; 25416679dcc1SBarry Smith PetscInt mbs, i, j, n; 2542ce68d72fSJed Brown PetscInt k; 254396e086a2SDaniel Kokron PetscBool usecprow = a->compressedrow.use; 25446679dcc1SBarry Smith const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; 254596e086a2SDaniel Kokron 254696e086a2SDaniel Kokron __m256d a0, a1, a2, a3, a4, a5; 2547ce68d72fSJed Brown __m256d w0, w1, w2, w3; 254896e086a2SDaniel Kokron __m256d z0, z1, z2; 254996e086a2SDaniel Kokron __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); 255096e086a2SDaniel Kokron 255196e086a2SDaniel Kokron PetscFunctionBegin; 25529566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 25539566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 25549566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 255596e086a2SDaniel Kokron 255696e086a2SDaniel Kokron idx = a->j; 255796e086a2SDaniel Kokron v = a->a; 255896e086a2SDaniel Kokron if (usecprow) { 255996e086a2SDaniel Kokron mbs = a->compressedrow.nrows; 256096e086a2SDaniel Kokron ii = a->compressedrow.i; 256196e086a2SDaniel Kokron ridx = a->compressedrow.rindex; 256296e086a2SDaniel Kokron } else { 256396e086a2SDaniel Kokron mbs = a->mbs; 256496e086a2SDaniel Kokron ii = a->i; 256596e086a2SDaniel Kokron z = zarray; 256696e086a2SDaniel Kokron } 256796e086a2SDaniel Kokron 256896e086a2SDaniel Kokron if (!a->mult_work) { 256996e086a2SDaniel Kokron k = PetscMax(A->rmap->n, A->cmap->n); 25709566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 257196e086a2SDaniel Kokron } 257296e086a2SDaniel Kokron 257396e086a2SDaniel Kokron work = a->mult_work; 257496e086a2SDaniel Kokron for (i = 0; i < mbs; i++) { 25759371c9d4SSatish Balay n = ii[1] - ii[0]; 25769371c9d4SSatish Balay ii++; 257796e086a2SDaniel Kokron workt = work; 257896e086a2SDaniel Kokron for (j = 0; j < n; j++) { 257996e086a2SDaniel Kokron xb = x + bs * (*idx++); 258096e086a2SDaniel Kokron for (k = 0; k < bs; k++) workt[k] = xb[k]; 258196e086a2SDaniel Kokron workt += bs; 258296e086a2SDaniel Kokron } 258396e086a2SDaniel Kokron if (usecprow) z = zarray + bs * ridx[i]; 258496e086a2SDaniel Kokron 25859371c9d4SSatish Balay z0 = _mm256_loadu_pd(&z[0]); 25869371c9d4SSatish Balay z1 = _mm256_loadu_pd(&z[4]); 25879371c9d4SSatish Balay z2 = _mm256_set1_pd(z[8]); 258896e086a2SDaniel Kokron 258996e086a2SDaniel Kokron for (j = 0; j < n; j++) { 2590c05b70c4SSatish Balay /* first column of a */ 259196e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9]); 25929371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81]); 25939371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 25949371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 4]); 25959371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 25969371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 8]); 25979371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 259896e086a2SDaniel Kokron 2599c05b70c4SSatish Balay /* second column of a */ 260096e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 1]); 26019371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 9]); 26029371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 26039371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 13]); 26049371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 26059371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 17]); 26069371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 260796e086a2SDaniel Kokron 2608c05b70c4SSatish Balay /* third column of a */ 260996e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 2]); 26109371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 18]); 26119371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w2, z0); 26129371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 22]); 26139371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w2, z1); 26149371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 26]); 26159371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w2, z2); 261696e086a2SDaniel Kokron 2617c05b70c4SSatish Balay /* fourth column of a */ 261896e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 3]); 26199371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 27]); 26209371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w3, z0); 26219371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 31]); 26229371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w3, z1); 26239371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 35]); 26249371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w3, z2); 262596e086a2SDaniel Kokron 2626c05b70c4SSatish Balay /* fifth column of a */ 262796e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 4]); 26289371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 36]); 26299371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w0, z0); 26309371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 40]); 26319371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w0, z1); 26329371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 44]); 26339371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w0, z2); 263496e086a2SDaniel Kokron 2635c05b70c4SSatish Balay /* sixth column of a */ 263696e086a2SDaniel Kokron w1 = _mm256_set1_pd(work[j * 9 + 5]); 26379371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 45]); 26389371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w1, z0); 26399371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 49]); 26409371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w1, z1); 26419371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 53]); 26429371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w1, z2); 264396e086a2SDaniel Kokron 2644c05b70c4SSatish Balay /* seventh column of a */ 264596e086a2SDaniel Kokron w2 = _mm256_set1_pd(work[j * 9 + 6]); 26469371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 54]); 26479371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w2, z0); 26489371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 58]); 26499371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w2, z1); 26509371c9d4SSatish Balay a2 = _mm256_loadu_pd(&v[j * 81 + 62]); 26519371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w2, z2); 265296e086a2SDaniel Kokron 26536aad120cSJose E. Roman /* eighth column of a */ 265496e086a2SDaniel Kokron w3 = _mm256_set1_pd(work[j * 9 + 7]); 26559371c9d4SSatish Balay a3 = _mm256_loadu_pd(&v[j * 81 + 63]); 26569371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a3, w3, z0); 26579371c9d4SSatish Balay a4 = _mm256_loadu_pd(&v[j * 81 + 67]); 26589371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a4, w3, z1); 26599371c9d4SSatish Balay a5 = _mm256_loadu_pd(&v[j * 81 + 71]); 26609371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a5, w3, z2); 266196e086a2SDaniel Kokron 2662c05b70c4SSatish Balay /* ninth column of a */ 266396e086a2SDaniel Kokron w0 = _mm256_set1_pd(work[j * 9 + 8]); 26649371c9d4SSatish Balay a0 = _mm256_loadu_pd(&v[j * 81 + 72]); 26659371c9d4SSatish Balay z0 = _mm256_fmadd_pd(a0, w0, z0); 26669371c9d4SSatish Balay a1 = _mm256_loadu_pd(&v[j * 81 + 76]); 26679371c9d4SSatish Balay z1 = _mm256_fmadd_pd(a1, w0, z1); 26689371c9d4SSatish Balay a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); 26699371c9d4SSatish Balay z2 = _mm256_fmadd_pd(a2, w0, z2); 267096e086a2SDaniel Kokron } 267196e086a2SDaniel Kokron 26729371c9d4SSatish Balay _mm256_storeu_pd(&z[0], z0); 26739371c9d4SSatish Balay _mm256_storeu_pd(&z[4], z1); 26749371c9d4SSatish Balay _mm256_maskstore_pd(&z[8], mask1, z2); 267596e086a2SDaniel Kokron 267696e086a2SDaniel Kokron v += n * bs2; 267796e086a2SDaniel Kokron if (!usecprow) z += bs; 267896e086a2SDaniel Kokron } 26799566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 26809566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 26819566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(162.0 * a->nz)); 2682*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 268396e086a2SDaniel Kokron } 268496e086a2SDaniel Kokron #endif 268596e086a2SDaniel Kokron 2686d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) 2687d71ae5a4SJacob Faibussowitsch { 2688ebada01fSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2689f4259b30SLisandro Dalcin PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; 2690ebada01fSBarry Smith const PetscScalar *x, *xb; 2691ebada01fSBarry Smith PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; 2692ebada01fSBarry Smith const MatScalar *v; 2693ebada01fSBarry Smith PetscInt mbs = a->mbs, i, j, n; 2694ebada01fSBarry Smith const PetscInt *idx, *ii, *ridx = NULL; 2695ebada01fSBarry Smith PetscBool usecprow = a->compressedrow.use; 2696ebada01fSBarry Smith 2697ebada01fSBarry Smith PetscFunctionBegin; 26989566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 26999566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); 2700ebada01fSBarry Smith 2701ebada01fSBarry Smith idx = a->j; 2702ebada01fSBarry Smith v = a->a; 2703ebada01fSBarry Smith if (usecprow) { 270448a46eb9SPierre Jolivet if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); 2705ebada01fSBarry Smith mbs = a->compressedrow.nrows; 2706ebada01fSBarry Smith ii = a->compressedrow.i; 2707ebada01fSBarry Smith ridx = a->compressedrow.rindex; 2708ebada01fSBarry Smith } else { 2709ebada01fSBarry Smith ii = a->i; 2710ebada01fSBarry Smith y = yarray; 2711ebada01fSBarry Smith z = zarray; 2712ebada01fSBarry Smith } 2713ebada01fSBarry Smith 2714ebada01fSBarry Smith for (i = 0; i < mbs; i++) { 27159371c9d4SSatish Balay n = ii[1] - ii[0]; 27169371c9d4SSatish Balay ii++; 2717ebada01fSBarry Smith if (usecprow) { 2718ebada01fSBarry Smith z = zarray + 11 * ridx[i]; 2719ebada01fSBarry Smith y = yarray + 11 * ridx[i]; 2720ebada01fSBarry Smith } 27219371c9d4SSatish Balay sum1 = y[0]; 27229371c9d4SSatish Balay sum2 = y[1]; 27239371c9d4SSatish Balay sum3 = y[2]; 27249371c9d4SSatish Balay sum4 = y[3]; 27259371c9d4SSatish Balay sum5 = y[4]; 27269371c9d4SSatish Balay sum6 = y[5]; 27279371c9d4SSatish Balay sum7 = y[6]; 27289371c9d4SSatish Balay sum8 = y[7]; 27299371c9d4SSatish Balay sum9 = y[8]; 27309371c9d4SSatish Balay sum10 = y[9]; 27319371c9d4SSatish Balay sum11 = y[10]; 2732ebada01fSBarry Smith PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2733ebada01fSBarry Smith PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2734ebada01fSBarry Smith for (j = 0; j < n; j++) { 2735ebada01fSBarry Smith xb = x + 11 * (*idx++); 27369371c9d4SSatish Balay x1 = xb[0]; 27379371c9d4SSatish Balay x2 = xb[1]; 27389371c9d4SSatish Balay x3 = xb[2]; 27399371c9d4SSatish Balay x4 = xb[3]; 27409371c9d4SSatish Balay x5 = xb[4]; 27419371c9d4SSatish Balay x6 = xb[5]; 27429371c9d4SSatish Balay x7 = xb[6]; 27439371c9d4SSatish Balay x8 = xb[7]; 27449371c9d4SSatish Balay x9 = xb[8]; 27459371c9d4SSatish Balay x10 = xb[9]; 27469371c9d4SSatish Balay x11 = xb[10]; 2747ebada01fSBarry Smith sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; 2748ebada01fSBarry Smith sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; 2749ebada01fSBarry Smith sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; 2750ebada01fSBarry Smith sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; 2751ebada01fSBarry Smith sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; 2752ebada01fSBarry Smith sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; 2753ebada01fSBarry Smith sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; 2754ebada01fSBarry Smith sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; 2755ebada01fSBarry Smith sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; 2756ebada01fSBarry Smith sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; 2757ebada01fSBarry Smith sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; 2758ebada01fSBarry Smith v += 121; 2759ebada01fSBarry Smith } 27609371c9d4SSatish Balay z[0] = sum1; 27619371c9d4SSatish Balay z[1] = sum2; 27629371c9d4SSatish Balay z[2] = sum3; 27639371c9d4SSatish Balay z[3] = sum4; 27649371c9d4SSatish Balay z[4] = sum5; 27659371c9d4SSatish Balay z[5] = sum6; 27669371c9d4SSatish Balay z[6] = sum7; 27679371c9d4SSatish Balay z[7] = sum8; 27689371c9d4SSatish Balay z[8] = sum9; 27699371c9d4SSatish Balay z[9] = sum10; 27709371c9d4SSatish Balay z[10] = sum11; 2771ebada01fSBarry Smith if (!usecprow) { 27729371c9d4SSatish Balay z += 11; 27739371c9d4SSatish Balay y += 11; 2774ebada01fSBarry Smith } 2775ebada01fSBarry Smith } 27769566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 27779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); 27789566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(242.0 * a->nz)); 2779*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2780ebada01fSBarry Smith } 2781ebada01fSBarry Smith 2782d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 2783d71ae5a4SJacob Faibussowitsch { 27842d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2785f4259b30SLisandro Dalcin PetscScalar *z = NULL, *work, *workt, *zarray; 2786d9ca1df4SBarry Smith const PetscScalar *x, *xb; 2787d9ca1df4SBarry Smith const MatScalar *v; 2788d9ca1df4SBarry Smith PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 2789d9ca1df4SBarry Smith PetscInt ncols, k; 2790d9ca1df4SBarry Smith const PetscInt *ridx = NULL, *idx, *ii; 2791ace3abfcSBarry Smith PetscBool usecprow = a->compressedrow.use; 2792218c64b6SSatish Balay 27932d61bbb3SSatish Balay PetscFunctionBegin; 27949566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 27959566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 27969566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &zarray)); 27972d61bbb3SSatish Balay 27982d61bbb3SSatish Balay idx = a->j; 27992d61bbb3SSatish Balay v = a->a; 280026e093fcSHong Zhang if (usecprow) { 280126e093fcSHong Zhang mbs = a->compressedrow.nrows; 280226e093fcSHong Zhang ii = a->compressedrow.i; 28037b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 280426e093fcSHong Zhang } else { 280526e093fcSHong Zhang mbs = a->mbs; 28062d61bbb3SSatish Balay ii = a->i; 280726e093fcSHong Zhang z = zarray; 280826e093fcSHong Zhang } 28092d61bbb3SSatish Balay 28102d61bbb3SSatish Balay if (!a->mult_work) { 2811d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 28129566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 28132d61bbb3SSatish Balay } 28142d61bbb3SSatish Balay work = a->mult_work; 28152d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 28169371c9d4SSatish Balay n = ii[1] - ii[0]; 28179371c9d4SSatish Balay ii++; 28182d61bbb3SSatish Balay ncols = n * bs; 28192d61bbb3SSatish Balay workt = work; 28202d61bbb3SSatish Balay for (j = 0; j < n; j++) { 28212d61bbb3SSatish Balay xb = x + bs * (*idx++); 28222d61bbb3SSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 28232d61bbb3SSatish Balay workt += bs; 28242d61bbb3SSatish Balay } 28257b2bb3b9SHong Zhang if (usecprow) z = zarray + bs * ridx[i]; 282696b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 28272d61bbb3SSatish Balay v += n * bs2; 282826fbe8dcSKarl Rupp if (!usecprow) z += bs; 282926e093fcSHong Zhang } 28309566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 28319566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &zarray)); 28329566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); 2833*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28342d61bbb3SSatish Balay } 28352d61bbb3SSatish Balay 2836d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2837d71ae5a4SJacob Faibussowitsch { 2838547795f9SHong Zhang PetscScalar zero = 0.0; 2839547795f9SHong Zhang 2840547795f9SHong Zhang PetscFunctionBegin; 28419566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28429566063dSJacob Faibussowitsch PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2843*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2844547795f9SHong Zhang } 2845547795f9SHong Zhang 2846d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) 2847d71ae5a4SJacob Faibussowitsch { 28483447b6efSHong Zhang PetscScalar zero = 0.0; 28492d61bbb3SSatish Balay 28502d61bbb3SSatish Balay PetscFunctionBegin; 28519566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 28529566063dSJacob Faibussowitsch PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); 2853*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28542d61bbb3SSatish Balay } 28552d61bbb3SSatish Balay 2856d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 2857d71ae5a4SJacob Faibussowitsch { 2858547795f9SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2859b8c08b77SHong Zhang PetscScalar *z, x1, x2, x3, x4, x5; 2860d9ca1df4SBarry Smith const PetscScalar *x, *xb = NULL; 2861d9ca1df4SBarry Smith const MatScalar *v; 2862b8c08b77SHong Zhang PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; 2863d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 2864547795f9SHong Zhang Mat_CompressedRow cprow = a->compressedrow; 2865ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 2866547795f9SHong Zhang 2867547795f9SHong Zhang PetscFunctionBegin; 28689566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 28699566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 28709566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 2871547795f9SHong Zhang 2872547795f9SHong Zhang idx = a->j; 2873547795f9SHong Zhang v = a->a; 2874547795f9SHong Zhang if (usecprow) { 2875547795f9SHong Zhang mbs = cprow.nrows; 2876547795f9SHong Zhang ii = cprow.i; 2877547795f9SHong Zhang ridx = cprow.rindex; 2878547795f9SHong Zhang } else { 2879547795f9SHong Zhang mbs = a->mbs; 2880547795f9SHong Zhang ii = a->i; 2881547795f9SHong Zhang xb = x; 2882547795f9SHong Zhang } 2883547795f9SHong Zhang 2884547795f9SHong Zhang switch (bs) { 2885547795f9SHong Zhang case 1: 2886547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2887547795f9SHong Zhang if (usecprow) xb = x + ridx[i]; 2888547795f9SHong Zhang x1 = xb[0]; 2889547795f9SHong Zhang ib = idx + ii[0]; 28909371c9d4SSatish Balay n = ii[1] - ii[0]; 28919371c9d4SSatish Balay ii++; 2892547795f9SHong Zhang for (j = 0; j < n; j++) { 2893547795f9SHong Zhang rval = ib[j]; 2894547795f9SHong Zhang z[rval] += PetscConj(*v) * x1; 2895547795f9SHong Zhang v++; 2896547795f9SHong Zhang } 2897547795f9SHong Zhang if (!usecprow) xb++; 2898547795f9SHong Zhang } 2899547795f9SHong Zhang break; 2900547795f9SHong Zhang case 2: 2901547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2902547795f9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 29039371c9d4SSatish Balay x1 = xb[0]; 29049371c9d4SSatish Balay x2 = xb[1]; 2905547795f9SHong Zhang ib = idx + ii[0]; 29069371c9d4SSatish Balay n = ii[1] - ii[0]; 29079371c9d4SSatish Balay ii++; 2908547795f9SHong Zhang for (j = 0; j < n; j++) { 2909547795f9SHong Zhang rval = ib[j] * 2; 2910547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; 2911547795f9SHong Zhang z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; 2912547795f9SHong Zhang v += 4; 2913547795f9SHong Zhang } 2914547795f9SHong Zhang if (!usecprow) xb += 2; 2915547795f9SHong Zhang } 2916547795f9SHong Zhang break; 2917547795f9SHong Zhang case 3: 2918547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2919547795f9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 29209371c9d4SSatish Balay x1 = xb[0]; 29219371c9d4SSatish Balay x2 = xb[1]; 29229371c9d4SSatish Balay x3 = xb[2]; 2923547795f9SHong Zhang ib = idx + ii[0]; 29249371c9d4SSatish Balay n = ii[1] - ii[0]; 29259371c9d4SSatish Balay ii++; 2926547795f9SHong Zhang for (j = 0; j < n; j++) { 2927547795f9SHong Zhang rval = ib[j] * 3; 2928547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; 2929547795f9SHong Zhang z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; 2930547795f9SHong Zhang z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; 2931547795f9SHong Zhang v += 9; 2932547795f9SHong Zhang } 2933547795f9SHong Zhang if (!usecprow) xb += 3; 2934547795f9SHong Zhang } 2935547795f9SHong Zhang break; 2936547795f9SHong Zhang case 4: 2937547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2938547795f9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 29399371c9d4SSatish Balay x1 = xb[0]; 29409371c9d4SSatish Balay x2 = xb[1]; 29419371c9d4SSatish Balay x3 = xb[2]; 29429371c9d4SSatish Balay x4 = xb[3]; 2943547795f9SHong Zhang ib = idx + ii[0]; 29449371c9d4SSatish Balay n = ii[1] - ii[0]; 29459371c9d4SSatish Balay ii++; 2946547795f9SHong Zhang for (j = 0; j < n; j++) { 2947547795f9SHong Zhang rval = ib[j] * 4; 2948547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; 2949547795f9SHong Zhang z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; 2950547795f9SHong Zhang z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; 2951547795f9SHong Zhang z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; 2952547795f9SHong Zhang v += 16; 2953547795f9SHong Zhang } 2954547795f9SHong Zhang if (!usecprow) xb += 4; 2955547795f9SHong Zhang } 2956547795f9SHong Zhang break; 2957547795f9SHong Zhang case 5: 2958547795f9SHong Zhang for (i = 0; i < mbs; i++) { 2959547795f9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 29609371c9d4SSatish Balay x1 = xb[0]; 29619371c9d4SSatish Balay x2 = xb[1]; 29629371c9d4SSatish Balay x3 = xb[2]; 29639371c9d4SSatish Balay x4 = xb[3]; 29649371c9d4SSatish Balay x5 = xb[4]; 2965547795f9SHong Zhang ib = idx + ii[0]; 29669371c9d4SSatish Balay n = ii[1] - ii[0]; 29679371c9d4SSatish Balay ii++; 2968547795f9SHong Zhang for (j = 0; j < n; j++) { 2969547795f9SHong Zhang rval = ib[j] * 5; 2970547795f9SHong Zhang z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; 2971547795f9SHong Zhang z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; 2972547795f9SHong Zhang z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; 2973547795f9SHong Zhang z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; 2974547795f9SHong Zhang z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; 2975547795f9SHong Zhang v += 25; 2976547795f9SHong Zhang } 2977547795f9SHong Zhang if (!usecprow) xb += 5; 2978547795f9SHong Zhang } 2979547795f9SHong Zhang break; 2980d71ae5a4SJacob Faibussowitsch default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2981d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); 2982968ae2c8SSatish Balay #if 0 2983968ae2c8SSatish Balay { 2984b8c08b77SHong Zhang PetscInt ncols,k,bs2=a->bs2; 2985b8c08b77SHong Zhang PetscScalar *work,*workt,zb; 2986d9ca1df4SBarry Smith const PetscScalar *xtmp; 2987547795f9SHong Zhang if (!a->mult_work) { 2988547795f9SHong Zhang k = PetscMax(A->rmap->n,A->cmap->n); 29899566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2990547795f9SHong Zhang } 2991547795f9SHong Zhang work = a->mult_work; 2992547795f9SHong Zhang xtmp = x; 2993547795f9SHong Zhang for (i=0; i<mbs; i++) { 2994547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 2995547795f9SHong Zhang ncols = n*bs; 29969566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work,ncols)); 299726fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs*ridx[i]; 299896b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2999547795f9SHong Zhang v += n*bs2; 3000547795f9SHong Zhang if (!usecprow) xtmp += bs; 3001547795f9SHong Zhang workt = work; 3002547795f9SHong Zhang for (j=0; j<n; j++) { 3003547795f9SHong Zhang zb = z + bs*(*idx++); 3004547795f9SHong Zhang for (k=0; k<bs; k++) zb[k] += workt[k] ; 3005547795f9SHong Zhang workt += bs; 3006547795f9SHong Zhang } 3007547795f9SHong Zhang } 3008547795f9SHong Zhang } 3009968ae2c8SSatish Balay #endif 3010547795f9SHong Zhang } 30119566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 30129566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 30139566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3014*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3015547795f9SHong Zhang } 3016547795f9SHong Zhang 3017d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) 3018d71ae5a4SJacob Faibussowitsch { 30192d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3020d9ca1df4SBarry Smith PetscScalar *zb, *z, x1, x2, x3, x4, x5; 3021f4259b30SLisandro Dalcin const PetscScalar *x, *xb = NULL; 3022d9ca1df4SBarry Smith const MatScalar *v; 3023d9ca1df4SBarry Smith PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3024d9ca1df4SBarry Smith const PetscInt *idx, *ii, *ib, *ridx = NULL; 30253447b6efSHong Zhang Mat_CompressedRow cprow = a->compressedrow; 3026ace3abfcSBarry Smith PetscBool usecprow = cprow.use; 30272d61bbb3SSatish Balay 30282d61bbb3SSatish Balay PetscFunctionBegin; 30299566063dSJacob Faibussowitsch if (yy != zz) PetscCall(VecCopy(yy, zz)); 30309566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 30319566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 30322d61bbb3SSatish Balay 30332d61bbb3SSatish Balay idx = a->j; 30342d61bbb3SSatish Balay v = a->a; 30353447b6efSHong Zhang if (usecprow) { 30363447b6efSHong Zhang mbs = cprow.nrows; 30373447b6efSHong Zhang ii = cprow.i; 30387b2bb3b9SHong Zhang ridx = cprow.rindex; 30393447b6efSHong Zhang } else { 30403447b6efSHong Zhang mbs = a->mbs; 30412d61bbb3SSatish Balay ii = a->i; 3042f1af5d2fSBarry Smith xb = x; 30433447b6efSHong Zhang } 30442d61bbb3SSatish Balay 30452d61bbb3SSatish Balay switch (bs) { 30462d61bbb3SSatish Balay case 1: 30472d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30487b2bb3b9SHong Zhang if (usecprow) xb = x + ridx[i]; 3049f1af5d2fSBarry Smith x1 = xb[0]; 30503447b6efSHong Zhang ib = idx + ii[0]; 30519371c9d4SSatish Balay n = ii[1] - ii[0]; 30529371c9d4SSatish Balay ii++; 30532d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30542d61bbb3SSatish Balay rval = ib[j]; 3055f1af5d2fSBarry Smith z[rval] += *v * x1; 3056f1af5d2fSBarry Smith v++; 30572d61bbb3SSatish Balay } 30583447b6efSHong Zhang if (!usecprow) xb++; 30592d61bbb3SSatish Balay } 30602d61bbb3SSatish Balay break; 30612d61bbb3SSatish Balay case 2: 30622d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30637b2bb3b9SHong Zhang if (usecprow) xb = x + 2 * ridx[i]; 30649371c9d4SSatish Balay x1 = xb[0]; 30659371c9d4SSatish Balay x2 = xb[1]; 30663447b6efSHong Zhang ib = idx + ii[0]; 30679371c9d4SSatish Balay n = ii[1] - ii[0]; 30689371c9d4SSatish Balay ii++; 30692d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30702d61bbb3SSatish Balay rval = ib[j] * 2; 30712d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2; 30722d61bbb3SSatish Balay z[rval++] += v[2] * x1 + v[3] * x2; 30732d61bbb3SSatish Balay v += 4; 30742d61bbb3SSatish Balay } 30753447b6efSHong Zhang if (!usecprow) xb += 2; 30762d61bbb3SSatish Balay } 30772d61bbb3SSatish Balay break; 30782d61bbb3SSatish Balay case 3: 30792d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30807b2bb3b9SHong Zhang if (usecprow) xb = x + 3 * ridx[i]; 30819371c9d4SSatish Balay x1 = xb[0]; 30829371c9d4SSatish Balay x2 = xb[1]; 30839371c9d4SSatish Balay x3 = xb[2]; 30843447b6efSHong Zhang ib = idx + ii[0]; 30859371c9d4SSatish Balay n = ii[1] - ii[0]; 30869371c9d4SSatish Balay ii++; 30872d61bbb3SSatish Balay for (j = 0; j < n; j++) { 30882d61bbb3SSatish Balay rval = ib[j] * 3; 30892d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; 30902d61bbb3SSatish Balay z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; 30912d61bbb3SSatish Balay z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; 30922d61bbb3SSatish Balay v += 9; 30932d61bbb3SSatish Balay } 30943447b6efSHong Zhang if (!usecprow) xb += 3; 30952d61bbb3SSatish Balay } 30962d61bbb3SSatish Balay break; 30972d61bbb3SSatish Balay case 4: 30982d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 30997b2bb3b9SHong Zhang if (usecprow) xb = x + 4 * ridx[i]; 31009371c9d4SSatish Balay x1 = xb[0]; 31019371c9d4SSatish Balay x2 = xb[1]; 31029371c9d4SSatish Balay x3 = xb[2]; 31039371c9d4SSatish Balay x4 = xb[3]; 31043447b6efSHong Zhang ib = idx + ii[0]; 31059371c9d4SSatish Balay n = ii[1] - ii[0]; 31069371c9d4SSatish Balay ii++; 31072d61bbb3SSatish Balay for (j = 0; j < n; j++) { 31082d61bbb3SSatish Balay rval = ib[j] * 4; 31092d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 31102d61bbb3SSatish Balay z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 31112d61bbb3SSatish Balay z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 31122d61bbb3SSatish Balay z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 31132d61bbb3SSatish Balay v += 16; 31142d61bbb3SSatish Balay } 31153447b6efSHong Zhang if (!usecprow) xb += 4; 31162d61bbb3SSatish Balay } 31172d61bbb3SSatish Balay break; 31182d61bbb3SSatish Balay case 5: 31192d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 31207b2bb3b9SHong Zhang if (usecprow) xb = x + 5 * ridx[i]; 31219371c9d4SSatish Balay x1 = xb[0]; 31229371c9d4SSatish Balay x2 = xb[1]; 31239371c9d4SSatish Balay x3 = xb[2]; 31249371c9d4SSatish Balay x4 = xb[3]; 31259371c9d4SSatish Balay x5 = xb[4]; 31263447b6efSHong Zhang ib = idx + ii[0]; 31279371c9d4SSatish Balay n = ii[1] - ii[0]; 31289371c9d4SSatish Balay ii++; 31292d61bbb3SSatish Balay for (j = 0; j < n; j++) { 31302d61bbb3SSatish Balay rval = ib[j] * 5; 31312d61bbb3SSatish Balay z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 31322d61bbb3SSatish Balay z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 31332d61bbb3SSatish Balay z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 31342d61bbb3SSatish Balay z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 31352d61bbb3SSatish Balay z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 31362d61bbb3SSatish Balay v += 25; 31372d61bbb3SSatish Balay } 31383447b6efSHong Zhang if (!usecprow) xb += 5; 31392d61bbb3SSatish Balay } 31402d61bbb3SSatish Balay break; 3141f1af5d2fSBarry Smith default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 3142690b6cddSBarry Smith PetscInt ncols, k; 3143d9ca1df4SBarry Smith PetscScalar *work, *workt; 3144d9ca1df4SBarry Smith const PetscScalar *xtmp; 31452d61bbb3SSatish Balay if (!a->mult_work) { 3146d0f46423SBarry Smith k = PetscMax(A->rmap->n, A->cmap->n); 31479566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(k + 1, &a->mult_work)); 31482d61bbb3SSatish Balay } 31492d61bbb3SSatish Balay work = a->mult_work; 31503447b6efSHong Zhang xtmp = x; 31512d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { 31529371c9d4SSatish Balay n = ii[1] - ii[0]; 31539371c9d4SSatish Balay ii++; 31542d61bbb3SSatish Balay ncols = n * bs; 31559566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(work, ncols)); 315626fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs * ridx[i]; 315796b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); 31582d61bbb3SSatish Balay v += n * bs2; 31593447b6efSHong Zhang if (!usecprow) xtmp += bs; 31602d61bbb3SSatish Balay workt = work; 31612d61bbb3SSatish Balay for (j = 0; j < n; j++) { 31622d61bbb3SSatish Balay zb = z + bs * (*idx++); 31632d61bbb3SSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 31642d61bbb3SSatish Balay workt += bs; 31652d61bbb3SSatish Balay } 31662d61bbb3SSatish Balay } 31672d61bbb3SSatish Balay } 31682d61bbb3SSatish Balay } 31699566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 31709566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 31719566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); 3172*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 31732d61bbb3SSatish Balay } 31742d61bbb3SSatish Balay 3175d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) 3176d71ae5a4SJacob Faibussowitsch { 31772d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3178690b6cddSBarry Smith PetscInt totalnz = a->bs2 * a->nz; 3179f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 3180c5df96a5SBarry Smith PetscBLASInt one = 1, tnz; 31812d61bbb3SSatish Balay 31822d61bbb3SSatish Balay PetscFunctionBegin; 31839566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(totalnz, &tnz)); 3184792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); 31859566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 3186*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 31872d61bbb3SSatish Balay } 31882d61bbb3SSatish Balay 3189d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) 3190d71ae5a4SJacob Faibussowitsch { 31912d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 31923f1db9ecSBarry Smith MatScalar *v = a->a; 3193329f5518SBarry Smith PetscReal sum = 0.0; 3194d0f46423SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; 31952d61bbb3SSatish Balay 31962d61bbb3SSatish Balay PetscFunctionBegin; 31972d61bbb3SSatish Balay if (type == NORM_FROBENIUS) { 3198570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16) 3199570b7f6dSBarry Smith PetscBLASInt one = 1, cnt = bs2 * nz; 3200792fecdfSBarry Smith PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); 3201570b7f6dSBarry Smith #else 32022d61bbb3SSatish Balay for (i = 0; i < bs2 * nz; i++) { 32039371c9d4SSatish Balay sum += PetscRealPart(PetscConj(*v) * (*v)); 32049371c9d4SSatish Balay v++; 32052d61bbb3SSatish Balay } 3206570b7f6dSBarry Smith #endif 32078f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum); 32089566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * nz)); 32098a62d963SHong Zhang } else if (type == NORM_1) { /* maximum column sum */ 32108a62d963SHong Zhang PetscReal *tmp; 32118a62d963SHong Zhang PetscInt *bcol = a->j; 32129566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); 32138a62d963SHong Zhang for (i = 0; i < nz; i++) { 32148a62d963SHong Zhang for (j = 0; j < bs; j++) { 32158a62d963SHong Zhang k1 = bs * (*bcol) + j; /* column index */ 32168a62d963SHong Zhang for (k = 0; k < bs; k++) { 32179371c9d4SSatish Balay tmp[k1] += PetscAbsScalar(*v); 32189371c9d4SSatish Balay v++; 32198a62d963SHong Zhang } 32208a62d963SHong Zhang } 32218a62d963SHong Zhang bcol++; 32228a62d963SHong Zhang } 32238a62d963SHong Zhang *norm = 0.0; 3224d0f46423SBarry Smith for (j = 0; j < A->cmap->n; j++) { 32258a62d963SHong Zhang if (tmp[j] > *norm) *norm = tmp[j]; 32268a62d963SHong Zhang } 32279566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp)); 32289566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3229596552b5SBarry Smith } else if (type == NORM_INFINITY) { /* maximum row sum */ 3230596552b5SBarry Smith *norm = 0.0; 3231596552b5SBarry Smith for (k = 0; k < bs; k++) { 323274f84c7bSSatish Balay for (j = 0; j < a->mbs; j++) { 3233596552b5SBarry Smith v = a->a + bs2 * a->i[j] + k; 3234596552b5SBarry Smith sum = 0.0; 3235596552b5SBarry Smith for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { 32360e90e235SBarry Smith for (k1 = 0; k1 < bs; k1++) { 3237596552b5SBarry Smith sum += PetscAbsScalar(*v); 3238596552b5SBarry Smith v += bs; 32392d61bbb3SSatish Balay } 32400e90e235SBarry Smith } 3241596552b5SBarry Smith if (sum > *norm) *norm = sum; 3242596552b5SBarry Smith } 3243596552b5SBarry Smith } 32449566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); 3245e7e72b3dSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 3246*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 32472d61bbb3SSatish Balay } 32482d61bbb3SSatish Balay 3249d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) 3250d71ae5a4SJacob Faibussowitsch { 32512d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 32522d61bbb3SSatish Balay 32532d61bbb3SSatish Balay PetscFunctionBegin; 32542d61bbb3SSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 3255d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 3256273d9f13SBarry Smith *flg = PETSC_FALSE; 3257*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 32582d61bbb3SSatish Balay } 32592d61bbb3SSatish Balay 32602d61bbb3SSatish Balay /* if the a->i are the same */ 32619566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 3262*3ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 32632d61bbb3SSatish Balay 32642d61bbb3SSatish Balay /* if a->j are the same */ 32659566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 3266*3ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 326726fbe8dcSKarl Rupp 32682d61bbb3SSatish Balay /* if a->a are the same */ 32699566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg)); 3270*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 32712d61bbb3SSatish Balay } 32722d61bbb3SSatish Balay 3273d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) 3274d71ae5a4SJacob Faibussowitsch { 32752d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3276690b6cddSBarry Smith PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2; 327787828ca2SBarry Smith PetscScalar *x, zero = 0.0; 32783f1db9ecSBarry Smith MatScalar *aa, *aa_j; 32792d61bbb3SSatish Balay 32802d61bbb3SSatish Balay PetscFunctionBegin; 328128b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 3282d0f46423SBarry Smith bs = A->rmap->bs; 32832d61bbb3SSatish Balay aa = a->a; 32842d61bbb3SSatish Balay ai = a->i; 32852d61bbb3SSatish Balay aj = a->j; 32862d61bbb3SSatish Balay ambs = a->mbs; 32872d61bbb3SSatish Balay bs2 = a->bs2; 32882d61bbb3SSatish Balay 32899566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 32909566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 32919566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 329208401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 32932d61bbb3SSatish Balay for (i = 0; i < ambs; i++) { 32942d61bbb3SSatish Balay for (j = ai[i]; j < ai[i + 1]; j++) { 32952d61bbb3SSatish Balay if (aj[j] == i) { 32962d61bbb3SSatish Balay row = i * bs; 32972d61bbb3SSatish Balay aa_j = aa + j * bs2; 32982d61bbb3SSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 32992d61bbb3SSatish Balay break; 33002d61bbb3SSatish Balay } 33012d61bbb3SSatish Balay } 33022d61bbb3SSatish Balay } 33039566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 3304*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33052d61bbb3SSatish Balay } 33062d61bbb3SSatish Balay 3307d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) 3308d71ae5a4SJacob Faibussowitsch { 33092d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 331053ef36baSBarry Smith const PetscScalar *l, *r, *li, *ri; 331153ef36baSBarry Smith PetscScalar x; 33123f1db9ecSBarry Smith MatScalar *aa, *v; 331353ef36baSBarry Smith PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; 331453ef36baSBarry Smith const PetscInt *ai, *aj; 33152d61bbb3SSatish Balay 33162d61bbb3SSatish Balay PetscFunctionBegin; 33172d61bbb3SSatish Balay ai = a->i; 33182d61bbb3SSatish Balay aj = a->j; 33192d61bbb3SSatish Balay aa = a->a; 3320d0f46423SBarry Smith m = A->rmap->n; 3321d0f46423SBarry Smith n = A->cmap->n; 3322d0f46423SBarry Smith bs = A->rmap->bs; 33232d61bbb3SSatish Balay mbs = a->mbs; 33242d61bbb3SSatish Balay bs2 = a->bs2; 33252d61bbb3SSatish Balay if (ll) { 33269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 33279566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 332808401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 33292d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 33302d61bbb3SSatish Balay M = ai[i + 1] - ai[i]; 33312d61bbb3SSatish Balay li = l + i * bs; 33322d61bbb3SSatish Balay v = aa + bs2 * ai[i]; 33332d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 3334ad540459SPierre Jolivet for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; 33352d61bbb3SSatish Balay } 33362d61bbb3SSatish Balay } 33379566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 33389566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 33392d61bbb3SSatish Balay } 33402d61bbb3SSatish Balay 33412d61bbb3SSatish Balay if (rr) { 33429566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(rr, &r)); 33439566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(rr, &rn)); 334408401ef6SPierre Jolivet PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 33452d61bbb3SSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 334653ef36baSBarry Smith iai = ai[i]; 334753ef36baSBarry Smith M = ai[i + 1] - iai; 334853ef36baSBarry Smith v = aa + bs2 * iai; 33492d61bbb3SSatish Balay for (j = 0; j < M; j++) { /* for each block */ 335053ef36baSBarry Smith ri = r + bs * aj[iai + j]; 33512d61bbb3SSatish Balay for (k = 0; k < bs; k++) { 33522d61bbb3SSatish Balay x = ri[k]; 335353ef36baSBarry Smith for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; 335453ef36baSBarry Smith v += bs; 33552d61bbb3SSatish Balay } 33562d61bbb3SSatish Balay } 33572d61bbb3SSatish Balay } 33589566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(rr, &r)); 33599566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 33602d61bbb3SSatish Balay } 3361*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33622d61bbb3SSatish Balay } 33632d61bbb3SSatish Balay 3364d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) 3365d71ae5a4SJacob Faibussowitsch { 33662d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33672d61bbb3SSatish Balay 33682d61bbb3SSatish Balay PetscFunctionBegin; 33692d61bbb3SSatish Balay info->block_size = a->bs2; 3370ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; 33712d61bbb3SSatish Balay info->nz_used = a->bs2 * a->nz; 33723966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 33732d61bbb3SSatish Balay info->assemblies = A->num_ass; 33748e58a170SBarry Smith info->mallocs = A->info.mallocs; 33754dfa11a4SJacob Faibussowitsch info->memory = 0; /* REVIEW ME */ 3376d5f3da31SBarry Smith if (A->factortype) { 33772d61bbb3SSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 33782d61bbb3SSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 33792d61bbb3SSatish Balay info->factor_mallocs = A->info.factor_mallocs; 33802d61bbb3SSatish Balay } else { 33812d61bbb3SSatish Balay info->fill_ratio_given = 0; 33822d61bbb3SSatish Balay info->fill_ratio_needed = 0; 33832d61bbb3SSatish Balay info->factor_mallocs = 0; 33842d61bbb3SSatish Balay } 3385*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33862d61bbb3SSatish Balay } 33872d61bbb3SSatish Balay 3388d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 3389d71ae5a4SJacob Faibussowitsch { 33902d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33912d61bbb3SSatish Balay 33922d61bbb3SSatish Balay PetscFunctionBegin; 33939566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 3394*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33952d61bbb3SSatish Balay } 3396a001520aSPierre Jolivet 3397d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 3398d71ae5a4SJacob Faibussowitsch { 3399a001520aSPierre Jolivet PetscFunctionBegin; 34009566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 34014222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 3402*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3403a001520aSPierre Jolivet } 3404a001520aSPierre Jolivet 3405d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3406d71ae5a4SJacob Faibussowitsch { 340774eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3408f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1; 3409bcf10a7aSPierre Jolivet const PetscScalar *xb; 341074eeabc5SPierre Jolivet PetscScalar x1; 341174eeabc5SPierre Jolivet const MatScalar *v, *vv; 341274eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 341374eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 341474eeabc5SPierre Jolivet 341574eeabc5SPierre Jolivet PetscFunctionBegin; 341674eeabc5SPierre Jolivet idx = a->j; 341774eeabc5SPierre Jolivet v = a->a; 341874eeabc5SPierre Jolivet if (usecprow) { 341974eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 342074eeabc5SPierre Jolivet ii = a->compressedrow.i; 342174eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 342274eeabc5SPierre Jolivet } else { 342374eeabc5SPierre Jolivet mbs = a->mbs; 342474eeabc5SPierre Jolivet ii = a->i; 342574eeabc5SPierre Jolivet z = c; 342674eeabc5SPierre Jolivet } 342774eeabc5SPierre Jolivet 342874eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 34299371c9d4SSatish Balay n = ii[1] - ii[0]; 34309371c9d4SSatish Balay ii++; 343174eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 343274eeabc5SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 343374eeabc5SPierre Jolivet if (usecprow) z = c + ridx[i]; 343474eeabc5SPierre Jolivet jj = idx; 343574eeabc5SPierre Jolivet vv = v; 343674eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 343774eeabc5SPierre Jolivet idx = jj; 343874eeabc5SPierre Jolivet v = vv; 343974eeabc5SPierre Jolivet sum1 = 0.0; 344074eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 34419371c9d4SSatish Balay xb = b + (*idx++); 34429371c9d4SSatish Balay x1 = xb[0 + k * bm]; 344374eeabc5SPierre Jolivet sum1 += v[0] * x1; 344474eeabc5SPierre Jolivet v += 1; 344574eeabc5SPierre Jolivet } 3446feb237baSPierre Jolivet z[0 + k * cm] = sum1; 344774eeabc5SPierre Jolivet } 344874eeabc5SPierre Jolivet if (!usecprow) z += 1; 344974eeabc5SPierre Jolivet } 3450*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 345174eeabc5SPierre Jolivet } 345274eeabc5SPierre Jolivet 3453d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3454d71ae5a4SJacob Faibussowitsch { 34554b7054f4SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3456f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2; 3457bcf10a7aSPierre Jolivet const PetscScalar *xb; 34584b7054f4SPierre Jolivet PetscScalar x1, x2; 34594b7054f4SPierre Jolivet const MatScalar *v, *vv; 34604b7054f4SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 34614b7054f4SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 34624b7054f4SPierre Jolivet 34634b7054f4SPierre Jolivet PetscFunctionBegin; 34644b7054f4SPierre Jolivet idx = a->j; 34654b7054f4SPierre Jolivet v = a->a; 34664b7054f4SPierre Jolivet if (usecprow) { 34674b7054f4SPierre Jolivet mbs = a->compressedrow.nrows; 34684b7054f4SPierre Jolivet ii = a->compressedrow.i; 34694b7054f4SPierre Jolivet ridx = a->compressedrow.rindex; 34704b7054f4SPierre Jolivet } else { 34714b7054f4SPierre Jolivet mbs = a->mbs; 34724b7054f4SPierre Jolivet ii = a->i; 34734b7054f4SPierre Jolivet z = c; 34744b7054f4SPierre Jolivet } 34754b7054f4SPierre Jolivet 34764b7054f4SPierre Jolivet for (i = 0; i < mbs; i++) { 34779371c9d4SSatish Balay n = ii[1] - ii[0]; 34789371c9d4SSatish Balay ii++; 34794b7054f4SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 34804b7054f4SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 34814b7054f4SPierre Jolivet if (usecprow) z = c + 2 * ridx[i]; 34824b7054f4SPierre Jolivet jj = idx; 34834b7054f4SPierre Jolivet vv = v; 34844b7054f4SPierre Jolivet for (k = 0; k < cn; k++) { 34854b7054f4SPierre Jolivet idx = jj; 34864b7054f4SPierre Jolivet v = vv; 34879371c9d4SSatish Balay sum1 = 0.0; 34889371c9d4SSatish Balay sum2 = 0.0; 34894b7054f4SPierre Jolivet for (j = 0; j < n; j++) { 34909371c9d4SSatish Balay xb = b + 2 * (*idx++); 34919371c9d4SSatish Balay x1 = xb[0 + k * bm]; 34929371c9d4SSatish Balay x2 = xb[1 + k * bm]; 34934b7054f4SPierre Jolivet sum1 += v[0] * x1 + v[2] * x2; 34944b7054f4SPierre Jolivet sum2 += v[1] * x1 + v[3] * x2; 34954b7054f4SPierre Jolivet v += 4; 34964b7054f4SPierre Jolivet } 34979371c9d4SSatish Balay z[0 + k * cm] = sum1; 34989371c9d4SSatish Balay z[1 + k * cm] = sum2; 34994b7054f4SPierre Jolivet } 35004b7054f4SPierre Jolivet if (!usecprow) z += 2; 35014b7054f4SPierre Jolivet } 3502*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 35034b7054f4SPierre Jolivet } 35044b7054f4SPierre Jolivet 3505d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3506d71ae5a4SJacob Faibussowitsch { 350774eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3508f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3; 3509bcf10a7aSPierre Jolivet const PetscScalar *xb; 351074eeabc5SPierre Jolivet PetscScalar x1, x2, x3; 351174eeabc5SPierre Jolivet const MatScalar *v, *vv; 351274eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 351374eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 351474eeabc5SPierre Jolivet 351574eeabc5SPierre Jolivet PetscFunctionBegin; 351674eeabc5SPierre Jolivet idx = a->j; 351774eeabc5SPierre Jolivet v = a->a; 351874eeabc5SPierre Jolivet if (usecprow) { 351974eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 352074eeabc5SPierre Jolivet ii = a->compressedrow.i; 352174eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 352274eeabc5SPierre Jolivet } else { 352374eeabc5SPierre Jolivet mbs = a->mbs; 352474eeabc5SPierre Jolivet ii = a->i; 352574eeabc5SPierre Jolivet z = c; 352674eeabc5SPierre Jolivet } 352774eeabc5SPierre Jolivet 352874eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 35299371c9d4SSatish Balay n = ii[1] - ii[0]; 35309371c9d4SSatish Balay ii++; 353174eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 353274eeabc5SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 353374eeabc5SPierre Jolivet if (usecprow) z = c + 3 * ridx[i]; 353474eeabc5SPierre Jolivet jj = idx; 353574eeabc5SPierre Jolivet vv = v; 353674eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 353774eeabc5SPierre Jolivet idx = jj; 353874eeabc5SPierre Jolivet v = vv; 35399371c9d4SSatish Balay sum1 = 0.0; 35409371c9d4SSatish Balay sum2 = 0.0; 35419371c9d4SSatish Balay sum3 = 0.0; 354274eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 35439371c9d4SSatish Balay xb = b + 3 * (*idx++); 35449371c9d4SSatish Balay x1 = xb[0 + k * bm]; 35459371c9d4SSatish Balay x2 = xb[1 + k * bm]; 35469371c9d4SSatish Balay x3 = xb[2 + k * bm]; 354774eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; 354874eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; 354974eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; 355074eeabc5SPierre Jolivet v += 9; 355174eeabc5SPierre Jolivet } 35529371c9d4SSatish Balay z[0 + k * cm] = sum1; 35539371c9d4SSatish Balay z[1 + k * cm] = sum2; 35549371c9d4SSatish Balay z[2 + k * cm] = sum3; 355574eeabc5SPierre Jolivet } 355674eeabc5SPierre Jolivet if (!usecprow) z += 3; 355774eeabc5SPierre Jolivet } 3558*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 355974eeabc5SPierre Jolivet } 356074eeabc5SPierre Jolivet 3561d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3562d71ae5a4SJacob Faibussowitsch { 356374eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3564f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4; 3565bcf10a7aSPierre Jolivet const PetscScalar *xb; 356674eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4; 356774eeabc5SPierre Jolivet const MatScalar *v, *vv; 356874eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 356974eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 357074eeabc5SPierre Jolivet 357174eeabc5SPierre Jolivet PetscFunctionBegin; 357274eeabc5SPierre Jolivet idx = a->j; 357374eeabc5SPierre Jolivet v = a->a; 357474eeabc5SPierre Jolivet if (usecprow) { 357574eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 357674eeabc5SPierre Jolivet ii = a->compressedrow.i; 357774eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 357874eeabc5SPierre Jolivet } else { 357974eeabc5SPierre Jolivet mbs = a->mbs; 358074eeabc5SPierre Jolivet ii = a->i; 358174eeabc5SPierre Jolivet z = c; 358274eeabc5SPierre Jolivet } 358374eeabc5SPierre Jolivet 358474eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 35859371c9d4SSatish Balay n = ii[1] - ii[0]; 35869371c9d4SSatish Balay ii++; 358774eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 358874eeabc5SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 358974eeabc5SPierre Jolivet if (usecprow) z = c + 4 * ridx[i]; 359074eeabc5SPierre Jolivet jj = idx; 359174eeabc5SPierre Jolivet vv = v; 359274eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 359374eeabc5SPierre Jolivet idx = jj; 359474eeabc5SPierre Jolivet v = vv; 35959371c9d4SSatish Balay sum1 = 0.0; 35969371c9d4SSatish Balay sum2 = 0.0; 35979371c9d4SSatish Balay sum3 = 0.0; 35989371c9d4SSatish Balay sum4 = 0.0; 359974eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 36009371c9d4SSatish Balay xb = b + 4 * (*idx++); 36019371c9d4SSatish Balay x1 = xb[0 + k * bm]; 36029371c9d4SSatish Balay x2 = xb[1 + k * bm]; 36039371c9d4SSatish Balay x3 = xb[2 + k * bm]; 36049371c9d4SSatish Balay x4 = xb[3 + k * bm]; 360574eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 360674eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 360774eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 360874eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 360974eeabc5SPierre Jolivet v += 16; 361074eeabc5SPierre Jolivet } 36119371c9d4SSatish Balay z[0 + k * cm] = sum1; 36129371c9d4SSatish Balay z[1 + k * cm] = sum2; 36139371c9d4SSatish Balay z[2 + k * cm] = sum3; 36149371c9d4SSatish Balay z[3 + k * cm] = sum4; 361574eeabc5SPierre Jolivet } 361674eeabc5SPierre Jolivet if (!usecprow) z += 4; 361774eeabc5SPierre Jolivet } 3618*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 361974eeabc5SPierre Jolivet } 362074eeabc5SPierre Jolivet 3621d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 3622d71ae5a4SJacob Faibussowitsch { 362374eeabc5SPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3624f4259b30SLisandro Dalcin PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; 3625bcf10a7aSPierre Jolivet const PetscScalar *xb; 362674eeabc5SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 362774eeabc5SPierre Jolivet const MatScalar *v, *vv; 362874eeabc5SPierre Jolivet PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; 362974eeabc5SPierre Jolivet PetscBool usecprow = a->compressedrow.use; 363074eeabc5SPierre Jolivet 363174eeabc5SPierre Jolivet PetscFunctionBegin; 363274eeabc5SPierre Jolivet idx = a->j; 363374eeabc5SPierre Jolivet v = a->a; 363474eeabc5SPierre Jolivet if (usecprow) { 363574eeabc5SPierre Jolivet mbs = a->compressedrow.nrows; 363674eeabc5SPierre Jolivet ii = a->compressedrow.i; 363774eeabc5SPierre Jolivet ridx = a->compressedrow.rindex; 363874eeabc5SPierre Jolivet } else { 363974eeabc5SPierre Jolivet mbs = a->mbs; 364074eeabc5SPierre Jolivet ii = a->i; 364174eeabc5SPierre Jolivet z = c; 364274eeabc5SPierre Jolivet } 364374eeabc5SPierre Jolivet 364474eeabc5SPierre Jolivet for (i = 0; i < mbs; i++) { 36459371c9d4SSatish Balay n = ii[1] - ii[0]; 36469371c9d4SSatish Balay ii++; 364774eeabc5SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 364874eeabc5SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 364974eeabc5SPierre Jolivet if (usecprow) z = c + 5 * ridx[i]; 365074eeabc5SPierre Jolivet jj = idx; 365174eeabc5SPierre Jolivet vv = v; 365274eeabc5SPierre Jolivet for (k = 0; k < cn; k++) { 365374eeabc5SPierre Jolivet idx = jj; 365474eeabc5SPierre Jolivet v = vv; 36559371c9d4SSatish Balay sum1 = 0.0; 36569371c9d4SSatish Balay sum2 = 0.0; 36579371c9d4SSatish Balay sum3 = 0.0; 36589371c9d4SSatish Balay sum4 = 0.0; 36599371c9d4SSatish Balay sum5 = 0.0; 366074eeabc5SPierre Jolivet for (j = 0; j < n; j++) { 36619371c9d4SSatish Balay xb = b + 5 * (*idx++); 36629371c9d4SSatish Balay x1 = xb[0 + k * bm]; 36639371c9d4SSatish Balay x2 = xb[1 + k * bm]; 36649371c9d4SSatish Balay x3 = xb[2 + k * bm]; 36659371c9d4SSatish Balay x4 = xb[3 + k * bm]; 36669371c9d4SSatish Balay x5 = xb[4 + k * bm]; 366774eeabc5SPierre Jolivet sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 366874eeabc5SPierre Jolivet sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 366974eeabc5SPierre Jolivet sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 367074eeabc5SPierre Jolivet sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 367174eeabc5SPierre Jolivet sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 367274eeabc5SPierre Jolivet v += 25; 367374eeabc5SPierre Jolivet } 36749371c9d4SSatish Balay z[0 + k * cm] = sum1; 36759371c9d4SSatish Balay z[1 + k * cm] = sum2; 36769371c9d4SSatish Balay z[2 + k * cm] = sum3; 36779371c9d4SSatish Balay z[3 + k * cm] = sum4; 36789371c9d4SSatish Balay z[4 + k * cm] = sum5; 367974eeabc5SPierre Jolivet } 368074eeabc5SPierre Jolivet if (!usecprow) z += 5; 368174eeabc5SPierre Jolivet } 3682*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 368374eeabc5SPierre Jolivet } 368474eeabc5SPierre Jolivet 3685d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) 3686d71ae5a4SJacob Faibussowitsch { 3687a001520aSPierre Jolivet Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3688a001520aSPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 3689910cf402Sprj- Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 3690bcf10a7aSPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 3691a001520aSPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 3692a001520aSPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 3693f4259b30SLisandro Dalcin PetscScalar *z = NULL; 3694a001520aSPierre Jolivet PetscScalar *c, *b; 3695a001520aSPierre Jolivet const MatScalar *v; 3696a001520aSPierre Jolivet const PetscInt *idx, *ii, *ridx = NULL; 36974b7054f4SPierre Jolivet PetscScalar _DZero = 0.0, _DOne = 1.0; 3698a001520aSPierre Jolivet PetscBool usecprow = a->compressedrow.use; 3699a001520aSPierre Jolivet 3700a001520aSPierre Jolivet PetscFunctionBegin; 3701*3ba16761SJacob Faibussowitsch if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 370208401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 370308401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 370408401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 3705a001520aSPierre Jolivet b = bd->v; 370648a46eb9SPierre Jolivet if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); 37079566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 370874eeabc5SPierre Jolivet switch (bs) { 3709d71ae5a4SJacob Faibussowitsch case 1: 3710d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3711d71ae5a4SJacob Faibussowitsch break; 3712d71ae5a4SJacob Faibussowitsch case 2: 3713d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3714d71ae5a4SJacob Faibussowitsch break; 3715d71ae5a4SJacob Faibussowitsch case 3: 3716d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3717d71ae5a4SJacob Faibussowitsch break; 3718d71ae5a4SJacob Faibussowitsch case 4: 3719d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3720d71ae5a4SJacob Faibussowitsch break; 3721d71ae5a4SJacob Faibussowitsch case 5: 3722d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3723d71ae5a4SJacob Faibussowitsch break; 372474eeabc5SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 37259566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 37269566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 37279566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 37289566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 3729a001520aSPierre Jolivet idx = a->j; 3730a001520aSPierre Jolivet v = a->a; 3731a001520aSPierre Jolivet if (usecprow) { 3732a001520aSPierre Jolivet mbs = a->compressedrow.nrows; 3733a001520aSPierre Jolivet ii = a->compressedrow.i; 3734a001520aSPierre Jolivet ridx = a->compressedrow.rindex; 3735a001520aSPierre Jolivet } else { 3736a001520aSPierre Jolivet mbs = a->mbs; 3737a001520aSPierre Jolivet ii = a->i; 3738a001520aSPierre Jolivet z = c; 3739a001520aSPierre Jolivet } 3740a001520aSPierre Jolivet for (i = 0; i < mbs; i++) { 37419371c9d4SSatish Balay n = ii[1] - ii[0]; 37429371c9d4SSatish Balay ii++; 3743a001520aSPierre Jolivet if (usecprow) z = c + bs * ridx[i]; 37444b7054f4SPierre Jolivet if (n) { 3745792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); 37464b7054f4SPierre Jolivet v += bs2; 37474b7054f4SPierre Jolivet } 37484b7054f4SPierre Jolivet for (j = 1; j < n; j++) { 3749792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 3750a001520aSPierre Jolivet v += bs2; 3751a001520aSPierre Jolivet } 3752a001520aSPierre Jolivet if (!usecprow) z += bs; 3753a001520aSPierre Jolivet } 37544b7054f4SPierre Jolivet } 37559566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 37569566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); 3757*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3758a001520aSPierre Jolivet } 3759