149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h> 5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 6c6db04a5SJed Brown #include <petscbt.h> 7c6db04a5SJed Brown #include <petscblaslapack.h> 849b5e25fSSatish Balay 9*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 10*d71ae5a4SJacob Faibussowitsch { 115eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 125d0c19d7SBarry Smith PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs, *nidx2; 135d0c19d7SBarry Smith const PetscInt *idx; 14db41cccfSHong Zhang PetscBT table_out, table_in; 15d94109b8SHong Zhang 16d94109b8SHong Zhang PetscFunctionBegin; 1708401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 18d94109b8SHong Zhang mbs = a->mbs; 19d94109b8SHong Zhang ai = a->i; 20d94109b8SHong Zhang aj = a->j; 21d0f46423SBarry Smith bs = A->rmap->bs; 229566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_out)); 239566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs + 1, &nidx)); 249566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2)); 259566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_in)); 26d94109b8SHong Zhang 27d94109b8SHong Zhang for (i = 0; i < is_max; i++) { /* for each is */ 28d94109b8SHong Zhang isz = 0; 299566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_out)); 30d94109b8SHong Zhang 31d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 329566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 339566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 34d94109b8SHong Zhang 35db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 36dbe03f88SHong Zhang bcol_max = 0; 37d94109b8SHong Zhang for (j = 0; j < n; ++j) { 38d94109b8SHong Zhang brow = idx[j] / bs; /* convert the indices into block indices */ 3908401ef6SPierre Jolivet PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 40db41cccfSHong Zhang if (!PetscBTLookupSet(table_out, brow)) { 41dbe03f88SHong Zhang nidx[isz++] = brow; 42dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 43dbe03f88SHong Zhang } 44d94109b8SHong Zhang } 459566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 469566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 47d94109b8SHong Zhang 48d94109b8SHong Zhang k = 0; 49d94109b8SHong Zhang for (j = 0; j < ov; j++) { /* for each overlap */ 50db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 519566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_in)); 529566063dSJacob Faibussowitsch for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 53d94109b8SHong Zhang 54d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 55d94109b8SHong Zhang for (brow = 0; brow < mbs; brow++) { 569371c9d4SSatish Balay start = ai[brow]; 579371c9d4SSatish Balay end = ai[brow + 1]; 58db41cccfSHong Zhang if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 59d94109b8SHong Zhang for (l = start; l < end; l++) { 60d94109b8SHong Zhang bcol = aj[l]; 61d7b97159SHong Zhang if (!PetscBTLookupSet(table_out, bcol)) { 62d7b97159SHong Zhang nidx[isz++] = bcol; 63d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 64d7b97159SHong Zhang } 65d94109b8SHong Zhang } 66d94109b8SHong Zhang k++; 67d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 68d94109b8SHong Zhang } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 69d94109b8SHong Zhang for (l = start; l < end; l++) { 70d94109b8SHong Zhang bcol = aj[l]; 71dbe03f88SHong Zhang if (bcol > bcol_max) break; 72db41cccfSHong Zhang if (PetscBTLookup(table_in, bcol)) { 7326fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 74d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } 79d94109b8SHong Zhang } /* for each overlap */ 80d94109b8SHong Zhang 81d94109b8SHong Zhang /* expand the Index Set */ 82d94109b8SHong Zhang for (j = 0; j < isz; j++) { 8326fbe8dcSKarl Rupp for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k; 84d94109b8SHong Zhang } 859566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i)); 86d94109b8SHong Zhang } 879566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_out)); 889566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 899566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx2)); 909566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_in)); 915eee224dSHong Zhang PetscFunctionReturn(0); 9249b5e25fSSatish Balay } 9349b5e25fSSatish Balay 94847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 95847daeccSHong Zhang Zero some ops' to avoid invalid usse */ 96*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 97*d71ae5a4SJacob Faibussowitsch { 9849b5e25fSSatish Balay PetscFunctionBegin; 999566063dSJacob Faibussowitsch PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 100f4259b30SLisandro Dalcin Bseq->ops->mult = NULL; 101f4259b30SLisandro Dalcin Bseq->ops->multadd = NULL; 102f4259b30SLisandro Dalcin Bseq->ops->multtranspose = NULL; 103f4259b30SLisandro Dalcin Bseq->ops->multtransposeadd = NULL; 104f4259b30SLisandro Dalcin Bseq->ops->lufactor = NULL; 105f4259b30SLisandro Dalcin Bseq->ops->choleskyfactor = NULL; 106f4259b30SLisandro Dalcin Bseq->ops->lufactorsymbolic = NULL; 107f4259b30SLisandro Dalcin Bseq->ops->choleskyfactorsymbolic = NULL; 108f4259b30SLisandro Dalcin Bseq->ops->getinertia = NULL; 10949b5e25fSSatish Balay PetscFunctionReturn(0); 11049b5e25fSSatish Balay } 11149b5e25fSSatish Balay 1127dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 113*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 114*d71ae5a4SJacob Faibussowitsch { 115e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c; 116e50a8114SHong Zhang PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 117e50a8114SHong Zhang PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 118e50a8114SHong Zhang const PetscInt *irow, *icol; 119e50a8114SHong Zhang PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 120e50a8114SHong Zhang PetscInt *aj = a->j, *ai = a->i; 121e50a8114SHong Zhang MatScalar *mat_a; 122e50a8114SHong Zhang Mat C; 123e50a8114SHong Zhang PetscBool flag; 124e50a8114SHong Zhang 125e50a8114SHong Zhang PetscFunctionBegin; 126e50a8114SHong Zhang 1279566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 1289566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 1299566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 1309566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 131e50a8114SHong Zhang 1329566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 133e50a8114SHong Zhang ssmap = smap; 1349566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 135e50a8114SHong Zhang for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 136e50a8114SHong Zhang /* determine lens of each row */ 137e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 138e50a8114SHong Zhang kstart = ai[irow[i]]; 139e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 140e50a8114SHong Zhang lens[i] = 0; 141e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 142e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 143e50a8114SHong Zhang } 144e50a8114SHong Zhang } 145e50a8114SHong Zhang /* Create and fill new matrix */ 146e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 147e50a8114SHong Zhang c = (Mat_SeqSBAIJ *)((*B)->data); 148e50a8114SHong Zhang 149aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1509566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 15128b400f6SJacob Faibussowitsch PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 1529566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 153e50a8114SHong Zhang C = *B; 154e50a8114SHong Zhang } else { 1559566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1569566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 1579566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1589566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 159e50a8114SHong Zhang } 160e50a8114SHong Zhang c = (Mat_SeqSBAIJ *)(C->data); 161e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 162e50a8114SHong Zhang row = irow[i]; 163e50a8114SHong Zhang kstart = ai[row]; 164e50a8114SHong Zhang kend = kstart + a->ilen[row]; 165e50a8114SHong Zhang mat_i = c->i[i]; 166e50a8114SHong Zhang mat_j = c->j + mat_i; 167e50a8114SHong Zhang mat_a = c->a + mat_i * bs2; 168e50a8114SHong Zhang mat_ilen = c->ilen + i; 169e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 170e50a8114SHong Zhang if ((tcol = ssmap[a->j[k]])) { 171e50a8114SHong Zhang *mat_j++ = tcol - 1; 1729566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 173e50a8114SHong Zhang mat_a += bs2; 174e50a8114SHong Zhang (*mat_ilen)++; 175e50a8114SHong Zhang } 176e50a8114SHong Zhang } 177e50a8114SHong Zhang } 178e50a8114SHong Zhang /* sort */ 179e50a8114SHong Zhang { 180e50a8114SHong Zhang MatScalar *work; 181e50a8114SHong Zhang 1829566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 183e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 184e50a8114SHong Zhang PetscInt ilen; 185e50a8114SHong Zhang mat_i = c->i[i]; 186e50a8114SHong Zhang mat_j = c->j + mat_i; 187e50a8114SHong Zhang mat_a = c->a + mat_i * bs2; 188e50a8114SHong Zhang ilen = c->ilen[i]; 1899566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 190e50a8114SHong Zhang } 1919566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 192e50a8114SHong Zhang } 193e50a8114SHong Zhang 194e50a8114SHong Zhang /* Free work space */ 1959566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 1969566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 1979566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 1989566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1999566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 200e50a8114SHong Zhang 2019566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 202e50a8114SHong Zhang *B = C; 203e50a8114SHong Zhang PetscFunctionReturn(0); 204e50a8114SHong Zhang } 205e50a8114SHong Zhang 206*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 207*d71ae5a4SJacob Faibussowitsch { 208e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 209e50a8114SHong Zhang IS is1, is2; 210e50a8114SHong Zhang PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs; 211e50a8114SHong Zhang const PetscInt *irow, *icol; 21249b5e25fSSatish Balay 21349b5e25fSSatish Balay PetscFunctionBegin; 2149566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 2159566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 2169566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 2179566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 218e50a8114SHong Zhang 219e50a8114SHong Zhang /* Verify if the indices corespond to each element in a block 220e50a8114SHong Zhang and form the IS with compressed IS */ 221e50a8114SHong Zhang maxmnbs = PetscMax(a->mbs, a->nbs); 2229566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 2239566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->mbs)); 224e50a8114SHong Zhang for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 225ad540459SPierre Jolivet for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 226e50a8114SHong Zhang count = 0; 227e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 228e50a8114SHong Zhang PetscInt j = irow[i] / bs; 229e50a8114SHong Zhang if ((vary[j]--) == bs) iary[count++] = j; 230e50a8114SHong Zhang } 2319566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 232e50a8114SHong Zhang 2339566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->nbs)); 234e50a8114SHong Zhang for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 235ad540459SPierre Jolivet for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 236e50a8114SHong Zhang count = 0; 237e50a8114SHong Zhang for (i = 0; i < ncols; i++) { 238e50a8114SHong Zhang PetscInt j = icol[i] / bs; 239e50a8114SHong Zhang if ((vary[j]--) == bs) iary[count++] = j; 240e50a8114SHong Zhang } 2419566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 2429566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 2439566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 2449566063dSJacob Faibussowitsch PetscCall(PetscFree2(vary, iary)); 245e50a8114SHong Zhang 2469566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B)); 2479566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2489566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 249847daeccSHong Zhang 2508f46ffcaSHong Zhang if (isrow != iscol) { 2518f46ffcaSHong Zhang PetscBool isequal; 2529566063dSJacob Faibussowitsch PetscCall(ISEqual(isrow, iscol, &isequal)); 25348a46eb9SPierre Jolivet if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 25449b5e25fSSatish Balay } 25549b5e25fSSatish Balay PetscFunctionReturn(0); 25649b5e25fSSatish Balay } 25749b5e25fSSatish Balay 258*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 259*d71ae5a4SJacob Faibussowitsch { 26013f74950SBarry Smith PetscInt i; 26149b5e25fSSatish Balay 26249b5e25fSSatish Balay PetscFunctionBegin; 26348a46eb9SPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 264e50a8114SHong Zhang 26548a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 26649b5e25fSSatish Balay PetscFunctionReturn(0); 26749b5e25fSSatish Balay } 26849b5e25fSSatish Balay 26949b5e25fSSatish Balay /* -------------------------------------------------------*/ 27049b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 27149b5e25fSSatish Balay /* -------------------------------------------------------*/ 27249b5e25fSSatish Balay 273*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 274*d71ae5a4SJacob Faibussowitsch { 27549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 276d9ca1df4SBarry Smith PetscScalar *z, x1, x2, zero = 0.0; 277d9ca1df4SBarry Smith const PetscScalar *x, *xb; 278d9ca1df4SBarry Smith const MatScalar *v; 279d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 280d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 28198c9bda7SSatish Balay PetscInt nonzerorow = 0; 28249b5e25fSSatish Balay 28349b5e25fSSatish Balay PetscFunctionBegin; 2849566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 285c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 2869566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2879566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 28849b5e25fSSatish Balay 28949b5e25fSSatish Balay v = a->a; 29049b5e25fSSatish Balay xb = x; 29149b5e25fSSatish Balay 29249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 29349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 2949371c9d4SSatish Balay x1 = xb[0]; 2959371c9d4SSatish Balay x2 = xb[1]; 29649b5e25fSSatish Balay ib = aj + *ai; 297831a3094SHong Zhang jmin = 0; 29898c9bda7SSatish Balay nonzerorow += (n > 0); 2997fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 30049b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 30149b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 3029371c9d4SSatish Balay v += 4; 3039371c9d4SSatish Balay jmin++; 3047fbae186SHong Zhang } 305444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 306444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 307831a3094SHong Zhang for (j = jmin; j < n; j++) { 30849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 30949b5e25fSSatish Balay cval = ib[j] * 2; 31049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 31149b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 31249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 31349b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 31449b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 31549b5e25fSSatish Balay v += 4; 31649b5e25fSSatish Balay } 3179371c9d4SSatish Balay xb += 2; 3189371c9d4SSatish Balay ai++; 31949b5e25fSSatish Balay } 32049b5e25fSSatish Balay 3219566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3229566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3239566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 32449b5e25fSSatish Balay PetscFunctionReturn(0); 32549b5e25fSSatish Balay } 32649b5e25fSSatish Balay 327*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 328*d71ae5a4SJacob Faibussowitsch { 32949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 330d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, zero = 0.0; 331d9ca1df4SBarry Smith const PetscScalar *x, *xb; 332d9ca1df4SBarry Smith const MatScalar *v; 333d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 334d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 33598c9bda7SSatish Balay PetscInt nonzerorow = 0; 33649b5e25fSSatish Balay 33749b5e25fSSatish Balay PetscFunctionBegin; 3389566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 339c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 3409566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3419566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 34249b5e25fSSatish Balay 34349b5e25fSSatish Balay v = a->a; 34449b5e25fSSatish Balay xb = x; 34549b5e25fSSatish Balay 34649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 34749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3489371c9d4SSatish Balay x1 = xb[0]; 3499371c9d4SSatish Balay x2 = xb[1]; 3509371c9d4SSatish Balay x3 = xb[2]; 35149b5e25fSSatish Balay ib = aj + *ai; 352831a3094SHong Zhang jmin = 0; 35398c9bda7SSatish Balay nonzerorow += (n > 0); 3547fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 35549b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 35649b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 35749b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3589371c9d4SSatish Balay v += 9; 3599371c9d4SSatish Balay jmin++; 3607fbae186SHong Zhang } 361444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 362444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 363831a3094SHong Zhang for (j = jmin; j < n; j++) { 36449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 36549b5e25fSSatish Balay cval = ib[j] * 3; 36649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 36749b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 36849b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 36949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 37049b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 37149b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 37249b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 37349b5e25fSSatish Balay v += 9; 37449b5e25fSSatish Balay } 3759371c9d4SSatish Balay xb += 3; 3769371c9d4SSatish Balay ai++; 37749b5e25fSSatish Balay } 37849b5e25fSSatish Balay 3799566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3809566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3819566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 38249b5e25fSSatish Balay PetscFunctionReturn(0); 38349b5e25fSSatish Balay } 38449b5e25fSSatish Balay 385*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 386*d71ae5a4SJacob Faibussowitsch { 38749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 388d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 389d9ca1df4SBarry Smith const PetscScalar *x, *xb; 390d9ca1df4SBarry Smith const MatScalar *v; 391d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 392d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 39398c9bda7SSatish Balay PetscInt nonzerorow = 0; 39449b5e25fSSatish Balay 39549b5e25fSSatish Balay PetscFunctionBegin; 3969566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 397c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 3989566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3999566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 40049b5e25fSSatish Balay 40149b5e25fSSatish Balay v = a->a; 40249b5e25fSSatish Balay xb = x; 40349b5e25fSSatish Balay 40449b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 40549b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4069371c9d4SSatish Balay x1 = xb[0]; 4079371c9d4SSatish Balay x2 = xb[1]; 4089371c9d4SSatish Balay x3 = xb[2]; 4099371c9d4SSatish Balay x4 = xb[3]; 41049b5e25fSSatish Balay ib = aj + *ai; 411831a3094SHong Zhang jmin = 0; 41298c9bda7SSatish Balay nonzerorow += (n > 0); 4137fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 41449b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 41549b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 41649b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 41749b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 4189371c9d4SSatish Balay v += 16; 4199371c9d4SSatish Balay jmin++; 4207fbae186SHong Zhang } 421444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 422444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 423831a3094SHong Zhang for (j = jmin; j < n; j++) { 42449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 42549b5e25fSSatish Balay cval = ib[j] * 4; 42649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 42749b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 42849b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 42949b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 43049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 43149b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 43249b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 43349b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 43449b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 43549b5e25fSSatish Balay v += 16; 43649b5e25fSSatish Balay } 4379371c9d4SSatish Balay xb += 4; 4389371c9d4SSatish Balay ai++; 43949b5e25fSSatish Balay } 44049b5e25fSSatish Balay 4419566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4429566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 4439566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 44449b5e25fSSatish Balay PetscFunctionReturn(0); 44549b5e25fSSatish Balay } 44649b5e25fSSatish Balay 447*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 448*d71ae5a4SJacob Faibussowitsch { 44949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 450d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 451d9ca1df4SBarry Smith const PetscScalar *x, *xb; 452d9ca1df4SBarry Smith const MatScalar *v; 453d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 454d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 45598c9bda7SSatish Balay PetscInt nonzerorow = 0; 45649b5e25fSSatish Balay 45749b5e25fSSatish Balay PetscFunctionBegin; 4589566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 459c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 4609566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4619566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 46249b5e25fSSatish Balay 46349b5e25fSSatish Balay v = a->a; 46449b5e25fSSatish Balay xb = x; 46549b5e25fSSatish Balay 46649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 46749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4689371c9d4SSatish Balay x1 = xb[0]; 4699371c9d4SSatish Balay x2 = xb[1]; 4709371c9d4SSatish Balay x3 = xb[2]; 4719371c9d4SSatish Balay x4 = xb[3]; 4729371c9d4SSatish Balay x5 = xb[4]; 47349b5e25fSSatish Balay ib = aj + *ai; 474831a3094SHong Zhang jmin = 0; 47598c9bda7SSatish Balay nonzerorow += (n > 0); 4767fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 47749b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 47849b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 47949b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 48049b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 48149b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 4829371c9d4SSatish Balay v += 25; 4839371c9d4SSatish Balay jmin++; 4847fbae186SHong Zhang } 485444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 486444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 487831a3094SHong Zhang for (j = jmin; j < n; j++) { 48849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 48949b5e25fSSatish Balay cval = ib[j] * 5; 49049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 49149b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 49249b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 49349b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 49449b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 49549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 49649b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 49749b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 49849b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 49949b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 50049b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 50149b5e25fSSatish Balay v += 25; 50249b5e25fSSatish Balay } 5039371c9d4SSatish Balay xb += 5; 5049371c9d4SSatish Balay ai++; 50549b5e25fSSatish Balay } 50649b5e25fSSatish Balay 5079566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5089566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5099566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 51049b5e25fSSatish Balay PetscFunctionReturn(0); 51149b5e25fSSatish Balay } 51249b5e25fSSatish Balay 513*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 514*d71ae5a4SJacob Faibussowitsch { 51549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 516d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 517d9ca1df4SBarry Smith const PetscScalar *x, *xb; 518d9ca1df4SBarry Smith const MatScalar *v; 519d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 520d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 52198c9bda7SSatish Balay PetscInt nonzerorow = 0; 52249b5e25fSSatish Balay 52349b5e25fSSatish Balay PetscFunctionBegin; 5249566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 525c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 5269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5279566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 52849b5e25fSSatish Balay 52949b5e25fSSatish Balay v = a->a; 53049b5e25fSSatish Balay xb = x; 53149b5e25fSSatish Balay 53249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 53349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 5349371c9d4SSatish Balay x1 = xb[0]; 5359371c9d4SSatish Balay x2 = xb[1]; 5369371c9d4SSatish Balay x3 = xb[2]; 5379371c9d4SSatish Balay x4 = xb[3]; 5389371c9d4SSatish Balay x5 = xb[4]; 5399371c9d4SSatish Balay x6 = xb[5]; 54049b5e25fSSatish Balay ib = aj + *ai; 541831a3094SHong Zhang jmin = 0; 54298c9bda7SSatish Balay nonzerorow += (n > 0); 5437fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 54449b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 54549b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 54649b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 54749b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 54849b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 54949b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 5509371c9d4SSatish Balay v += 36; 5519371c9d4SSatish Balay jmin++; 5527fbae186SHong Zhang } 553444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 554444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 555831a3094SHong Zhang for (j = jmin; j < n; j++) { 55649b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 55749b5e25fSSatish Balay cval = ib[j] * 6; 55849b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 55949b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 56049b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 56149b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 56249b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 56349b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 56449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 56549b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 56649b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 56749b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 56849b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 56949b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 57049b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 57149b5e25fSSatish Balay v += 36; 57249b5e25fSSatish Balay } 5739371c9d4SSatish Balay xb += 6; 5749371c9d4SSatish Balay ai++; 57549b5e25fSSatish Balay } 57649b5e25fSSatish Balay 5779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5789566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5799566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 58049b5e25fSSatish Balay PetscFunctionReturn(0); 58149b5e25fSSatish Balay } 582c2916339SPierre Jolivet 583*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 584*d71ae5a4SJacob Faibussowitsch { 58549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 586d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 587d9ca1df4SBarry Smith const PetscScalar *x, *xb; 588d9ca1df4SBarry Smith const MatScalar *v; 589d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 590d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 59198c9bda7SSatish Balay PetscInt nonzerorow = 0; 59249b5e25fSSatish Balay 59349b5e25fSSatish Balay PetscFunctionBegin; 5949566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 595c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 5969566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5979566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 59849b5e25fSSatish Balay 59949b5e25fSSatish Balay v = a->a; 60049b5e25fSSatish Balay xb = x; 60149b5e25fSSatish Balay 60249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 60349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 6049371c9d4SSatish Balay x1 = xb[0]; 6059371c9d4SSatish Balay x2 = xb[1]; 6069371c9d4SSatish Balay x3 = xb[2]; 6079371c9d4SSatish Balay x4 = xb[3]; 6089371c9d4SSatish Balay x5 = xb[4]; 6099371c9d4SSatish Balay x6 = xb[5]; 6109371c9d4SSatish Balay x7 = xb[6]; 61149b5e25fSSatish Balay ib = aj + *ai; 612831a3094SHong Zhang jmin = 0; 61398c9bda7SSatish Balay nonzerorow += (n > 0); 6147fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 61549b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 61649b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 61749b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 61849b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 61949b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 62049b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 62149b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 6229371c9d4SSatish Balay v += 49; 6239371c9d4SSatish Balay jmin++; 6247fbae186SHong Zhang } 625444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 626444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 627831a3094SHong Zhang for (j = jmin; j < n; j++) { 62849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 62949b5e25fSSatish Balay cval = ib[j] * 7; 63049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 63149b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 63249b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 63349b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 63449b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 63549b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 63649b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 63749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 63849b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 63949b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 64049b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 64149b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 64249b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 64349b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 64449b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 64549b5e25fSSatish Balay v += 49; 64649b5e25fSSatish Balay } 6479371c9d4SSatish Balay xb += 7; 6489371c9d4SSatish Balay ai++; 64949b5e25fSSatish Balay } 6509566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6519566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 6529566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 65349b5e25fSSatish Balay PetscFunctionReturn(0); 65449b5e25fSSatish Balay } 65549b5e25fSSatish Balay 65649b5e25fSSatish Balay /* 65749b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 65849b5e25fSSatish Balay */ 659*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 660*d71ae5a4SJacob Faibussowitsch { 66149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 662d9ca1df4SBarry Smith PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 663d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 664d9ca1df4SBarry Smith const MatScalar *v; 665d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 666d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 66798c9bda7SSatish Balay PetscInt nonzerorow = 0; 66849b5e25fSSatish Balay 66949b5e25fSSatish Balay PetscFunctionBegin; 6709566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 671c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 6729566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6739566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 67459689ffbSStefano Zampini 67559689ffbSStefano Zampini x_ptr = x; 67659689ffbSStefano Zampini z_ptr = z; 67749b5e25fSSatish Balay 67849b5e25fSSatish Balay aj = a->j; 67949b5e25fSSatish Balay v = a->a; 68049b5e25fSSatish Balay ii = a->i; 68149b5e25fSSatish Balay 68248a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 68349b5e25fSSatish Balay work = a->mult_work; 68449b5e25fSSatish Balay 68549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 6869371c9d4SSatish Balay n = ii[1] - ii[0]; 6879371c9d4SSatish Balay ncols = n * bs; 6889371c9d4SSatish Balay workt = work; 6899371c9d4SSatish Balay idx = aj + ii[0]; 69098c9bda7SSatish Balay nonzerorow += (n > 0); 69149b5e25fSSatish Balay 69249b5e25fSSatish Balay /* upper triangular part */ 69349b5e25fSSatish Balay for (j = 0; j < n; j++) { 69449b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 69549b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 69649b5e25fSSatish Balay workt += bs; 69749b5e25fSSatish Balay } 69849b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 69996b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 70049b5e25fSSatish Balay 70149b5e25fSSatish Balay /* strict lower triangular part */ 702831a3094SHong Zhang idx = aj + ii[0]; 70359689ffbSStefano Zampini if (n && *idx == i) { 7049371c9d4SSatish Balay ncols -= bs; 7059371c9d4SSatish Balay v += bs2; 7069371c9d4SSatish Balay idx++; 7079371c9d4SSatish Balay n--; 708831a3094SHong Zhang } 70996b9376eSHong Zhang 71049b5e25fSSatish Balay if (ncols > 0) { 71149b5e25fSSatish Balay workt = work; 7129566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 71396b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 714831a3094SHong Zhang for (j = 0; j < n; j++) { 715831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 71649b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 71749b5e25fSSatish Balay workt += bs; 71849b5e25fSSatish Balay } 71949b5e25fSSatish Balay } 7209371c9d4SSatish Balay x += bs; 7219371c9d4SSatish Balay v += n * bs2; 7229371c9d4SSatish Balay z += bs; 7239371c9d4SSatish Balay ii++; 72449b5e25fSSatish Balay } 72549b5e25fSSatish Balay 7269566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7279566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 7289566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 72949b5e25fSSatish Balay PetscFunctionReturn(0); 73049b5e25fSSatish Balay } 73149b5e25fSSatish Balay 732*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 733*d71ae5a4SJacob Faibussowitsch { 73449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 735d9ca1df4SBarry Smith PetscScalar *z, x1; 736d9ca1df4SBarry Smith const PetscScalar *x, *xb; 737d9ca1df4SBarry Smith const MatScalar *v; 738d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 739d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 74098c9bda7SSatish Balay PetscInt nonzerorow = 0; 741eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 742b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 743eb1ec7c1SStefano Zampini #else 744eb1ec7c1SStefano Zampini const int aconj = 0; 745eb1ec7c1SStefano Zampini #endif 74649b5e25fSSatish Balay 74749b5e25fSSatish Balay PetscFunctionBegin; 7489566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 749c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 7509566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7519566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 75249b5e25fSSatish Balay v = a->a; 75349b5e25fSSatish Balay xb = x; 75449b5e25fSSatish Balay 75549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 75649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 75749b5e25fSSatish Balay x1 = xb[0]; 75849b5e25fSSatish Balay ib = aj + *ai; 759831a3094SHong Zhang jmin = 0; 76098c9bda7SSatish Balay nonzerorow += (n > 0); 7613d9ade75SStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 7629371c9d4SSatish Balay z[i] += *v++ * x[*ib++]; 7639371c9d4SSatish Balay jmin++; 764831a3094SHong Zhang } 765eb1ec7c1SStefano Zampini if (aconj) { 766eb1ec7c1SStefano Zampini for (j = jmin; j < n; j++) { 767eb1ec7c1SStefano Zampini cval = *ib; 768eb1ec7c1SStefano Zampini z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 769eb1ec7c1SStefano Zampini z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 770eb1ec7c1SStefano Zampini } 771eb1ec7c1SStefano Zampini } else { 772831a3094SHong Zhang for (j = jmin; j < n; j++) { 77349b5e25fSSatish Balay cval = *ib; 77449b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 77549b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 77649b5e25fSSatish Balay } 777eb1ec7c1SStefano Zampini } 7789371c9d4SSatish Balay xb++; 7799371c9d4SSatish Balay ai++; 78049b5e25fSSatish Balay } 78149b5e25fSSatish Balay 7829566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7839566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 78449b5e25fSSatish Balay 7859566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 78649b5e25fSSatish Balay PetscFunctionReturn(0); 78749b5e25fSSatish Balay } 78849b5e25fSSatish Balay 789*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 790*d71ae5a4SJacob Faibussowitsch { 79149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 792d9ca1df4SBarry Smith PetscScalar *z, x1, x2; 793d9ca1df4SBarry Smith const PetscScalar *x, *xb; 794d9ca1df4SBarry Smith const MatScalar *v; 795d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 796d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 79798c9bda7SSatish Balay PetscInt nonzerorow = 0; 79849b5e25fSSatish Balay 79949b5e25fSSatish Balay PetscFunctionBegin; 8009566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 801c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 8029566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8039566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 80449b5e25fSSatish Balay 80549b5e25fSSatish Balay v = a->a; 80649b5e25fSSatish Balay xb = x; 80749b5e25fSSatish Balay 80849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 80949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8109371c9d4SSatish Balay x1 = xb[0]; 8119371c9d4SSatish Balay x2 = xb[1]; 81249b5e25fSSatish Balay ib = aj + *ai; 813831a3094SHong Zhang jmin = 0; 81498c9bda7SSatish Balay nonzerorow += (n > 0); 81559689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 81649b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 81749b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 8189371c9d4SSatish Balay v += 4; 8199371c9d4SSatish Balay jmin++; 8207fbae186SHong Zhang } 821444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 822444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 823831a3094SHong Zhang for (j = jmin; j < n; j++) { 82449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 82549b5e25fSSatish Balay cval = ib[j] * 2; 82649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 82749b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 82849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 82949b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 83049b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 83149b5e25fSSatish Balay v += 4; 83249b5e25fSSatish Balay } 8339371c9d4SSatish Balay xb += 2; 8349371c9d4SSatish Balay ai++; 83549b5e25fSSatish Balay } 8369566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8379566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 83849b5e25fSSatish Balay 8399566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 84049b5e25fSSatish Balay PetscFunctionReturn(0); 84149b5e25fSSatish Balay } 84249b5e25fSSatish Balay 843*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 844*d71ae5a4SJacob Faibussowitsch { 84549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 846d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3; 847d9ca1df4SBarry Smith const PetscScalar *x, *xb; 848d9ca1df4SBarry Smith const MatScalar *v; 849d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 850d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 85198c9bda7SSatish Balay PetscInt nonzerorow = 0; 85249b5e25fSSatish Balay 85349b5e25fSSatish Balay PetscFunctionBegin; 8549566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 855c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 8569566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8579566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 85849b5e25fSSatish Balay 85949b5e25fSSatish Balay v = a->a; 86049b5e25fSSatish Balay xb = x; 86149b5e25fSSatish Balay 86249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 86349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8649371c9d4SSatish Balay x1 = xb[0]; 8659371c9d4SSatish Balay x2 = xb[1]; 8669371c9d4SSatish Balay x3 = xb[2]; 86749b5e25fSSatish Balay ib = aj + *ai; 868831a3094SHong Zhang jmin = 0; 86998c9bda7SSatish Balay nonzerorow += (n > 0); 87059689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 87149b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 87249b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 87349b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 8749371c9d4SSatish Balay v += 9; 8759371c9d4SSatish Balay jmin++; 8767fbae186SHong Zhang } 877444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 878444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 879831a3094SHong Zhang for (j = jmin; j < n; j++) { 88049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 88149b5e25fSSatish Balay cval = ib[j] * 3; 88249b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 88349b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 88449b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 88549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 88649b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 88749b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 88849b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 88949b5e25fSSatish Balay v += 9; 89049b5e25fSSatish Balay } 8919371c9d4SSatish Balay xb += 3; 8929371c9d4SSatish Balay ai++; 89349b5e25fSSatish Balay } 89449b5e25fSSatish Balay 8959566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8969566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 89749b5e25fSSatish Balay 8989566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 89949b5e25fSSatish Balay PetscFunctionReturn(0); 90049b5e25fSSatish Balay } 90149b5e25fSSatish Balay 902*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 903*d71ae5a4SJacob Faibussowitsch { 90449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 905d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4; 906d9ca1df4SBarry Smith const PetscScalar *x, *xb; 907d9ca1df4SBarry Smith const MatScalar *v; 908d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 909d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 91098c9bda7SSatish Balay PetscInt nonzerorow = 0; 91149b5e25fSSatish Balay 91249b5e25fSSatish Balay PetscFunctionBegin; 9139566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 914c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 9159566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9169566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 91749b5e25fSSatish Balay 91849b5e25fSSatish Balay v = a->a; 91949b5e25fSSatish Balay xb = x; 92049b5e25fSSatish Balay 92149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 92249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 9239371c9d4SSatish Balay x1 = xb[0]; 9249371c9d4SSatish Balay x2 = xb[1]; 9259371c9d4SSatish Balay x3 = xb[2]; 9269371c9d4SSatish Balay x4 = xb[3]; 92749b5e25fSSatish Balay ib = aj + *ai; 928831a3094SHong Zhang jmin = 0; 92998c9bda7SSatish Balay nonzerorow += (n > 0); 93059689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 93149b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 93249b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 93349b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 93449b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 9359371c9d4SSatish Balay v += 16; 9369371c9d4SSatish Balay jmin++; 9377fbae186SHong Zhang } 938444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 939444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 940831a3094SHong Zhang for (j = jmin; j < n; j++) { 94149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 94249b5e25fSSatish Balay cval = ib[j] * 4; 94349b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 94449b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 94549b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 94649b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 94749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 94849b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 94949b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 95049b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 95149b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 95249b5e25fSSatish Balay v += 16; 95349b5e25fSSatish Balay } 9549371c9d4SSatish Balay xb += 4; 9559371c9d4SSatish Balay ai++; 95649b5e25fSSatish Balay } 95749b5e25fSSatish Balay 9589566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9599566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 96049b5e25fSSatish Balay 9619566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 96249b5e25fSSatish Balay PetscFunctionReturn(0); 96349b5e25fSSatish Balay } 96449b5e25fSSatish Balay 965*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 966*d71ae5a4SJacob Faibussowitsch { 96749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 968d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5; 969d9ca1df4SBarry Smith const PetscScalar *x, *xb; 970d9ca1df4SBarry Smith const MatScalar *v; 971d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 972d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 97398c9bda7SSatish Balay PetscInt nonzerorow = 0; 97449b5e25fSSatish Balay 97549b5e25fSSatish Balay PetscFunctionBegin; 9769566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 977c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 9789566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9799566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 98049b5e25fSSatish Balay 98149b5e25fSSatish Balay v = a->a; 98249b5e25fSSatish Balay xb = x; 98349b5e25fSSatish Balay 98449b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 98549b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 9869371c9d4SSatish Balay x1 = xb[0]; 9879371c9d4SSatish Balay x2 = xb[1]; 9889371c9d4SSatish Balay x3 = xb[2]; 9899371c9d4SSatish Balay x4 = xb[3]; 9909371c9d4SSatish Balay x5 = xb[4]; 99149b5e25fSSatish Balay ib = aj + *ai; 992831a3094SHong Zhang jmin = 0; 99398c9bda7SSatish Balay nonzerorow += (n > 0); 99459689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 99549b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 99649b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 99749b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 99849b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 99949b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 10009371c9d4SSatish Balay v += 25; 10019371c9d4SSatish Balay jmin++; 10027fbae186SHong Zhang } 1003444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1004444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1005831a3094SHong Zhang for (j = jmin; j < n; j++) { 100649b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 100749b5e25fSSatish Balay cval = ib[j] * 5; 100849b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 100949b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 101049b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 101149b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 101249b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 101349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 101449b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 101549b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 101649b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 101749b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 101849b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 101949b5e25fSSatish Balay v += 25; 102049b5e25fSSatish Balay } 10219371c9d4SSatish Balay xb += 5; 10229371c9d4SSatish Balay ai++; 102349b5e25fSSatish Balay } 102449b5e25fSSatish Balay 10259566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10269566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 102749b5e25fSSatish Balay 10289566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 102949b5e25fSSatish Balay PetscFunctionReturn(0); 103049b5e25fSSatish Balay } 1031c2916339SPierre Jolivet 1032*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1033*d71ae5a4SJacob Faibussowitsch { 103449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1035d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6; 1036d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1037d9ca1df4SBarry Smith const MatScalar *v; 1038d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1039d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 104098c9bda7SSatish Balay PetscInt nonzerorow = 0; 104149b5e25fSSatish Balay 104249b5e25fSSatish Balay PetscFunctionBegin; 10439566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 1044c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 10459566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10469566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 104749b5e25fSSatish Balay 104849b5e25fSSatish Balay v = a->a; 104949b5e25fSSatish Balay xb = x; 105049b5e25fSSatish Balay 105149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 105249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 10539371c9d4SSatish Balay x1 = xb[0]; 10549371c9d4SSatish Balay x2 = xb[1]; 10559371c9d4SSatish Balay x3 = xb[2]; 10569371c9d4SSatish Balay x4 = xb[3]; 10579371c9d4SSatish Balay x5 = xb[4]; 10589371c9d4SSatish Balay x6 = xb[5]; 105949b5e25fSSatish Balay ib = aj + *ai; 1060831a3094SHong Zhang jmin = 0; 106198c9bda7SSatish Balay nonzerorow += (n > 0); 106259689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 106349b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 106449b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 106549b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 106649b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 106749b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 106849b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 10699371c9d4SSatish Balay v += 36; 10709371c9d4SSatish Balay jmin++; 10717fbae186SHong Zhang } 1072444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1073444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1074831a3094SHong Zhang for (j = jmin; j < n; j++) { 107549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 107649b5e25fSSatish Balay cval = ib[j] * 6; 107749b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 107849b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 107949b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 108049b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 108149b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 108249b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 108349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 108449b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 108549b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 108649b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 108749b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 108849b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 108949b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 109049b5e25fSSatish Balay v += 36; 109149b5e25fSSatish Balay } 10929371c9d4SSatish Balay xb += 6; 10939371c9d4SSatish Balay ai++; 109449b5e25fSSatish Balay } 109549b5e25fSSatish Balay 10969566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10979566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 109849b5e25fSSatish Balay 10999566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 110049b5e25fSSatish Balay PetscFunctionReturn(0); 110149b5e25fSSatish Balay } 110249b5e25fSSatish Balay 1103*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1104*d71ae5a4SJacob Faibussowitsch { 110549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1106d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1107d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1108d9ca1df4SBarry Smith const MatScalar *v; 1109d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1110d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 111198c9bda7SSatish Balay PetscInt nonzerorow = 0; 111249b5e25fSSatish Balay 111349b5e25fSSatish Balay PetscFunctionBegin; 11149566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 1115c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 11169566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 11179566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 111849b5e25fSSatish Balay 111949b5e25fSSatish Balay v = a->a; 112049b5e25fSSatish Balay xb = x; 112149b5e25fSSatish Balay 112249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 112349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 11249371c9d4SSatish Balay x1 = xb[0]; 11259371c9d4SSatish Balay x2 = xb[1]; 11269371c9d4SSatish Balay x3 = xb[2]; 11279371c9d4SSatish Balay x4 = xb[3]; 11289371c9d4SSatish Balay x5 = xb[4]; 11299371c9d4SSatish Balay x6 = xb[5]; 11309371c9d4SSatish Balay x7 = xb[6]; 113149b5e25fSSatish Balay ib = aj + *ai; 1132831a3094SHong Zhang jmin = 0; 113398c9bda7SSatish Balay nonzerorow += (n > 0); 113459689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 113549b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 113649b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 113749b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 113849b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 113949b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 114049b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 114149b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 11429371c9d4SSatish Balay v += 49; 11439371c9d4SSatish Balay jmin++; 11447fbae186SHong Zhang } 1145444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1146444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1147831a3094SHong Zhang for (j = jmin; j < n; j++) { 114849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 114949b5e25fSSatish Balay cval = ib[j] * 7; 115049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 115149b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 115249b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 115349b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 115449b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 115549b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 115649b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 115749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 115849b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 115949b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 116049b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 116149b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 116249b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 116349b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 116449b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 116549b5e25fSSatish Balay v += 49; 116649b5e25fSSatish Balay } 11679371c9d4SSatish Balay xb += 7; 11689371c9d4SSatish Balay ai++; 116949b5e25fSSatish Balay } 117049b5e25fSSatish Balay 11719566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11729566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 117349b5e25fSSatish Balay 11749566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 117549b5e25fSSatish Balay PetscFunctionReturn(0); 117649b5e25fSSatish Balay } 117749b5e25fSSatish Balay 1178*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1179*d71ae5a4SJacob Faibussowitsch { 118049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1181f4259b30SLisandro Dalcin PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1182d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 1183d9ca1df4SBarry Smith const MatScalar *v; 1184d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1185d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 118698c9bda7SSatish Balay PetscInt nonzerorow = 0; 118749b5e25fSSatish Balay 118849b5e25fSSatish Balay PetscFunctionBegin; 11899566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 1190c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 11919371c9d4SSatish Balay PetscCall(VecGetArrayRead(xx, &x)); 11929371c9d4SSatish Balay x_ptr = x; 11939371c9d4SSatish Balay PetscCall(VecGetArray(zz, &z)); 11949371c9d4SSatish Balay z_ptr = z; 119549b5e25fSSatish Balay 119649b5e25fSSatish Balay aj = a->j; 119749b5e25fSSatish Balay v = a->a; 119849b5e25fSSatish Balay ii = a->i; 119949b5e25fSSatish Balay 120048a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 120149b5e25fSSatish Balay work = a->mult_work; 120249b5e25fSSatish Balay 120349b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 12049371c9d4SSatish Balay n = ii[1] - ii[0]; 12059371c9d4SSatish Balay ncols = n * bs; 12069371c9d4SSatish Balay workt = work; 12079371c9d4SSatish Balay idx = aj + ii[0]; 120898c9bda7SSatish Balay nonzerorow += (n > 0); 120949b5e25fSSatish Balay 121049b5e25fSSatish Balay /* upper triangular part */ 121149b5e25fSSatish Balay for (j = 0; j < n; j++) { 121249b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 121349b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 121449b5e25fSSatish Balay workt += bs; 121549b5e25fSSatish Balay } 121649b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 121796b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 121849b5e25fSSatish Balay 121949b5e25fSSatish Balay /* strict lower triangular part */ 1220831a3094SHong Zhang idx = aj + ii[0]; 122159689ffbSStefano Zampini if (n && *idx == i) { 12229371c9d4SSatish Balay ncols -= bs; 12239371c9d4SSatish Balay v += bs2; 12249371c9d4SSatish Balay idx++; 12259371c9d4SSatish Balay n--; 1226831a3094SHong Zhang } 122749b5e25fSSatish Balay if (ncols > 0) { 122849b5e25fSSatish Balay workt = work; 12299566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 123096b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1231831a3094SHong Zhang for (j = 0; j < n; j++) { 1232831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 123349b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 123449b5e25fSSatish Balay workt += bs; 123549b5e25fSSatish Balay } 123649b5e25fSSatish Balay } 123749b5e25fSSatish Balay 12389371c9d4SSatish Balay x += bs; 12399371c9d4SSatish Balay v += n * bs2; 12409371c9d4SSatish Balay z += bs; 12419371c9d4SSatish Balay ii++; 124249b5e25fSSatish Balay } 124349b5e25fSSatish Balay 12449566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 12459566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 124649b5e25fSSatish Balay 12479566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 124849b5e25fSSatish Balay PetscFunctionReturn(0); 124949b5e25fSSatish Balay } 125049b5e25fSSatish Balay 1251*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1252*d71ae5a4SJacob Faibussowitsch { 125349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1254f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1255c5df96a5SBarry Smith PetscBLASInt one = 1, totalnz; 125649b5e25fSSatish Balay 125749b5e25fSSatish Balay PetscFunctionBegin; 12589566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1259792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 12609566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 126149b5e25fSSatish Balay PetscFunctionReturn(0); 126249b5e25fSSatish Balay } 126349b5e25fSSatish Balay 1264*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1265*d71ae5a4SJacob Faibussowitsch { 126649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1267d9ca1df4SBarry Smith const MatScalar *v = a->a; 126849b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1269d9ca1df4SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1270d9ca1df4SBarry Smith const PetscInt *aj = a->j, *col; 127149b5e25fSSatish Balay 127249b5e25fSSatish Balay PetscFunctionBegin; 1273c40ae873SPierre Jolivet if (!a->nz) { 1274c40ae873SPierre Jolivet *norm = 0.0; 1275c40ae873SPierre Jolivet PetscFunctionReturn(0); 1276c40ae873SPierre Jolivet } 127749b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 127849b5e25fSSatish Balay for (k = 0; k < mbs; k++) { 127959689ffbSStefano Zampini jmin = a->i[k]; 128059689ffbSStefano Zampini jmax = a->i[k + 1]; 1281831a3094SHong Zhang col = aj + jmin; 128259689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 128349b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 12849371c9d4SSatish Balay sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 12859371c9d4SSatish Balay v++; 128649b5e25fSSatish Balay } 1287831a3094SHong Zhang jmin++; 1288831a3094SHong Zhang } 1289831a3094SHong Zhang for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 129049b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 12919371c9d4SSatish Balay sum_off += PetscRealPart(PetscConj(*v) * (*v)); 12929371c9d4SSatish Balay v++; 129349b5e25fSSatish Balay } 129449b5e25fSSatish Balay } 129549b5e25fSSatish Balay } 12968f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 12979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 12980b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 12999566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 13000b8dc8d2SHong Zhang for (i = 0; i < mbs; i++) jl[i] = mbs; 13010b8dc8d2SHong Zhang il[0] = 0; 130249b5e25fSSatish Balay 130349b5e25fSSatish Balay *norm = 0.0; 130449b5e25fSSatish Balay for (k = 0; k < mbs; k++) { /* k_th block row */ 130549b5e25fSSatish Balay for (j = 0; j < bs; j++) sum[j] = 0.0; 130649b5e25fSSatish Balay /*-- col sum --*/ 130749b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1308831a3094SHong Zhang /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1309831a3094SHong Zhang at step k */ 131049b5e25fSSatish Balay while (i < mbs) { 131149b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 131249b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 131349b5e25fSSatish Balay for (j = 0; j < bs; j++) { 131449b5e25fSSatish Balay v = a->a + ik * bs2 + j * bs; 131549b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 13169371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 13179371c9d4SSatish Balay v++; 131849b5e25fSSatish Balay } 131949b5e25fSSatish Balay } 132049b5e25fSSatish Balay /* update il, jl */ 1321831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1322831a3094SHong Zhang jmax = a->i[i + 1]; 132349b5e25fSSatish Balay if (jmin < jmax) { 132449b5e25fSSatish Balay il[i] = jmin; 132549b5e25fSSatish Balay j = a->j[jmin]; 13269371c9d4SSatish Balay jl[i] = jl[j]; 13279371c9d4SSatish Balay jl[j] = i; 132849b5e25fSSatish Balay } 132949b5e25fSSatish Balay i = nexti; 133049b5e25fSSatish Balay } 133149b5e25fSSatish Balay /*-- row sum --*/ 133259689ffbSStefano Zampini jmin = a->i[k]; 133359689ffbSStefano Zampini jmax = a->i[k + 1]; 133449b5e25fSSatish Balay for (i = jmin; i < jmax; i++) { 133549b5e25fSSatish Balay for (j = 0; j < bs; j++) { 133649b5e25fSSatish Balay v = a->a + i * bs2 + j; 133749b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 13389371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 13399371c9d4SSatish Balay v += bs; 134049b5e25fSSatish Balay } 134149b5e25fSSatish Balay } 134249b5e25fSSatish Balay } 134349b5e25fSSatish Balay /* add k_th block row to il, jl */ 1344831a3094SHong Zhang col = aj + jmin; 134559689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) jmin++; 134649b5e25fSSatish Balay if (jmin < jmax) { 134749b5e25fSSatish Balay il[k] = jmin; 13489371c9d4SSatish Balay j = a->j[jmin]; 13499371c9d4SSatish Balay jl[k] = jl[j]; 13509371c9d4SSatish Balay jl[j] = k; 135149b5e25fSSatish Balay } 135249b5e25fSSatish Balay for (j = 0; j < bs; j++) { 135349b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 135449b5e25fSSatish Balay } 135549b5e25fSSatish Balay } 13569566063dSJacob Faibussowitsch PetscCall(PetscFree3(sum, il, jl)); 13579566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1358f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 135949b5e25fSSatish Balay PetscFunctionReturn(0); 136049b5e25fSSatish Balay } 136149b5e25fSSatish Balay 1362*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1363*d71ae5a4SJacob Faibussowitsch { 136449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 136549b5e25fSSatish Balay 136649b5e25fSSatish Balay PetscFunctionBegin; 136749b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1368d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1369ef511fbeSHong Zhang *flg = PETSC_FALSE; 1370ef511fbeSHong Zhang PetscFunctionReturn(0); 137149b5e25fSSatish Balay } 137249b5e25fSSatish Balay 137349b5e25fSSatish Balay /* if the a->i are the same */ 13749566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 137526fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 137649b5e25fSSatish Balay 137749b5e25fSSatish Balay /* if a->j are the same */ 13789566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 137926fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 138026fbe8dcSKarl Rupp 138149b5e25fSSatish Balay /* if a->a are the same */ 13829566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 1383935af2e7SHong Zhang PetscFunctionReturn(0); 138449b5e25fSSatish Balay } 138549b5e25fSSatish Balay 1386*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1387*d71ae5a4SJacob Faibussowitsch { 138849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1389d9ca1df4SBarry Smith PetscInt i, j, k, row, bs, ambs, bs2; 1390d9ca1df4SBarry Smith const PetscInt *ai, *aj; 139187828ca2SBarry Smith PetscScalar *x, zero = 0.0; 1392d9ca1df4SBarry Smith const MatScalar *aa, *aa_j; 139349b5e25fSSatish Balay 139449b5e25fSSatish Balay PetscFunctionBegin; 1395d0f46423SBarry Smith bs = A->rmap->bs; 1396aed4548fSBarry Smith PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 139782799104SHong Zhang 139849b5e25fSSatish Balay aa = a->a; 13998a0c6efdSHong Zhang ambs = a->mbs; 14008a0c6efdSHong Zhang 14018a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 14028a0c6efdSHong Zhang PetscInt *diag = a->diag; 14038a0c6efdSHong Zhang aa = a->a; 14048a0c6efdSHong Zhang ambs = a->mbs; 14059566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 14068a0c6efdSHong Zhang for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 14079566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 14088a0c6efdSHong Zhang PetscFunctionReturn(0); 14098a0c6efdSHong Zhang } 14108a0c6efdSHong Zhang 141149b5e25fSSatish Balay ai = a->i; 141249b5e25fSSatish Balay aj = a->j; 141349b5e25fSSatish Balay bs2 = a->bs2; 14149566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 1415c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 14169566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 141749b5e25fSSatish Balay for (i = 0; i < ambs; i++) { 141849b5e25fSSatish Balay j = ai[i]; 141949b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 142049b5e25fSSatish Balay row = i * bs; 142149b5e25fSSatish Balay aa_j = aa + j * bs2; 142249b5e25fSSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 142349b5e25fSSatish Balay } 142449b5e25fSSatish Balay } 14259566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 142649b5e25fSSatish Balay PetscFunctionReturn(0); 142749b5e25fSSatish Balay } 142849b5e25fSSatish Balay 1429*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1430*d71ae5a4SJacob Faibussowitsch { 143149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1432d9ca1df4SBarry Smith PetscScalar x; 1433d9ca1df4SBarry Smith const PetscScalar *l, *li, *ri; 143449b5e25fSSatish Balay MatScalar *aa, *v; 1435fff8e43fSBarry Smith PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1436fff8e43fSBarry Smith const PetscInt *ai, *aj; 1437ace3abfcSBarry Smith PetscBool flg; 143849b5e25fSSatish Balay 143949b5e25fSSatish Balay PetscFunctionBegin; 1440b3bf805bSHong Zhang if (ll != rr) { 14419566063dSJacob Faibussowitsch PetscCall(VecEqual(ll, rr, &flg)); 144228b400f6SJacob Faibussowitsch PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1443b3bf805bSHong Zhang } 1444b3bf805bSHong Zhang if (!ll) PetscFunctionReturn(0); 144549b5e25fSSatish Balay ai = a->i; 144649b5e25fSSatish Balay aj = a->j; 144749b5e25fSSatish Balay aa = a->a; 1448d0f46423SBarry Smith m = A->rmap->N; 1449d0f46423SBarry Smith bs = A->rmap->bs; 145049b5e25fSSatish Balay mbs = a->mbs; 145149b5e25fSSatish Balay bs2 = a->bs2; 145249b5e25fSSatish Balay 14539566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 14549566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 145508401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 145649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 145749b5e25fSSatish Balay M = ai[i + 1] - ai[i]; 145849b5e25fSSatish Balay li = l + i * bs; 145949b5e25fSSatish Balay v = aa + bs2 * ai[i]; 146049b5e25fSSatish Balay for (j = 0; j < M; j++) { /* for each block */ 146149b5e25fSSatish Balay ri = l + bs * aj[ai[i] + j]; 14625e90f9d9SHong Zhang for (k = 0; k < bs; k++) { 146349b5e25fSSatish Balay x = ri[k]; 146449b5e25fSSatish Balay for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 146549b5e25fSSatish Balay } 146649b5e25fSSatish Balay } 146749b5e25fSSatish Balay } 14689566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 14699566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 147049b5e25fSSatish Balay PetscFunctionReturn(0); 147149b5e25fSSatish Balay } 147249b5e25fSSatish Balay 1473*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1474*d71ae5a4SJacob Faibussowitsch { 147549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 147649b5e25fSSatish Balay 147749b5e25fSSatish Balay PetscFunctionBegin; 147849b5e25fSSatish Balay info->block_size = a->bs2; 1479ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 14806c6c5352SBarry Smith info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 14813966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 148249b5e25fSSatish Balay info->assemblies = A->num_ass; 14838e58a170SBarry Smith info->mallocs = A->info.mallocs; 14844dfa11a4SJacob Faibussowitsch info->memory = 0; /* REVIEW ME */ 1485d5f3da31SBarry Smith if (A->factortype) { 148649b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 148749b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 148849b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 148949b5e25fSSatish Balay } else { 149049b5e25fSSatish Balay info->fill_ratio_given = 0; 149149b5e25fSSatish Balay info->fill_ratio_needed = 0; 149249b5e25fSSatish Balay info->factor_mallocs = 0; 149349b5e25fSSatish Balay } 149449b5e25fSSatish Balay PetscFunctionReturn(0); 149549b5e25fSSatish Balay } 149649b5e25fSSatish Balay 1497*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1498*d71ae5a4SJacob Faibussowitsch { 149949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 150049b5e25fSSatish Balay 150149b5e25fSSatish Balay PetscFunctionBegin; 15029566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 150349b5e25fSSatish Balay PetscFunctionReturn(0); 150449b5e25fSSatish Balay } 1505dc354874SHong Zhang 1506*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1507*d71ae5a4SJacob Faibussowitsch { 1508dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1509d9ca1df4SBarry Smith PetscInt i, j, n, row, col, bs, mbs; 1510d9ca1df4SBarry Smith const PetscInt *ai, *aj; 1511c3fca9a7SHong Zhang PetscReal atmp; 1512d9ca1df4SBarry Smith const MatScalar *aa; 1513985db425SBarry Smith PetscScalar *x; 151413f74950SBarry Smith PetscInt ncols, brow, bcol, krow, kcol; 1515dc354874SHong Zhang 1516dc354874SHong Zhang PetscFunctionBegin; 151728b400f6SJacob Faibussowitsch PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 151828b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1519d0f46423SBarry Smith bs = A->rmap->bs; 1520dc354874SHong Zhang aa = a->a; 1521dc354874SHong Zhang ai = a->i; 1522dc354874SHong Zhang aj = a->j; 152344117c81SHong Zhang mbs = a->mbs; 1524dc354874SHong Zhang 15259566063dSJacob Faibussowitsch PetscCall(VecSet(v, 0.0)); 15269566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 15279566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 152808401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 152944117c81SHong Zhang for (i = 0; i < mbs; i++) { 15309371c9d4SSatish Balay ncols = ai[1] - ai[0]; 15319371c9d4SSatish Balay ai++; 1532d0f6400bSHong Zhang brow = bs * i; 153344117c81SHong Zhang for (j = 0; j < ncols; j++) { 1534d0f6400bSHong Zhang bcol = bs * (*aj); 153544117c81SHong Zhang for (kcol = 0; kcol < bs; kcol++) { 1536d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 153744117c81SHong Zhang for (krow = 0; krow < bs; krow++) { 15389371c9d4SSatish Balay atmp = PetscAbsScalar(*aa); 15399371c9d4SSatish Balay aa++; 1540d0f6400bSHong Zhang row = brow + krow; /* row index */ 1541c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1542c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 154344117c81SHong Zhang } 154444117c81SHong Zhang } 1545d0f6400bSHong Zhang aj++; 1546dc354874SHong Zhang } 1547dc354874SHong Zhang } 15489566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 1549dc354874SHong Zhang PetscFunctionReturn(0); 1550dc354874SHong Zhang } 1551c2916339SPierre Jolivet 1552*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1553*d71ae5a4SJacob Faibussowitsch { 1554c2916339SPierre Jolivet PetscFunctionBegin; 15559566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 15564222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1557c2916339SPierre Jolivet PetscFunctionReturn(0); 1558c2916339SPierre Jolivet } 1559c2916339SPierre Jolivet 1560*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1561*d71ae5a4SJacob Faibussowitsch { 1562c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1563c2916339SPierre Jolivet PetscScalar *z = c; 1564c2916339SPierre Jolivet const PetscScalar *xb; 1565c2916339SPierre Jolivet PetscScalar x1; 1566c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1567c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 15683c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 1569b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 15703c2e41e1SStefano Zampini #else 15713c2e41e1SStefano Zampini const int aconj = 0; 15723c2e41e1SStefano Zampini #endif 1573c2916339SPierre Jolivet 1574c2916339SPierre Jolivet PetscFunctionBegin; 1575c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 15769371c9d4SSatish Balay n = ii[1] - ii[0]; 15779371c9d4SSatish Balay ii++; 1578c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1579c2916339SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1580c2916339SPierre Jolivet jj = idx; 1581c2916339SPierre Jolivet vv = v; 1582c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1583c2916339SPierre Jolivet idx = jj; 1584c2916339SPierre Jolivet v = vv; 1585c2916339SPierre Jolivet for (j = 0; j < n; j++) { 15869371c9d4SSatish Balay xb = b + (*idx); 15879371c9d4SSatish Balay x1 = xb[0 + k * bm]; 1588c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1; 15893c2e41e1SStefano Zampini if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1590c2916339SPierre Jolivet v += 1; 1591c2916339SPierre Jolivet ++idx; 1592c2916339SPierre Jolivet } 1593c2916339SPierre Jolivet } 1594c2916339SPierre Jolivet z += 1; 1595c2916339SPierre Jolivet } 1596c2916339SPierre Jolivet PetscFunctionReturn(0); 1597c2916339SPierre Jolivet } 1598c2916339SPierre Jolivet 1599*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1600*d71ae5a4SJacob Faibussowitsch { 1601c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1602c2916339SPierre Jolivet PetscScalar *z = c; 1603c2916339SPierre Jolivet const PetscScalar *xb; 1604c2916339SPierre Jolivet PetscScalar x1, x2; 1605c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1606c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1607c2916339SPierre Jolivet 1608c2916339SPierre Jolivet PetscFunctionBegin; 1609c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16109371c9d4SSatish Balay n = ii[1] - ii[0]; 16119371c9d4SSatish Balay ii++; 1612c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1613c2916339SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1614c2916339SPierre Jolivet jj = idx; 1615c2916339SPierre Jolivet vv = v; 1616c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1617c2916339SPierre Jolivet idx = jj; 1618c2916339SPierre Jolivet v = vv; 1619c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16209371c9d4SSatish Balay xb = b + 2 * (*idx); 16219371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16229371c9d4SSatish Balay x2 = xb[1 + k * bm]; 1623c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1624c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1625c2916339SPierre Jolivet if (*idx != i) { 1626c2916339SPierre Jolivet c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1627c2916339SPierre Jolivet c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1628c2916339SPierre Jolivet } 1629c2916339SPierre Jolivet v += 4; 1630c2916339SPierre Jolivet ++idx; 1631c2916339SPierre Jolivet } 1632c2916339SPierre Jolivet } 1633c2916339SPierre Jolivet z += 2; 1634c2916339SPierre Jolivet } 1635c2916339SPierre Jolivet PetscFunctionReturn(0); 1636c2916339SPierre Jolivet } 1637c2916339SPierre Jolivet 1638*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1639*d71ae5a4SJacob Faibussowitsch { 1640c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1641c2916339SPierre Jolivet PetscScalar *z = c; 1642c2916339SPierre Jolivet const PetscScalar *xb; 1643c2916339SPierre Jolivet PetscScalar x1, x2, x3; 1644c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1645c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1646c2916339SPierre Jolivet 1647c2916339SPierre Jolivet PetscFunctionBegin; 1648c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16499371c9d4SSatish Balay n = ii[1] - ii[0]; 16509371c9d4SSatish Balay ii++; 1651c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1652c2916339SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1653c2916339SPierre Jolivet jj = idx; 1654c2916339SPierre Jolivet vv = v; 1655c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1656c2916339SPierre Jolivet idx = jj; 1657c2916339SPierre Jolivet v = vv; 1658c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16599371c9d4SSatish Balay xb = b + 3 * (*idx); 16609371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16619371c9d4SSatish Balay x2 = xb[1 + k * bm]; 16629371c9d4SSatish Balay x3 = xb[2 + k * bm]; 1663c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1664c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1665c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1666c2916339SPierre Jolivet if (*idx != i) { 1667c2916339SPierre Jolivet c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1668c2916339SPierre Jolivet c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1669c2916339SPierre Jolivet c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1670c2916339SPierre Jolivet } 1671c2916339SPierre Jolivet v += 9; 1672c2916339SPierre Jolivet ++idx; 1673c2916339SPierre Jolivet } 1674c2916339SPierre Jolivet } 1675c2916339SPierre Jolivet z += 3; 1676c2916339SPierre Jolivet } 1677c2916339SPierre Jolivet PetscFunctionReturn(0); 1678c2916339SPierre Jolivet } 1679c2916339SPierre Jolivet 1680*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1681*d71ae5a4SJacob Faibussowitsch { 1682c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1683c2916339SPierre Jolivet PetscScalar *z = c; 1684c2916339SPierre Jolivet const PetscScalar *xb; 1685c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4; 1686c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1687c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1688c2916339SPierre Jolivet 1689c2916339SPierre Jolivet PetscFunctionBegin; 1690c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16919371c9d4SSatish Balay n = ii[1] - ii[0]; 16929371c9d4SSatish Balay ii++; 1693c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1694c2916339SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1695c2916339SPierre Jolivet jj = idx; 1696c2916339SPierre Jolivet vv = v; 1697c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1698c2916339SPierre Jolivet idx = jj; 1699c2916339SPierre Jolivet v = vv; 1700c2916339SPierre Jolivet for (j = 0; j < n; j++) { 17019371c9d4SSatish Balay xb = b + 4 * (*idx); 17029371c9d4SSatish Balay x1 = xb[0 + k * bm]; 17039371c9d4SSatish Balay x2 = xb[1 + k * bm]; 17049371c9d4SSatish Balay x3 = xb[2 + k * bm]; 17059371c9d4SSatish Balay x4 = xb[3 + k * bm]; 1706c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1707c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1708c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1709c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1710c2916339SPierre Jolivet if (*idx != i) { 1711c2916339SPierre Jolivet c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1712c2916339SPierre Jolivet c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1713c2916339SPierre Jolivet c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1714c2916339SPierre Jolivet c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1715c2916339SPierre Jolivet } 1716c2916339SPierre Jolivet v += 16; 1717c2916339SPierre Jolivet ++idx; 1718c2916339SPierre Jolivet } 1719c2916339SPierre Jolivet } 1720c2916339SPierre Jolivet z += 4; 1721c2916339SPierre Jolivet } 1722c2916339SPierre Jolivet PetscFunctionReturn(0); 1723c2916339SPierre Jolivet } 1724c2916339SPierre Jolivet 1725*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1726*d71ae5a4SJacob Faibussowitsch { 1727c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1728c2916339SPierre Jolivet PetscScalar *z = c; 1729c2916339SPierre Jolivet const PetscScalar *xb; 1730c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 1731c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1732c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1733c2916339SPierre Jolivet 1734c2916339SPierre Jolivet PetscFunctionBegin; 1735c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 17369371c9d4SSatish Balay n = ii[1] - ii[0]; 17379371c9d4SSatish Balay ii++; 1738c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1739c2916339SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1740c2916339SPierre Jolivet jj = idx; 1741c2916339SPierre Jolivet vv = v; 1742c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1743c2916339SPierre Jolivet idx = jj; 1744c2916339SPierre Jolivet v = vv; 1745c2916339SPierre Jolivet for (j = 0; j < n; j++) { 17469371c9d4SSatish Balay xb = b + 5 * (*idx); 17479371c9d4SSatish Balay x1 = xb[0 + k * bm]; 17489371c9d4SSatish Balay x2 = xb[1 + k * bm]; 17499371c9d4SSatish Balay x3 = xb[2 + k * bm]; 17509371c9d4SSatish Balay x4 = xb[3 + k * bm]; 17519371c9d4SSatish Balay x5 = xb[4 + k * cm]; 1752c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1753c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1754c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1755c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1756c2916339SPierre Jolivet z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1757c2916339SPierre Jolivet if (*idx != i) { 1758c2916339SPierre Jolivet c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1759c2916339SPierre Jolivet c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1760c2916339SPierre Jolivet c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1761c2916339SPierre Jolivet c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1762c2916339SPierre Jolivet c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1763c2916339SPierre Jolivet } 1764c2916339SPierre Jolivet v += 25; 1765c2916339SPierre Jolivet ++idx; 1766c2916339SPierre Jolivet } 1767c2916339SPierre Jolivet } 1768c2916339SPierre Jolivet z += 5; 1769c2916339SPierre Jolivet } 1770c2916339SPierre Jolivet PetscFunctionReturn(0); 1771c2916339SPierre Jolivet } 1772c2916339SPierre Jolivet 1773*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1774*d71ae5a4SJacob Faibussowitsch { 1775c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1776c2916339SPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1777281439baSStefano Zampini Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1778c2916339SPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1779c2916339SPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1780c2916339SPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 1781f4259b30SLisandro Dalcin PetscScalar *z = NULL; 1782c2916339SPierre Jolivet PetscScalar *c, *b; 1783c2916339SPierre Jolivet const MatScalar *v; 1784c2916339SPierre Jolivet const PetscInt *idx, *ii; 1785c2916339SPierre Jolivet PetscScalar _DOne = 1.0; 1786c2916339SPierre Jolivet 1787c2916339SPierre Jolivet PetscFunctionBegin; 1788c2916339SPierre Jolivet if (!cm || !cn) PetscFunctionReturn(0); 178908401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 179008401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 179108401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1792c2916339SPierre Jolivet b = bd->v; 17939566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(C)); 17949566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 1795c2916339SPierre Jolivet switch (bs) { 1796*d71ae5a4SJacob Faibussowitsch case 1: 1797*d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1798*d71ae5a4SJacob Faibussowitsch break; 1799*d71ae5a4SJacob Faibussowitsch case 2: 1800*d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1801*d71ae5a4SJacob Faibussowitsch break; 1802*d71ae5a4SJacob Faibussowitsch case 3: 1803*d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1804*d71ae5a4SJacob Faibussowitsch break; 1805*d71ae5a4SJacob Faibussowitsch case 4: 1806*d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1807*d71ae5a4SJacob Faibussowitsch break; 1808*d71ae5a4SJacob Faibussowitsch case 5: 1809*d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1810*d71ae5a4SJacob Faibussowitsch break; 1811c2916339SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 18129566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 18139566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 18149566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 18159566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 1816c2916339SPierre Jolivet idx = a->j; 1817c2916339SPierre Jolivet v = a->a; 1818c2916339SPierre Jolivet mbs = a->mbs; 1819c2916339SPierre Jolivet ii = a->i; 1820c2916339SPierre Jolivet z = c; 1821c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 18229371c9d4SSatish Balay n = ii[1] - ii[0]; 18239371c9d4SSatish Balay ii++; 1824c2916339SPierre Jolivet for (j = 0; j < n; j++) { 1825792fecdfSBarry Smith if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1826792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1827c2916339SPierre Jolivet v += bs2; 1828c2916339SPierre Jolivet } 1829c2916339SPierre Jolivet z += bs; 1830c2916339SPierre Jolivet } 1831c2916339SPierre Jolivet } 18329566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 18339566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1834c2916339SPierre Jolivet PetscFunctionReturn(0); 1835c2916339SPierre Jolivet } 1836