149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h> 5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 6c6db04a5SJed Brown #include <petscbt.h> 7c6db04a5SJed Brown #include <petscblaslapack.h> 849b5e25fSSatish Balay 9d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 10d71ae5a4SJacob Faibussowitsch { 115eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 127bede89fSBarry Smith PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 135d0c19d7SBarry Smith const PetscInt *idx; 14db41cccfSHong Zhang PetscBT table_out, table_in; 15d94109b8SHong Zhang 16d94109b8SHong Zhang PetscFunctionBegin; 1708401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 18d94109b8SHong Zhang mbs = a->mbs; 19d94109b8SHong Zhang ai = a->i; 20d94109b8SHong Zhang aj = a->j; 21d0f46423SBarry Smith bs = A->rmap->bs; 229566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_out)); 239566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs + 1, &nidx)); 249566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_in)); 25d94109b8SHong Zhang 26d94109b8SHong Zhang for (i = 0; i < is_max; i++) { /* for each is */ 27d94109b8SHong Zhang isz = 0; 289566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_out)); 29d94109b8SHong Zhang 30d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 319566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 329566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 33d94109b8SHong Zhang 34db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 35dbe03f88SHong Zhang bcol_max = 0; 36d94109b8SHong Zhang for (j = 0; j < n; ++j) { 37d94109b8SHong Zhang brow = idx[j] / bs; /* convert the indices into block indices */ 3808401ef6SPierre Jolivet PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 39db41cccfSHong Zhang if (!PetscBTLookupSet(table_out, brow)) { 40dbe03f88SHong Zhang nidx[isz++] = brow; 41dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 42dbe03f88SHong Zhang } 43d94109b8SHong Zhang } 449566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 459566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 46d94109b8SHong Zhang 47d94109b8SHong Zhang k = 0; 48d94109b8SHong Zhang for (j = 0; j < ov; j++) { /* for each overlap */ 49db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 509566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_in)); 519566063dSJacob Faibussowitsch for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 52d94109b8SHong Zhang 53d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 54d94109b8SHong Zhang for (brow = 0; brow < mbs; brow++) { 559371c9d4SSatish Balay start = ai[brow]; 569371c9d4SSatish Balay end = ai[brow + 1]; 57db41cccfSHong Zhang if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58d94109b8SHong Zhang for (l = start; l < end; l++) { 59d94109b8SHong Zhang bcol = aj[l]; 60d7b97159SHong Zhang if (!PetscBTLookupSet(table_out, bcol)) { 61d7b97159SHong Zhang nidx[isz++] = bcol; 62d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 63d7b97159SHong Zhang } 64d94109b8SHong Zhang } 65d94109b8SHong Zhang k++; 66d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67da81f932SPierre Jolivet } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */ 68d94109b8SHong Zhang for (l = start; l < end; l++) { 69d94109b8SHong Zhang bcol = aj[l]; 70dbe03f88SHong Zhang if (bcol > bcol_max) break; 71db41cccfSHong Zhang if (PetscBTLookup(table_in, bcol)) { 7226fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 73d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 74d94109b8SHong Zhang } 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } /* for each overlap */ 797bede89fSBarry Smith PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 807bede89fSBarry Smith } /* for each is */ 819566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_out)); 829566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 839566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_in)); 843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 8549b5e25fSSatish Balay } 8649b5e25fSSatish Balay 87847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 887cf5f706SPierre Jolivet Zero some ops' to avoid invalid use */ 89d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 90d71ae5a4SJacob Faibussowitsch { 9149b5e25fSSatish Balay PetscFunctionBegin; 929566063dSJacob Faibussowitsch PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 93f4259b30SLisandro Dalcin Bseq->ops->mult = NULL; 94f4259b30SLisandro Dalcin Bseq->ops->multadd = NULL; 95f4259b30SLisandro Dalcin Bseq->ops->multtranspose = NULL; 96f4259b30SLisandro Dalcin Bseq->ops->multtransposeadd = NULL; 97f4259b30SLisandro Dalcin Bseq->ops->lufactor = NULL; 98f4259b30SLisandro Dalcin Bseq->ops->choleskyfactor = NULL; 99f4259b30SLisandro Dalcin Bseq->ops->lufactorsymbolic = NULL; 100f4259b30SLisandro Dalcin Bseq->ops->choleskyfactorsymbolic = NULL; 101f4259b30SLisandro Dalcin Bseq->ops->getinertia = NULL; 1023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 10349b5e25fSSatish Balay } 10449b5e25fSSatish Balay 1057dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 106d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 107d71ae5a4SJacob Faibussowitsch { 108e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c; 109e50a8114SHong Zhang PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 110e50a8114SHong Zhang PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 111e50a8114SHong Zhang const PetscInt *irow, *icol; 112e50a8114SHong Zhang PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 113e50a8114SHong Zhang PetscInt *aj = a->j, *ai = a->i; 114e50a8114SHong Zhang MatScalar *mat_a; 115e50a8114SHong Zhang Mat C; 116e50a8114SHong Zhang PetscBool flag; 117e50a8114SHong Zhang 118e50a8114SHong Zhang PetscFunctionBegin; 119e50a8114SHong Zhang 1209566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 1219566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 1229566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 1239566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 124e50a8114SHong Zhang 1259566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 126e50a8114SHong Zhang ssmap = smap; 1279566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 128e50a8114SHong Zhang for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 129e50a8114SHong Zhang /* determine lens of each row */ 130e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 131e50a8114SHong Zhang kstart = ai[irow[i]]; 132e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 133e50a8114SHong Zhang lens[i] = 0; 134e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 135e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 136e50a8114SHong Zhang } 137e50a8114SHong Zhang } 138e50a8114SHong Zhang /* Create and fill new matrix */ 139e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 140e50a8114SHong Zhang c = (Mat_SeqSBAIJ *)((*B)->data); 141e50a8114SHong Zhang 142aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1439566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 14428b400f6SJacob Faibussowitsch PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 1459566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 146e50a8114SHong Zhang C = *B; 147e50a8114SHong Zhang } else { 1489566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1499566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 1509566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1519566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 152e50a8114SHong Zhang } 153e50a8114SHong Zhang c = (Mat_SeqSBAIJ *)(C->data); 154e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 155e50a8114SHong Zhang row = irow[i]; 156e50a8114SHong Zhang kstart = ai[row]; 157e50a8114SHong Zhang kend = kstart + a->ilen[row]; 158e50a8114SHong Zhang mat_i = c->i[i]; 159e50a8114SHong Zhang mat_j = c->j + mat_i; 160e50a8114SHong Zhang mat_a = c->a + mat_i * bs2; 161e50a8114SHong Zhang mat_ilen = c->ilen + i; 162e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 163e50a8114SHong Zhang if ((tcol = ssmap[a->j[k]])) { 164e50a8114SHong Zhang *mat_j++ = tcol - 1; 1659566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 166e50a8114SHong Zhang mat_a += bs2; 167e50a8114SHong Zhang (*mat_ilen)++; 168e50a8114SHong Zhang } 169e50a8114SHong Zhang } 170e50a8114SHong Zhang } 171e50a8114SHong Zhang /* sort */ 172e50a8114SHong Zhang { 173e50a8114SHong Zhang MatScalar *work; 174e50a8114SHong Zhang 1759566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 176e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 177e50a8114SHong Zhang PetscInt ilen; 178e50a8114SHong Zhang mat_i = c->i[i]; 179e50a8114SHong Zhang mat_j = c->j + mat_i; 180e50a8114SHong Zhang mat_a = c->a + mat_i * bs2; 181e50a8114SHong Zhang ilen = c->ilen[i]; 1829566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 183e50a8114SHong Zhang } 1849566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 185e50a8114SHong Zhang } 186e50a8114SHong Zhang 187e50a8114SHong Zhang /* Free work space */ 1889566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 1899566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 1909566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 1919566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1929566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 193e50a8114SHong Zhang 1949566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 195e50a8114SHong Zhang *B = C; 1963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 197e50a8114SHong Zhang } 198e50a8114SHong Zhang 199d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 200d71ae5a4SJacob Faibussowitsch { 201e50a8114SHong Zhang IS is1, is2; 20249b5e25fSSatish Balay 20349b5e25fSSatish Balay PetscFunctionBegin; 204f9a48b90SPierre Jolivet PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 205f9a48b90SPierre Jolivet if (isrow == iscol) { 206f9a48b90SPierre Jolivet is2 = is1; 207f9a48b90SPierre Jolivet PetscCall(PetscObjectReference((PetscObject)is2)); 208f9a48b90SPierre Jolivet } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 2099566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B)); 2109566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2119566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 212847daeccSHong Zhang 2138f46ffcaSHong Zhang if (isrow != iscol) { 2148f46ffcaSHong Zhang PetscBool isequal; 2159566063dSJacob Faibussowitsch PetscCall(ISEqual(isrow, iscol, &isequal)); 21648a46eb9SPierre Jolivet if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 21749b5e25fSSatish Balay } 2183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21949b5e25fSSatish Balay } 22049b5e25fSSatish Balay 221d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 222d71ae5a4SJacob Faibussowitsch { 22313f74950SBarry Smith PetscInt i; 22449b5e25fSSatish Balay 22549b5e25fSSatish Balay PetscFunctionBegin; 226*314f503fSPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B)); 227e50a8114SHong Zhang 22848a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 2293ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23049b5e25fSSatish Balay } 23149b5e25fSSatish Balay 23249b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 233d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 234d71ae5a4SJacob Faibussowitsch { 23549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 236d9ca1df4SBarry Smith PetscScalar *z, x1, x2, zero = 0.0; 237d9ca1df4SBarry Smith const PetscScalar *x, *xb; 238d9ca1df4SBarry Smith const MatScalar *v; 239d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 240d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 24198c9bda7SSatish Balay PetscInt nonzerorow = 0; 24249b5e25fSSatish Balay 24349b5e25fSSatish Balay PetscFunctionBegin; 2449566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 2453ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 2469566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2479566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 24849b5e25fSSatish Balay 24949b5e25fSSatish Balay v = a->a; 25049b5e25fSSatish Balay xb = x; 25149b5e25fSSatish Balay 25249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 25349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 2549371c9d4SSatish Balay x1 = xb[0]; 2559371c9d4SSatish Balay x2 = xb[1]; 25649b5e25fSSatish Balay ib = aj + *ai; 257831a3094SHong Zhang jmin = 0; 25898c9bda7SSatish Balay nonzerorow += (n > 0); 2597fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 26049b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 26149b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 2629371c9d4SSatish Balay v += 4; 2639371c9d4SSatish Balay jmin++; 2647fbae186SHong Zhang } 265444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 266444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 267831a3094SHong Zhang for (j = jmin; j < n; j++) { 26849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 26949b5e25fSSatish Balay cval = ib[j] * 2; 27049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 27149b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 27249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 27349b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 27449b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 27549b5e25fSSatish Balay v += 4; 27649b5e25fSSatish Balay } 2779371c9d4SSatish Balay xb += 2; 2789371c9d4SSatish Balay ai++; 27949b5e25fSSatish Balay } 28049b5e25fSSatish Balay 2819566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 2829566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 2839566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 2843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28549b5e25fSSatish Balay } 28649b5e25fSSatish Balay 287d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 288d71ae5a4SJacob Faibussowitsch { 28949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 290d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, zero = 0.0; 291d9ca1df4SBarry Smith const PetscScalar *x, *xb; 292d9ca1df4SBarry Smith const MatScalar *v; 293d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 294d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 29598c9bda7SSatish Balay PetscInt nonzerorow = 0; 29649b5e25fSSatish Balay 29749b5e25fSSatish Balay PetscFunctionBegin; 2989566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 2993ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 3009566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3019566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 30249b5e25fSSatish Balay 30349b5e25fSSatish Balay v = a->a; 30449b5e25fSSatish Balay xb = x; 30549b5e25fSSatish Balay 30649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 30749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3089371c9d4SSatish Balay x1 = xb[0]; 3099371c9d4SSatish Balay x2 = xb[1]; 3109371c9d4SSatish Balay x3 = xb[2]; 31149b5e25fSSatish Balay ib = aj + *ai; 312831a3094SHong Zhang jmin = 0; 31398c9bda7SSatish Balay nonzerorow += (n > 0); 3147fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 31549b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 31649b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 31749b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3189371c9d4SSatish Balay v += 9; 3199371c9d4SSatish Balay jmin++; 3207fbae186SHong Zhang } 321444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 322444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 323831a3094SHong Zhang for (j = jmin; j < n; j++) { 32449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 32549b5e25fSSatish Balay cval = ib[j] * 3; 32649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 32749b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 32849b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 32949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 33049b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 33149b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 33249b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 33349b5e25fSSatish Balay v += 9; 33449b5e25fSSatish Balay } 3359371c9d4SSatish Balay xb += 3; 3369371c9d4SSatish Balay ai++; 33749b5e25fSSatish Balay } 33849b5e25fSSatish Balay 3399566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3409566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3419566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 3423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 34349b5e25fSSatish Balay } 34449b5e25fSSatish Balay 345d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 346d71ae5a4SJacob Faibussowitsch { 34749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 348d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 349d9ca1df4SBarry Smith const PetscScalar *x, *xb; 350d9ca1df4SBarry Smith const MatScalar *v; 351d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 352d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 35398c9bda7SSatish Balay PetscInt nonzerorow = 0; 35449b5e25fSSatish Balay 35549b5e25fSSatish Balay PetscFunctionBegin; 3569566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 3573ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 3589566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3599566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 36049b5e25fSSatish Balay 36149b5e25fSSatish Balay v = a->a; 36249b5e25fSSatish Balay xb = x; 36349b5e25fSSatish Balay 36449b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 36549b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3669371c9d4SSatish Balay x1 = xb[0]; 3679371c9d4SSatish Balay x2 = xb[1]; 3689371c9d4SSatish Balay x3 = xb[2]; 3699371c9d4SSatish Balay x4 = xb[3]; 37049b5e25fSSatish Balay ib = aj + *ai; 371831a3094SHong Zhang jmin = 0; 37298c9bda7SSatish Balay nonzerorow += (n > 0); 3737fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 37449b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 37549b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 37649b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 37749b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 3789371c9d4SSatish Balay v += 16; 3799371c9d4SSatish Balay jmin++; 3807fbae186SHong Zhang } 381444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 382444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 383831a3094SHong Zhang for (j = jmin; j < n; j++) { 38449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 38549b5e25fSSatish Balay cval = ib[j] * 4; 38649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 38749b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 38849b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 38949b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 39049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 39149b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 39249b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 39349b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 39449b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 39549b5e25fSSatish Balay v += 16; 39649b5e25fSSatish Balay } 3979371c9d4SSatish Balay xb += 4; 3989371c9d4SSatish Balay ai++; 39949b5e25fSSatish Balay } 40049b5e25fSSatish Balay 4019566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4029566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 4039566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 4043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 40549b5e25fSSatish Balay } 40649b5e25fSSatish Balay 407d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 408d71ae5a4SJacob Faibussowitsch { 40949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 410d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 411d9ca1df4SBarry Smith const PetscScalar *x, *xb; 412d9ca1df4SBarry Smith const MatScalar *v; 413d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 414d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 41598c9bda7SSatish Balay PetscInt nonzerorow = 0; 41649b5e25fSSatish Balay 41749b5e25fSSatish Balay PetscFunctionBegin; 4189566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 4193ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 4209566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4219566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 42249b5e25fSSatish Balay 42349b5e25fSSatish Balay v = a->a; 42449b5e25fSSatish Balay xb = x; 42549b5e25fSSatish Balay 42649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 42749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4289371c9d4SSatish Balay x1 = xb[0]; 4299371c9d4SSatish Balay x2 = xb[1]; 4309371c9d4SSatish Balay x3 = xb[2]; 4319371c9d4SSatish Balay x4 = xb[3]; 4329371c9d4SSatish Balay x5 = xb[4]; 43349b5e25fSSatish Balay ib = aj + *ai; 434831a3094SHong Zhang jmin = 0; 43598c9bda7SSatish Balay nonzerorow += (n > 0); 4367fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 43749b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 43849b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 43949b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 44049b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 44149b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 4429371c9d4SSatish Balay v += 25; 4439371c9d4SSatish Balay jmin++; 4447fbae186SHong Zhang } 445444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 446444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 447831a3094SHong Zhang for (j = jmin; j < n; j++) { 44849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 44949b5e25fSSatish Balay cval = ib[j] * 5; 45049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 45149b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 45249b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 45349b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 45449b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 45549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 45649b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 45749b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 45849b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 45949b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 46049b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 46149b5e25fSSatish Balay v += 25; 46249b5e25fSSatish Balay } 4639371c9d4SSatish Balay xb += 5; 4649371c9d4SSatish Balay ai++; 46549b5e25fSSatish Balay } 46649b5e25fSSatish Balay 4679566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4689566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 4699566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 4703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 47149b5e25fSSatish Balay } 47249b5e25fSSatish Balay 473d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 474d71ae5a4SJacob Faibussowitsch { 47549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 476d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 477d9ca1df4SBarry Smith const PetscScalar *x, *xb; 478d9ca1df4SBarry Smith const MatScalar *v; 479d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 480d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 48198c9bda7SSatish Balay PetscInt nonzerorow = 0; 48249b5e25fSSatish Balay 48349b5e25fSSatish Balay PetscFunctionBegin; 4849566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 4853ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 4869566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4879566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 48849b5e25fSSatish Balay 48949b5e25fSSatish Balay v = a->a; 49049b5e25fSSatish Balay xb = x; 49149b5e25fSSatish Balay 49249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 49349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4949371c9d4SSatish Balay x1 = xb[0]; 4959371c9d4SSatish Balay x2 = xb[1]; 4969371c9d4SSatish Balay x3 = xb[2]; 4979371c9d4SSatish Balay x4 = xb[3]; 4989371c9d4SSatish Balay x5 = xb[4]; 4999371c9d4SSatish Balay x6 = xb[5]; 50049b5e25fSSatish Balay ib = aj + *ai; 501831a3094SHong Zhang jmin = 0; 50298c9bda7SSatish Balay nonzerorow += (n > 0); 5037fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 50449b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 50549b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 50649b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 50749b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 50849b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 50949b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 5109371c9d4SSatish Balay v += 36; 5119371c9d4SSatish Balay jmin++; 5127fbae186SHong Zhang } 513444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 514444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 515831a3094SHong Zhang for (j = jmin; j < n; j++) { 51649b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 51749b5e25fSSatish Balay cval = ib[j] * 6; 51849b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 51949b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 52049b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 52149b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 52249b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 52349b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 52449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 52549b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 52649b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 52749b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 52849b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 52949b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 53049b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 53149b5e25fSSatish Balay v += 36; 53249b5e25fSSatish Balay } 5339371c9d4SSatish Balay xb += 6; 5349371c9d4SSatish Balay ai++; 53549b5e25fSSatish Balay } 53649b5e25fSSatish Balay 5379566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5389566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5399566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 5403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 54149b5e25fSSatish Balay } 542c2916339SPierre Jolivet 543d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 544d71ae5a4SJacob Faibussowitsch { 54549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 546d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 547d9ca1df4SBarry Smith const PetscScalar *x, *xb; 548d9ca1df4SBarry Smith const MatScalar *v; 549d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 550d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 55198c9bda7SSatish Balay PetscInt nonzerorow = 0; 55249b5e25fSSatish Balay 55349b5e25fSSatish Balay PetscFunctionBegin; 5549566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 5553ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 5569566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5579566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 55849b5e25fSSatish Balay 55949b5e25fSSatish Balay v = a->a; 56049b5e25fSSatish Balay xb = x; 56149b5e25fSSatish Balay 56249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 56349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 5649371c9d4SSatish Balay x1 = xb[0]; 5659371c9d4SSatish Balay x2 = xb[1]; 5669371c9d4SSatish Balay x3 = xb[2]; 5679371c9d4SSatish Balay x4 = xb[3]; 5689371c9d4SSatish Balay x5 = xb[4]; 5699371c9d4SSatish Balay x6 = xb[5]; 5709371c9d4SSatish Balay x7 = xb[6]; 57149b5e25fSSatish Balay ib = aj + *ai; 572831a3094SHong Zhang jmin = 0; 57398c9bda7SSatish Balay nonzerorow += (n > 0); 5747fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 57549b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 57649b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 57749b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 57849b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 57949b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 58049b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 58149b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 5829371c9d4SSatish Balay v += 49; 5839371c9d4SSatish Balay jmin++; 5847fbae186SHong Zhang } 585444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 586444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 587831a3094SHong Zhang for (j = jmin; j < n; j++) { 58849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 58949b5e25fSSatish Balay cval = ib[j] * 7; 59049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 59149b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 59249b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 59349b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 59449b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 59549b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 59649b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 59749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 59849b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 59949b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 60049b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 60149b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 60249b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 60349b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 60449b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 60549b5e25fSSatish Balay v += 49; 60649b5e25fSSatish Balay } 6079371c9d4SSatish Balay xb += 7; 6089371c9d4SSatish Balay ai++; 60949b5e25fSSatish Balay } 6109566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6119566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 6129566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 6133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 61449b5e25fSSatish Balay } 61549b5e25fSSatish Balay 61649b5e25fSSatish Balay /* 61749b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 61849b5e25fSSatish Balay */ 619d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 620d71ae5a4SJacob Faibussowitsch { 62149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 622d9ca1df4SBarry Smith PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 623d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 624d9ca1df4SBarry Smith const MatScalar *v; 625d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 626d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 62798c9bda7SSatish Balay PetscInt nonzerorow = 0; 62849b5e25fSSatish Balay 62949b5e25fSSatish Balay PetscFunctionBegin; 6309566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 6313ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 6329566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6339566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 63459689ffbSStefano Zampini 63559689ffbSStefano Zampini x_ptr = x; 63659689ffbSStefano Zampini z_ptr = z; 63749b5e25fSSatish Balay 63849b5e25fSSatish Balay aj = a->j; 63949b5e25fSSatish Balay v = a->a; 64049b5e25fSSatish Balay ii = a->i; 64149b5e25fSSatish Balay 64248a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 64349b5e25fSSatish Balay work = a->mult_work; 64449b5e25fSSatish Balay 64549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 6469371c9d4SSatish Balay n = ii[1] - ii[0]; 6479371c9d4SSatish Balay ncols = n * bs; 6489371c9d4SSatish Balay workt = work; 6499371c9d4SSatish Balay idx = aj + ii[0]; 65098c9bda7SSatish Balay nonzerorow += (n > 0); 65149b5e25fSSatish Balay 65249b5e25fSSatish Balay /* upper triangular part */ 65349b5e25fSSatish Balay for (j = 0; j < n; j++) { 65449b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 65549b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 65649b5e25fSSatish Balay workt += bs; 65749b5e25fSSatish Balay } 65849b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 65996b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 66049b5e25fSSatish Balay 66149b5e25fSSatish Balay /* strict lower triangular part */ 662831a3094SHong Zhang idx = aj + ii[0]; 66359689ffbSStefano Zampini if (n && *idx == i) { 6649371c9d4SSatish Balay ncols -= bs; 6659371c9d4SSatish Balay v += bs2; 6669371c9d4SSatish Balay idx++; 6679371c9d4SSatish Balay n--; 668831a3094SHong Zhang } 66996b9376eSHong Zhang 67049b5e25fSSatish Balay if (ncols > 0) { 67149b5e25fSSatish Balay workt = work; 6729566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 67396b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 674831a3094SHong Zhang for (j = 0; j < n; j++) { 675831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 67649b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 67749b5e25fSSatish Balay workt += bs; 67849b5e25fSSatish Balay } 67949b5e25fSSatish Balay } 6809371c9d4SSatish Balay x += bs; 6819371c9d4SSatish Balay v += n * bs2; 6829371c9d4SSatish Balay z += bs; 6839371c9d4SSatish Balay ii++; 68449b5e25fSSatish Balay } 68549b5e25fSSatish Balay 6869566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6879566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 6889566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 6893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 69049b5e25fSSatish Balay } 69149b5e25fSSatish Balay 692d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 693d71ae5a4SJacob Faibussowitsch { 69449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 695d9ca1df4SBarry Smith PetscScalar *z, x1; 696d9ca1df4SBarry Smith const PetscScalar *x, *xb; 697d9ca1df4SBarry Smith const MatScalar *v; 698d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 699d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 70098c9bda7SSatish Balay PetscInt nonzerorow = 0; 701eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 702b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 703eb1ec7c1SStefano Zampini #else 704eb1ec7c1SStefano Zampini const int aconj = 0; 705eb1ec7c1SStefano Zampini #endif 70649b5e25fSSatish Balay 70749b5e25fSSatish Balay PetscFunctionBegin; 7089566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 7093ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 7109566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7119566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 71249b5e25fSSatish Balay v = a->a; 71349b5e25fSSatish Balay xb = x; 71449b5e25fSSatish Balay 71549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 71649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 71749b5e25fSSatish Balay x1 = xb[0]; 71849b5e25fSSatish Balay ib = aj + *ai; 719831a3094SHong Zhang jmin = 0; 72098c9bda7SSatish Balay nonzerorow += (n > 0); 7213d9ade75SStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 7229371c9d4SSatish Balay z[i] += *v++ * x[*ib++]; 7239371c9d4SSatish Balay jmin++; 724831a3094SHong Zhang } 725eb1ec7c1SStefano Zampini if (aconj) { 726eb1ec7c1SStefano Zampini for (j = jmin; j < n; j++) { 727eb1ec7c1SStefano Zampini cval = *ib; 728eb1ec7c1SStefano Zampini z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 729eb1ec7c1SStefano Zampini z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 730eb1ec7c1SStefano Zampini } 731eb1ec7c1SStefano Zampini } else { 732831a3094SHong Zhang for (j = jmin; j < n; j++) { 73349b5e25fSSatish Balay cval = *ib; 73449b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 73549b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 73649b5e25fSSatish Balay } 737eb1ec7c1SStefano Zampini } 7389371c9d4SSatish Balay xb++; 7399371c9d4SSatish Balay ai++; 74049b5e25fSSatish Balay } 74149b5e25fSSatish Balay 7429566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7439566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 74449b5e25fSSatish Balay 7459566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 7463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 74749b5e25fSSatish Balay } 74849b5e25fSSatish Balay 749d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 750d71ae5a4SJacob Faibussowitsch { 75149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 752d9ca1df4SBarry Smith PetscScalar *z, x1, x2; 753d9ca1df4SBarry Smith const PetscScalar *x, *xb; 754d9ca1df4SBarry Smith const MatScalar *v; 755d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 756d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 75798c9bda7SSatish Balay PetscInt nonzerorow = 0; 75849b5e25fSSatish Balay 75949b5e25fSSatish Balay PetscFunctionBegin; 7609566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 7613ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 7629566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7639566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 76449b5e25fSSatish Balay 76549b5e25fSSatish Balay v = a->a; 76649b5e25fSSatish Balay xb = x; 76749b5e25fSSatish Balay 76849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 76949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 7709371c9d4SSatish Balay x1 = xb[0]; 7719371c9d4SSatish Balay x2 = xb[1]; 77249b5e25fSSatish Balay ib = aj + *ai; 773831a3094SHong Zhang jmin = 0; 77498c9bda7SSatish Balay nonzerorow += (n > 0); 77559689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 77649b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 77749b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 7789371c9d4SSatish Balay v += 4; 7799371c9d4SSatish Balay jmin++; 7807fbae186SHong Zhang } 781444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 782444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 783831a3094SHong Zhang for (j = jmin; j < n; j++) { 78449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 78549b5e25fSSatish Balay cval = ib[j] * 2; 78649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 78749b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 78849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 78949b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 79049b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 79149b5e25fSSatish Balay v += 4; 79249b5e25fSSatish Balay } 7939371c9d4SSatish Balay xb += 2; 7949371c9d4SSatish Balay ai++; 79549b5e25fSSatish Balay } 7969566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7979566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 79849b5e25fSSatish Balay 7999566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 8003ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 80149b5e25fSSatish Balay } 80249b5e25fSSatish Balay 803d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 804d71ae5a4SJacob Faibussowitsch { 80549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 806d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3; 807d9ca1df4SBarry Smith const PetscScalar *x, *xb; 808d9ca1df4SBarry Smith const MatScalar *v; 809d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 810d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 81198c9bda7SSatish Balay PetscInt nonzerorow = 0; 81249b5e25fSSatish Balay 81349b5e25fSSatish Balay PetscFunctionBegin; 8149566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 8153ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 8169566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8179566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 81849b5e25fSSatish Balay 81949b5e25fSSatish Balay v = a->a; 82049b5e25fSSatish Balay xb = x; 82149b5e25fSSatish Balay 82249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 82349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8249371c9d4SSatish Balay x1 = xb[0]; 8259371c9d4SSatish Balay x2 = xb[1]; 8269371c9d4SSatish Balay x3 = xb[2]; 82749b5e25fSSatish Balay ib = aj + *ai; 828831a3094SHong Zhang jmin = 0; 82998c9bda7SSatish Balay nonzerorow += (n > 0); 83059689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 83149b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 83249b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 83349b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 8349371c9d4SSatish Balay v += 9; 8359371c9d4SSatish Balay jmin++; 8367fbae186SHong Zhang } 837444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 838444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 839831a3094SHong Zhang for (j = jmin; j < n; j++) { 84049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 84149b5e25fSSatish Balay cval = ib[j] * 3; 84249b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 84349b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 84449b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 84549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 84649b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 84749b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 84849b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 84949b5e25fSSatish Balay v += 9; 85049b5e25fSSatish Balay } 8519371c9d4SSatish Balay xb += 3; 8529371c9d4SSatish Balay ai++; 85349b5e25fSSatish Balay } 85449b5e25fSSatish Balay 8559566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8569566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 85749b5e25fSSatish Balay 8589566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 8593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 86049b5e25fSSatish Balay } 86149b5e25fSSatish Balay 862d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 863d71ae5a4SJacob Faibussowitsch { 86449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 865d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4; 866d9ca1df4SBarry Smith const PetscScalar *x, *xb; 867d9ca1df4SBarry Smith const MatScalar *v; 868d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 869d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 87098c9bda7SSatish Balay PetscInt nonzerorow = 0; 87149b5e25fSSatish Balay 87249b5e25fSSatish Balay PetscFunctionBegin; 8739566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 8743ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 8759566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8769566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 87749b5e25fSSatish Balay 87849b5e25fSSatish Balay v = a->a; 87949b5e25fSSatish Balay xb = x; 88049b5e25fSSatish Balay 88149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 88249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8839371c9d4SSatish Balay x1 = xb[0]; 8849371c9d4SSatish Balay x2 = xb[1]; 8859371c9d4SSatish Balay x3 = xb[2]; 8869371c9d4SSatish Balay x4 = xb[3]; 88749b5e25fSSatish Balay ib = aj + *ai; 888831a3094SHong Zhang jmin = 0; 88998c9bda7SSatish Balay nonzerorow += (n > 0); 89059689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 89149b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 89249b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 89349b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 89449b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 8959371c9d4SSatish Balay v += 16; 8969371c9d4SSatish Balay jmin++; 8977fbae186SHong Zhang } 898444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 899444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 900831a3094SHong Zhang for (j = jmin; j < n; j++) { 90149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 90249b5e25fSSatish Balay cval = ib[j] * 4; 90349b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 90449b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 90549b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 90649b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 90749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 90849b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 90949b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 91049b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 91149b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 91249b5e25fSSatish Balay v += 16; 91349b5e25fSSatish Balay } 9149371c9d4SSatish Balay xb += 4; 9159371c9d4SSatish Balay ai++; 91649b5e25fSSatish Balay } 91749b5e25fSSatish Balay 9189566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9199566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 92049b5e25fSSatish Balay 9219566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 9223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 92349b5e25fSSatish Balay } 92449b5e25fSSatish Balay 925d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 926d71ae5a4SJacob Faibussowitsch { 92749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 928d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5; 929d9ca1df4SBarry Smith const PetscScalar *x, *xb; 930d9ca1df4SBarry Smith const MatScalar *v; 931d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 932d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 93398c9bda7SSatish Balay PetscInt nonzerorow = 0; 93449b5e25fSSatish Balay 93549b5e25fSSatish Balay PetscFunctionBegin; 9369566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 9373ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 9389566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9399566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 94049b5e25fSSatish Balay 94149b5e25fSSatish Balay v = a->a; 94249b5e25fSSatish Balay xb = x; 94349b5e25fSSatish Balay 94449b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 94549b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 9469371c9d4SSatish Balay x1 = xb[0]; 9479371c9d4SSatish Balay x2 = xb[1]; 9489371c9d4SSatish Balay x3 = xb[2]; 9499371c9d4SSatish Balay x4 = xb[3]; 9509371c9d4SSatish Balay x5 = xb[4]; 95149b5e25fSSatish Balay ib = aj + *ai; 952831a3094SHong Zhang jmin = 0; 95398c9bda7SSatish Balay nonzerorow += (n > 0); 95459689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 95549b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 95649b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 95749b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 95849b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 95949b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 9609371c9d4SSatish Balay v += 25; 9619371c9d4SSatish Balay jmin++; 9627fbae186SHong Zhang } 963444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 964444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 965831a3094SHong Zhang for (j = jmin; j < n; j++) { 96649b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 96749b5e25fSSatish Balay cval = ib[j] * 5; 96849b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 96949b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 97049b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 97149b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 97249b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 97349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 97449b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 97549b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 97649b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 97749b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 97849b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 97949b5e25fSSatish Balay v += 25; 98049b5e25fSSatish Balay } 9819371c9d4SSatish Balay xb += 5; 9829371c9d4SSatish Balay ai++; 98349b5e25fSSatish Balay } 98449b5e25fSSatish Balay 9859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9869566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 98749b5e25fSSatish Balay 9889566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 9893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 99049b5e25fSSatish Balay } 991c2916339SPierre Jolivet 992d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 993d71ae5a4SJacob Faibussowitsch { 99449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 995d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6; 996d9ca1df4SBarry Smith const PetscScalar *x, *xb; 997d9ca1df4SBarry Smith const MatScalar *v; 998d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 999d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 100098c9bda7SSatish Balay PetscInt nonzerorow = 0; 100149b5e25fSSatish Balay 100249b5e25fSSatish Balay PetscFunctionBegin; 10039566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 10043ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 10059566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10069566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 100749b5e25fSSatish Balay 100849b5e25fSSatish Balay v = a->a; 100949b5e25fSSatish Balay xb = x; 101049b5e25fSSatish Balay 101149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 101249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 10139371c9d4SSatish Balay x1 = xb[0]; 10149371c9d4SSatish Balay x2 = xb[1]; 10159371c9d4SSatish Balay x3 = xb[2]; 10169371c9d4SSatish Balay x4 = xb[3]; 10179371c9d4SSatish Balay x5 = xb[4]; 10189371c9d4SSatish Balay x6 = xb[5]; 101949b5e25fSSatish Balay ib = aj + *ai; 1020831a3094SHong Zhang jmin = 0; 102198c9bda7SSatish Balay nonzerorow += (n > 0); 102259689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 102349b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 102449b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 102549b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 102649b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 102749b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 102849b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 10299371c9d4SSatish Balay v += 36; 10309371c9d4SSatish Balay jmin++; 10317fbae186SHong Zhang } 1032444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1033444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1034831a3094SHong Zhang for (j = jmin; j < n; j++) { 103549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 103649b5e25fSSatish Balay cval = ib[j] * 6; 103749b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 103849b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 103949b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 104049b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 104149b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 104249b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 104349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 104449b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 104549b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 104649b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 104749b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 104849b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 104949b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 105049b5e25fSSatish Balay v += 36; 105149b5e25fSSatish Balay } 10529371c9d4SSatish Balay xb += 6; 10539371c9d4SSatish Balay ai++; 105449b5e25fSSatish Balay } 105549b5e25fSSatish Balay 10569566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10579566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 105849b5e25fSSatish Balay 10599566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 10603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 106149b5e25fSSatish Balay } 106249b5e25fSSatish Balay 1063d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1064d71ae5a4SJacob Faibussowitsch { 106549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1066d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1067d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1068d9ca1df4SBarry Smith const MatScalar *v; 1069d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1070d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 107198c9bda7SSatish Balay PetscInt nonzerorow = 0; 107249b5e25fSSatish Balay 107349b5e25fSSatish Balay PetscFunctionBegin; 10749566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 10753ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 10769566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10779566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 107849b5e25fSSatish Balay 107949b5e25fSSatish Balay v = a->a; 108049b5e25fSSatish Balay xb = x; 108149b5e25fSSatish Balay 108249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 108349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 10849371c9d4SSatish Balay x1 = xb[0]; 10859371c9d4SSatish Balay x2 = xb[1]; 10869371c9d4SSatish Balay x3 = xb[2]; 10879371c9d4SSatish Balay x4 = xb[3]; 10889371c9d4SSatish Balay x5 = xb[4]; 10899371c9d4SSatish Balay x6 = xb[5]; 10909371c9d4SSatish Balay x7 = xb[6]; 109149b5e25fSSatish Balay ib = aj + *ai; 1092831a3094SHong Zhang jmin = 0; 109398c9bda7SSatish Balay nonzerorow += (n > 0); 109459689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 109549b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 109649b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 109749b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 109849b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 109949b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 110049b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 110149b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 11029371c9d4SSatish Balay v += 49; 11039371c9d4SSatish Balay jmin++; 11047fbae186SHong Zhang } 1105444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1106444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1107831a3094SHong Zhang for (j = jmin; j < n; j++) { 110849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 110949b5e25fSSatish Balay cval = ib[j] * 7; 111049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 111149b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 111249b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 111349b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 111449b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 111549b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 111649b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 111749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 111849b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 111949b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 112049b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 112149b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 112249b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 112349b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 112449b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 112549b5e25fSSatish Balay v += 49; 112649b5e25fSSatish Balay } 11279371c9d4SSatish Balay xb += 7; 11289371c9d4SSatish Balay ai++; 112949b5e25fSSatish Balay } 113049b5e25fSSatish Balay 11319566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11329566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 113349b5e25fSSatish Balay 11349566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 11353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 113649b5e25fSSatish Balay } 113749b5e25fSSatish Balay 1138d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1139d71ae5a4SJacob Faibussowitsch { 114049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1141f4259b30SLisandro Dalcin PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1142d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 1143d9ca1df4SBarry Smith const MatScalar *v; 1144d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1145d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 114698c9bda7SSatish Balay PetscInt nonzerorow = 0; 114749b5e25fSSatish Balay 114849b5e25fSSatish Balay PetscFunctionBegin; 11499566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 11503ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 11519371c9d4SSatish Balay PetscCall(VecGetArrayRead(xx, &x)); 11529371c9d4SSatish Balay x_ptr = x; 11539371c9d4SSatish Balay PetscCall(VecGetArray(zz, &z)); 11549371c9d4SSatish Balay z_ptr = z; 115549b5e25fSSatish Balay 115649b5e25fSSatish Balay aj = a->j; 115749b5e25fSSatish Balay v = a->a; 115849b5e25fSSatish Balay ii = a->i; 115949b5e25fSSatish Balay 116048a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 116149b5e25fSSatish Balay work = a->mult_work; 116249b5e25fSSatish Balay 116349b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 11649371c9d4SSatish Balay n = ii[1] - ii[0]; 11659371c9d4SSatish Balay ncols = n * bs; 11669371c9d4SSatish Balay workt = work; 11679371c9d4SSatish Balay idx = aj + ii[0]; 116898c9bda7SSatish Balay nonzerorow += (n > 0); 116949b5e25fSSatish Balay 117049b5e25fSSatish Balay /* upper triangular part */ 117149b5e25fSSatish Balay for (j = 0; j < n; j++) { 117249b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 117349b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 117449b5e25fSSatish Balay workt += bs; 117549b5e25fSSatish Balay } 117649b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 117796b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 117849b5e25fSSatish Balay 117949b5e25fSSatish Balay /* strict lower triangular part */ 1180831a3094SHong Zhang idx = aj + ii[0]; 118159689ffbSStefano Zampini if (n && *idx == i) { 11829371c9d4SSatish Balay ncols -= bs; 11839371c9d4SSatish Balay v += bs2; 11849371c9d4SSatish Balay idx++; 11859371c9d4SSatish Balay n--; 1186831a3094SHong Zhang } 118749b5e25fSSatish Balay if (ncols > 0) { 118849b5e25fSSatish Balay workt = work; 11899566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 119096b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1191831a3094SHong Zhang for (j = 0; j < n; j++) { 1192831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 119349b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 119449b5e25fSSatish Balay workt += bs; 119549b5e25fSSatish Balay } 119649b5e25fSSatish Balay } 119749b5e25fSSatish Balay 11989371c9d4SSatish Balay x += bs; 11999371c9d4SSatish Balay v += n * bs2; 12009371c9d4SSatish Balay z += bs; 12019371c9d4SSatish Balay ii++; 120249b5e25fSSatish Balay } 120349b5e25fSSatish Balay 12049566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 12059566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 120649b5e25fSSatish Balay 12079566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 12083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 120949b5e25fSSatish Balay } 121049b5e25fSSatish Balay 1211d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1212d71ae5a4SJacob Faibussowitsch { 121349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1214f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1215c5df96a5SBarry Smith PetscBLASInt one = 1, totalnz; 121649b5e25fSSatish Balay 121749b5e25fSSatish Balay PetscFunctionBegin; 12189566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1219792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 12209566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 12213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 122249b5e25fSSatish Balay } 122349b5e25fSSatish Balay 1224d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1225d71ae5a4SJacob Faibussowitsch { 122649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1227d9ca1df4SBarry Smith const MatScalar *v = a->a; 122849b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1229d9ca1df4SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1230d9ca1df4SBarry Smith const PetscInt *aj = a->j, *col; 123149b5e25fSSatish Balay 123249b5e25fSSatish Balay PetscFunctionBegin; 1233c40ae873SPierre Jolivet if (!a->nz) { 1234c40ae873SPierre Jolivet *norm = 0.0; 12353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1236c40ae873SPierre Jolivet } 123749b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 123849b5e25fSSatish Balay for (k = 0; k < mbs; k++) { 123959689ffbSStefano Zampini jmin = a->i[k]; 124059689ffbSStefano Zampini jmax = a->i[k + 1]; 1241831a3094SHong Zhang col = aj + jmin; 124259689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 124349b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 12449371c9d4SSatish Balay sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 12459371c9d4SSatish Balay v++; 124649b5e25fSSatish Balay } 1247831a3094SHong Zhang jmin++; 1248831a3094SHong Zhang } 1249831a3094SHong Zhang for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 125049b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 12519371c9d4SSatish Balay sum_off += PetscRealPart(PetscConj(*v) * (*v)); 12529371c9d4SSatish Balay v++; 125349b5e25fSSatish Balay } 125449b5e25fSSatish Balay } 125549b5e25fSSatish Balay } 12568f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 12579566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 12580b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 12599566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 12600b8dc8d2SHong Zhang for (i = 0; i < mbs; i++) jl[i] = mbs; 12610b8dc8d2SHong Zhang il[0] = 0; 126249b5e25fSSatish Balay 126349b5e25fSSatish Balay *norm = 0.0; 126449b5e25fSSatish Balay for (k = 0; k < mbs; k++) { /* k_th block row */ 126549b5e25fSSatish Balay for (j = 0; j < bs; j++) sum[j] = 0.0; 126649b5e25fSSatish Balay /*-- col sum --*/ 126749b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1268bbea24aaSStefano Zampini /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window) 1269831a3094SHong Zhang at step k */ 127049b5e25fSSatish Balay while (i < mbs) { 127149b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 127249b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 127349b5e25fSSatish Balay for (j = 0; j < bs; j++) { 127449b5e25fSSatish Balay v = a->a + ik * bs2 + j * bs; 127549b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 12769371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 12779371c9d4SSatish Balay v++; 127849b5e25fSSatish Balay } 127949b5e25fSSatish Balay } 128049b5e25fSSatish Balay /* update il, jl */ 1281831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1282831a3094SHong Zhang jmax = a->i[i + 1]; 128349b5e25fSSatish Balay if (jmin < jmax) { 128449b5e25fSSatish Balay il[i] = jmin; 128549b5e25fSSatish Balay j = a->j[jmin]; 12869371c9d4SSatish Balay jl[i] = jl[j]; 12879371c9d4SSatish Balay jl[j] = i; 128849b5e25fSSatish Balay } 128949b5e25fSSatish Balay i = nexti; 129049b5e25fSSatish Balay } 129149b5e25fSSatish Balay /*-- row sum --*/ 129259689ffbSStefano Zampini jmin = a->i[k]; 129359689ffbSStefano Zampini jmax = a->i[k + 1]; 129449b5e25fSSatish Balay for (i = jmin; i < jmax; i++) { 129549b5e25fSSatish Balay for (j = 0; j < bs; j++) { 129649b5e25fSSatish Balay v = a->a + i * bs2 + j; 129749b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 12989371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 12999371c9d4SSatish Balay v += bs; 130049b5e25fSSatish Balay } 130149b5e25fSSatish Balay } 130249b5e25fSSatish Balay } 130349b5e25fSSatish Balay /* add k_th block row to il, jl */ 1304831a3094SHong Zhang col = aj + jmin; 130559689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) jmin++; 130649b5e25fSSatish Balay if (jmin < jmax) { 130749b5e25fSSatish Balay il[k] = jmin; 13089371c9d4SSatish Balay j = a->j[jmin]; 13099371c9d4SSatish Balay jl[k] = jl[j]; 13109371c9d4SSatish Balay jl[j] = k; 131149b5e25fSSatish Balay } 131249b5e25fSSatish Balay for (j = 0; j < bs; j++) { 131349b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 131449b5e25fSSatish Balay } 131549b5e25fSSatish Balay } 13169566063dSJacob Faibussowitsch PetscCall(PetscFree3(sum, il, jl)); 13179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1318f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 13193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 132049b5e25fSSatish Balay } 132149b5e25fSSatish Balay 1322d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1323d71ae5a4SJacob Faibussowitsch { 132449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 132549b5e25fSSatish Balay 132649b5e25fSSatish Balay PetscFunctionBegin; 132749b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1328d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1329ef511fbeSHong Zhang *flg = PETSC_FALSE; 13303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 133149b5e25fSSatish Balay } 133249b5e25fSSatish Balay 133349b5e25fSSatish Balay /* if the a->i are the same */ 13349566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 13353ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 133649b5e25fSSatish Balay 133749b5e25fSSatish Balay /* if a->j are the same */ 13389566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 13393ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 134026fbe8dcSKarl Rupp 134149b5e25fSSatish Balay /* if a->a are the same */ 13429566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 13433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 134449b5e25fSSatish Balay } 134549b5e25fSSatish Balay 1346d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1347d71ae5a4SJacob Faibussowitsch { 134849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1349d9ca1df4SBarry Smith PetscInt i, j, k, row, bs, ambs, bs2; 1350d9ca1df4SBarry Smith const PetscInt *ai, *aj; 135187828ca2SBarry Smith PetscScalar *x, zero = 0.0; 1352d9ca1df4SBarry Smith const MatScalar *aa, *aa_j; 135349b5e25fSSatish Balay 135449b5e25fSSatish Balay PetscFunctionBegin; 1355d0f46423SBarry Smith bs = A->rmap->bs; 1356aed4548fSBarry Smith PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 135782799104SHong Zhang 135849b5e25fSSatish Balay aa = a->a; 13598a0c6efdSHong Zhang ambs = a->mbs; 13608a0c6efdSHong Zhang 13618a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 13628a0c6efdSHong Zhang PetscInt *diag = a->diag; 13638a0c6efdSHong Zhang aa = a->a; 13648a0c6efdSHong Zhang ambs = a->mbs; 13659566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 13668a0c6efdSHong Zhang for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 13679566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 13683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13698a0c6efdSHong Zhang } 13708a0c6efdSHong Zhang 137149b5e25fSSatish Balay ai = a->i; 137249b5e25fSSatish Balay aj = a->j; 137349b5e25fSSatish Balay bs2 = a->bs2; 13749566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 13753ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 13769566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 137749b5e25fSSatish Balay for (i = 0; i < ambs; i++) { 137849b5e25fSSatish Balay j = ai[i]; 137949b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 138049b5e25fSSatish Balay row = i * bs; 138149b5e25fSSatish Balay aa_j = aa + j * bs2; 138249b5e25fSSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 138349b5e25fSSatish Balay } 138449b5e25fSSatish Balay } 13859566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 13863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 138749b5e25fSSatish Balay } 138849b5e25fSSatish Balay 1389d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1390d71ae5a4SJacob Faibussowitsch { 139149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1392d9ca1df4SBarry Smith PetscScalar x; 1393d9ca1df4SBarry Smith const PetscScalar *l, *li, *ri; 139449b5e25fSSatish Balay MatScalar *aa, *v; 1395fff8e43fSBarry Smith PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1396fff8e43fSBarry Smith const PetscInt *ai, *aj; 1397ace3abfcSBarry Smith PetscBool flg; 139849b5e25fSSatish Balay 139949b5e25fSSatish Balay PetscFunctionBegin; 1400b3bf805bSHong Zhang if (ll != rr) { 14019566063dSJacob Faibussowitsch PetscCall(VecEqual(ll, rr, &flg)); 140228b400f6SJacob Faibussowitsch PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1403b3bf805bSHong Zhang } 14043ba16761SJacob Faibussowitsch if (!ll) PetscFunctionReturn(PETSC_SUCCESS); 140549b5e25fSSatish Balay ai = a->i; 140649b5e25fSSatish Balay aj = a->j; 140749b5e25fSSatish Balay aa = a->a; 1408d0f46423SBarry Smith m = A->rmap->N; 1409d0f46423SBarry Smith bs = A->rmap->bs; 141049b5e25fSSatish Balay mbs = a->mbs; 141149b5e25fSSatish Balay bs2 = a->bs2; 141249b5e25fSSatish Balay 14139566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 14149566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 141508401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 141649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 141749b5e25fSSatish Balay M = ai[i + 1] - ai[i]; 141849b5e25fSSatish Balay li = l + i * bs; 141949b5e25fSSatish Balay v = aa + bs2 * ai[i]; 142049b5e25fSSatish Balay for (j = 0; j < M; j++) { /* for each block */ 142149b5e25fSSatish Balay ri = l + bs * aj[ai[i] + j]; 14225e90f9d9SHong Zhang for (k = 0; k < bs; k++) { 142349b5e25fSSatish Balay x = ri[k]; 142449b5e25fSSatish Balay for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 142549b5e25fSSatish Balay } 142649b5e25fSSatish Balay } 142749b5e25fSSatish Balay } 14289566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 14299566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 14303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 143149b5e25fSSatish Balay } 143249b5e25fSSatish Balay 1433d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1434d71ae5a4SJacob Faibussowitsch { 143549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 143649b5e25fSSatish Balay 143749b5e25fSSatish Balay PetscFunctionBegin; 143849b5e25fSSatish Balay info->block_size = a->bs2; 1439ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 14406c6c5352SBarry Smith info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 14413966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 144249b5e25fSSatish Balay info->assemblies = A->num_ass; 14438e58a170SBarry Smith info->mallocs = A->info.mallocs; 14444dfa11a4SJacob Faibussowitsch info->memory = 0; /* REVIEW ME */ 1445d5f3da31SBarry Smith if (A->factortype) { 144649b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 144749b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 144849b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 144949b5e25fSSatish Balay } else { 145049b5e25fSSatish Balay info->fill_ratio_given = 0; 145149b5e25fSSatish Balay info->fill_ratio_needed = 0; 145249b5e25fSSatish Balay info->factor_mallocs = 0; 145349b5e25fSSatish Balay } 14543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 145549b5e25fSSatish Balay } 145649b5e25fSSatish Balay 1457d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1458d71ae5a4SJacob Faibussowitsch { 145949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 146049b5e25fSSatish Balay 146149b5e25fSSatish Balay PetscFunctionBegin; 14629566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 14633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 146449b5e25fSSatish Balay } 1465dc354874SHong Zhang 1466d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1467d71ae5a4SJacob Faibussowitsch { 1468dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1469d9ca1df4SBarry Smith PetscInt i, j, n, row, col, bs, mbs; 1470d9ca1df4SBarry Smith const PetscInt *ai, *aj; 1471c3fca9a7SHong Zhang PetscReal atmp; 1472d9ca1df4SBarry Smith const MatScalar *aa; 1473985db425SBarry Smith PetscScalar *x; 147413f74950SBarry Smith PetscInt ncols, brow, bcol, krow, kcol; 1475dc354874SHong Zhang 1476dc354874SHong Zhang PetscFunctionBegin; 147728b400f6SJacob Faibussowitsch PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 147828b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1479d0f46423SBarry Smith bs = A->rmap->bs; 1480dc354874SHong Zhang aa = a->a; 1481dc354874SHong Zhang ai = a->i; 1482dc354874SHong Zhang aj = a->j; 148344117c81SHong Zhang mbs = a->mbs; 1484dc354874SHong Zhang 14859566063dSJacob Faibussowitsch PetscCall(VecSet(v, 0.0)); 14869566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 14879566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 148808401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 148944117c81SHong Zhang for (i = 0; i < mbs; i++) { 14909371c9d4SSatish Balay ncols = ai[1] - ai[0]; 14919371c9d4SSatish Balay ai++; 1492d0f6400bSHong Zhang brow = bs * i; 149344117c81SHong Zhang for (j = 0; j < ncols; j++) { 1494d0f6400bSHong Zhang bcol = bs * (*aj); 149544117c81SHong Zhang for (kcol = 0; kcol < bs; kcol++) { 1496d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 149744117c81SHong Zhang for (krow = 0; krow < bs; krow++) { 14989371c9d4SSatish Balay atmp = PetscAbsScalar(*aa); 14999371c9d4SSatish Balay aa++; 1500d0f6400bSHong Zhang row = brow + krow; /* row index */ 1501c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1502c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 150344117c81SHong Zhang } 150444117c81SHong Zhang } 1505d0f6400bSHong Zhang aj++; 1506dc354874SHong Zhang } 1507dc354874SHong Zhang } 15089566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 15093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1510dc354874SHong Zhang } 1511c2916339SPierre Jolivet 1512d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1513d71ae5a4SJacob Faibussowitsch { 1514c2916339SPierre Jolivet PetscFunctionBegin; 15159566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 15164222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 15173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1518c2916339SPierre Jolivet } 1519c2916339SPierre Jolivet 1520d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1521d71ae5a4SJacob Faibussowitsch { 1522c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1523c2916339SPierre Jolivet PetscScalar *z = c; 1524c2916339SPierre Jolivet const PetscScalar *xb; 1525c2916339SPierre Jolivet PetscScalar x1; 1526c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1527c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 15283c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 1529b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 15303c2e41e1SStefano Zampini #else 15313c2e41e1SStefano Zampini const int aconj = 0; 15323c2e41e1SStefano Zampini #endif 1533c2916339SPierre Jolivet 1534c2916339SPierre Jolivet PetscFunctionBegin; 1535c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 15369371c9d4SSatish Balay n = ii[1] - ii[0]; 15379371c9d4SSatish Balay ii++; 1538c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1539c2916339SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1540c2916339SPierre Jolivet jj = idx; 1541c2916339SPierre Jolivet vv = v; 1542c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1543c2916339SPierre Jolivet idx = jj; 1544c2916339SPierre Jolivet v = vv; 1545c2916339SPierre Jolivet for (j = 0; j < n; j++) { 15469371c9d4SSatish Balay xb = b + (*idx); 15479371c9d4SSatish Balay x1 = xb[0 + k * bm]; 1548c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1; 15493c2e41e1SStefano Zampini if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1550c2916339SPierre Jolivet v += 1; 1551c2916339SPierre Jolivet ++idx; 1552c2916339SPierre Jolivet } 1553c2916339SPierre Jolivet } 1554c2916339SPierre Jolivet z += 1; 1555c2916339SPierre Jolivet } 15563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1557c2916339SPierre Jolivet } 1558c2916339SPierre Jolivet 1559d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1560d71ae5a4SJacob Faibussowitsch { 1561c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1562c2916339SPierre Jolivet PetscScalar *z = c; 1563c2916339SPierre Jolivet const PetscScalar *xb; 1564c2916339SPierre Jolivet PetscScalar x1, x2; 1565c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1566c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1567c2916339SPierre Jolivet 1568c2916339SPierre Jolivet PetscFunctionBegin; 1569c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 15709371c9d4SSatish Balay n = ii[1] - ii[0]; 15719371c9d4SSatish Balay ii++; 1572c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1573c2916339SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1574c2916339SPierre Jolivet jj = idx; 1575c2916339SPierre Jolivet vv = v; 1576c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1577c2916339SPierre Jolivet idx = jj; 1578c2916339SPierre Jolivet v = vv; 1579c2916339SPierre Jolivet for (j = 0; j < n; j++) { 15809371c9d4SSatish Balay xb = b + 2 * (*idx); 15819371c9d4SSatish Balay x1 = xb[0 + k * bm]; 15829371c9d4SSatish Balay x2 = xb[1 + k * bm]; 1583c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1584c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1585c2916339SPierre Jolivet if (*idx != i) { 1586c2916339SPierre Jolivet c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1587c2916339SPierre Jolivet c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1588c2916339SPierre Jolivet } 1589c2916339SPierre Jolivet v += 4; 1590c2916339SPierre Jolivet ++idx; 1591c2916339SPierre Jolivet } 1592c2916339SPierre Jolivet } 1593c2916339SPierre Jolivet z += 2; 1594c2916339SPierre Jolivet } 15953ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1596c2916339SPierre Jolivet } 1597c2916339SPierre Jolivet 1598d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1599d71ae5a4SJacob Faibussowitsch { 1600c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1601c2916339SPierre Jolivet PetscScalar *z = c; 1602c2916339SPierre Jolivet const PetscScalar *xb; 1603c2916339SPierre Jolivet PetscScalar x1, x2, x3; 1604c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1605c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1606c2916339SPierre Jolivet 1607c2916339SPierre Jolivet PetscFunctionBegin; 1608c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16099371c9d4SSatish Balay n = ii[1] - ii[0]; 16109371c9d4SSatish Balay ii++; 1611c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1612c2916339SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1613c2916339SPierre Jolivet jj = idx; 1614c2916339SPierre Jolivet vv = v; 1615c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1616c2916339SPierre Jolivet idx = jj; 1617c2916339SPierre Jolivet v = vv; 1618c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16199371c9d4SSatish Balay xb = b + 3 * (*idx); 16209371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16219371c9d4SSatish Balay x2 = xb[1 + k * bm]; 16229371c9d4SSatish Balay x3 = xb[2 + k * bm]; 1623c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1624c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1625c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1626c2916339SPierre Jolivet if (*idx != i) { 1627c2916339SPierre Jolivet c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1628c2916339SPierre Jolivet c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1629c2916339SPierre Jolivet c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1630c2916339SPierre Jolivet } 1631c2916339SPierre Jolivet v += 9; 1632c2916339SPierre Jolivet ++idx; 1633c2916339SPierre Jolivet } 1634c2916339SPierre Jolivet } 1635c2916339SPierre Jolivet z += 3; 1636c2916339SPierre Jolivet } 16373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1638c2916339SPierre Jolivet } 1639c2916339SPierre Jolivet 1640d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1641d71ae5a4SJacob Faibussowitsch { 1642c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1643c2916339SPierre Jolivet PetscScalar *z = c; 1644c2916339SPierre Jolivet const PetscScalar *xb; 1645c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4; 1646c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1647c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1648c2916339SPierre Jolivet 1649c2916339SPierre Jolivet PetscFunctionBegin; 1650c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16519371c9d4SSatish Balay n = ii[1] - ii[0]; 16529371c9d4SSatish Balay ii++; 1653c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1654c2916339SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1655c2916339SPierre Jolivet jj = idx; 1656c2916339SPierre Jolivet vv = v; 1657c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1658c2916339SPierre Jolivet idx = jj; 1659c2916339SPierre Jolivet v = vv; 1660c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16619371c9d4SSatish Balay xb = b + 4 * (*idx); 16629371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16639371c9d4SSatish Balay x2 = xb[1 + k * bm]; 16649371c9d4SSatish Balay x3 = xb[2 + k * bm]; 16659371c9d4SSatish Balay x4 = xb[3 + k * bm]; 1666c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1667c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1668c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1669c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1670c2916339SPierre Jolivet if (*idx != i) { 1671c2916339SPierre Jolivet c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1672c2916339SPierre Jolivet c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1673c2916339SPierre Jolivet c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1674c2916339SPierre Jolivet c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1675c2916339SPierre Jolivet } 1676c2916339SPierre Jolivet v += 16; 1677c2916339SPierre Jolivet ++idx; 1678c2916339SPierre Jolivet } 1679c2916339SPierre Jolivet } 1680c2916339SPierre Jolivet z += 4; 1681c2916339SPierre Jolivet } 16823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1683c2916339SPierre Jolivet } 1684c2916339SPierre Jolivet 1685d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1686d71ae5a4SJacob Faibussowitsch { 1687c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1688c2916339SPierre Jolivet PetscScalar *z = c; 1689c2916339SPierre Jolivet const PetscScalar *xb; 1690c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 1691c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1692c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1693c2916339SPierre Jolivet 1694c2916339SPierre Jolivet PetscFunctionBegin; 1695c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16969371c9d4SSatish Balay n = ii[1] - ii[0]; 16979371c9d4SSatish Balay ii++; 1698c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1699c2916339SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1700c2916339SPierre Jolivet jj = idx; 1701c2916339SPierre Jolivet vv = v; 1702c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1703c2916339SPierre Jolivet idx = jj; 1704c2916339SPierre Jolivet v = vv; 1705c2916339SPierre Jolivet for (j = 0; j < n; j++) { 17069371c9d4SSatish Balay xb = b + 5 * (*idx); 17079371c9d4SSatish Balay x1 = xb[0 + k * bm]; 17089371c9d4SSatish Balay x2 = xb[1 + k * bm]; 17099371c9d4SSatish Balay x3 = xb[2 + k * bm]; 17109371c9d4SSatish Balay x4 = xb[3 + k * bm]; 17119371c9d4SSatish Balay x5 = xb[4 + k * cm]; 1712c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1713c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1714c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1715c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1716c2916339SPierre Jolivet z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1717c2916339SPierre Jolivet if (*idx != i) { 1718c2916339SPierre Jolivet c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1719c2916339SPierre Jolivet c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1720c2916339SPierre Jolivet c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1721c2916339SPierre Jolivet c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1722c2916339SPierre Jolivet c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1723c2916339SPierre Jolivet } 1724c2916339SPierre Jolivet v += 25; 1725c2916339SPierre Jolivet ++idx; 1726c2916339SPierre Jolivet } 1727c2916339SPierre Jolivet } 1728c2916339SPierre Jolivet z += 5; 1729c2916339SPierre Jolivet } 17303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1731c2916339SPierre Jolivet } 1732c2916339SPierre Jolivet 1733d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1734d71ae5a4SJacob Faibussowitsch { 1735c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1736c2916339SPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1737281439baSStefano Zampini Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1738c2916339SPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1739c2916339SPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1740c2916339SPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 1741f4259b30SLisandro Dalcin PetscScalar *z = NULL; 1742c2916339SPierre Jolivet PetscScalar *c, *b; 1743c2916339SPierre Jolivet const MatScalar *v; 1744c2916339SPierre Jolivet const PetscInt *idx, *ii; 1745c2916339SPierre Jolivet PetscScalar _DOne = 1.0; 1746c2916339SPierre Jolivet 1747c2916339SPierre Jolivet PetscFunctionBegin; 17483ba16761SJacob Faibussowitsch if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 174908401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 175008401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 175108401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1752c2916339SPierre Jolivet b = bd->v; 17539566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(C)); 17549566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 1755c2916339SPierre Jolivet switch (bs) { 1756d71ae5a4SJacob Faibussowitsch case 1: 1757d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1758d71ae5a4SJacob Faibussowitsch break; 1759d71ae5a4SJacob Faibussowitsch case 2: 1760d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1761d71ae5a4SJacob Faibussowitsch break; 1762d71ae5a4SJacob Faibussowitsch case 3: 1763d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1764d71ae5a4SJacob Faibussowitsch break; 1765d71ae5a4SJacob Faibussowitsch case 4: 1766d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1767d71ae5a4SJacob Faibussowitsch break; 1768d71ae5a4SJacob Faibussowitsch case 5: 1769d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1770d71ae5a4SJacob Faibussowitsch break; 1771c2916339SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 17729566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 17739566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 17749566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 17759566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 1776c2916339SPierre Jolivet idx = a->j; 1777c2916339SPierre Jolivet v = a->a; 1778c2916339SPierre Jolivet mbs = a->mbs; 1779c2916339SPierre Jolivet ii = a->i; 1780c2916339SPierre Jolivet z = c; 1781c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 17829371c9d4SSatish Balay n = ii[1] - ii[0]; 17839371c9d4SSatish Balay ii++; 1784c2916339SPierre Jolivet for (j = 0; j < n; j++) { 1785792fecdfSBarry Smith if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1786792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1787c2916339SPierre Jolivet v += bs2; 1788c2916339SPierre Jolivet } 1789c2916339SPierre Jolivet z += bs; 1790c2916339SPierre Jolivet } 1791c2916339SPierre Jolivet } 17929566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 17939566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 17943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1795c2916339SPierre Jolivet } 1796