149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h> 5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 6c6db04a5SJed Brown #include <petscbt.h> 7c6db04a5SJed Brown #include <petscblaslapack.h> 849b5e25fSSatish Balay 9d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 10d71ae5a4SJacob Faibussowitsch { 115eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 12*7bede89fSBarry Smith PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 135d0c19d7SBarry Smith const PetscInt *idx; 14db41cccfSHong Zhang PetscBT table_out, table_in; 15d94109b8SHong Zhang 16d94109b8SHong Zhang PetscFunctionBegin; 1708401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 18d94109b8SHong Zhang mbs = a->mbs; 19d94109b8SHong Zhang ai = a->i; 20d94109b8SHong Zhang aj = a->j; 21d0f46423SBarry Smith bs = A->rmap->bs; 229566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_out)); 239566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs + 1, &nidx)); 249566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_in)); 25d94109b8SHong Zhang 26d94109b8SHong Zhang for (i = 0; i < is_max; i++) { /* for each is */ 27d94109b8SHong Zhang isz = 0; 289566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_out)); 29d94109b8SHong Zhang 30d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 319566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 329566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 33d94109b8SHong Zhang 34db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 35dbe03f88SHong Zhang bcol_max = 0; 36d94109b8SHong Zhang for (j = 0; j < n; ++j) { 37d94109b8SHong Zhang brow = idx[j] / bs; /* convert the indices into block indices */ 3808401ef6SPierre Jolivet PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 39db41cccfSHong Zhang if (!PetscBTLookupSet(table_out, brow)) { 40dbe03f88SHong Zhang nidx[isz++] = brow; 41dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 42dbe03f88SHong Zhang } 43d94109b8SHong Zhang } 449566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 459566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 46d94109b8SHong Zhang 47d94109b8SHong Zhang k = 0; 48d94109b8SHong Zhang for (j = 0; j < ov; j++) { /* for each overlap */ 49db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 509566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_in)); 519566063dSJacob Faibussowitsch for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 52d94109b8SHong Zhang 53d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 54d94109b8SHong Zhang for (brow = 0; brow < mbs; brow++) { 559371c9d4SSatish Balay start = ai[brow]; 569371c9d4SSatish Balay end = ai[brow + 1]; 57db41cccfSHong Zhang if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58d94109b8SHong Zhang for (l = start; l < end; l++) { 59d94109b8SHong Zhang bcol = aj[l]; 60d7b97159SHong Zhang if (!PetscBTLookupSet(table_out, bcol)) { 61d7b97159SHong Zhang nidx[isz++] = bcol; 62d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 63d7b97159SHong Zhang } 64d94109b8SHong Zhang } 65d94109b8SHong Zhang k++; 66d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67d94109b8SHong Zhang } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68d94109b8SHong Zhang for (l = start; l < end; l++) { 69d94109b8SHong Zhang bcol = aj[l]; 70dbe03f88SHong Zhang if (bcol > bcol_max) break; 71db41cccfSHong Zhang if (PetscBTLookup(table_in, bcol)) { 7226fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 73d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 74d94109b8SHong Zhang } 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } /* for each overlap */ 79*7bede89fSBarry Smith PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 80*7bede89fSBarry Smith } /* for each is */ 819566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_out)); 829566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 839566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_in)); 845eee224dSHong Zhang PetscFunctionReturn(0); 8549b5e25fSSatish Balay } 8649b5e25fSSatish Balay 87847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 88847daeccSHong Zhang Zero some ops' to avoid invalid usse */ 89d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 90d71ae5a4SJacob Faibussowitsch { 9149b5e25fSSatish Balay PetscFunctionBegin; 929566063dSJacob Faibussowitsch PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 93f4259b30SLisandro Dalcin Bseq->ops->mult = NULL; 94f4259b30SLisandro Dalcin Bseq->ops->multadd = NULL; 95f4259b30SLisandro Dalcin Bseq->ops->multtranspose = NULL; 96f4259b30SLisandro Dalcin Bseq->ops->multtransposeadd = NULL; 97f4259b30SLisandro Dalcin Bseq->ops->lufactor = NULL; 98f4259b30SLisandro Dalcin Bseq->ops->choleskyfactor = NULL; 99f4259b30SLisandro Dalcin Bseq->ops->lufactorsymbolic = NULL; 100f4259b30SLisandro Dalcin Bseq->ops->choleskyfactorsymbolic = NULL; 101f4259b30SLisandro Dalcin Bseq->ops->getinertia = NULL; 10249b5e25fSSatish Balay PetscFunctionReturn(0); 10349b5e25fSSatish Balay } 10449b5e25fSSatish Balay 1057dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 106d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 107d71ae5a4SJacob Faibussowitsch { 108e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c; 109e50a8114SHong Zhang PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 110e50a8114SHong Zhang PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 111e50a8114SHong Zhang const PetscInt *irow, *icol; 112e50a8114SHong Zhang PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 113e50a8114SHong Zhang PetscInt *aj = a->j, *ai = a->i; 114e50a8114SHong Zhang MatScalar *mat_a; 115e50a8114SHong Zhang Mat C; 116e50a8114SHong Zhang PetscBool flag; 117e50a8114SHong Zhang 118e50a8114SHong Zhang PetscFunctionBegin; 119e50a8114SHong Zhang 1209566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 1219566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 1229566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 1239566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 124e50a8114SHong Zhang 1259566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 126e50a8114SHong Zhang ssmap = smap; 1279566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 128e50a8114SHong Zhang for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 129e50a8114SHong Zhang /* determine lens of each row */ 130e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 131e50a8114SHong Zhang kstart = ai[irow[i]]; 132e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 133e50a8114SHong Zhang lens[i] = 0; 134e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 135e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 136e50a8114SHong Zhang } 137e50a8114SHong Zhang } 138e50a8114SHong Zhang /* Create and fill new matrix */ 139e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 140e50a8114SHong Zhang c = (Mat_SeqSBAIJ *)((*B)->data); 141e50a8114SHong Zhang 142aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1439566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 14428b400f6SJacob Faibussowitsch PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 1459566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 146e50a8114SHong Zhang C = *B; 147e50a8114SHong Zhang } else { 1489566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1499566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 1509566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1519566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 152e50a8114SHong Zhang } 153e50a8114SHong Zhang c = (Mat_SeqSBAIJ *)(C->data); 154e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 155e50a8114SHong Zhang row = irow[i]; 156e50a8114SHong Zhang kstart = ai[row]; 157e50a8114SHong Zhang kend = kstart + a->ilen[row]; 158e50a8114SHong Zhang mat_i = c->i[i]; 159e50a8114SHong Zhang mat_j = c->j + mat_i; 160e50a8114SHong Zhang mat_a = c->a + mat_i * bs2; 161e50a8114SHong Zhang mat_ilen = c->ilen + i; 162e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 163e50a8114SHong Zhang if ((tcol = ssmap[a->j[k]])) { 164e50a8114SHong Zhang *mat_j++ = tcol - 1; 1659566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 166e50a8114SHong Zhang mat_a += bs2; 167e50a8114SHong Zhang (*mat_ilen)++; 168e50a8114SHong Zhang } 169e50a8114SHong Zhang } 170e50a8114SHong Zhang } 171e50a8114SHong Zhang /* sort */ 172e50a8114SHong Zhang { 173e50a8114SHong Zhang MatScalar *work; 174e50a8114SHong Zhang 1759566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 176e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 177e50a8114SHong Zhang PetscInt ilen; 178e50a8114SHong Zhang mat_i = c->i[i]; 179e50a8114SHong Zhang mat_j = c->j + mat_i; 180e50a8114SHong Zhang mat_a = c->a + mat_i * bs2; 181e50a8114SHong Zhang ilen = c->ilen[i]; 1829566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 183e50a8114SHong Zhang } 1849566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 185e50a8114SHong Zhang } 186e50a8114SHong Zhang 187e50a8114SHong Zhang /* Free work space */ 1889566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 1899566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 1909566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 1919566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1929566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 193e50a8114SHong Zhang 1949566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 195e50a8114SHong Zhang *B = C; 196e50a8114SHong Zhang PetscFunctionReturn(0); 197e50a8114SHong Zhang } 198e50a8114SHong Zhang 199d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 200d71ae5a4SJacob Faibussowitsch { 201e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 202e50a8114SHong Zhang IS is1, is2; 203e50a8114SHong Zhang PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs; 204e50a8114SHong Zhang const PetscInt *irow, *icol; 20549b5e25fSSatish Balay 20649b5e25fSSatish Balay PetscFunctionBegin; 2079566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 2089566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 2099566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 2109566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 211e50a8114SHong Zhang 212e50a8114SHong Zhang /* Verify if the indices corespond to each element in a block 213e50a8114SHong Zhang and form the IS with compressed IS */ 214e50a8114SHong Zhang maxmnbs = PetscMax(a->mbs, a->nbs); 2159566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 2169566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->mbs)); 217e50a8114SHong Zhang for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 218ad540459SPierre Jolivet for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 219e50a8114SHong Zhang count = 0; 220e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 221e50a8114SHong Zhang PetscInt j = irow[i] / bs; 222e50a8114SHong Zhang if ((vary[j]--) == bs) iary[count++] = j; 223e50a8114SHong Zhang } 2249566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 225e50a8114SHong Zhang 2269566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->nbs)); 227e50a8114SHong Zhang for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 228ad540459SPierre Jolivet for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 229e50a8114SHong Zhang count = 0; 230e50a8114SHong Zhang for (i = 0; i < ncols; i++) { 231e50a8114SHong Zhang PetscInt j = icol[i] / bs; 232e50a8114SHong Zhang if ((vary[j]--) == bs) iary[count++] = j; 233e50a8114SHong Zhang } 2349566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 2359566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 2369566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 2379566063dSJacob Faibussowitsch PetscCall(PetscFree2(vary, iary)); 238e50a8114SHong Zhang 2399566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B)); 2409566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2419566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 242847daeccSHong Zhang 2438f46ffcaSHong Zhang if (isrow != iscol) { 2448f46ffcaSHong Zhang PetscBool isequal; 2459566063dSJacob Faibussowitsch PetscCall(ISEqual(isrow, iscol, &isequal)); 24648a46eb9SPierre Jolivet if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 24749b5e25fSSatish Balay } 24849b5e25fSSatish Balay PetscFunctionReturn(0); 24949b5e25fSSatish Balay } 25049b5e25fSSatish Balay 251d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 252d71ae5a4SJacob Faibussowitsch { 25313f74950SBarry Smith PetscInt i; 25449b5e25fSSatish Balay 25549b5e25fSSatish Balay PetscFunctionBegin; 25648a46eb9SPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 257e50a8114SHong Zhang 25848a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 25949b5e25fSSatish Balay PetscFunctionReturn(0); 26049b5e25fSSatish Balay } 26149b5e25fSSatish Balay 26249b5e25fSSatish Balay /* -------------------------------------------------------*/ 26349b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 26449b5e25fSSatish Balay /* -------------------------------------------------------*/ 26549b5e25fSSatish Balay 266d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 267d71ae5a4SJacob Faibussowitsch { 26849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 269d9ca1df4SBarry Smith PetscScalar *z, x1, x2, zero = 0.0; 270d9ca1df4SBarry Smith const PetscScalar *x, *xb; 271d9ca1df4SBarry Smith const MatScalar *v; 272d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 273d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 27498c9bda7SSatish Balay PetscInt nonzerorow = 0; 27549b5e25fSSatish Balay 27649b5e25fSSatish Balay PetscFunctionBegin; 2779566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 278c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 2799566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2809566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 28149b5e25fSSatish Balay 28249b5e25fSSatish Balay v = a->a; 28349b5e25fSSatish Balay xb = x; 28449b5e25fSSatish Balay 28549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 28649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 2879371c9d4SSatish Balay x1 = xb[0]; 2889371c9d4SSatish Balay x2 = xb[1]; 28949b5e25fSSatish Balay ib = aj + *ai; 290831a3094SHong Zhang jmin = 0; 29198c9bda7SSatish Balay nonzerorow += (n > 0); 2927fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 29349b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 29449b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 2959371c9d4SSatish Balay v += 4; 2969371c9d4SSatish Balay jmin++; 2977fbae186SHong Zhang } 298444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 299444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 300831a3094SHong Zhang for (j = jmin; j < n; j++) { 30149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 30249b5e25fSSatish Balay cval = ib[j] * 2; 30349b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 30449b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 30549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 30649b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 30749b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 30849b5e25fSSatish Balay v += 4; 30949b5e25fSSatish Balay } 3109371c9d4SSatish Balay xb += 2; 3119371c9d4SSatish Balay ai++; 31249b5e25fSSatish Balay } 31349b5e25fSSatish Balay 3149566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3159566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3169566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 31749b5e25fSSatish Balay PetscFunctionReturn(0); 31849b5e25fSSatish Balay } 31949b5e25fSSatish Balay 320d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 321d71ae5a4SJacob Faibussowitsch { 32249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 323d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, zero = 0.0; 324d9ca1df4SBarry Smith const PetscScalar *x, *xb; 325d9ca1df4SBarry Smith const MatScalar *v; 326d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 327d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 32898c9bda7SSatish Balay PetscInt nonzerorow = 0; 32949b5e25fSSatish Balay 33049b5e25fSSatish Balay PetscFunctionBegin; 3319566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 332c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 3339566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3349566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 33549b5e25fSSatish Balay 33649b5e25fSSatish Balay v = a->a; 33749b5e25fSSatish Balay xb = x; 33849b5e25fSSatish Balay 33949b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 34049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3419371c9d4SSatish Balay x1 = xb[0]; 3429371c9d4SSatish Balay x2 = xb[1]; 3439371c9d4SSatish Balay x3 = xb[2]; 34449b5e25fSSatish Balay ib = aj + *ai; 345831a3094SHong Zhang jmin = 0; 34698c9bda7SSatish Balay nonzerorow += (n > 0); 3477fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 34849b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 34949b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 35049b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3519371c9d4SSatish Balay v += 9; 3529371c9d4SSatish Balay jmin++; 3537fbae186SHong Zhang } 354444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 355444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 356831a3094SHong Zhang for (j = jmin; j < n; j++) { 35749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 35849b5e25fSSatish Balay cval = ib[j] * 3; 35949b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 36049b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 36149b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 36249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 36349b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 36449b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 36549b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 36649b5e25fSSatish Balay v += 9; 36749b5e25fSSatish Balay } 3689371c9d4SSatish Balay xb += 3; 3699371c9d4SSatish Balay ai++; 37049b5e25fSSatish Balay } 37149b5e25fSSatish Balay 3729566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3739566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3749566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 37549b5e25fSSatish Balay PetscFunctionReturn(0); 37649b5e25fSSatish Balay } 37749b5e25fSSatish Balay 378d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 379d71ae5a4SJacob Faibussowitsch { 38049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 381d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 382d9ca1df4SBarry Smith const PetscScalar *x, *xb; 383d9ca1df4SBarry Smith const MatScalar *v; 384d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 385d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 38698c9bda7SSatish Balay PetscInt nonzerorow = 0; 38749b5e25fSSatish Balay 38849b5e25fSSatish Balay PetscFunctionBegin; 3899566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 390c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 3919566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3929566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 39349b5e25fSSatish Balay 39449b5e25fSSatish Balay v = a->a; 39549b5e25fSSatish Balay xb = x; 39649b5e25fSSatish Balay 39749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 39849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3999371c9d4SSatish Balay x1 = xb[0]; 4009371c9d4SSatish Balay x2 = xb[1]; 4019371c9d4SSatish Balay x3 = xb[2]; 4029371c9d4SSatish Balay x4 = xb[3]; 40349b5e25fSSatish Balay ib = aj + *ai; 404831a3094SHong Zhang jmin = 0; 40598c9bda7SSatish Balay nonzerorow += (n > 0); 4067fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 40749b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 40849b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 40949b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 41049b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 4119371c9d4SSatish Balay v += 16; 4129371c9d4SSatish Balay jmin++; 4137fbae186SHong Zhang } 414444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 415444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 416831a3094SHong Zhang for (j = jmin; j < n; j++) { 41749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 41849b5e25fSSatish Balay cval = ib[j] * 4; 41949b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 42049b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 42149b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 42249b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 42349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 42449b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 42549b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 42649b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 42749b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 42849b5e25fSSatish Balay v += 16; 42949b5e25fSSatish Balay } 4309371c9d4SSatish Balay xb += 4; 4319371c9d4SSatish Balay ai++; 43249b5e25fSSatish Balay } 43349b5e25fSSatish Balay 4349566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4359566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 4369566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 43749b5e25fSSatish Balay PetscFunctionReturn(0); 43849b5e25fSSatish Balay } 43949b5e25fSSatish Balay 440d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 441d71ae5a4SJacob Faibussowitsch { 44249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 443d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 444d9ca1df4SBarry Smith const PetscScalar *x, *xb; 445d9ca1df4SBarry Smith const MatScalar *v; 446d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 447d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 44898c9bda7SSatish Balay PetscInt nonzerorow = 0; 44949b5e25fSSatish Balay 45049b5e25fSSatish Balay PetscFunctionBegin; 4519566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 452c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 4539566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4549566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 45549b5e25fSSatish Balay 45649b5e25fSSatish Balay v = a->a; 45749b5e25fSSatish Balay xb = x; 45849b5e25fSSatish Balay 45949b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 46049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4619371c9d4SSatish Balay x1 = xb[0]; 4629371c9d4SSatish Balay x2 = xb[1]; 4639371c9d4SSatish Balay x3 = xb[2]; 4649371c9d4SSatish Balay x4 = xb[3]; 4659371c9d4SSatish Balay x5 = xb[4]; 46649b5e25fSSatish Balay ib = aj + *ai; 467831a3094SHong Zhang jmin = 0; 46898c9bda7SSatish Balay nonzerorow += (n > 0); 4697fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 47049b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 47149b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 47249b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 47349b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 47449b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 4759371c9d4SSatish Balay v += 25; 4769371c9d4SSatish Balay jmin++; 4777fbae186SHong Zhang } 478444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 479444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 480831a3094SHong Zhang for (j = jmin; j < n; j++) { 48149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 48249b5e25fSSatish Balay cval = ib[j] * 5; 48349b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 48449b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 48549b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 48649b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 48749b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 48849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 48949b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 49049b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 49149b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 49249b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 49349b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 49449b5e25fSSatish Balay v += 25; 49549b5e25fSSatish Balay } 4969371c9d4SSatish Balay xb += 5; 4979371c9d4SSatish Balay ai++; 49849b5e25fSSatish Balay } 49949b5e25fSSatish Balay 5009566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5019566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5029566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 50349b5e25fSSatish Balay PetscFunctionReturn(0); 50449b5e25fSSatish Balay } 50549b5e25fSSatish Balay 506d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 507d71ae5a4SJacob Faibussowitsch { 50849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 509d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 510d9ca1df4SBarry Smith const PetscScalar *x, *xb; 511d9ca1df4SBarry Smith const MatScalar *v; 512d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 513d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 51498c9bda7SSatish Balay PetscInt nonzerorow = 0; 51549b5e25fSSatish Balay 51649b5e25fSSatish Balay PetscFunctionBegin; 5179566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 518c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 5199566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5209566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 52149b5e25fSSatish Balay 52249b5e25fSSatish Balay v = a->a; 52349b5e25fSSatish Balay xb = x; 52449b5e25fSSatish Balay 52549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 52649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 5279371c9d4SSatish Balay x1 = xb[0]; 5289371c9d4SSatish Balay x2 = xb[1]; 5299371c9d4SSatish Balay x3 = xb[2]; 5309371c9d4SSatish Balay x4 = xb[3]; 5319371c9d4SSatish Balay x5 = xb[4]; 5329371c9d4SSatish Balay x6 = xb[5]; 53349b5e25fSSatish Balay ib = aj + *ai; 534831a3094SHong Zhang jmin = 0; 53598c9bda7SSatish Balay nonzerorow += (n > 0); 5367fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 53749b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 53849b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 53949b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 54049b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 54149b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 54249b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 5439371c9d4SSatish Balay v += 36; 5449371c9d4SSatish Balay jmin++; 5457fbae186SHong Zhang } 546444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 547444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 548831a3094SHong Zhang for (j = jmin; j < n; j++) { 54949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 55049b5e25fSSatish Balay cval = ib[j] * 6; 55149b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 55249b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 55349b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 55449b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 55549b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 55649b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 55749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 55849b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 55949b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 56049b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 56149b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 56249b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 56349b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 56449b5e25fSSatish Balay v += 36; 56549b5e25fSSatish Balay } 5669371c9d4SSatish Balay xb += 6; 5679371c9d4SSatish Balay ai++; 56849b5e25fSSatish Balay } 56949b5e25fSSatish Balay 5709566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5719566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5729566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 57349b5e25fSSatish Balay PetscFunctionReturn(0); 57449b5e25fSSatish Balay } 575c2916339SPierre Jolivet 576d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 577d71ae5a4SJacob Faibussowitsch { 57849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 579d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 580d9ca1df4SBarry Smith const PetscScalar *x, *xb; 581d9ca1df4SBarry Smith const MatScalar *v; 582d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 583d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 58498c9bda7SSatish Balay PetscInt nonzerorow = 0; 58549b5e25fSSatish Balay 58649b5e25fSSatish Balay PetscFunctionBegin; 5879566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 588c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 5899566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5909566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 59149b5e25fSSatish Balay 59249b5e25fSSatish Balay v = a->a; 59349b5e25fSSatish Balay xb = x; 59449b5e25fSSatish Balay 59549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 59649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 5979371c9d4SSatish Balay x1 = xb[0]; 5989371c9d4SSatish Balay x2 = xb[1]; 5999371c9d4SSatish Balay x3 = xb[2]; 6009371c9d4SSatish Balay x4 = xb[3]; 6019371c9d4SSatish Balay x5 = xb[4]; 6029371c9d4SSatish Balay x6 = xb[5]; 6039371c9d4SSatish Balay x7 = xb[6]; 60449b5e25fSSatish Balay ib = aj + *ai; 605831a3094SHong Zhang jmin = 0; 60698c9bda7SSatish Balay nonzerorow += (n > 0); 6077fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 60849b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 60949b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 61049b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 61149b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 61249b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 61349b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 61449b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 6159371c9d4SSatish Balay v += 49; 6169371c9d4SSatish Balay jmin++; 6177fbae186SHong Zhang } 618444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 619444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 620831a3094SHong Zhang for (j = jmin; j < n; j++) { 62149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 62249b5e25fSSatish Balay cval = ib[j] * 7; 62349b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 62449b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 62549b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 62649b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 62749b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 62849b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 62949b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 63049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 63149b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 63249b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 63349b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 63449b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 63549b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 63649b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 63749b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 63849b5e25fSSatish Balay v += 49; 63949b5e25fSSatish Balay } 6409371c9d4SSatish Balay xb += 7; 6419371c9d4SSatish Balay ai++; 64249b5e25fSSatish Balay } 6439566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6449566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 6459566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 64649b5e25fSSatish Balay PetscFunctionReturn(0); 64749b5e25fSSatish Balay } 64849b5e25fSSatish Balay 64949b5e25fSSatish Balay /* 65049b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 65149b5e25fSSatish Balay */ 652d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 653d71ae5a4SJacob Faibussowitsch { 65449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 655d9ca1df4SBarry Smith PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 656d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 657d9ca1df4SBarry Smith const MatScalar *v; 658d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 659d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 66098c9bda7SSatish Balay PetscInt nonzerorow = 0; 66149b5e25fSSatish Balay 66249b5e25fSSatish Balay PetscFunctionBegin; 6639566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 664c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 6659566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6669566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 66759689ffbSStefano Zampini 66859689ffbSStefano Zampini x_ptr = x; 66959689ffbSStefano Zampini z_ptr = z; 67049b5e25fSSatish Balay 67149b5e25fSSatish Balay aj = a->j; 67249b5e25fSSatish Balay v = a->a; 67349b5e25fSSatish Balay ii = a->i; 67449b5e25fSSatish Balay 67548a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 67649b5e25fSSatish Balay work = a->mult_work; 67749b5e25fSSatish Balay 67849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 6799371c9d4SSatish Balay n = ii[1] - ii[0]; 6809371c9d4SSatish Balay ncols = n * bs; 6819371c9d4SSatish Balay workt = work; 6829371c9d4SSatish Balay idx = aj + ii[0]; 68398c9bda7SSatish Balay nonzerorow += (n > 0); 68449b5e25fSSatish Balay 68549b5e25fSSatish Balay /* upper triangular part */ 68649b5e25fSSatish Balay for (j = 0; j < n; j++) { 68749b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 68849b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 68949b5e25fSSatish Balay workt += bs; 69049b5e25fSSatish Balay } 69149b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 69296b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 69349b5e25fSSatish Balay 69449b5e25fSSatish Balay /* strict lower triangular part */ 695831a3094SHong Zhang idx = aj + ii[0]; 69659689ffbSStefano Zampini if (n && *idx == i) { 6979371c9d4SSatish Balay ncols -= bs; 6989371c9d4SSatish Balay v += bs2; 6999371c9d4SSatish Balay idx++; 7009371c9d4SSatish Balay n--; 701831a3094SHong Zhang } 70296b9376eSHong Zhang 70349b5e25fSSatish Balay if (ncols > 0) { 70449b5e25fSSatish Balay workt = work; 7059566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 70696b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 707831a3094SHong Zhang for (j = 0; j < n; j++) { 708831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 70949b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 71049b5e25fSSatish Balay workt += bs; 71149b5e25fSSatish Balay } 71249b5e25fSSatish Balay } 7139371c9d4SSatish Balay x += bs; 7149371c9d4SSatish Balay v += n * bs2; 7159371c9d4SSatish Balay z += bs; 7169371c9d4SSatish Balay ii++; 71749b5e25fSSatish Balay } 71849b5e25fSSatish Balay 7199566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7209566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 7219566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 72249b5e25fSSatish Balay PetscFunctionReturn(0); 72349b5e25fSSatish Balay } 72449b5e25fSSatish Balay 725d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 726d71ae5a4SJacob Faibussowitsch { 72749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 728d9ca1df4SBarry Smith PetscScalar *z, x1; 729d9ca1df4SBarry Smith const PetscScalar *x, *xb; 730d9ca1df4SBarry Smith const MatScalar *v; 731d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 732d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 73398c9bda7SSatish Balay PetscInt nonzerorow = 0; 734eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 735b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 736eb1ec7c1SStefano Zampini #else 737eb1ec7c1SStefano Zampini const int aconj = 0; 738eb1ec7c1SStefano Zampini #endif 73949b5e25fSSatish Balay 74049b5e25fSSatish Balay PetscFunctionBegin; 7419566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 742c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 7439566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7449566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 74549b5e25fSSatish Balay v = a->a; 74649b5e25fSSatish Balay xb = x; 74749b5e25fSSatish Balay 74849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 74949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 75049b5e25fSSatish Balay x1 = xb[0]; 75149b5e25fSSatish Balay ib = aj + *ai; 752831a3094SHong Zhang jmin = 0; 75398c9bda7SSatish Balay nonzerorow += (n > 0); 7543d9ade75SStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 7559371c9d4SSatish Balay z[i] += *v++ * x[*ib++]; 7569371c9d4SSatish Balay jmin++; 757831a3094SHong Zhang } 758eb1ec7c1SStefano Zampini if (aconj) { 759eb1ec7c1SStefano Zampini for (j = jmin; j < n; j++) { 760eb1ec7c1SStefano Zampini cval = *ib; 761eb1ec7c1SStefano Zampini z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 762eb1ec7c1SStefano Zampini z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 763eb1ec7c1SStefano Zampini } 764eb1ec7c1SStefano Zampini } else { 765831a3094SHong Zhang for (j = jmin; j < n; j++) { 76649b5e25fSSatish Balay cval = *ib; 76749b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 76849b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 76949b5e25fSSatish Balay } 770eb1ec7c1SStefano Zampini } 7719371c9d4SSatish Balay xb++; 7729371c9d4SSatish Balay ai++; 77349b5e25fSSatish Balay } 77449b5e25fSSatish Balay 7759566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7769566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 77749b5e25fSSatish Balay 7789566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 77949b5e25fSSatish Balay PetscFunctionReturn(0); 78049b5e25fSSatish Balay } 78149b5e25fSSatish Balay 782d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 783d71ae5a4SJacob Faibussowitsch { 78449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 785d9ca1df4SBarry Smith PetscScalar *z, x1, x2; 786d9ca1df4SBarry Smith const PetscScalar *x, *xb; 787d9ca1df4SBarry Smith const MatScalar *v; 788d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 789d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 79098c9bda7SSatish Balay PetscInt nonzerorow = 0; 79149b5e25fSSatish Balay 79249b5e25fSSatish Balay PetscFunctionBegin; 7939566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 794c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 7959566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7969566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 79749b5e25fSSatish Balay 79849b5e25fSSatish Balay v = a->a; 79949b5e25fSSatish Balay xb = x; 80049b5e25fSSatish Balay 80149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 80249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8039371c9d4SSatish Balay x1 = xb[0]; 8049371c9d4SSatish Balay x2 = xb[1]; 80549b5e25fSSatish Balay ib = aj + *ai; 806831a3094SHong Zhang jmin = 0; 80798c9bda7SSatish Balay nonzerorow += (n > 0); 80859689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 80949b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 81049b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 8119371c9d4SSatish Balay v += 4; 8129371c9d4SSatish Balay jmin++; 8137fbae186SHong Zhang } 814444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 815444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 816831a3094SHong Zhang for (j = jmin; j < n; j++) { 81749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 81849b5e25fSSatish Balay cval = ib[j] * 2; 81949b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 82049b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 82149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 82249b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 82349b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 82449b5e25fSSatish Balay v += 4; 82549b5e25fSSatish Balay } 8269371c9d4SSatish Balay xb += 2; 8279371c9d4SSatish Balay ai++; 82849b5e25fSSatish Balay } 8299566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8309566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 83149b5e25fSSatish Balay 8329566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 83349b5e25fSSatish Balay PetscFunctionReturn(0); 83449b5e25fSSatish Balay } 83549b5e25fSSatish Balay 836d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 837d71ae5a4SJacob Faibussowitsch { 83849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 839d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3; 840d9ca1df4SBarry Smith const PetscScalar *x, *xb; 841d9ca1df4SBarry Smith const MatScalar *v; 842d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 843d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 84498c9bda7SSatish Balay PetscInt nonzerorow = 0; 84549b5e25fSSatish Balay 84649b5e25fSSatish Balay PetscFunctionBegin; 8479566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 848c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 8499566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8509566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 85149b5e25fSSatish Balay 85249b5e25fSSatish Balay v = a->a; 85349b5e25fSSatish Balay xb = x; 85449b5e25fSSatish Balay 85549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 85649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8579371c9d4SSatish Balay x1 = xb[0]; 8589371c9d4SSatish Balay x2 = xb[1]; 8599371c9d4SSatish Balay x3 = xb[2]; 86049b5e25fSSatish Balay ib = aj + *ai; 861831a3094SHong Zhang jmin = 0; 86298c9bda7SSatish Balay nonzerorow += (n > 0); 86359689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 86449b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 86549b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 86649b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 8679371c9d4SSatish Balay v += 9; 8689371c9d4SSatish Balay jmin++; 8697fbae186SHong Zhang } 870444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 871444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 872831a3094SHong Zhang for (j = jmin; j < n; j++) { 87349b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 87449b5e25fSSatish Balay cval = ib[j] * 3; 87549b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 87649b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 87749b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 87849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 87949b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 88049b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 88149b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 88249b5e25fSSatish Balay v += 9; 88349b5e25fSSatish Balay } 8849371c9d4SSatish Balay xb += 3; 8859371c9d4SSatish Balay ai++; 88649b5e25fSSatish Balay } 88749b5e25fSSatish Balay 8889566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8899566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 89049b5e25fSSatish Balay 8919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 89249b5e25fSSatish Balay PetscFunctionReturn(0); 89349b5e25fSSatish Balay } 89449b5e25fSSatish Balay 895d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 896d71ae5a4SJacob Faibussowitsch { 89749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 898d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4; 899d9ca1df4SBarry Smith const PetscScalar *x, *xb; 900d9ca1df4SBarry Smith const MatScalar *v; 901d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 902d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 90398c9bda7SSatish Balay PetscInt nonzerorow = 0; 90449b5e25fSSatish Balay 90549b5e25fSSatish Balay PetscFunctionBegin; 9069566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 907c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 9089566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9099566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 91049b5e25fSSatish Balay 91149b5e25fSSatish Balay v = a->a; 91249b5e25fSSatish Balay xb = x; 91349b5e25fSSatish Balay 91449b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 91549b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 9169371c9d4SSatish Balay x1 = xb[0]; 9179371c9d4SSatish Balay x2 = xb[1]; 9189371c9d4SSatish Balay x3 = xb[2]; 9199371c9d4SSatish Balay x4 = xb[3]; 92049b5e25fSSatish Balay ib = aj + *ai; 921831a3094SHong Zhang jmin = 0; 92298c9bda7SSatish Balay nonzerorow += (n > 0); 92359689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 92449b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 92549b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 92649b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 92749b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 9289371c9d4SSatish Balay v += 16; 9299371c9d4SSatish Balay jmin++; 9307fbae186SHong Zhang } 931444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 932444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 933831a3094SHong Zhang for (j = jmin; j < n; j++) { 93449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 93549b5e25fSSatish Balay cval = ib[j] * 4; 93649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 93749b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 93849b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 93949b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 94049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 94149b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 94249b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 94349b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 94449b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 94549b5e25fSSatish Balay v += 16; 94649b5e25fSSatish Balay } 9479371c9d4SSatish Balay xb += 4; 9489371c9d4SSatish Balay ai++; 94949b5e25fSSatish Balay } 95049b5e25fSSatish Balay 9519566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9529566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 95349b5e25fSSatish Balay 9549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 95549b5e25fSSatish Balay PetscFunctionReturn(0); 95649b5e25fSSatish Balay } 95749b5e25fSSatish Balay 958d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 959d71ae5a4SJacob Faibussowitsch { 96049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 961d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5; 962d9ca1df4SBarry Smith const PetscScalar *x, *xb; 963d9ca1df4SBarry Smith const MatScalar *v; 964d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 965d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 96698c9bda7SSatish Balay PetscInt nonzerorow = 0; 96749b5e25fSSatish Balay 96849b5e25fSSatish Balay PetscFunctionBegin; 9699566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 970c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 9719566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9729566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 97349b5e25fSSatish Balay 97449b5e25fSSatish Balay v = a->a; 97549b5e25fSSatish Balay xb = x; 97649b5e25fSSatish Balay 97749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 97849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 9799371c9d4SSatish Balay x1 = xb[0]; 9809371c9d4SSatish Balay x2 = xb[1]; 9819371c9d4SSatish Balay x3 = xb[2]; 9829371c9d4SSatish Balay x4 = xb[3]; 9839371c9d4SSatish Balay x5 = xb[4]; 98449b5e25fSSatish Balay ib = aj + *ai; 985831a3094SHong Zhang jmin = 0; 98698c9bda7SSatish Balay nonzerorow += (n > 0); 98759689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 98849b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 98949b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 99049b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 99149b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 99249b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 9939371c9d4SSatish Balay v += 25; 9949371c9d4SSatish Balay jmin++; 9957fbae186SHong Zhang } 996444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 997444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 998831a3094SHong Zhang for (j = jmin; j < n; j++) { 99949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 100049b5e25fSSatish Balay cval = ib[j] * 5; 100149b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 100249b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 100349b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 100449b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 100549b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 100649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 100749b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 100849b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 100949b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 101049b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 101149b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 101249b5e25fSSatish Balay v += 25; 101349b5e25fSSatish Balay } 10149371c9d4SSatish Balay xb += 5; 10159371c9d4SSatish Balay ai++; 101649b5e25fSSatish Balay } 101749b5e25fSSatish Balay 10189566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10199566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 102049b5e25fSSatish Balay 10219566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 102249b5e25fSSatish Balay PetscFunctionReturn(0); 102349b5e25fSSatish Balay } 1024c2916339SPierre Jolivet 1025d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1026d71ae5a4SJacob Faibussowitsch { 102749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1028d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6; 1029d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1030d9ca1df4SBarry Smith const MatScalar *v; 1031d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1032d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 103398c9bda7SSatish Balay PetscInt nonzerorow = 0; 103449b5e25fSSatish Balay 103549b5e25fSSatish Balay PetscFunctionBegin; 10369566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 1037c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 10389566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10399566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 104049b5e25fSSatish Balay 104149b5e25fSSatish Balay v = a->a; 104249b5e25fSSatish Balay xb = x; 104349b5e25fSSatish Balay 104449b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 104549b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 10469371c9d4SSatish Balay x1 = xb[0]; 10479371c9d4SSatish Balay x2 = xb[1]; 10489371c9d4SSatish Balay x3 = xb[2]; 10499371c9d4SSatish Balay x4 = xb[3]; 10509371c9d4SSatish Balay x5 = xb[4]; 10519371c9d4SSatish Balay x6 = xb[5]; 105249b5e25fSSatish Balay ib = aj + *ai; 1053831a3094SHong Zhang jmin = 0; 105498c9bda7SSatish Balay nonzerorow += (n > 0); 105559689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 105649b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 105749b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 105849b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 105949b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 106049b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 106149b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 10629371c9d4SSatish Balay v += 36; 10639371c9d4SSatish Balay jmin++; 10647fbae186SHong Zhang } 1065444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1066444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1067831a3094SHong Zhang for (j = jmin; j < n; j++) { 106849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 106949b5e25fSSatish Balay cval = ib[j] * 6; 107049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 107149b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 107249b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 107349b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 107449b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 107549b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 107649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 107749b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 107849b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 107949b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 108049b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 108149b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 108249b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 108349b5e25fSSatish Balay v += 36; 108449b5e25fSSatish Balay } 10859371c9d4SSatish Balay xb += 6; 10869371c9d4SSatish Balay ai++; 108749b5e25fSSatish Balay } 108849b5e25fSSatish Balay 10899566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10909566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 109149b5e25fSSatish Balay 10929566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 109349b5e25fSSatish Balay PetscFunctionReturn(0); 109449b5e25fSSatish Balay } 109549b5e25fSSatish Balay 1096d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1097d71ae5a4SJacob Faibussowitsch { 109849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1099d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1100d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1101d9ca1df4SBarry Smith const MatScalar *v; 1102d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1103d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 110498c9bda7SSatish Balay PetscInt nonzerorow = 0; 110549b5e25fSSatish Balay 110649b5e25fSSatish Balay PetscFunctionBegin; 11079566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 1108c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 11099566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 11109566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 111149b5e25fSSatish Balay 111249b5e25fSSatish Balay v = a->a; 111349b5e25fSSatish Balay xb = x; 111449b5e25fSSatish Balay 111549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 111649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 11179371c9d4SSatish Balay x1 = xb[0]; 11189371c9d4SSatish Balay x2 = xb[1]; 11199371c9d4SSatish Balay x3 = xb[2]; 11209371c9d4SSatish Balay x4 = xb[3]; 11219371c9d4SSatish Balay x5 = xb[4]; 11229371c9d4SSatish Balay x6 = xb[5]; 11239371c9d4SSatish Balay x7 = xb[6]; 112449b5e25fSSatish Balay ib = aj + *ai; 1125831a3094SHong Zhang jmin = 0; 112698c9bda7SSatish Balay nonzerorow += (n > 0); 112759689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 112849b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 112949b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 113049b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 113149b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 113249b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 113349b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 113449b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 11359371c9d4SSatish Balay v += 49; 11369371c9d4SSatish Balay jmin++; 11377fbae186SHong Zhang } 1138444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1139444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1140831a3094SHong Zhang for (j = jmin; j < n; j++) { 114149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 114249b5e25fSSatish Balay cval = ib[j] * 7; 114349b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 114449b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 114549b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 114649b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 114749b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 114849b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 114949b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 115049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 115149b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 115249b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 115349b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 115449b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 115549b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 115649b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 115749b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 115849b5e25fSSatish Balay v += 49; 115949b5e25fSSatish Balay } 11609371c9d4SSatish Balay xb += 7; 11619371c9d4SSatish Balay ai++; 116249b5e25fSSatish Balay } 116349b5e25fSSatish Balay 11649566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11659566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 116649b5e25fSSatish Balay 11679566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 116849b5e25fSSatish Balay PetscFunctionReturn(0); 116949b5e25fSSatish Balay } 117049b5e25fSSatish Balay 1171d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1172d71ae5a4SJacob Faibussowitsch { 117349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1174f4259b30SLisandro Dalcin PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1175d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 1176d9ca1df4SBarry Smith const MatScalar *v; 1177d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1178d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 117998c9bda7SSatish Balay PetscInt nonzerorow = 0; 118049b5e25fSSatish Balay 118149b5e25fSSatish Balay PetscFunctionBegin; 11829566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 1183c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 11849371c9d4SSatish Balay PetscCall(VecGetArrayRead(xx, &x)); 11859371c9d4SSatish Balay x_ptr = x; 11869371c9d4SSatish Balay PetscCall(VecGetArray(zz, &z)); 11879371c9d4SSatish Balay z_ptr = z; 118849b5e25fSSatish Balay 118949b5e25fSSatish Balay aj = a->j; 119049b5e25fSSatish Balay v = a->a; 119149b5e25fSSatish Balay ii = a->i; 119249b5e25fSSatish Balay 119348a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 119449b5e25fSSatish Balay work = a->mult_work; 119549b5e25fSSatish Balay 119649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 11979371c9d4SSatish Balay n = ii[1] - ii[0]; 11989371c9d4SSatish Balay ncols = n * bs; 11999371c9d4SSatish Balay workt = work; 12009371c9d4SSatish Balay idx = aj + ii[0]; 120198c9bda7SSatish Balay nonzerorow += (n > 0); 120249b5e25fSSatish Balay 120349b5e25fSSatish Balay /* upper triangular part */ 120449b5e25fSSatish Balay for (j = 0; j < n; j++) { 120549b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 120649b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 120749b5e25fSSatish Balay workt += bs; 120849b5e25fSSatish Balay } 120949b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 121096b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 121149b5e25fSSatish Balay 121249b5e25fSSatish Balay /* strict lower triangular part */ 1213831a3094SHong Zhang idx = aj + ii[0]; 121459689ffbSStefano Zampini if (n && *idx == i) { 12159371c9d4SSatish Balay ncols -= bs; 12169371c9d4SSatish Balay v += bs2; 12179371c9d4SSatish Balay idx++; 12189371c9d4SSatish Balay n--; 1219831a3094SHong Zhang } 122049b5e25fSSatish Balay if (ncols > 0) { 122149b5e25fSSatish Balay workt = work; 12229566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 122396b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1224831a3094SHong Zhang for (j = 0; j < n; j++) { 1225831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 122649b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 122749b5e25fSSatish Balay workt += bs; 122849b5e25fSSatish Balay } 122949b5e25fSSatish Balay } 123049b5e25fSSatish Balay 12319371c9d4SSatish Balay x += bs; 12329371c9d4SSatish Balay v += n * bs2; 12339371c9d4SSatish Balay z += bs; 12349371c9d4SSatish Balay ii++; 123549b5e25fSSatish Balay } 123649b5e25fSSatish Balay 12379566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 12389566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 123949b5e25fSSatish Balay 12409566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 124149b5e25fSSatish Balay PetscFunctionReturn(0); 124249b5e25fSSatish Balay } 124349b5e25fSSatish Balay 1244d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1245d71ae5a4SJacob Faibussowitsch { 124649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1247f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1248c5df96a5SBarry Smith PetscBLASInt one = 1, totalnz; 124949b5e25fSSatish Balay 125049b5e25fSSatish Balay PetscFunctionBegin; 12519566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1252792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 12539566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 125449b5e25fSSatish Balay PetscFunctionReturn(0); 125549b5e25fSSatish Balay } 125649b5e25fSSatish Balay 1257d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1258d71ae5a4SJacob Faibussowitsch { 125949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1260d9ca1df4SBarry Smith const MatScalar *v = a->a; 126149b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1262d9ca1df4SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1263d9ca1df4SBarry Smith const PetscInt *aj = a->j, *col; 126449b5e25fSSatish Balay 126549b5e25fSSatish Balay PetscFunctionBegin; 1266c40ae873SPierre Jolivet if (!a->nz) { 1267c40ae873SPierre Jolivet *norm = 0.0; 1268c40ae873SPierre Jolivet PetscFunctionReturn(0); 1269c40ae873SPierre Jolivet } 127049b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 127149b5e25fSSatish Balay for (k = 0; k < mbs; k++) { 127259689ffbSStefano Zampini jmin = a->i[k]; 127359689ffbSStefano Zampini jmax = a->i[k + 1]; 1274831a3094SHong Zhang col = aj + jmin; 127559689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 127649b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 12779371c9d4SSatish Balay sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 12789371c9d4SSatish Balay v++; 127949b5e25fSSatish Balay } 1280831a3094SHong Zhang jmin++; 1281831a3094SHong Zhang } 1282831a3094SHong Zhang for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 128349b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 12849371c9d4SSatish Balay sum_off += PetscRealPart(PetscConj(*v) * (*v)); 12859371c9d4SSatish Balay v++; 128649b5e25fSSatish Balay } 128749b5e25fSSatish Balay } 128849b5e25fSSatish Balay } 12898f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 12909566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 12910b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 12929566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 12930b8dc8d2SHong Zhang for (i = 0; i < mbs; i++) jl[i] = mbs; 12940b8dc8d2SHong Zhang il[0] = 0; 129549b5e25fSSatish Balay 129649b5e25fSSatish Balay *norm = 0.0; 129749b5e25fSSatish Balay for (k = 0; k < mbs; k++) { /* k_th block row */ 129849b5e25fSSatish Balay for (j = 0; j < bs; j++) sum[j] = 0.0; 129949b5e25fSSatish Balay /*-- col sum --*/ 130049b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1301831a3094SHong Zhang /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1302831a3094SHong Zhang at step k */ 130349b5e25fSSatish Balay while (i < mbs) { 130449b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 130549b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 130649b5e25fSSatish Balay for (j = 0; j < bs; j++) { 130749b5e25fSSatish Balay v = a->a + ik * bs2 + j * bs; 130849b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 13099371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 13109371c9d4SSatish Balay v++; 131149b5e25fSSatish Balay } 131249b5e25fSSatish Balay } 131349b5e25fSSatish Balay /* update il, jl */ 1314831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1315831a3094SHong Zhang jmax = a->i[i + 1]; 131649b5e25fSSatish Balay if (jmin < jmax) { 131749b5e25fSSatish Balay il[i] = jmin; 131849b5e25fSSatish Balay j = a->j[jmin]; 13199371c9d4SSatish Balay jl[i] = jl[j]; 13209371c9d4SSatish Balay jl[j] = i; 132149b5e25fSSatish Balay } 132249b5e25fSSatish Balay i = nexti; 132349b5e25fSSatish Balay } 132449b5e25fSSatish Balay /*-- row sum --*/ 132559689ffbSStefano Zampini jmin = a->i[k]; 132659689ffbSStefano Zampini jmax = a->i[k + 1]; 132749b5e25fSSatish Balay for (i = jmin; i < jmax; i++) { 132849b5e25fSSatish Balay for (j = 0; j < bs; j++) { 132949b5e25fSSatish Balay v = a->a + i * bs2 + j; 133049b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 13319371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 13329371c9d4SSatish Balay v += bs; 133349b5e25fSSatish Balay } 133449b5e25fSSatish Balay } 133549b5e25fSSatish Balay } 133649b5e25fSSatish Balay /* add k_th block row to il, jl */ 1337831a3094SHong Zhang col = aj + jmin; 133859689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) jmin++; 133949b5e25fSSatish Balay if (jmin < jmax) { 134049b5e25fSSatish Balay il[k] = jmin; 13419371c9d4SSatish Balay j = a->j[jmin]; 13429371c9d4SSatish Balay jl[k] = jl[j]; 13439371c9d4SSatish Balay jl[j] = k; 134449b5e25fSSatish Balay } 134549b5e25fSSatish Balay for (j = 0; j < bs; j++) { 134649b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 134749b5e25fSSatish Balay } 134849b5e25fSSatish Balay } 13499566063dSJacob Faibussowitsch PetscCall(PetscFree3(sum, il, jl)); 13509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1351f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 135249b5e25fSSatish Balay PetscFunctionReturn(0); 135349b5e25fSSatish Balay } 135449b5e25fSSatish Balay 1355d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1356d71ae5a4SJacob Faibussowitsch { 135749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 135849b5e25fSSatish Balay 135949b5e25fSSatish Balay PetscFunctionBegin; 136049b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1361d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1362ef511fbeSHong Zhang *flg = PETSC_FALSE; 1363ef511fbeSHong Zhang PetscFunctionReturn(0); 136449b5e25fSSatish Balay } 136549b5e25fSSatish Balay 136649b5e25fSSatish Balay /* if the a->i are the same */ 13679566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 136826fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 136949b5e25fSSatish Balay 137049b5e25fSSatish Balay /* if a->j are the same */ 13719566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 137226fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 137326fbe8dcSKarl Rupp 137449b5e25fSSatish Balay /* if a->a are the same */ 13759566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 1376935af2e7SHong Zhang PetscFunctionReturn(0); 137749b5e25fSSatish Balay } 137849b5e25fSSatish Balay 1379d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1380d71ae5a4SJacob Faibussowitsch { 138149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1382d9ca1df4SBarry Smith PetscInt i, j, k, row, bs, ambs, bs2; 1383d9ca1df4SBarry Smith const PetscInt *ai, *aj; 138487828ca2SBarry Smith PetscScalar *x, zero = 0.0; 1385d9ca1df4SBarry Smith const MatScalar *aa, *aa_j; 138649b5e25fSSatish Balay 138749b5e25fSSatish Balay PetscFunctionBegin; 1388d0f46423SBarry Smith bs = A->rmap->bs; 1389aed4548fSBarry Smith PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 139082799104SHong Zhang 139149b5e25fSSatish Balay aa = a->a; 13928a0c6efdSHong Zhang ambs = a->mbs; 13938a0c6efdSHong Zhang 13948a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 13958a0c6efdSHong Zhang PetscInt *diag = a->diag; 13968a0c6efdSHong Zhang aa = a->a; 13978a0c6efdSHong Zhang ambs = a->mbs; 13989566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 13998a0c6efdSHong Zhang for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 14009566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 14018a0c6efdSHong Zhang PetscFunctionReturn(0); 14028a0c6efdSHong Zhang } 14038a0c6efdSHong Zhang 140449b5e25fSSatish Balay ai = a->i; 140549b5e25fSSatish Balay aj = a->j; 140649b5e25fSSatish Balay bs2 = a->bs2; 14079566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 1408c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 14099566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 141049b5e25fSSatish Balay for (i = 0; i < ambs; i++) { 141149b5e25fSSatish Balay j = ai[i]; 141249b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 141349b5e25fSSatish Balay row = i * bs; 141449b5e25fSSatish Balay aa_j = aa + j * bs2; 141549b5e25fSSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 141649b5e25fSSatish Balay } 141749b5e25fSSatish Balay } 14189566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 141949b5e25fSSatish Balay PetscFunctionReturn(0); 142049b5e25fSSatish Balay } 142149b5e25fSSatish Balay 1422d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1423d71ae5a4SJacob Faibussowitsch { 142449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1425d9ca1df4SBarry Smith PetscScalar x; 1426d9ca1df4SBarry Smith const PetscScalar *l, *li, *ri; 142749b5e25fSSatish Balay MatScalar *aa, *v; 1428fff8e43fSBarry Smith PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1429fff8e43fSBarry Smith const PetscInt *ai, *aj; 1430ace3abfcSBarry Smith PetscBool flg; 143149b5e25fSSatish Balay 143249b5e25fSSatish Balay PetscFunctionBegin; 1433b3bf805bSHong Zhang if (ll != rr) { 14349566063dSJacob Faibussowitsch PetscCall(VecEqual(ll, rr, &flg)); 143528b400f6SJacob Faibussowitsch PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1436b3bf805bSHong Zhang } 1437b3bf805bSHong Zhang if (!ll) PetscFunctionReturn(0); 143849b5e25fSSatish Balay ai = a->i; 143949b5e25fSSatish Balay aj = a->j; 144049b5e25fSSatish Balay aa = a->a; 1441d0f46423SBarry Smith m = A->rmap->N; 1442d0f46423SBarry Smith bs = A->rmap->bs; 144349b5e25fSSatish Balay mbs = a->mbs; 144449b5e25fSSatish Balay bs2 = a->bs2; 144549b5e25fSSatish Balay 14469566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 14479566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 144808401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 144949b5e25fSSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 145049b5e25fSSatish Balay M = ai[i + 1] - ai[i]; 145149b5e25fSSatish Balay li = l + i * bs; 145249b5e25fSSatish Balay v = aa + bs2 * ai[i]; 145349b5e25fSSatish Balay for (j = 0; j < M; j++) { /* for each block */ 145449b5e25fSSatish Balay ri = l + bs * aj[ai[i] + j]; 14555e90f9d9SHong Zhang for (k = 0; k < bs; k++) { 145649b5e25fSSatish Balay x = ri[k]; 145749b5e25fSSatish Balay for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 145849b5e25fSSatish Balay } 145949b5e25fSSatish Balay } 146049b5e25fSSatish Balay } 14619566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 14629566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 146349b5e25fSSatish Balay PetscFunctionReturn(0); 146449b5e25fSSatish Balay } 146549b5e25fSSatish Balay 1466d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1467d71ae5a4SJacob Faibussowitsch { 146849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 146949b5e25fSSatish Balay 147049b5e25fSSatish Balay PetscFunctionBegin; 147149b5e25fSSatish Balay info->block_size = a->bs2; 1472ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 14736c6c5352SBarry Smith info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 14743966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 147549b5e25fSSatish Balay info->assemblies = A->num_ass; 14768e58a170SBarry Smith info->mallocs = A->info.mallocs; 14774dfa11a4SJacob Faibussowitsch info->memory = 0; /* REVIEW ME */ 1478d5f3da31SBarry Smith if (A->factortype) { 147949b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 148049b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 148149b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 148249b5e25fSSatish Balay } else { 148349b5e25fSSatish Balay info->fill_ratio_given = 0; 148449b5e25fSSatish Balay info->fill_ratio_needed = 0; 148549b5e25fSSatish Balay info->factor_mallocs = 0; 148649b5e25fSSatish Balay } 148749b5e25fSSatish Balay PetscFunctionReturn(0); 148849b5e25fSSatish Balay } 148949b5e25fSSatish Balay 1490d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1491d71ae5a4SJacob Faibussowitsch { 149249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 149349b5e25fSSatish Balay 149449b5e25fSSatish Balay PetscFunctionBegin; 14959566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 149649b5e25fSSatish Balay PetscFunctionReturn(0); 149749b5e25fSSatish Balay } 1498dc354874SHong Zhang 1499d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1500d71ae5a4SJacob Faibussowitsch { 1501dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1502d9ca1df4SBarry Smith PetscInt i, j, n, row, col, bs, mbs; 1503d9ca1df4SBarry Smith const PetscInt *ai, *aj; 1504c3fca9a7SHong Zhang PetscReal atmp; 1505d9ca1df4SBarry Smith const MatScalar *aa; 1506985db425SBarry Smith PetscScalar *x; 150713f74950SBarry Smith PetscInt ncols, brow, bcol, krow, kcol; 1508dc354874SHong Zhang 1509dc354874SHong Zhang PetscFunctionBegin; 151028b400f6SJacob Faibussowitsch PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 151128b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1512d0f46423SBarry Smith bs = A->rmap->bs; 1513dc354874SHong Zhang aa = a->a; 1514dc354874SHong Zhang ai = a->i; 1515dc354874SHong Zhang aj = a->j; 151644117c81SHong Zhang mbs = a->mbs; 1517dc354874SHong Zhang 15189566063dSJacob Faibussowitsch PetscCall(VecSet(v, 0.0)); 15199566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 15209566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 152108401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 152244117c81SHong Zhang for (i = 0; i < mbs; i++) { 15239371c9d4SSatish Balay ncols = ai[1] - ai[0]; 15249371c9d4SSatish Balay ai++; 1525d0f6400bSHong Zhang brow = bs * i; 152644117c81SHong Zhang for (j = 0; j < ncols; j++) { 1527d0f6400bSHong Zhang bcol = bs * (*aj); 152844117c81SHong Zhang for (kcol = 0; kcol < bs; kcol++) { 1529d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 153044117c81SHong Zhang for (krow = 0; krow < bs; krow++) { 15319371c9d4SSatish Balay atmp = PetscAbsScalar(*aa); 15329371c9d4SSatish Balay aa++; 1533d0f6400bSHong Zhang row = brow + krow; /* row index */ 1534c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1535c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 153644117c81SHong Zhang } 153744117c81SHong Zhang } 1538d0f6400bSHong Zhang aj++; 1539dc354874SHong Zhang } 1540dc354874SHong Zhang } 15419566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 1542dc354874SHong Zhang PetscFunctionReturn(0); 1543dc354874SHong Zhang } 1544c2916339SPierre Jolivet 1545d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1546d71ae5a4SJacob Faibussowitsch { 1547c2916339SPierre Jolivet PetscFunctionBegin; 15489566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 15494222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1550c2916339SPierre Jolivet PetscFunctionReturn(0); 1551c2916339SPierre Jolivet } 1552c2916339SPierre Jolivet 1553d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1554d71ae5a4SJacob Faibussowitsch { 1555c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1556c2916339SPierre Jolivet PetscScalar *z = c; 1557c2916339SPierre Jolivet const PetscScalar *xb; 1558c2916339SPierre Jolivet PetscScalar x1; 1559c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1560c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 15613c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 1562b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 15633c2e41e1SStefano Zampini #else 15643c2e41e1SStefano Zampini const int aconj = 0; 15653c2e41e1SStefano Zampini #endif 1566c2916339SPierre Jolivet 1567c2916339SPierre Jolivet PetscFunctionBegin; 1568c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 15699371c9d4SSatish Balay n = ii[1] - ii[0]; 15709371c9d4SSatish Balay ii++; 1571c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1572c2916339SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1573c2916339SPierre Jolivet jj = idx; 1574c2916339SPierre Jolivet vv = v; 1575c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1576c2916339SPierre Jolivet idx = jj; 1577c2916339SPierre Jolivet v = vv; 1578c2916339SPierre Jolivet for (j = 0; j < n; j++) { 15799371c9d4SSatish Balay xb = b + (*idx); 15809371c9d4SSatish Balay x1 = xb[0 + k * bm]; 1581c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1; 15823c2e41e1SStefano Zampini if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1583c2916339SPierre Jolivet v += 1; 1584c2916339SPierre Jolivet ++idx; 1585c2916339SPierre Jolivet } 1586c2916339SPierre Jolivet } 1587c2916339SPierre Jolivet z += 1; 1588c2916339SPierre Jolivet } 1589c2916339SPierre Jolivet PetscFunctionReturn(0); 1590c2916339SPierre Jolivet } 1591c2916339SPierre Jolivet 1592d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1593d71ae5a4SJacob Faibussowitsch { 1594c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1595c2916339SPierre Jolivet PetscScalar *z = c; 1596c2916339SPierre Jolivet const PetscScalar *xb; 1597c2916339SPierre Jolivet PetscScalar x1, x2; 1598c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1599c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1600c2916339SPierre Jolivet 1601c2916339SPierre Jolivet PetscFunctionBegin; 1602c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16039371c9d4SSatish Balay n = ii[1] - ii[0]; 16049371c9d4SSatish Balay ii++; 1605c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1606c2916339SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1607c2916339SPierre Jolivet jj = idx; 1608c2916339SPierre Jolivet vv = v; 1609c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1610c2916339SPierre Jolivet idx = jj; 1611c2916339SPierre Jolivet v = vv; 1612c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16139371c9d4SSatish Balay xb = b + 2 * (*idx); 16149371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16159371c9d4SSatish Balay x2 = xb[1 + k * bm]; 1616c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1617c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1618c2916339SPierre Jolivet if (*idx != i) { 1619c2916339SPierre Jolivet c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1620c2916339SPierre Jolivet c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1621c2916339SPierre Jolivet } 1622c2916339SPierre Jolivet v += 4; 1623c2916339SPierre Jolivet ++idx; 1624c2916339SPierre Jolivet } 1625c2916339SPierre Jolivet } 1626c2916339SPierre Jolivet z += 2; 1627c2916339SPierre Jolivet } 1628c2916339SPierre Jolivet PetscFunctionReturn(0); 1629c2916339SPierre Jolivet } 1630c2916339SPierre Jolivet 1631d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1632d71ae5a4SJacob Faibussowitsch { 1633c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1634c2916339SPierre Jolivet PetscScalar *z = c; 1635c2916339SPierre Jolivet const PetscScalar *xb; 1636c2916339SPierre Jolivet PetscScalar x1, x2, x3; 1637c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1638c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1639c2916339SPierre Jolivet 1640c2916339SPierre Jolivet PetscFunctionBegin; 1641c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16429371c9d4SSatish Balay n = ii[1] - ii[0]; 16439371c9d4SSatish Balay ii++; 1644c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1645c2916339SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1646c2916339SPierre Jolivet jj = idx; 1647c2916339SPierre Jolivet vv = v; 1648c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1649c2916339SPierre Jolivet idx = jj; 1650c2916339SPierre Jolivet v = vv; 1651c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16529371c9d4SSatish Balay xb = b + 3 * (*idx); 16539371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16549371c9d4SSatish Balay x2 = xb[1 + k * bm]; 16559371c9d4SSatish Balay x3 = xb[2 + k * bm]; 1656c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1657c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1658c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1659c2916339SPierre Jolivet if (*idx != i) { 1660c2916339SPierre Jolivet c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1661c2916339SPierre Jolivet c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1662c2916339SPierre Jolivet c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1663c2916339SPierre Jolivet } 1664c2916339SPierre Jolivet v += 9; 1665c2916339SPierre Jolivet ++idx; 1666c2916339SPierre Jolivet } 1667c2916339SPierre Jolivet } 1668c2916339SPierre Jolivet z += 3; 1669c2916339SPierre Jolivet } 1670c2916339SPierre Jolivet PetscFunctionReturn(0); 1671c2916339SPierre Jolivet } 1672c2916339SPierre Jolivet 1673d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1674d71ae5a4SJacob Faibussowitsch { 1675c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1676c2916339SPierre Jolivet PetscScalar *z = c; 1677c2916339SPierre Jolivet const PetscScalar *xb; 1678c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4; 1679c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1680c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1681c2916339SPierre Jolivet 1682c2916339SPierre Jolivet PetscFunctionBegin; 1683c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16849371c9d4SSatish Balay n = ii[1] - ii[0]; 16859371c9d4SSatish Balay ii++; 1686c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1687c2916339SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1688c2916339SPierre Jolivet jj = idx; 1689c2916339SPierre Jolivet vv = v; 1690c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1691c2916339SPierre Jolivet idx = jj; 1692c2916339SPierre Jolivet v = vv; 1693c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16949371c9d4SSatish Balay xb = b + 4 * (*idx); 16959371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16969371c9d4SSatish Balay x2 = xb[1 + k * bm]; 16979371c9d4SSatish Balay x3 = xb[2 + k * bm]; 16989371c9d4SSatish Balay x4 = xb[3 + k * bm]; 1699c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1700c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1701c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1702c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1703c2916339SPierre Jolivet if (*idx != i) { 1704c2916339SPierre Jolivet c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1705c2916339SPierre Jolivet c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1706c2916339SPierre Jolivet c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1707c2916339SPierre Jolivet c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1708c2916339SPierre Jolivet } 1709c2916339SPierre Jolivet v += 16; 1710c2916339SPierre Jolivet ++idx; 1711c2916339SPierre Jolivet } 1712c2916339SPierre Jolivet } 1713c2916339SPierre Jolivet z += 4; 1714c2916339SPierre Jolivet } 1715c2916339SPierre Jolivet PetscFunctionReturn(0); 1716c2916339SPierre Jolivet } 1717c2916339SPierre Jolivet 1718d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1719d71ae5a4SJacob Faibussowitsch { 1720c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1721c2916339SPierre Jolivet PetscScalar *z = c; 1722c2916339SPierre Jolivet const PetscScalar *xb; 1723c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 1724c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1725c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1726c2916339SPierre Jolivet 1727c2916339SPierre Jolivet PetscFunctionBegin; 1728c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 17299371c9d4SSatish Balay n = ii[1] - ii[0]; 17309371c9d4SSatish Balay ii++; 1731c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1732c2916339SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1733c2916339SPierre Jolivet jj = idx; 1734c2916339SPierre Jolivet vv = v; 1735c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1736c2916339SPierre Jolivet idx = jj; 1737c2916339SPierre Jolivet v = vv; 1738c2916339SPierre Jolivet for (j = 0; j < n; j++) { 17399371c9d4SSatish Balay xb = b + 5 * (*idx); 17409371c9d4SSatish Balay x1 = xb[0 + k * bm]; 17419371c9d4SSatish Balay x2 = xb[1 + k * bm]; 17429371c9d4SSatish Balay x3 = xb[2 + k * bm]; 17439371c9d4SSatish Balay x4 = xb[3 + k * bm]; 17449371c9d4SSatish Balay x5 = xb[4 + k * cm]; 1745c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1746c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1747c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1748c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1749c2916339SPierre Jolivet z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1750c2916339SPierre Jolivet if (*idx != i) { 1751c2916339SPierre Jolivet c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1752c2916339SPierre Jolivet c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1753c2916339SPierre Jolivet c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1754c2916339SPierre Jolivet c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1755c2916339SPierre Jolivet c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1756c2916339SPierre Jolivet } 1757c2916339SPierre Jolivet v += 25; 1758c2916339SPierre Jolivet ++idx; 1759c2916339SPierre Jolivet } 1760c2916339SPierre Jolivet } 1761c2916339SPierre Jolivet z += 5; 1762c2916339SPierre Jolivet } 1763c2916339SPierre Jolivet PetscFunctionReturn(0); 1764c2916339SPierre Jolivet } 1765c2916339SPierre Jolivet 1766d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1767d71ae5a4SJacob Faibussowitsch { 1768c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1769c2916339SPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1770281439baSStefano Zampini Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1771c2916339SPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1772c2916339SPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1773c2916339SPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 1774f4259b30SLisandro Dalcin PetscScalar *z = NULL; 1775c2916339SPierre Jolivet PetscScalar *c, *b; 1776c2916339SPierre Jolivet const MatScalar *v; 1777c2916339SPierre Jolivet const PetscInt *idx, *ii; 1778c2916339SPierre Jolivet PetscScalar _DOne = 1.0; 1779c2916339SPierre Jolivet 1780c2916339SPierre Jolivet PetscFunctionBegin; 1781c2916339SPierre Jolivet if (!cm || !cn) PetscFunctionReturn(0); 178208401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 178308401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 178408401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1785c2916339SPierre Jolivet b = bd->v; 17869566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(C)); 17879566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 1788c2916339SPierre Jolivet switch (bs) { 1789d71ae5a4SJacob Faibussowitsch case 1: 1790d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1791d71ae5a4SJacob Faibussowitsch break; 1792d71ae5a4SJacob Faibussowitsch case 2: 1793d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1794d71ae5a4SJacob Faibussowitsch break; 1795d71ae5a4SJacob Faibussowitsch case 3: 1796d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1797d71ae5a4SJacob Faibussowitsch break; 1798d71ae5a4SJacob Faibussowitsch case 4: 1799d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1800d71ae5a4SJacob Faibussowitsch break; 1801d71ae5a4SJacob Faibussowitsch case 5: 1802d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1803d71ae5a4SJacob Faibussowitsch break; 1804c2916339SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 18059566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 18069566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 18079566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 18089566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 1809c2916339SPierre Jolivet idx = a->j; 1810c2916339SPierre Jolivet v = a->a; 1811c2916339SPierre Jolivet mbs = a->mbs; 1812c2916339SPierre Jolivet ii = a->i; 1813c2916339SPierre Jolivet z = c; 1814c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 18159371c9d4SSatish Balay n = ii[1] - ii[0]; 18169371c9d4SSatish Balay ii++; 1817c2916339SPierre Jolivet for (j = 0; j < n; j++) { 1818792fecdfSBarry Smith if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1819792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1820c2916339SPierre Jolivet v += bs2; 1821c2916339SPierre Jolivet } 1822c2916339SPierre Jolivet z += bs; 1823c2916339SPierre Jolivet } 1824c2916339SPierre Jolivet } 18259566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 18269566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1827c2916339SPierre Jolivet PetscFunctionReturn(0); 1828c2916339SPierre Jolivet } 1829