149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h> 5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 6c6db04a5SJed Brown #include <petscbt.h> 7c6db04a5SJed Brown #include <petscblaslapack.h> 849b5e25fSSatish Balay 99371c9d4SSatish Balay PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) { 105eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 115d0c19d7SBarry Smith PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs, *nidx2; 125d0c19d7SBarry Smith const PetscInt *idx; 13db41cccfSHong Zhang PetscBT table_out, table_in; 14d94109b8SHong Zhang 15d94109b8SHong Zhang PetscFunctionBegin; 1608401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 17d94109b8SHong Zhang mbs = a->mbs; 18d94109b8SHong Zhang ai = a->i; 19d94109b8SHong Zhang aj = a->j; 20d0f46423SBarry Smith bs = A->rmap->bs; 219566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_out)); 229566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs + 1, &nidx)); 239566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2)); 249566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_in)); 25d94109b8SHong Zhang 26d94109b8SHong Zhang for (i = 0; i < is_max; i++) { /* for each is */ 27d94109b8SHong Zhang isz = 0; 289566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_out)); 29d94109b8SHong Zhang 30d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 319566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 329566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 33d94109b8SHong Zhang 34db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 35dbe03f88SHong Zhang bcol_max = 0; 36d94109b8SHong Zhang for (j = 0; j < n; ++j) { 37d94109b8SHong Zhang brow = idx[j] / bs; /* convert the indices into block indices */ 3808401ef6SPierre Jolivet PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 39db41cccfSHong Zhang if (!PetscBTLookupSet(table_out, brow)) { 40dbe03f88SHong Zhang nidx[isz++] = brow; 41dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 42dbe03f88SHong Zhang } 43d94109b8SHong Zhang } 449566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 459566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 46d94109b8SHong Zhang 47d94109b8SHong Zhang k = 0; 48d94109b8SHong Zhang for (j = 0; j < ov; j++) { /* for each overlap */ 49db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 509566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_in)); 519566063dSJacob Faibussowitsch for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 52d94109b8SHong Zhang 53d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 54d94109b8SHong Zhang for (brow = 0; brow < mbs; brow++) { 559371c9d4SSatish Balay start = ai[brow]; 569371c9d4SSatish Balay end = ai[brow + 1]; 57db41cccfSHong Zhang if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58d94109b8SHong Zhang for (l = start; l < end; l++) { 59d94109b8SHong Zhang bcol = aj[l]; 60d7b97159SHong Zhang if (!PetscBTLookupSet(table_out, bcol)) { 61d7b97159SHong Zhang nidx[isz++] = bcol; 62d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 63d7b97159SHong Zhang } 64d94109b8SHong Zhang } 65d94109b8SHong Zhang k++; 66d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67d94109b8SHong Zhang } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68d94109b8SHong Zhang for (l = start; l < end; l++) { 69d94109b8SHong Zhang bcol = aj[l]; 70dbe03f88SHong Zhang if (bcol > bcol_max) break; 71db41cccfSHong Zhang if (PetscBTLookup(table_in, bcol)) { 7226fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 73d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 74d94109b8SHong Zhang } 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } /* for each overlap */ 79d94109b8SHong Zhang 80d94109b8SHong Zhang /* expand the Index Set */ 81d94109b8SHong Zhang for (j = 0; j < isz; j++) { 8226fbe8dcSKarl Rupp for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k; 83d94109b8SHong Zhang } 849566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i)); 85d94109b8SHong Zhang } 869566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_out)); 879566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 889566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx2)); 899566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_in)); 905eee224dSHong Zhang PetscFunctionReturn(0); 9149b5e25fSSatish Balay } 9249b5e25fSSatish Balay 93847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 94847daeccSHong Zhang Zero some ops' to avoid invalid usse */ 959371c9d4SSatish Balay PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) { 9649b5e25fSSatish Balay PetscFunctionBegin; 979566063dSJacob Faibussowitsch PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 98f4259b30SLisandro Dalcin Bseq->ops->mult = NULL; 99f4259b30SLisandro Dalcin Bseq->ops->multadd = NULL; 100f4259b30SLisandro Dalcin Bseq->ops->multtranspose = NULL; 101f4259b30SLisandro Dalcin Bseq->ops->multtransposeadd = NULL; 102f4259b30SLisandro Dalcin Bseq->ops->lufactor = NULL; 103f4259b30SLisandro Dalcin Bseq->ops->choleskyfactor = NULL; 104f4259b30SLisandro Dalcin Bseq->ops->lufactorsymbolic = NULL; 105f4259b30SLisandro Dalcin Bseq->ops->choleskyfactorsymbolic = NULL; 106f4259b30SLisandro Dalcin Bseq->ops->getinertia = NULL; 10749b5e25fSSatish Balay PetscFunctionReturn(0); 10849b5e25fSSatish Balay } 10949b5e25fSSatish Balay 1107dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 1119371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 112e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c; 113e50a8114SHong Zhang PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 114e50a8114SHong Zhang PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 115e50a8114SHong Zhang const PetscInt *irow, *icol; 116e50a8114SHong Zhang PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 117e50a8114SHong Zhang PetscInt *aj = a->j, *ai = a->i; 118e50a8114SHong Zhang MatScalar *mat_a; 119e50a8114SHong Zhang Mat C; 120e50a8114SHong Zhang PetscBool flag; 121e50a8114SHong Zhang 122e50a8114SHong Zhang PetscFunctionBegin; 123e50a8114SHong Zhang 1249566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 1259566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 1269566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 1279566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 128e50a8114SHong Zhang 1299566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 130e50a8114SHong Zhang ssmap = smap; 1319566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 132e50a8114SHong Zhang for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 133e50a8114SHong Zhang /* determine lens of each row */ 134e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 135e50a8114SHong Zhang kstart = ai[irow[i]]; 136e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 137e50a8114SHong Zhang lens[i] = 0; 138e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 139e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 140e50a8114SHong Zhang } 141e50a8114SHong Zhang } 142e50a8114SHong Zhang /* Create and fill new matrix */ 143e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 144e50a8114SHong Zhang c = (Mat_SeqSBAIJ *)((*B)->data); 145e50a8114SHong Zhang 146aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1479566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 14828b400f6SJacob Faibussowitsch PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 1499566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 150e50a8114SHong Zhang C = *B; 151e50a8114SHong Zhang } else { 1529566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1539566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 1549566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1559566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 156e50a8114SHong Zhang } 157e50a8114SHong Zhang c = (Mat_SeqSBAIJ *)(C->data); 158e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 159e50a8114SHong Zhang row = irow[i]; 160e50a8114SHong Zhang kstart = ai[row]; 161e50a8114SHong Zhang kend = kstart + a->ilen[row]; 162e50a8114SHong Zhang mat_i = c->i[i]; 163e50a8114SHong Zhang mat_j = c->j + mat_i; 164e50a8114SHong Zhang mat_a = c->a + mat_i * bs2; 165e50a8114SHong Zhang mat_ilen = c->ilen + i; 166e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 167e50a8114SHong Zhang if ((tcol = ssmap[a->j[k]])) { 168e50a8114SHong Zhang *mat_j++ = tcol - 1; 1699566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 170e50a8114SHong Zhang mat_a += bs2; 171e50a8114SHong Zhang (*mat_ilen)++; 172e50a8114SHong Zhang } 173e50a8114SHong Zhang } 174e50a8114SHong Zhang } 175e50a8114SHong Zhang /* sort */ 176e50a8114SHong Zhang { 177e50a8114SHong Zhang MatScalar *work; 178e50a8114SHong Zhang 1799566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 180e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 181e50a8114SHong Zhang PetscInt ilen; 182e50a8114SHong Zhang mat_i = c->i[i]; 183e50a8114SHong Zhang mat_j = c->j + mat_i; 184e50a8114SHong Zhang mat_a = c->a + mat_i * bs2; 185e50a8114SHong Zhang ilen = c->ilen[i]; 1869566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 187e50a8114SHong Zhang } 1889566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 189e50a8114SHong Zhang } 190e50a8114SHong Zhang 191e50a8114SHong Zhang /* Free work space */ 1929566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 1939566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 1949566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 1959566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1969566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 197e50a8114SHong Zhang 1989566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 199e50a8114SHong Zhang *B = C; 200e50a8114SHong Zhang PetscFunctionReturn(0); 201e50a8114SHong Zhang } 202e50a8114SHong Zhang 2039371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 204e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 205e50a8114SHong Zhang IS is1, is2; 206e50a8114SHong Zhang PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs; 207e50a8114SHong Zhang const PetscInt *irow, *icol; 20849b5e25fSSatish Balay 20949b5e25fSSatish Balay PetscFunctionBegin; 2109566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 2119566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 2129566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 2139566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 214e50a8114SHong Zhang 215e50a8114SHong Zhang /* Verify if the indices corespond to each element in a block 216e50a8114SHong Zhang and form the IS with compressed IS */ 217e50a8114SHong Zhang maxmnbs = PetscMax(a->mbs, a->nbs); 2189566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 2199566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->mbs)); 220e50a8114SHong Zhang for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 221ad540459SPierre Jolivet for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 222e50a8114SHong Zhang count = 0; 223e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 224e50a8114SHong Zhang PetscInt j = irow[i] / bs; 225e50a8114SHong Zhang if ((vary[j]--) == bs) iary[count++] = j; 226e50a8114SHong Zhang } 2279566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 228e50a8114SHong Zhang 2299566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary, a->nbs)); 230e50a8114SHong Zhang for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 231ad540459SPierre Jolivet for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 232e50a8114SHong Zhang count = 0; 233e50a8114SHong Zhang for (i = 0; i < ncols; i++) { 234e50a8114SHong Zhang PetscInt j = icol[i] / bs; 235e50a8114SHong Zhang if ((vary[j]--) == bs) iary[count++] = j; 236e50a8114SHong Zhang } 2379566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 2389566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 2399566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 2409566063dSJacob Faibussowitsch PetscCall(PetscFree2(vary, iary)); 241e50a8114SHong Zhang 2429566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B)); 2439566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2449566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 245847daeccSHong Zhang 2468f46ffcaSHong Zhang if (isrow != iscol) { 2478f46ffcaSHong Zhang PetscBool isequal; 2489566063dSJacob Faibussowitsch PetscCall(ISEqual(isrow, iscol, &isequal)); 24948a46eb9SPierre Jolivet if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 25049b5e25fSSatish Balay } 25149b5e25fSSatish Balay PetscFunctionReturn(0); 25249b5e25fSSatish Balay } 25349b5e25fSSatish Balay 2549371c9d4SSatish Balay PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) { 25513f74950SBarry Smith PetscInt i; 25649b5e25fSSatish Balay 25749b5e25fSSatish Balay PetscFunctionBegin; 25848a46eb9SPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 259e50a8114SHong Zhang 26048a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 26149b5e25fSSatish Balay PetscFunctionReturn(0); 26249b5e25fSSatish Balay } 26349b5e25fSSatish Balay 26449b5e25fSSatish Balay /* -------------------------------------------------------*/ 26549b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 26649b5e25fSSatish Balay /* -------------------------------------------------------*/ 26749b5e25fSSatish Balay 2689371c9d4SSatish Balay PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) { 26949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 270d9ca1df4SBarry Smith PetscScalar *z, x1, x2, zero = 0.0; 271d9ca1df4SBarry Smith const PetscScalar *x, *xb; 272d9ca1df4SBarry Smith const MatScalar *v; 273d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 274d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 27598c9bda7SSatish Balay PetscInt nonzerorow = 0; 27649b5e25fSSatish Balay 27749b5e25fSSatish Balay PetscFunctionBegin; 2789566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 279c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 2809566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 2819566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 28249b5e25fSSatish Balay 28349b5e25fSSatish Balay v = a->a; 28449b5e25fSSatish Balay xb = x; 28549b5e25fSSatish Balay 28649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 28749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 2889371c9d4SSatish Balay x1 = xb[0]; 2899371c9d4SSatish Balay x2 = xb[1]; 29049b5e25fSSatish Balay ib = aj + *ai; 291831a3094SHong Zhang jmin = 0; 29298c9bda7SSatish Balay nonzerorow += (n > 0); 2937fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 29449b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 29549b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 2969371c9d4SSatish Balay v += 4; 2979371c9d4SSatish Balay jmin++; 2987fbae186SHong Zhang } 299444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 300444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 301831a3094SHong Zhang for (j = jmin; j < n; j++) { 30249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 30349b5e25fSSatish Balay cval = ib[j] * 2; 30449b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 30549b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 30649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 30749b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 30849b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 30949b5e25fSSatish Balay v += 4; 31049b5e25fSSatish Balay } 3119371c9d4SSatish Balay xb += 2; 3129371c9d4SSatish Balay ai++; 31349b5e25fSSatish Balay } 31449b5e25fSSatish Balay 3159566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3169566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 31849b5e25fSSatish Balay PetscFunctionReturn(0); 31949b5e25fSSatish Balay } 32049b5e25fSSatish Balay 3219371c9d4SSatish Balay PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) { 32249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 323d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, zero = 0.0; 324d9ca1df4SBarry Smith const PetscScalar *x, *xb; 325d9ca1df4SBarry Smith const MatScalar *v; 326d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 327d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 32898c9bda7SSatish Balay PetscInt nonzerorow = 0; 32949b5e25fSSatish Balay 33049b5e25fSSatish Balay PetscFunctionBegin; 3319566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 332c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 3339566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3349566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 33549b5e25fSSatish Balay 33649b5e25fSSatish Balay v = a->a; 33749b5e25fSSatish Balay xb = x; 33849b5e25fSSatish Balay 33949b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 34049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3419371c9d4SSatish Balay x1 = xb[0]; 3429371c9d4SSatish Balay x2 = xb[1]; 3439371c9d4SSatish Balay x3 = xb[2]; 34449b5e25fSSatish Balay ib = aj + *ai; 345831a3094SHong Zhang jmin = 0; 34698c9bda7SSatish Balay nonzerorow += (n > 0); 3477fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 34849b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 34949b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 35049b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3519371c9d4SSatish Balay v += 9; 3529371c9d4SSatish Balay jmin++; 3537fbae186SHong Zhang } 354444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 355444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 356831a3094SHong Zhang for (j = jmin; j < n; j++) { 35749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 35849b5e25fSSatish Balay cval = ib[j] * 3; 35949b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 36049b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 36149b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 36249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 36349b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 36449b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 36549b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 36649b5e25fSSatish Balay v += 9; 36749b5e25fSSatish Balay } 3689371c9d4SSatish Balay xb += 3; 3699371c9d4SSatish Balay ai++; 37049b5e25fSSatish Balay } 37149b5e25fSSatish Balay 3729566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3739566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3749566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 37549b5e25fSSatish Balay PetscFunctionReturn(0); 37649b5e25fSSatish Balay } 37749b5e25fSSatish Balay 3789371c9d4SSatish Balay PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) { 37949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 380d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 381d9ca1df4SBarry Smith const PetscScalar *x, *xb; 382d9ca1df4SBarry Smith const MatScalar *v; 383d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 384d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 38598c9bda7SSatish Balay PetscInt nonzerorow = 0; 38649b5e25fSSatish Balay 38749b5e25fSSatish Balay PetscFunctionBegin; 3889566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 389c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 3909566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3919566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 39249b5e25fSSatish Balay 39349b5e25fSSatish Balay v = a->a; 39449b5e25fSSatish Balay xb = x; 39549b5e25fSSatish Balay 39649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 39749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3989371c9d4SSatish Balay x1 = xb[0]; 3999371c9d4SSatish Balay x2 = xb[1]; 4009371c9d4SSatish Balay x3 = xb[2]; 4019371c9d4SSatish Balay x4 = xb[3]; 40249b5e25fSSatish Balay ib = aj + *ai; 403831a3094SHong Zhang jmin = 0; 40498c9bda7SSatish Balay nonzerorow += (n > 0); 4057fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 40649b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 40749b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 40849b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 40949b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 4109371c9d4SSatish Balay v += 16; 4119371c9d4SSatish Balay jmin++; 4127fbae186SHong Zhang } 413444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 414444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 415831a3094SHong Zhang for (j = jmin; j < n; j++) { 41649b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 41749b5e25fSSatish Balay cval = ib[j] * 4; 41849b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 41949b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 42049b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 42149b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 42249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 42349b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 42449b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 42549b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 42649b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 42749b5e25fSSatish Balay v += 16; 42849b5e25fSSatish Balay } 4299371c9d4SSatish Balay xb += 4; 4309371c9d4SSatish Balay ai++; 43149b5e25fSSatish Balay } 43249b5e25fSSatish Balay 4339566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4349566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 4359566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 43649b5e25fSSatish Balay PetscFunctionReturn(0); 43749b5e25fSSatish Balay } 43849b5e25fSSatish Balay 4399371c9d4SSatish Balay PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) { 44049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 441d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 442d9ca1df4SBarry Smith const PetscScalar *x, *xb; 443d9ca1df4SBarry Smith const MatScalar *v; 444d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 445d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 44698c9bda7SSatish Balay PetscInt nonzerorow = 0; 44749b5e25fSSatish Balay 44849b5e25fSSatish Balay PetscFunctionBegin; 4499566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 450c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 4519566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4529566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 45349b5e25fSSatish Balay 45449b5e25fSSatish Balay v = a->a; 45549b5e25fSSatish Balay xb = x; 45649b5e25fSSatish Balay 45749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 45849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4599371c9d4SSatish Balay x1 = xb[0]; 4609371c9d4SSatish Balay x2 = xb[1]; 4619371c9d4SSatish Balay x3 = xb[2]; 4629371c9d4SSatish Balay x4 = xb[3]; 4639371c9d4SSatish Balay x5 = xb[4]; 46449b5e25fSSatish Balay ib = aj + *ai; 465831a3094SHong Zhang jmin = 0; 46698c9bda7SSatish Balay nonzerorow += (n > 0); 4677fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 46849b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 46949b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 47049b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 47149b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 47249b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 4739371c9d4SSatish Balay v += 25; 4749371c9d4SSatish Balay jmin++; 4757fbae186SHong Zhang } 476444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 477444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 478831a3094SHong Zhang for (j = jmin; j < n; j++) { 47949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 48049b5e25fSSatish Balay cval = ib[j] * 5; 48149b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 48249b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 48349b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 48449b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 48549b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 48649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 48749b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 48849b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 48949b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 49049b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 49149b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 49249b5e25fSSatish Balay v += 25; 49349b5e25fSSatish Balay } 4949371c9d4SSatish Balay xb += 5; 4959371c9d4SSatish Balay ai++; 49649b5e25fSSatish Balay } 49749b5e25fSSatish Balay 4989566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4999566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5009566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 50149b5e25fSSatish Balay PetscFunctionReturn(0); 50249b5e25fSSatish Balay } 50349b5e25fSSatish Balay 5049371c9d4SSatish Balay PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) { 50549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 506d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 507d9ca1df4SBarry Smith const PetscScalar *x, *xb; 508d9ca1df4SBarry Smith const MatScalar *v; 509d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 510d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 51198c9bda7SSatish Balay PetscInt nonzerorow = 0; 51249b5e25fSSatish Balay 51349b5e25fSSatish Balay PetscFunctionBegin; 5149566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 515c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 5169566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5179566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 51849b5e25fSSatish Balay 51949b5e25fSSatish Balay v = a->a; 52049b5e25fSSatish Balay xb = x; 52149b5e25fSSatish Balay 52249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 52349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 5249371c9d4SSatish Balay x1 = xb[0]; 5259371c9d4SSatish Balay x2 = xb[1]; 5269371c9d4SSatish Balay x3 = xb[2]; 5279371c9d4SSatish Balay x4 = xb[3]; 5289371c9d4SSatish Balay x5 = xb[4]; 5299371c9d4SSatish Balay x6 = xb[5]; 53049b5e25fSSatish Balay ib = aj + *ai; 531831a3094SHong Zhang jmin = 0; 53298c9bda7SSatish Balay nonzerorow += (n > 0); 5337fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 53449b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 53549b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 53649b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 53749b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 53849b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 53949b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 5409371c9d4SSatish Balay v += 36; 5419371c9d4SSatish Balay jmin++; 5427fbae186SHong Zhang } 543444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 544444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 545831a3094SHong Zhang for (j = jmin; j < n; j++) { 54649b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 54749b5e25fSSatish Balay cval = ib[j] * 6; 54849b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 54949b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 55049b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 55149b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 55249b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 55349b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 55449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 55549b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 55649b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 55749b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 55849b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 55949b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 56049b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 56149b5e25fSSatish Balay v += 36; 56249b5e25fSSatish Balay } 5639371c9d4SSatish Balay xb += 6; 5649371c9d4SSatish Balay ai++; 56549b5e25fSSatish Balay } 56649b5e25fSSatish Balay 5679566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5689566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5699566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 57049b5e25fSSatish Balay PetscFunctionReturn(0); 57149b5e25fSSatish Balay } 572c2916339SPierre Jolivet 5739371c9d4SSatish Balay PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) { 57449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 575d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 576d9ca1df4SBarry Smith const PetscScalar *x, *xb; 577d9ca1df4SBarry Smith const MatScalar *v; 578d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 579d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 58098c9bda7SSatish Balay PetscInt nonzerorow = 0; 58149b5e25fSSatish Balay 58249b5e25fSSatish Balay PetscFunctionBegin; 5839566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 584c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 5859566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5869566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 58749b5e25fSSatish Balay 58849b5e25fSSatish Balay v = a->a; 58949b5e25fSSatish Balay xb = x; 59049b5e25fSSatish Balay 59149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 59249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 5939371c9d4SSatish Balay x1 = xb[0]; 5949371c9d4SSatish Balay x2 = xb[1]; 5959371c9d4SSatish Balay x3 = xb[2]; 5969371c9d4SSatish Balay x4 = xb[3]; 5979371c9d4SSatish Balay x5 = xb[4]; 5989371c9d4SSatish Balay x6 = xb[5]; 5999371c9d4SSatish Balay x7 = xb[6]; 60049b5e25fSSatish Balay ib = aj + *ai; 601831a3094SHong Zhang jmin = 0; 60298c9bda7SSatish Balay nonzerorow += (n > 0); 6037fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 60449b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 60549b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 60649b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 60749b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 60849b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 60949b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 61049b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 6119371c9d4SSatish Balay v += 49; 6129371c9d4SSatish Balay jmin++; 6137fbae186SHong Zhang } 614444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 615444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 616831a3094SHong Zhang for (j = jmin; j < n; j++) { 61749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 61849b5e25fSSatish Balay cval = ib[j] * 7; 61949b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 62049b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 62149b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 62249b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 62349b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 62449b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 62549b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 62649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 62749b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 62849b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 62949b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 63049b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 63149b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 63249b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 63349b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 63449b5e25fSSatish Balay v += 49; 63549b5e25fSSatish Balay } 6369371c9d4SSatish Balay xb += 7; 6379371c9d4SSatish Balay ai++; 63849b5e25fSSatish Balay } 6399566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6409566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 6419566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 64249b5e25fSSatish Balay PetscFunctionReturn(0); 64349b5e25fSSatish Balay } 64449b5e25fSSatish Balay 64549b5e25fSSatish Balay /* 64649b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 64749b5e25fSSatish Balay */ 6489371c9d4SSatish Balay PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) { 64949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 650d9ca1df4SBarry Smith PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 651d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 652d9ca1df4SBarry Smith const MatScalar *v; 653d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 654d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 65598c9bda7SSatish Balay PetscInt nonzerorow = 0; 65649b5e25fSSatish Balay 65749b5e25fSSatish Balay PetscFunctionBegin; 6589566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 659c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 6609566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6619566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 66259689ffbSStefano Zampini 66359689ffbSStefano Zampini x_ptr = x; 66459689ffbSStefano Zampini z_ptr = z; 66549b5e25fSSatish Balay 66649b5e25fSSatish Balay aj = a->j; 66749b5e25fSSatish Balay v = a->a; 66849b5e25fSSatish Balay ii = a->i; 66949b5e25fSSatish Balay 67048a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 67149b5e25fSSatish Balay work = a->mult_work; 67249b5e25fSSatish Balay 67349b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 6749371c9d4SSatish Balay n = ii[1] - ii[0]; 6759371c9d4SSatish Balay ncols = n * bs; 6769371c9d4SSatish Balay workt = work; 6779371c9d4SSatish Balay idx = aj + ii[0]; 67898c9bda7SSatish Balay nonzerorow += (n > 0); 67949b5e25fSSatish Balay 68049b5e25fSSatish Balay /* upper triangular part */ 68149b5e25fSSatish Balay for (j = 0; j < n; j++) { 68249b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 68349b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 68449b5e25fSSatish Balay workt += bs; 68549b5e25fSSatish Balay } 68649b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 68796b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 68849b5e25fSSatish Balay 68949b5e25fSSatish Balay /* strict lower triangular part */ 690831a3094SHong Zhang idx = aj + ii[0]; 69159689ffbSStefano Zampini if (n && *idx == i) { 6929371c9d4SSatish Balay ncols -= bs; 6939371c9d4SSatish Balay v += bs2; 6949371c9d4SSatish Balay idx++; 6959371c9d4SSatish Balay n--; 696831a3094SHong Zhang } 69796b9376eSHong Zhang 69849b5e25fSSatish Balay if (ncols > 0) { 69949b5e25fSSatish Balay workt = work; 7009566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 70196b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 702831a3094SHong Zhang for (j = 0; j < n; j++) { 703831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 70449b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 70549b5e25fSSatish Balay workt += bs; 70649b5e25fSSatish Balay } 70749b5e25fSSatish Balay } 7089371c9d4SSatish Balay x += bs; 7099371c9d4SSatish Balay v += n * bs2; 7109371c9d4SSatish Balay z += bs; 7119371c9d4SSatish Balay ii++; 71249b5e25fSSatish Balay } 71349b5e25fSSatish Balay 7149566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7159566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 7169566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 71749b5e25fSSatish Balay PetscFunctionReturn(0); 71849b5e25fSSatish Balay } 71949b5e25fSSatish Balay 7209371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) { 72149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 722d9ca1df4SBarry Smith PetscScalar *z, x1; 723d9ca1df4SBarry Smith const PetscScalar *x, *xb; 724d9ca1df4SBarry Smith const MatScalar *v; 725d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 726d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 72798c9bda7SSatish Balay PetscInt nonzerorow = 0; 728eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 729b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 730eb1ec7c1SStefano Zampini #else 731eb1ec7c1SStefano Zampini const int aconj = 0; 732eb1ec7c1SStefano Zampini #endif 73349b5e25fSSatish Balay 73449b5e25fSSatish Balay PetscFunctionBegin; 7359566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 736c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 7379566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7389566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 73949b5e25fSSatish Balay v = a->a; 74049b5e25fSSatish Balay xb = x; 74149b5e25fSSatish Balay 74249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 74349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 74449b5e25fSSatish Balay x1 = xb[0]; 74549b5e25fSSatish Balay ib = aj + *ai; 746831a3094SHong Zhang jmin = 0; 74798c9bda7SSatish Balay nonzerorow += (n > 0); 7483d9ade75SStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 7499371c9d4SSatish Balay z[i] += *v++ * x[*ib++]; 7509371c9d4SSatish Balay jmin++; 751831a3094SHong Zhang } 752eb1ec7c1SStefano Zampini if (aconj) { 753eb1ec7c1SStefano Zampini for (j = jmin; j < n; j++) { 754eb1ec7c1SStefano Zampini cval = *ib; 755eb1ec7c1SStefano Zampini z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 756eb1ec7c1SStefano Zampini z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 757eb1ec7c1SStefano Zampini } 758eb1ec7c1SStefano Zampini } else { 759831a3094SHong Zhang for (j = jmin; j < n; j++) { 76049b5e25fSSatish Balay cval = *ib; 76149b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 76249b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 76349b5e25fSSatish Balay } 764eb1ec7c1SStefano Zampini } 7659371c9d4SSatish Balay xb++; 7669371c9d4SSatish Balay ai++; 76749b5e25fSSatish Balay } 76849b5e25fSSatish Balay 7699566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7709566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 77149b5e25fSSatish Balay 7729566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 77349b5e25fSSatish Balay PetscFunctionReturn(0); 77449b5e25fSSatish Balay } 77549b5e25fSSatish Balay 7769371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) { 77749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 778d9ca1df4SBarry Smith PetscScalar *z, x1, x2; 779d9ca1df4SBarry Smith const PetscScalar *x, *xb; 780d9ca1df4SBarry Smith const MatScalar *v; 781d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 782d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 78398c9bda7SSatish Balay PetscInt nonzerorow = 0; 78449b5e25fSSatish Balay 78549b5e25fSSatish Balay PetscFunctionBegin; 7869566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 787c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 7889566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7899566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 79049b5e25fSSatish Balay 79149b5e25fSSatish Balay v = a->a; 79249b5e25fSSatish Balay xb = x; 79349b5e25fSSatish Balay 79449b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 79549b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 7969371c9d4SSatish Balay x1 = xb[0]; 7979371c9d4SSatish Balay x2 = xb[1]; 79849b5e25fSSatish Balay ib = aj + *ai; 799831a3094SHong Zhang jmin = 0; 80098c9bda7SSatish Balay nonzerorow += (n > 0); 80159689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 80249b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 80349b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 8049371c9d4SSatish Balay v += 4; 8059371c9d4SSatish Balay jmin++; 8067fbae186SHong Zhang } 807444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 808444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 809831a3094SHong Zhang for (j = jmin; j < n; j++) { 81049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 81149b5e25fSSatish Balay cval = ib[j] * 2; 81249b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 81349b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 81449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 81549b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 81649b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 81749b5e25fSSatish Balay v += 4; 81849b5e25fSSatish Balay } 8199371c9d4SSatish Balay xb += 2; 8209371c9d4SSatish Balay ai++; 82149b5e25fSSatish Balay } 8229566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8239566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 82449b5e25fSSatish Balay 8259566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 82649b5e25fSSatish Balay PetscFunctionReturn(0); 82749b5e25fSSatish Balay } 82849b5e25fSSatish Balay 8299371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) { 83049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 831d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3; 832d9ca1df4SBarry Smith const PetscScalar *x, *xb; 833d9ca1df4SBarry Smith const MatScalar *v; 834d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 835d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 83698c9bda7SSatish Balay PetscInt nonzerorow = 0; 83749b5e25fSSatish Balay 83849b5e25fSSatish Balay PetscFunctionBegin; 8399566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 840c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 8419566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8429566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 84349b5e25fSSatish Balay 84449b5e25fSSatish Balay v = a->a; 84549b5e25fSSatish Balay xb = x; 84649b5e25fSSatish Balay 84749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 84849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8499371c9d4SSatish Balay x1 = xb[0]; 8509371c9d4SSatish Balay x2 = xb[1]; 8519371c9d4SSatish Balay x3 = xb[2]; 85249b5e25fSSatish Balay ib = aj + *ai; 853831a3094SHong Zhang jmin = 0; 85498c9bda7SSatish Balay nonzerorow += (n > 0); 85559689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 85649b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 85749b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 85849b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 8599371c9d4SSatish Balay v += 9; 8609371c9d4SSatish Balay jmin++; 8617fbae186SHong Zhang } 862444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 863444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 864831a3094SHong Zhang for (j = jmin; j < n; j++) { 86549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 86649b5e25fSSatish Balay cval = ib[j] * 3; 86749b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 86849b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 86949b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 87049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 87149b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 87249b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 87349b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 87449b5e25fSSatish Balay v += 9; 87549b5e25fSSatish Balay } 8769371c9d4SSatish Balay xb += 3; 8779371c9d4SSatish Balay ai++; 87849b5e25fSSatish Balay } 87949b5e25fSSatish Balay 8809566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8819566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 88249b5e25fSSatish Balay 8839566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 88449b5e25fSSatish Balay PetscFunctionReturn(0); 88549b5e25fSSatish Balay } 88649b5e25fSSatish Balay 8879371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) { 88849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 889d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4; 890d9ca1df4SBarry Smith const PetscScalar *x, *xb; 891d9ca1df4SBarry Smith const MatScalar *v; 892d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 893d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 89498c9bda7SSatish Balay PetscInt nonzerorow = 0; 89549b5e25fSSatish Balay 89649b5e25fSSatish Balay PetscFunctionBegin; 8979566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 898c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 8999566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9009566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 90149b5e25fSSatish Balay 90249b5e25fSSatish Balay v = a->a; 90349b5e25fSSatish Balay xb = x; 90449b5e25fSSatish Balay 90549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 90649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 9079371c9d4SSatish Balay x1 = xb[0]; 9089371c9d4SSatish Balay x2 = xb[1]; 9099371c9d4SSatish Balay x3 = xb[2]; 9109371c9d4SSatish Balay x4 = xb[3]; 91149b5e25fSSatish Balay ib = aj + *ai; 912831a3094SHong Zhang jmin = 0; 91398c9bda7SSatish Balay nonzerorow += (n > 0); 91459689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 91549b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 91649b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 91749b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 91849b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 9199371c9d4SSatish Balay v += 16; 9209371c9d4SSatish Balay jmin++; 9217fbae186SHong Zhang } 922444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 923444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 924831a3094SHong Zhang for (j = jmin; j < n; j++) { 92549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 92649b5e25fSSatish Balay cval = ib[j] * 4; 92749b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 92849b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 92949b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 93049b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 93149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 93249b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 93349b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 93449b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 93549b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 93649b5e25fSSatish Balay v += 16; 93749b5e25fSSatish Balay } 9389371c9d4SSatish Balay xb += 4; 9399371c9d4SSatish Balay ai++; 94049b5e25fSSatish Balay } 94149b5e25fSSatish Balay 9429566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9439566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 94449b5e25fSSatish Balay 9459566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 94649b5e25fSSatish Balay PetscFunctionReturn(0); 94749b5e25fSSatish Balay } 94849b5e25fSSatish Balay 9499371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) { 95049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 951d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5; 952d9ca1df4SBarry Smith const PetscScalar *x, *xb; 953d9ca1df4SBarry Smith const MatScalar *v; 954d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 955d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 95698c9bda7SSatish Balay PetscInt nonzerorow = 0; 95749b5e25fSSatish Balay 95849b5e25fSSatish Balay PetscFunctionBegin; 9599566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 960c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 9619566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9629566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 96349b5e25fSSatish Balay 96449b5e25fSSatish Balay v = a->a; 96549b5e25fSSatish Balay xb = x; 96649b5e25fSSatish Balay 96749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 96849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 9699371c9d4SSatish Balay x1 = xb[0]; 9709371c9d4SSatish Balay x2 = xb[1]; 9719371c9d4SSatish Balay x3 = xb[2]; 9729371c9d4SSatish Balay x4 = xb[3]; 9739371c9d4SSatish Balay x5 = xb[4]; 97449b5e25fSSatish Balay ib = aj + *ai; 975831a3094SHong Zhang jmin = 0; 97698c9bda7SSatish Balay nonzerorow += (n > 0); 97759689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 97849b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 97949b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 98049b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 98149b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 98249b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 9839371c9d4SSatish Balay v += 25; 9849371c9d4SSatish Balay jmin++; 9857fbae186SHong Zhang } 986444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 987444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 988831a3094SHong Zhang for (j = jmin; j < n; j++) { 98949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 99049b5e25fSSatish Balay cval = ib[j] * 5; 99149b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 99249b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 99349b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 99449b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 99549b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 99649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 99749b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 99849b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 99949b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 100049b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 100149b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 100249b5e25fSSatish Balay v += 25; 100349b5e25fSSatish Balay } 10049371c9d4SSatish Balay xb += 5; 10059371c9d4SSatish Balay ai++; 100649b5e25fSSatish Balay } 100749b5e25fSSatish Balay 10089566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10099566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 101049b5e25fSSatish Balay 10119566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 101249b5e25fSSatish Balay PetscFunctionReturn(0); 101349b5e25fSSatish Balay } 1014c2916339SPierre Jolivet 10159371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) { 101649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1017d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6; 1018d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1019d9ca1df4SBarry Smith const MatScalar *v; 1020d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1021d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 102298c9bda7SSatish Balay PetscInt nonzerorow = 0; 102349b5e25fSSatish Balay 102449b5e25fSSatish Balay PetscFunctionBegin; 10259566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 1026c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 10279566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10289566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 102949b5e25fSSatish Balay 103049b5e25fSSatish Balay v = a->a; 103149b5e25fSSatish Balay xb = x; 103249b5e25fSSatish Balay 103349b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 103449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 10359371c9d4SSatish Balay x1 = xb[0]; 10369371c9d4SSatish Balay x2 = xb[1]; 10379371c9d4SSatish Balay x3 = xb[2]; 10389371c9d4SSatish Balay x4 = xb[3]; 10399371c9d4SSatish Balay x5 = xb[4]; 10409371c9d4SSatish Balay x6 = xb[5]; 104149b5e25fSSatish Balay ib = aj + *ai; 1042831a3094SHong Zhang jmin = 0; 104398c9bda7SSatish Balay nonzerorow += (n > 0); 104459689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 104549b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 104649b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 104749b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 104849b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 104949b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 105049b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 10519371c9d4SSatish Balay v += 36; 10529371c9d4SSatish Balay jmin++; 10537fbae186SHong Zhang } 1054444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1055444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1056831a3094SHong Zhang for (j = jmin; j < n; j++) { 105749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 105849b5e25fSSatish Balay cval = ib[j] * 6; 105949b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 106049b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 106149b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 106249b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 106349b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 106449b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 106549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 106649b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 106749b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 106849b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 106949b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 107049b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 107149b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 107249b5e25fSSatish Balay v += 36; 107349b5e25fSSatish Balay } 10749371c9d4SSatish Balay xb += 6; 10759371c9d4SSatish Balay ai++; 107649b5e25fSSatish Balay } 107749b5e25fSSatish Balay 10789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10799566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 108049b5e25fSSatish Balay 10819566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 108249b5e25fSSatish Balay PetscFunctionReturn(0); 108349b5e25fSSatish Balay } 108449b5e25fSSatish Balay 10859371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) { 108649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1087d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1088d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1089d9ca1df4SBarry Smith const MatScalar *v; 1090d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1091d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 109298c9bda7SSatish Balay PetscInt nonzerorow = 0; 109349b5e25fSSatish Balay 109449b5e25fSSatish Balay PetscFunctionBegin; 10959566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 1096c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 10979566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10989566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 109949b5e25fSSatish Balay 110049b5e25fSSatish Balay v = a->a; 110149b5e25fSSatish Balay xb = x; 110249b5e25fSSatish Balay 110349b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 110449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 11059371c9d4SSatish Balay x1 = xb[0]; 11069371c9d4SSatish Balay x2 = xb[1]; 11079371c9d4SSatish Balay x3 = xb[2]; 11089371c9d4SSatish Balay x4 = xb[3]; 11099371c9d4SSatish Balay x5 = xb[4]; 11109371c9d4SSatish Balay x6 = xb[5]; 11119371c9d4SSatish Balay x7 = xb[6]; 111249b5e25fSSatish Balay ib = aj + *ai; 1113831a3094SHong Zhang jmin = 0; 111498c9bda7SSatish Balay nonzerorow += (n > 0); 111559689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 111649b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 111749b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 111849b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 111949b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 112049b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 112149b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 112249b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 11239371c9d4SSatish Balay v += 49; 11249371c9d4SSatish Balay jmin++; 11257fbae186SHong Zhang } 1126444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1127444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1128831a3094SHong Zhang for (j = jmin; j < n; j++) { 112949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 113049b5e25fSSatish Balay cval = ib[j] * 7; 113149b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 113249b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 113349b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 113449b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 113549b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 113649b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 113749b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 113849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 113949b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 114049b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 114149b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 114249b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 114349b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 114449b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 114549b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 114649b5e25fSSatish Balay v += 49; 114749b5e25fSSatish Balay } 11489371c9d4SSatish Balay xb += 7; 11499371c9d4SSatish Balay ai++; 115049b5e25fSSatish Balay } 115149b5e25fSSatish Balay 11529566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11539566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 115449b5e25fSSatish Balay 11559566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 115649b5e25fSSatish Balay PetscFunctionReturn(0); 115749b5e25fSSatish Balay } 115849b5e25fSSatish Balay 11599371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) { 116049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1161f4259b30SLisandro Dalcin PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1162d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 1163d9ca1df4SBarry Smith const MatScalar *v; 1164d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1165d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 116698c9bda7SSatish Balay PetscInt nonzerorow = 0; 116749b5e25fSSatish Balay 116849b5e25fSSatish Balay PetscFunctionBegin; 11699566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 1170c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 11719371c9d4SSatish Balay PetscCall(VecGetArrayRead(xx, &x)); 11729371c9d4SSatish Balay x_ptr = x; 11739371c9d4SSatish Balay PetscCall(VecGetArray(zz, &z)); 11749371c9d4SSatish Balay z_ptr = z; 117549b5e25fSSatish Balay 117649b5e25fSSatish Balay aj = a->j; 117749b5e25fSSatish Balay v = a->a; 117849b5e25fSSatish Balay ii = a->i; 117949b5e25fSSatish Balay 118048a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 118149b5e25fSSatish Balay work = a->mult_work; 118249b5e25fSSatish Balay 118349b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 11849371c9d4SSatish Balay n = ii[1] - ii[0]; 11859371c9d4SSatish Balay ncols = n * bs; 11869371c9d4SSatish Balay workt = work; 11879371c9d4SSatish Balay idx = aj + ii[0]; 118898c9bda7SSatish Balay nonzerorow += (n > 0); 118949b5e25fSSatish Balay 119049b5e25fSSatish Balay /* upper triangular part */ 119149b5e25fSSatish Balay for (j = 0; j < n; j++) { 119249b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 119349b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 119449b5e25fSSatish Balay workt += bs; 119549b5e25fSSatish Balay } 119649b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 119796b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 119849b5e25fSSatish Balay 119949b5e25fSSatish Balay /* strict lower triangular part */ 1200831a3094SHong Zhang idx = aj + ii[0]; 120159689ffbSStefano Zampini if (n && *idx == i) { 12029371c9d4SSatish Balay ncols -= bs; 12039371c9d4SSatish Balay v += bs2; 12049371c9d4SSatish Balay idx++; 12059371c9d4SSatish Balay n--; 1206831a3094SHong Zhang } 120749b5e25fSSatish Balay if (ncols > 0) { 120849b5e25fSSatish Balay workt = work; 12099566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 121096b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1211831a3094SHong Zhang for (j = 0; j < n; j++) { 1212831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 121349b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 121449b5e25fSSatish Balay workt += bs; 121549b5e25fSSatish Balay } 121649b5e25fSSatish Balay } 121749b5e25fSSatish Balay 12189371c9d4SSatish Balay x += bs; 12199371c9d4SSatish Balay v += n * bs2; 12209371c9d4SSatish Balay z += bs; 12219371c9d4SSatish Balay ii++; 122249b5e25fSSatish Balay } 122349b5e25fSSatish Balay 12249566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 12259566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 122649b5e25fSSatish Balay 12279566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 122849b5e25fSSatish Balay PetscFunctionReturn(0); 122949b5e25fSSatish Balay } 123049b5e25fSSatish Balay 12319371c9d4SSatish Balay PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) { 123249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1233f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1234c5df96a5SBarry Smith PetscBLASInt one = 1, totalnz; 123549b5e25fSSatish Balay 123649b5e25fSSatish Balay PetscFunctionBegin; 12379566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1238792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 12399566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 124049b5e25fSSatish Balay PetscFunctionReturn(0); 124149b5e25fSSatish Balay } 124249b5e25fSSatish Balay 12439371c9d4SSatish Balay PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) { 124449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1245d9ca1df4SBarry Smith const MatScalar *v = a->a; 124649b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1247d9ca1df4SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1248d9ca1df4SBarry Smith const PetscInt *aj = a->j, *col; 124949b5e25fSSatish Balay 125049b5e25fSSatish Balay PetscFunctionBegin; 1251c40ae873SPierre Jolivet if (!a->nz) { 1252c40ae873SPierre Jolivet *norm = 0.0; 1253c40ae873SPierre Jolivet PetscFunctionReturn(0); 1254c40ae873SPierre Jolivet } 125549b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 125649b5e25fSSatish Balay for (k = 0; k < mbs; k++) { 125759689ffbSStefano Zampini jmin = a->i[k]; 125859689ffbSStefano Zampini jmax = a->i[k + 1]; 1259831a3094SHong Zhang col = aj + jmin; 126059689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 126149b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 12629371c9d4SSatish Balay sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 12639371c9d4SSatish Balay v++; 126449b5e25fSSatish Balay } 1265831a3094SHong Zhang jmin++; 1266831a3094SHong Zhang } 1267831a3094SHong Zhang for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 126849b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 12699371c9d4SSatish Balay sum_off += PetscRealPart(PetscConj(*v) * (*v)); 12709371c9d4SSatish Balay v++; 127149b5e25fSSatish Balay } 127249b5e25fSSatish Balay } 127349b5e25fSSatish Balay } 12748f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 12759566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 12760b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 12779566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 12780b8dc8d2SHong Zhang for (i = 0; i < mbs; i++) jl[i] = mbs; 12790b8dc8d2SHong Zhang il[0] = 0; 128049b5e25fSSatish Balay 128149b5e25fSSatish Balay *norm = 0.0; 128249b5e25fSSatish Balay for (k = 0; k < mbs; k++) { /* k_th block row */ 128349b5e25fSSatish Balay for (j = 0; j < bs; j++) sum[j] = 0.0; 128449b5e25fSSatish Balay /*-- col sum --*/ 128549b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1286831a3094SHong Zhang /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1287831a3094SHong Zhang at step k */ 128849b5e25fSSatish Balay while (i < mbs) { 128949b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 129049b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 129149b5e25fSSatish Balay for (j = 0; j < bs; j++) { 129249b5e25fSSatish Balay v = a->a + ik * bs2 + j * bs; 129349b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 12949371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 12959371c9d4SSatish Balay v++; 129649b5e25fSSatish Balay } 129749b5e25fSSatish Balay } 129849b5e25fSSatish Balay /* update il, jl */ 1299831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1300831a3094SHong Zhang jmax = a->i[i + 1]; 130149b5e25fSSatish Balay if (jmin < jmax) { 130249b5e25fSSatish Balay il[i] = jmin; 130349b5e25fSSatish Balay j = a->j[jmin]; 13049371c9d4SSatish Balay jl[i] = jl[j]; 13059371c9d4SSatish Balay jl[j] = i; 130649b5e25fSSatish Balay } 130749b5e25fSSatish Balay i = nexti; 130849b5e25fSSatish Balay } 130949b5e25fSSatish Balay /*-- row sum --*/ 131059689ffbSStefano Zampini jmin = a->i[k]; 131159689ffbSStefano Zampini jmax = a->i[k + 1]; 131249b5e25fSSatish Balay for (i = jmin; i < jmax; i++) { 131349b5e25fSSatish Balay for (j = 0; j < bs; j++) { 131449b5e25fSSatish Balay v = a->a + i * bs2 + j; 131549b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 13169371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 13179371c9d4SSatish Balay v += bs; 131849b5e25fSSatish Balay } 131949b5e25fSSatish Balay } 132049b5e25fSSatish Balay } 132149b5e25fSSatish Balay /* add k_th block row to il, jl */ 1322831a3094SHong Zhang col = aj + jmin; 132359689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) jmin++; 132449b5e25fSSatish Balay if (jmin < jmax) { 132549b5e25fSSatish Balay il[k] = jmin; 13269371c9d4SSatish Balay j = a->j[jmin]; 13279371c9d4SSatish Balay jl[k] = jl[j]; 13289371c9d4SSatish Balay jl[j] = k; 132949b5e25fSSatish Balay } 133049b5e25fSSatish Balay for (j = 0; j < bs; j++) { 133149b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 133249b5e25fSSatish Balay } 133349b5e25fSSatish Balay } 13349566063dSJacob Faibussowitsch PetscCall(PetscFree3(sum, il, jl)); 13359566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1336f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 133749b5e25fSSatish Balay PetscFunctionReturn(0); 133849b5e25fSSatish Balay } 133949b5e25fSSatish Balay 13409371c9d4SSatish Balay PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) { 134149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 134249b5e25fSSatish Balay 134349b5e25fSSatish Balay PetscFunctionBegin; 134449b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1345d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1346ef511fbeSHong Zhang *flg = PETSC_FALSE; 1347ef511fbeSHong Zhang PetscFunctionReturn(0); 134849b5e25fSSatish Balay } 134949b5e25fSSatish Balay 135049b5e25fSSatish Balay /* if the a->i are the same */ 13519566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 135226fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 135349b5e25fSSatish Balay 135449b5e25fSSatish Balay /* if a->j are the same */ 13559566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 135626fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 135726fbe8dcSKarl Rupp 135849b5e25fSSatish Balay /* if a->a are the same */ 13599566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 1360935af2e7SHong Zhang PetscFunctionReturn(0); 136149b5e25fSSatish Balay } 136249b5e25fSSatish Balay 13639371c9d4SSatish Balay PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) { 136449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1365d9ca1df4SBarry Smith PetscInt i, j, k, row, bs, ambs, bs2; 1366d9ca1df4SBarry Smith const PetscInt *ai, *aj; 136787828ca2SBarry Smith PetscScalar *x, zero = 0.0; 1368d9ca1df4SBarry Smith const MatScalar *aa, *aa_j; 136949b5e25fSSatish Balay 137049b5e25fSSatish Balay PetscFunctionBegin; 1371d0f46423SBarry Smith bs = A->rmap->bs; 1372aed4548fSBarry Smith PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 137382799104SHong Zhang 137449b5e25fSSatish Balay aa = a->a; 13758a0c6efdSHong Zhang ambs = a->mbs; 13768a0c6efdSHong Zhang 13778a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 13788a0c6efdSHong Zhang PetscInt *diag = a->diag; 13798a0c6efdSHong Zhang aa = a->a; 13808a0c6efdSHong Zhang ambs = a->mbs; 13819566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 13828a0c6efdSHong Zhang for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 13839566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 13848a0c6efdSHong Zhang PetscFunctionReturn(0); 13858a0c6efdSHong Zhang } 13868a0c6efdSHong Zhang 138749b5e25fSSatish Balay ai = a->i; 138849b5e25fSSatish Balay aj = a->j; 138949b5e25fSSatish Balay bs2 = a->bs2; 13909566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 1391c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 13929566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 139349b5e25fSSatish Balay for (i = 0; i < ambs; i++) { 139449b5e25fSSatish Balay j = ai[i]; 139549b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 139649b5e25fSSatish Balay row = i * bs; 139749b5e25fSSatish Balay aa_j = aa + j * bs2; 139849b5e25fSSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 139949b5e25fSSatish Balay } 140049b5e25fSSatish Balay } 14019566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 140249b5e25fSSatish Balay PetscFunctionReturn(0); 140349b5e25fSSatish Balay } 140449b5e25fSSatish Balay 14059371c9d4SSatish Balay PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) { 140649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1407d9ca1df4SBarry Smith PetscScalar x; 1408d9ca1df4SBarry Smith const PetscScalar *l, *li, *ri; 140949b5e25fSSatish Balay MatScalar *aa, *v; 1410fff8e43fSBarry Smith PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1411fff8e43fSBarry Smith const PetscInt *ai, *aj; 1412ace3abfcSBarry Smith PetscBool flg; 141349b5e25fSSatish Balay 141449b5e25fSSatish Balay PetscFunctionBegin; 1415b3bf805bSHong Zhang if (ll != rr) { 14169566063dSJacob Faibussowitsch PetscCall(VecEqual(ll, rr, &flg)); 141728b400f6SJacob Faibussowitsch PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1418b3bf805bSHong Zhang } 1419b3bf805bSHong Zhang if (!ll) PetscFunctionReturn(0); 142049b5e25fSSatish Balay ai = a->i; 142149b5e25fSSatish Balay aj = a->j; 142249b5e25fSSatish Balay aa = a->a; 1423d0f46423SBarry Smith m = A->rmap->N; 1424d0f46423SBarry Smith bs = A->rmap->bs; 142549b5e25fSSatish Balay mbs = a->mbs; 142649b5e25fSSatish Balay bs2 = a->bs2; 142749b5e25fSSatish Balay 14289566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 14299566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 143008401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 143149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 143249b5e25fSSatish Balay M = ai[i + 1] - ai[i]; 143349b5e25fSSatish Balay li = l + i * bs; 143449b5e25fSSatish Balay v = aa + bs2 * ai[i]; 143549b5e25fSSatish Balay for (j = 0; j < M; j++) { /* for each block */ 143649b5e25fSSatish Balay ri = l + bs * aj[ai[i] + j]; 14375e90f9d9SHong Zhang for (k = 0; k < bs; k++) { 143849b5e25fSSatish Balay x = ri[k]; 143949b5e25fSSatish Balay for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 144049b5e25fSSatish Balay } 144149b5e25fSSatish Balay } 144249b5e25fSSatish Balay } 14439566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 14449566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 144549b5e25fSSatish Balay PetscFunctionReturn(0); 144649b5e25fSSatish Balay } 144749b5e25fSSatish Balay 14489371c9d4SSatish Balay PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) { 144949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 145049b5e25fSSatish Balay 145149b5e25fSSatish Balay PetscFunctionBegin; 145249b5e25fSSatish Balay info->block_size = a->bs2; 1453ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 14546c6c5352SBarry Smith info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 14553966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 145649b5e25fSSatish Balay info->assemblies = A->num_ass; 14578e58a170SBarry Smith info->mallocs = A->info.mallocs; 1458*4dfa11a4SJacob Faibussowitsch info->memory = 0; /* REVIEW ME */ 1459d5f3da31SBarry Smith if (A->factortype) { 146049b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 146149b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 146249b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 146349b5e25fSSatish Balay } else { 146449b5e25fSSatish Balay info->fill_ratio_given = 0; 146549b5e25fSSatish Balay info->fill_ratio_needed = 0; 146649b5e25fSSatish Balay info->factor_mallocs = 0; 146749b5e25fSSatish Balay } 146849b5e25fSSatish Balay PetscFunctionReturn(0); 146949b5e25fSSatish Balay } 147049b5e25fSSatish Balay 14719371c9d4SSatish Balay PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) { 147249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 147349b5e25fSSatish Balay 147449b5e25fSSatish Balay PetscFunctionBegin; 14759566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 147649b5e25fSSatish Balay PetscFunctionReturn(0); 147749b5e25fSSatish Balay } 1478dc354874SHong Zhang 14799371c9d4SSatish Balay PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) { 1480dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1481d9ca1df4SBarry Smith PetscInt i, j, n, row, col, bs, mbs; 1482d9ca1df4SBarry Smith const PetscInt *ai, *aj; 1483c3fca9a7SHong Zhang PetscReal atmp; 1484d9ca1df4SBarry Smith const MatScalar *aa; 1485985db425SBarry Smith PetscScalar *x; 148613f74950SBarry Smith PetscInt ncols, brow, bcol, krow, kcol; 1487dc354874SHong Zhang 1488dc354874SHong Zhang PetscFunctionBegin; 148928b400f6SJacob Faibussowitsch PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 149028b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1491d0f46423SBarry Smith bs = A->rmap->bs; 1492dc354874SHong Zhang aa = a->a; 1493dc354874SHong Zhang ai = a->i; 1494dc354874SHong Zhang aj = a->j; 149544117c81SHong Zhang mbs = a->mbs; 1496dc354874SHong Zhang 14979566063dSJacob Faibussowitsch PetscCall(VecSet(v, 0.0)); 14989566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 14999566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 150008401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 150144117c81SHong Zhang for (i = 0; i < mbs; i++) { 15029371c9d4SSatish Balay ncols = ai[1] - ai[0]; 15039371c9d4SSatish Balay ai++; 1504d0f6400bSHong Zhang brow = bs * i; 150544117c81SHong Zhang for (j = 0; j < ncols; j++) { 1506d0f6400bSHong Zhang bcol = bs * (*aj); 150744117c81SHong Zhang for (kcol = 0; kcol < bs; kcol++) { 1508d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 150944117c81SHong Zhang for (krow = 0; krow < bs; krow++) { 15109371c9d4SSatish Balay atmp = PetscAbsScalar(*aa); 15119371c9d4SSatish Balay aa++; 1512d0f6400bSHong Zhang row = brow + krow; /* row index */ 1513c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1514c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 151544117c81SHong Zhang } 151644117c81SHong Zhang } 1517d0f6400bSHong Zhang aj++; 1518dc354874SHong Zhang } 1519dc354874SHong Zhang } 15209566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 1521dc354874SHong Zhang PetscFunctionReturn(0); 1522dc354874SHong Zhang } 1523c2916339SPierre Jolivet 15249371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) { 1525c2916339SPierre Jolivet PetscFunctionBegin; 15269566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 15274222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1528c2916339SPierre Jolivet PetscFunctionReturn(0); 1529c2916339SPierre Jolivet } 1530c2916339SPierre Jolivet 15319371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 1532c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1533c2916339SPierre Jolivet PetscScalar *z = c; 1534c2916339SPierre Jolivet const PetscScalar *xb; 1535c2916339SPierre Jolivet PetscScalar x1; 1536c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1537c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 15383c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 1539b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 15403c2e41e1SStefano Zampini #else 15413c2e41e1SStefano Zampini const int aconj = 0; 15423c2e41e1SStefano Zampini #endif 1543c2916339SPierre Jolivet 1544c2916339SPierre Jolivet PetscFunctionBegin; 1545c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 15469371c9d4SSatish Balay n = ii[1] - ii[0]; 15479371c9d4SSatish Balay ii++; 1548c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1549c2916339SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1550c2916339SPierre Jolivet jj = idx; 1551c2916339SPierre Jolivet vv = v; 1552c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1553c2916339SPierre Jolivet idx = jj; 1554c2916339SPierre Jolivet v = vv; 1555c2916339SPierre Jolivet for (j = 0; j < n; j++) { 15569371c9d4SSatish Balay xb = b + (*idx); 15579371c9d4SSatish Balay x1 = xb[0 + k * bm]; 1558c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1; 15593c2e41e1SStefano Zampini if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1560c2916339SPierre Jolivet v += 1; 1561c2916339SPierre Jolivet ++idx; 1562c2916339SPierre Jolivet } 1563c2916339SPierre Jolivet } 1564c2916339SPierre Jolivet z += 1; 1565c2916339SPierre Jolivet } 1566c2916339SPierre Jolivet PetscFunctionReturn(0); 1567c2916339SPierre Jolivet } 1568c2916339SPierre Jolivet 15699371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 1570c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1571c2916339SPierre Jolivet PetscScalar *z = c; 1572c2916339SPierre Jolivet const PetscScalar *xb; 1573c2916339SPierre Jolivet PetscScalar x1, x2; 1574c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1575c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1576c2916339SPierre Jolivet 1577c2916339SPierre Jolivet PetscFunctionBegin; 1578c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 15799371c9d4SSatish Balay n = ii[1] - ii[0]; 15809371c9d4SSatish Balay ii++; 1581c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1582c2916339SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1583c2916339SPierre Jolivet jj = idx; 1584c2916339SPierre Jolivet vv = v; 1585c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1586c2916339SPierre Jolivet idx = jj; 1587c2916339SPierre Jolivet v = vv; 1588c2916339SPierre Jolivet for (j = 0; j < n; j++) { 15899371c9d4SSatish Balay xb = b + 2 * (*idx); 15909371c9d4SSatish Balay x1 = xb[0 + k * bm]; 15919371c9d4SSatish Balay x2 = xb[1 + k * bm]; 1592c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1593c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1594c2916339SPierre Jolivet if (*idx != i) { 1595c2916339SPierre Jolivet c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1596c2916339SPierre Jolivet c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1597c2916339SPierre Jolivet } 1598c2916339SPierre Jolivet v += 4; 1599c2916339SPierre Jolivet ++idx; 1600c2916339SPierre Jolivet } 1601c2916339SPierre Jolivet } 1602c2916339SPierre Jolivet z += 2; 1603c2916339SPierre Jolivet } 1604c2916339SPierre Jolivet PetscFunctionReturn(0); 1605c2916339SPierre Jolivet } 1606c2916339SPierre Jolivet 16079371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 1608c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1609c2916339SPierre Jolivet PetscScalar *z = c; 1610c2916339SPierre Jolivet const PetscScalar *xb; 1611c2916339SPierre Jolivet PetscScalar x1, x2, x3; 1612c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1613c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1614c2916339SPierre Jolivet 1615c2916339SPierre Jolivet PetscFunctionBegin; 1616c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16179371c9d4SSatish Balay n = ii[1] - ii[0]; 16189371c9d4SSatish Balay ii++; 1619c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1620c2916339SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1621c2916339SPierre Jolivet jj = idx; 1622c2916339SPierre Jolivet vv = v; 1623c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1624c2916339SPierre Jolivet idx = jj; 1625c2916339SPierre Jolivet v = vv; 1626c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16279371c9d4SSatish Balay xb = b + 3 * (*idx); 16289371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16299371c9d4SSatish Balay x2 = xb[1 + k * bm]; 16309371c9d4SSatish Balay x3 = xb[2 + k * bm]; 1631c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1632c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1633c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1634c2916339SPierre Jolivet if (*idx != i) { 1635c2916339SPierre Jolivet c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1636c2916339SPierre Jolivet c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1637c2916339SPierre Jolivet c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1638c2916339SPierre Jolivet } 1639c2916339SPierre Jolivet v += 9; 1640c2916339SPierre Jolivet ++idx; 1641c2916339SPierre Jolivet } 1642c2916339SPierre Jolivet } 1643c2916339SPierre Jolivet z += 3; 1644c2916339SPierre Jolivet } 1645c2916339SPierre Jolivet PetscFunctionReturn(0); 1646c2916339SPierre Jolivet } 1647c2916339SPierre Jolivet 16489371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 1649c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1650c2916339SPierre Jolivet PetscScalar *z = c; 1651c2916339SPierre Jolivet const PetscScalar *xb; 1652c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4; 1653c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1654c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1655c2916339SPierre Jolivet 1656c2916339SPierre Jolivet PetscFunctionBegin; 1657c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16589371c9d4SSatish Balay n = ii[1] - ii[0]; 16599371c9d4SSatish Balay ii++; 1660c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1661c2916339SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1662c2916339SPierre Jolivet jj = idx; 1663c2916339SPierre Jolivet vv = v; 1664c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1665c2916339SPierre Jolivet idx = jj; 1666c2916339SPierre Jolivet v = vv; 1667c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16689371c9d4SSatish Balay xb = b + 4 * (*idx); 16699371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16709371c9d4SSatish Balay x2 = xb[1 + k * bm]; 16719371c9d4SSatish Balay x3 = xb[2 + k * bm]; 16729371c9d4SSatish Balay x4 = xb[3 + k * bm]; 1673c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1674c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1675c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1676c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1677c2916339SPierre Jolivet if (*idx != i) { 1678c2916339SPierre Jolivet c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1679c2916339SPierre Jolivet c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1680c2916339SPierre Jolivet c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1681c2916339SPierre Jolivet c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1682c2916339SPierre Jolivet } 1683c2916339SPierre Jolivet v += 16; 1684c2916339SPierre Jolivet ++idx; 1685c2916339SPierre Jolivet } 1686c2916339SPierre Jolivet } 1687c2916339SPierre Jolivet z += 4; 1688c2916339SPierre Jolivet } 1689c2916339SPierre Jolivet PetscFunctionReturn(0); 1690c2916339SPierre Jolivet } 1691c2916339SPierre Jolivet 16929371c9d4SSatish Balay PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 1693c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1694c2916339SPierre Jolivet PetscScalar *z = c; 1695c2916339SPierre Jolivet const PetscScalar *xb; 1696c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 1697c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1698c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1699c2916339SPierre Jolivet 1700c2916339SPierre Jolivet PetscFunctionBegin; 1701c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 17029371c9d4SSatish Balay n = ii[1] - ii[0]; 17039371c9d4SSatish Balay ii++; 1704c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1705c2916339SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1706c2916339SPierre Jolivet jj = idx; 1707c2916339SPierre Jolivet vv = v; 1708c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1709c2916339SPierre Jolivet idx = jj; 1710c2916339SPierre Jolivet v = vv; 1711c2916339SPierre Jolivet for (j = 0; j < n; j++) { 17129371c9d4SSatish Balay xb = b + 5 * (*idx); 17139371c9d4SSatish Balay x1 = xb[0 + k * bm]; 17149371c9d4SSatish Balay x2 = xb[1 + k * bm]; 17159371c9d4SSatish Balay x3 = xb[2 + k * bm]; 17169371c9d4SSatish Balay x4 = xb[3 + k * bm]; 17179371c9d4SSatish Balay x5 = xb[4 + k * cm]; 1718c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1719c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1720c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1721c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1722c2916339SPierre Jolivet z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1723c2916339SPierre Jolivet if (*idx != i) { 1724c2916339SPierre Jolivet c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1725c2916339SPierre Jolivet c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1726c2916339SPierre Jolivet c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1727c2916339SPierre Jolivet c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1728c2916339SPierre Jolivet c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1729c2916339SPierre Jolivet } 1730c2916339SPierre Jolivet v += 25; 1731c2916339SPierre Jolivet ++idx; 1732c2916339SPierre Jolivet } 1733c2916339SPierre Jolivet } 1734c2916339SPierre Jolivet z += 5; 1735c2916339SPierre Jolivet } 1736c2916339SPierre Jolivet PetscFunctionReturn(0); 1737c2916339SPierre Jolivet } 1738c2916339SPierre Jolivet 17399371c9d4SSatish Balay PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) { 1740c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1741c2916339SPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1742281439baSStefano Zampini Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1743c2916339SPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1744c2916339SPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1745c2916339SPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 1746f4259b30SLisandro Dalcin PetscScalar *z = NULL; 1747c2916339SPierre Jolivet PetscScalar *c, *b; 1748c2916339SPierre Jolivet const MatScalar *v; 1749c2916339SPierre Jolivet const PetscInt *idx, *ii; 1750c2916339SPierre Jolivet PetscScalar _DOne = 1.0; 1751c2916339SPierre Jolivet 1752c2916339SPierre Jolivet PetscFunctionBegin; 1753c2916339SPierre Jolivet if (!cm || !cn) PetscFunctionReturn(0); 175408401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 175508401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 175608401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1757c2916339SPierre Jolivet b = bd->v; 17589566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(C)); 17599566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 1760c2916339SPierre Jolivet switch (bs) { 17619371c9d4SSatish Balay case 1: PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); break; 17629371c9d4SSatish Balay case 2: PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); break; 17639371c9d4SSatish Balay case 3: PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); break; 17649371c9d4SSatish Balay case 4: PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); break; 17659371c9d4SSatish Balay case 5: PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); break; 1766c2916339SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 17679566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 17689566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 17699566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 17709566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 1771c2916339SPierre Jolivet idx = a->j; 1772c2916339SPierre Jolivet v = a->a; 1773c2916339SPierre Jolivet mbs = a->mbs; 1774c2916339SPierre Jolivet ii = a->i; 1775c2916339SPierre Jolivet z = c; 1776c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 17779371c9d4SSatish Balay n = ii[1] - ii[0]; 17789371c9d4SSatish Balay ii++; 1779c2916339SPierre Jolivet for (j = 0; j < n; j++) { 1780792fecdfSBarry Smith if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1781792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1782c2916339SPierre Jolivet v += bs2; 1783c2916339SPierre Jolivet } 1784c2916339SPierre Jolivet z += bs; 1785c2916339SPierre Jolivet } 1786c2916339SPierre Jolivet } 17879566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 17889566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1789c2916339SPierre Jolivet PetscFunctionReturn(0); 1790c2916339SPierre Jolivet } 1791