1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 2c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 3c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h> 4af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 5c6db04a5SJed Brown #include <petscbt.h> 6c6db04a5SJed Brown #include <petscblaslapack.h> 749b5e25fSSatish Balay 8d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 9d71ae5a4SJacob Faibussowitsch { 105eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 117bede89fSBarry Smith PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 125d0c19d7SBarry Smith const PetscInt *idx; 13db41cccfSHong Zhang PetscBT table_out, table_in; 14d94109b8SHong Zhang 15d94109b8SHong Zhang PetscFunctionBegin; 1608401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 17d94109b8SHong Zhang mbs = a->mbs; 18d94109b8SHong Zhang ai = a->i; 19d94109b8SHong Zhang aj = a->j; 20d0f46423SBarry Smith bs = A->rmap->bs; 219566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_out)); 229566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs + 1, &nidx)); 239566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_in)); 24d94109b8SHong Zhang 25d94109b8SHong Zhang for (i = 0; i < is_max; i++) { /* for each is */ 26d94109b8SHong Zhang isz = 0; 279566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_out)); 28d94109b8SHong Zhang 29d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 309566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 319566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 32d94109b8SHong Zhang 33db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 34dbe03f88SHong Zhang bcol_max = 0; 35d94109b8SHong Zhang for (j = 0; j < n; ++j) { 36d94109b8SHong Zhang brow = idx[j] / bs; /* convert the indices into block indices */ 3708401ef6SPierre Jolivet PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 38db41cccfSHong Zhang if (!PetscBTLookupSet(table_out, brow)) { 39dbe03f88SHong Zhang nidx[isz++] = brow; 40dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 41dbe03f88SHong Zhang } 42d94109b8SHong Zhang } 439566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 449566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 45d94109b8SHong Zhang 46d94109b8SHong Zhang k = 0; 47d94109b8SHong Zhang for (j = 0; j < ov; j++) { /* for each overlap */ 48db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 499566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_in)); 509566063dSJacob Faibussowitsch for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 51d94109b8SHong Zhang 52d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 53d94109b8SHong Zhang for (brow = 0; brow < mbs; brow++) { 549371c9d4SSatish Balay start = ai[brow]; 559371c9d4SSatish Balay end = ai[brow + 1]; 56db41cccfSHong Zhang if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 57d94109b8SHong Zhang for (l = start; l < end; l++) { 58d94109b8SHong Zhang bcol = aj[l]; 59d7b97159SHong Zhang if (!PetscBTLookupSet(table_out, bcol)) { 60d7b97159SHong Zhang nidx[isz++] = bcol; 61d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 62d7b97159SHong Zhang } 63d94109b8SHong Zhang } 64d94109b8SHong Zhang k++; 65d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66da81f932SPierre Jolivet } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */ 67d94109b8SHong Zhang for (l = start; l < end; l++) { 68d94109b8SHong Zhang bcol = aj[l]; 69dbe03f88SHong Zhang if (bcol > bcol_max) break; 70db41cccfSHong Zhang if (PetscBTLookup(table_in, bcol)) { 7126fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 72d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 73d94109b8SHong Zhang } 74d94109b8SHong Zhang } 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } /* for each overlap */ 787bede89fSBarry Smith PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 797bede89fSBarry Smith } /* for each is */ 809566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_out)); 819566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 829566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_in)); 833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 8449b5e25fSSatish Balay } 8549b5e25fSSatish Balay 86847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 877cf5f706SPierre Jolivet Zero some ops' to avoid invalid use */ 88d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 89d71ae5a4SJacob Faibussowitsch { 9049b5e25fSSatish Balay PetscFunctionBegin; 919566063dSJacob Faibussowitsch PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 92f4259b30SLisandro Dalcin Bseq->ops->mult = NULL; 93f4259b30SLisandro Dalcin Bseq->ops->multadd = NULL; 94f4259b30SLisandro Dalcin Bseq->ops->multtranspose = NULL; 95f4259b30SLisandro Dalcin Bseq->ops->multtransposeadd = NULL; 96f4259b30SLisandro Dalcin Bseq->ops->lufactor = NULL; 97f4259b30SLisandro Dalcin Bseq->ops->choleskyfactor = NULL; 98f4259b30SLisandro Dalcin Bseq->ops->lufactorsymbolic = NULL; 99f4259b30SLisandro Dalcin Bseq->ops->choleskyfactorsymbolic = NULL; 100f4259b30SLisandro Dalcin Bseq->ops->getinertia = NULL; 1013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 10249b5e25fSSatish Balay } 10349b5e25fSSatish Balay 1047dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 105fdfbdca6SPierre Jolivet static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym) 106d71ae5a4SJacob Faibussowitsch { 107fdfbdca6SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL; 108fdfbdca6SPierre Jolivet Mat_SeqBAIJ *d = NULL; 109e50a8114SHong Zhang PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 110e50a8114SHong Zhang PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 111e50a8114SHong Zhang const PetscInt *irow, *icol; 112e50a8114SHong Zhang PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 113e50a8114SHong Zhang PetscInt *aj = a->j, *ai = a->i; 114e50a8114SHong Zhang MatScalar *mat_a; 115e50a8114SHong Zhang Mat C; 116e50a8114SHong Zhang PetscBool flag; 117e50a8114SHong Zhang 118e50a8114SHong Zhang PetscFunctionBegin; 119e50a8114SHong Zhang 1209566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 1219566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 1229566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 1239566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 124e50a8114SHong Zhang 1259566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 126e50a8114SHong Zhang ssmap = smap; 1279566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 128e50a8114SHong Zhang for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 129e50a8114SHong Zhang /* determine lens of each row */ 130e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 131e50a8114SHong Zhang kstart = ai[irow[i]]; 132e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 133e50a8114SHong Zhang lens[i] = 0; 134e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 135e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 136e50a8114SHong Zhang } 137e50a8114SHong Zhang } 138e50a8114SHong Zhang /* Create and fill new matrix */ 139e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 140fdfbdca6SPierre Jolivet if (sym) { 141e50a8114SHong Zhang c = (Mat_SeqSBAIJ *)((*B)->data); 142e50a8114SHong Zhang 143aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1449566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 145fdfbdca6SPierre Jolivet PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 1469566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 147fdfbdca6SPierre Jolivet } else { 148fdfbdca6SPierre Jolivet d = (Mat_SeqBAIJ *)((*B)->data); 149fdfbdca6SPierre Jolivet 150fdfbdca6SPierre Jolivet PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 151fdfbdca6SPierre Jolivet PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag)); 152fdfbdca6SPierre Jolivet PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 153fdfbdca6SPierre Jolivet PetscCall(PetscArrayzero(d->ilen, d->mbs)); 154fdfbdca6SPierre Jolivet } 155e50a8114SHong Zhang C = *B; 156e50a8114SHong Zhang } else { 1579566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1589566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 159fdfbdca6SPierre Jolivet if (sym) { 1609566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1619566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 162fdfbdca6SPierre Jolivet } else { 163fdfbdca6SPierre Jolivet PetscCall(MatSetType(C, MATSEQBAIJ)); 164fdfbdca6SPierre Jolivet PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 165e50a8114SHong Zhang } 166fdfbdca6SPierre Jolivet } 167fdfbdca6SPierre Jolivet if (sym) c = (Mat_SeqSBAIJ *)(C->data); 168fdfbdca6SPierre Jolivet else d = (Mat_SeqBAIJ *)(C->data); 169e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 170e50a8114SHong Zhang row = irow[i]; 171e50a8114SHong Zhang kstart = ai[row]; 172e50a8114SHong Zhang kend = kstart + a->ilen[row]; 173fdfbdca6SPierre Jolivet if (sym) { 174e50a8114SHong Zhang mat_i = c->i[i]; 175*8e3a54c0SPierre Jolivet mat_j = PetscSafePointerPlusOffset(c->j, mat_i); 176*8e3a54c0SPierre Jolivet mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); 177e50a8114SHong Zhang mat_ilen = c->ilen + i; 178fdfbdca6SPierre Jolivet } else { 179fdfbdca6SPierre Jolivet mat_i = d->i[i]; 180*8e3a54c0SPierre Jolivet mat_j = PetscSafePointerPlusOffset(d->j, mat_i); 181*8e3a54c0SPierre Jolivet mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2); 182fdfbdca6SPierre Jolivet mat_ilen = d->ilen + i; 183fdfbdca6SPierre Jolivet } 184e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 185e50a8114SHong Zhang if ((tcol = ssmap[a->j[k]])) { 186e50a8114SHong Zhang *mat_j++ = tcol - 1; 1879566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 188e50a8114SHong Zhang mat_a += bs2; 189e50a8114SHong Zhang (*mat_ilen)++; 190e50a8114SHong Zhang } 191e50a8114SHong Zhang } 192e50a8114SHong Zhang } 193e50a8114SHong Zhang /* sort */ 194e50a8114SHong Zhang { 195e50a8114SHong Zhang MatScalar *work; 196e50a8114SHong Zhang 1979566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 198e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 199e50a8114SHong Zhang PetscInt ilen; 200fdfbdca6SPierre Jolivet if (sym) { 201e50a8114SHong Zhang mat_i = c->i[i]; 202*8e3a54c0SPierre Jolivet mat_j = PetscSafePointerPlusOffset(c->j, mat_i); 203*8e3a54c0SPierre Jolivet mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); 204e50a8114SHong Zhang ilen = c->ilen[i]; 205fdfbdca6SPierre Jolivet } else { 206fdfbdca6SPierre Jolivet mat_i = d->i[i]; 207*8e3a54c0SPierre Jolivet mat_j = PetscSafePointerPlusOffset(d->j, mat_i); 208*8e3a54c0SPierre Jolivet mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2); 209fdfbdca6SPierre Jolivet ilen = d->ilen[i]; 210fdfbdca6SPierre Jolivet } 2119566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 212e50a8114SHong Zhang } 2139566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 214e50a8114SHong Zhang } 215e50a8114SHong Zhang 216e50a8114SHong Zhang /* Free work space */ 2179566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 2189566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 2199566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 2209566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 2219566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 222e50a8114SHong Zhang 2239566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 224e50a8114SHong Zhang *B = C; 2253ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 226e50a8114SHong Zhang } 227e50a8114SHong Zhang 228d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 229d71ae5a4SJacob Faibussowitsch { 230fdfbdca6SPierre Jolivet Mat C[2]; 231fdfbdca6SPierre Jolivet IS is1, is2, intersect = NULL; 232fdfbdca6SPierre Jolivet PetscInt n1, n2, ni; 233fdfbdca6SPierre Jolivet PetscBool sym = PETSC_TRUE; 23449b5e25fSSatish Balay 23549b5e25fSSatish Balay PetscFunctionBegin; 236f9a48b90SPierre Jolivet PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 237f9a48b90SPierre Jolivet if (isrow == iscol) { 238f9a48b90SPierre Jolivet is2 = is1; 239f9a48b90SPierre Jolivet PetscCall(PetscObjectReference((PetscObject)is2)); 240fdfbdca6SPierre Jolivet } else { 241fdfbdca6SPierre Jolivet PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 242fdfbdca6SPierre Jolivet PetscCall(ISIntersect(is1, is2, &intersect)); 243fdfbdca6SPierre Jolivet PetscCall(ISGetLocalSize(intersect, &ni)); 244fdfbdca6SPierre Jolivet PetscCall(ISDestroy(&intersect)); 245fdfbdca6SPierre Jolivet if (ni == 0) sym = PETSC_FALSE; 246fdfbdca6SPierre Jolivet else if (PetscDefined(USE_DEBUG)) { 247fdfbdca6SPierre Jolivet PetscCall(ISGetLocalSize(is1, &n1)); 248fdfbdca6SPierre Jolivet PetscCall(ISGetLocalSize(is2, &n2)); 249fdfbdca6SPierre Jolivet PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix"); 250fdfbdca6SPierre Jolivet } 251fdfbdca6SPierre Jolivet } 252fdfbdca6SPierre Jolivet if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym)); 253fdfbdca6SPierre Jolivet else { 254fdfbdca6SPierre Jolivet PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym)); 255fdfbdca6SPierre Jolivet PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym)); 256fdfbdca6SPierre Jolivet PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1)); 257fdfbdca6SPierre Jolivet PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN)); 258fdfbdca6SPierre Jolivet PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 259fdfbdca6SPierre Jolivet if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 260fdfbdca6SPierre Jolivet else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B)); 261fdfbdca6SPierre Jolivet else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 262fdfbdca6SPierre Jolivet PetscCall(MatDestroy(C)); 263fdfbdca6SPierre Jolivet PetscCall(MatDestroy(C + 1)); 264fdfbdca6SPierre Jolivet } 2659566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2669566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 267847daeccSHong Zhang 268fdfbdca6SPierre Jolivet if (sym && isrow != iscol) { 2698f46ffcaSHong Zhang PetscBool isequal; 2709566063dSJacob Faibussowitsch PetscCall(ISEqual(isrow, iscol, &isequal)); 27148a46eb9SPierre Jolivet if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 27249b5e25fSSatish Balay } 2733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 27449b5e25fSSatish Balay } 27549b5e25fSSatish Balay 276d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 277d71ae5a4SJacob Faibussowitsch { 27813f74950SBarry Smith PetscInt i; 27949b5e25fSSatish Balay 28049b5e25fSSatish Balay PetscFunctionBegin; 281314f503fSPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B)); 282e50a8114SHong Zhang 28348a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 2843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28549b5e25fSSatish Balay } 28649b5e25fSSatish Balay 28749b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 288d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 289d71ae5a4SJacob Faibussowitsch { 29049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 291d9ca1df4SBarry Smith PetscScalar *z, x1, x2, zero = 0.0; 292d9ca1df4SBarry Smith const PetscScalar *x, *xb; 293d9ca1df4SBarry Smith const MatScalar *v; 294d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 295d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 29698c9bda7SSatish Balay PetscInt nonzerorow = 0; 29749b5e25fSSatish Balay 29849b5e25fSSatish Balay PetscFunctionBegin; 2999566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 3003ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 3019566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3029566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 30349b5e25fSSatish Balay 30449b5e25fSSatish Balay v = a->a; 30549b5e25fSSatish Balay xb = x; 30649b5e25fSSatish Balay 30749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 30849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3099371c9d4SSatish Balay x1 = xb[0]; 3109371c9d4SSatish Balay x2 = xb[1]; 31149b5e25fSSatish Balay ib = aj + *ai; 312831a3094SHong Zhang jmin = 0; 31398c9bda7SSatish Balay nonzerorow += (n > 0); 3147fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 31549b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 31649b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 3179371c9d4SSatish Balay v += 4; 3189371c9d4SSatish Balay jmin++; 3197fbae186SHong Zhang } 320444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 321444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 322831a3094SHong Zhang for (j = jmin; j < n; j++) { 32349b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 32449b5e25fSSatish Balay cval = ib[j] * 2; 32549b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 32649b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 32749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 32849b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 32949b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 33049b5e25fSSatish Balay v += 4; 33149b5e25fSSatish Balay } 3329371c9d4SSatish Balay xb += 2; 3339371c9d4SSatish Balay ai++; 33449b5e25fSSatish Balay } 33549b5e25fSSatish Balay 3369566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3379566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3389566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 3393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 34049b5e25fSSatish Balay } 34149b5e25fSSatish Balay 342d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 343d71ae5a4SJacob Faibussowitsch { 34449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 345d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, zero = 0.0; 346d9ca1df4SBarry Smith const PetscScalar *x, *xb; 347d9ca1df4SBarry Smith const MatScalar *v; 348d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 349d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 35098c9bda7SSatish Balay PetscInt nonzerorow = 0; 35149b5e25fSSatish Balay 35249b5e25fSSatish Balay PetscFunctionBegin; 3539566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 3543ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 3559566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3569566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 35749b5e25fSSatish Balay 35849b5e25fSSatish Balay v = a->a; 35949b5e25fSSatish Balay xb = x; 36049b5e25fSSatish Balay 36149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 36249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3639371c9d4SSatish Balay x1 = xb[0]; 3649371c9d4SSatish Balay x2 = xb[1]; 3659371c9d4SSatish Balay x3 = xb[2]; 36649b5e25fSSatish Balay ib = aj + *ai; 367831a3094SHong Zhang jmin = 0; 36898c9bda7SSatish Balay nonzerorow += (n > 0); 3697fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 37049b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 37149b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 37249b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3739371c9d4SSatish Balay v += 9; 3749371c9d4SSatish Balay jmin++; 3757fbae186SHong Zhang } 376444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 377444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 378831a3094SHong Zhang for (j = jmin; j < n; j++) { 37949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 38049b5e25fSSatish Balay cval = ib[j] * 3; 38149b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 38249b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 38349b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 38449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 38549b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 38649b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 38749b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 38849b5e25fSSatish Balay v += 9; 38949b5e25fSSatish Balay } 3909371c9d4SSatish Balay xb += 3; 3919371c9d4SSatish Balay ai++; 39249b5e25fSSatish Balay } 39349b5e25fSSatish Balay 3949566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3959566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3969566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 3973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39849b5e25fSSatish Balay } 39949b5e25fSSatish Balay 400d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 401d71ae5a4SJacob Faibussowitsch { 40249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 403d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 404d9ca1df4SBarry Smith const PetscScalar *x, *xb; 405d9ca1df4SBarry Smith const MatScalar *v; 406d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 407d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 40898c9bda7SSatish Balay PetscInt nonzerorow = 0; 40949b5e25fSSatish Balay 41049b5e25fSSatish Balay PetscFunctionBegin; 4119566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 4123ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 4139566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4149566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 41549b5e25fSSatish Balay 41649b5e25fSSatish Balay v = a->a; 41749b5e25fSSatish Balay xb = x; 41849b5e25fSSatish Balay 41949b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 42049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4219371c9d4SSatish Balay x1 = xb[0]; 4229371c9d4SSatish Balay x2 = xb[1]; 4239371c9d4SSatish Balay x3 = xb[2]; 4249371c9d4SSatish Balay x4 = xb[3]; 42549b5e25fSSatish Balay ib = aj + *ai; 426831a3094SHong Zhang jmin = 0; 42798c9bda7SSatish Balay nonzerorow += (n > 0); 4287fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 42949b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 43049b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 43149b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 43249b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 4339371c9d4SSatish Balay v += 16; 4349371c9d4SSatish Balay jmin++; 4357fbae186SHong Zhang } 436444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 437444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 438831a3094SHong Zhang for (j = jmin; j < n; j++) { 43949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 44049b5e25fSSatish Balay cval = ib[j] * 4; 44149b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 44249b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 44349b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 44449b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 44549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 44649b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 44749b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 44849b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 44949b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 45049b5e25fSSatish Balay v += 16; 45149b5e25fSSatish Balay } 4529371c9d4SSatish Balay xb += 4; 4539371c9d4SSatish Balay ai++; 45449b5e25fSSatish Balay } 45549b5e25fSSatish Balay 4569566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4579566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 4589566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 4593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 46049b5e25fSSatish Balay } 46149b5e25fSSatish Balay 462d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 463d71ae5a4SJacob Faibussowitsch { 46449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 465d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 466d9ca1df4SBarry Smith const PetscScalar *x, *xb; 467d9ca1df4SBarry Smith const MatScalar *v; 468d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 469d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 47098c9bda7SSatish Balay PetscInt nonzerorow = 0; 47149b5e25fSSatish Balay 47249b5e25fSSatish Balay PetscFunctionBegin; 4739566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 4743ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 4759566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4769566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 47749b5e25fSSatish Balay 47849b5e25fSSatish Balay v = a->a; 47949b5e25fSSatish Balay xb = x; 48049b5e25fSSatish Balay 48149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 48249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4839371c9d4SSatish Balay x1 = xb[0]; 4849371c9d4SSatish Balay x2 = xb[1]; 4859371c9d4SSatish Balay x3 = xb[2]; 4869371c9d4SSatish Balay x4 = xb[3]; 4879371c9d4SSatish Balay x5 = xb[4]; 48849b5e25fSSatish Balay ib = aj + *ai; 489831a3094SHong Zhang jmin = 0; 49098c9bda7SSatish Balay nonzerorow += (n > 0); 4917fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 49249b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 49349b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 49449b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 49549b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 49649b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 4979371c9d4SSatish Balay v += 25; 4989371c9d4SSatish Balay jmin++; 4997fbae186SHong Zhang } 500444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 501444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 502831a3094SHong Zhang for (j = jmin; j < n; j++) { 50349b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 50449b5e25fSSatish Balay cval = ib[j] * 5; 50549b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 50649b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 50749b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 50849b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 50949b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 51049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 51149b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 51249b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 51349b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 51449b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 51549b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 51649b5e25fSSatish Balay v += 25; 51749b5e25fSSatish Balay } 5189371c9d4SSatish Balay xb += 5; 5199371c9d4SSatish Balay ai++; 52049b5e25fSSatish Balay } 52149b5e25fSSatish Balay 5229566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5239566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5249566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 5253ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 52649b5e25fSSatish Balay } 52749b5e25fSSatish Balay 528d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 529d71ae5a4SJacob Faibussowitsch { 53049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 531d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 532d9ca1df4SBarry Smith const PetscScalar *x, *xb; 533d9ca1df4SBarry Smith const MatScalar *v; 534d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 535d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 53698c9bda7SSatish Balay PetscInt nonzerorow = 0; 53749b5e25fSSatish Balay 53849b5e25fSSatish Balay PetscFunctionBegin; 5399566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 5403ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 5419566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5429566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 54349b5e25fSSatish Balay 54449b5e25fSSatish Balay v = a->a; 54549b5e25fSSatish Balay xb = x; 54649b5e25fSSatish Balay 54749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 54849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 5499371c9d4SSatish Balay x1 = xb[0]; 5509371c9d4SSatish Balay x2 = xb[1]; 5519371c9d4SSatish Balay x3 = xb[2]; 5529371c9d4SSatish Balay x4 = xb[3]; 5539371c9d4SSatish Balay x5 = xb[4]; 5549371c9d4SSatish Balay x6 = xb[5]; 55549b5e25fSSatish Balay ib = aj + *ai; 556831a3094SHong Zhang jmin = 0; 55798c9bda7SSatish Balay nonzerorow += (n > 0); 5587fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 55949b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 56049b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 56149b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 56249b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 56349b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 56449b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 5659371c9d4SSatish Balay v += 36; 5669371c9d4SSatish Balay jmin++; 5677fbae186SHong Zhang } 568444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 569444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 570831a3094SHong Zhang for (j = jmin; j < n; j++) { 57149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 57249b5e25fSSatish Balay cval = ib[j] * 6; 57349b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 57449b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 57549b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 57649b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 57749b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 57849b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 57949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 58049b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 58149b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 58249b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 58349b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 58449b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 58549b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 58649b5e25fSSatish Balay v += 36; 58749b5e25fSSatish Balay } 5889371c9d4SSatish Balay xb += 6; 5899371c9d4SSatish Balay ai++; 59049b5e25fSSatish Balay } 59149b5e25fSSatish Balay 5929566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5939566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5949566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 5953ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 59649b5e25fSSatish Balay } 597c2916339SPierre Jolivet 598d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 599d71ae5a4SJacob Faibussowitsch { 60049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 601d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 602d9ca1df4SBarry Smith const PetscScalar *x, *xb; 603d9ca1df4SBarry Smith const MatScalar *v; 604d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 605d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 60698c9bda7SSatish Balay PetscInt nonzerorow = 0; 60749b5e25fSSatish Balay 60849b5e25fSSatish Balay PetscFunctionBegin; 6099566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 6103ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 6119566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6129566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 61349b5e25fSSatish Balay 61449b5e25fSSatish Balay v = a->a; 61549b5e25fSSatish Balay xb = x; 61649b5e25fSSatish Balay 61749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 61849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 6199371c9d4SSatish Balay x1 = xb[0]; 6209371c9d4SSatish Balay x2 = xb[1]; 6219371c9d4SSatish Balay x3 = xb[2]; 6229371c9d4SSatish Balay x4 = xb[3]; 6239371c9d4SSatish Balay x5 = xb[4]; 6249371c9d4SSatish Balay x6 = xb[5]; 6259371c9d4SSatish Balay x7 = xb[6]; 62649b5e25fSSatish Balay ib = aj + *ai; 627831a3094SHong Zhang jmin = 0; 62898c9bda7SSatish Balay nonzerorow += (n > 0); 6297fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 63049b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 63149b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 63249b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 63349b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 63449b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 63549b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 63649b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 6379371c9d4SSatish Balay v += 49; 6389371c9d4SSatish Balay jmin++; 6397fbae186SHong Zhang } 640444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 641444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 642831a3094SHong Zhang for (j = jmin; j < n; j++) { 64349b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 64449b5e25fSSatish Balay cval = ib[j] * 7; 64549b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 64649b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 64749b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 64849b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 64949b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 65049b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 65149b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 65249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 65349b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 65449b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 65549b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 65649b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 65749b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 65849b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 65949b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 66049b5e25fSSatish Balay v += 49; 66149b5e25fSSatish Balay } 6629371c9d4SSatish Balay xb += 7; 6639371c9d4SSatish Balay ai++; 66449b5e25fSSatish Balay } 6659566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6669566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 6679566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 6683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 66949b5e25fSSatish Balay } 67049b5e25fSSatish Balay 67149b5e25fSSatish Balay /* 67249b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 67349b5e25fSSatish Balay */ 674d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 675d71ae5a4SJacob Faibussowitsch { 67649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 677d9ca1df4SBarry Smith PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 678d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 679d9ca1df4SBarry Smith const MatScalar *v; 680d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 681d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 68298c9bda7SSatish Balay PetscInt nonzerorow = 0; 68349b5e25fSSatish Balay 68449b5e25fSSatish Balay PetscFunctionBegin; 6859566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 6863ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 6879566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6889566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 68959689ffbSStefano Zampini 69059689ffbSStefano Zampini x_ptr = x; 69159689ffbSStefano Zampini z_ptr = z; 69249b5e25fSSatish Balay 69349b5e25fSSatish Balay aj = a->j; 69449b5e25fSSatish Balay v = a->a; 69549b5e25fSSatish Balay ii = a->i; 69649b5e25fSSatish Balay 69748a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 69849b5e25fSSatish Balay work = a->mult_work; 69949b5e25fSSatish Balay 70049b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 7019371c9d4SSatish Balay n = ii[1] - ii[0]; 7029371c9d4SSatish Balay ncols = n * bs; 7039371c9d4SSatish Balay workt = work; 7049371c9d4SSatish Balay idx = aj + ii[0]; 70598c9bda7SSatish Balay nonzerorow += (n > 0); 70649b5e25fSSatish Balay 70749b5e25fSSatish Balay /* upper triangular part */ 70849b5e25fSSatish Balay for (j = 0; j < n; j++) { 70949b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 71049b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 71149b5e25fSSatish Balay workt += bs; 71249b5e25fSSatish Balay } 71349b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 71496b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 71549b5e25fSSatish Balay 71649b5e25fSSatish Balay /* strict lower triangular part */ 717831a3094SHong Zhang idx = aj + ii[0]; 71859689ffbSStefano Zampini if (n && *idx == i) { 7199371c9d4SSatish Balay ncols -= bs; 7209371c9d4SSatish Balay v += bs2; 7219371c9d4SSatish Balay idx++; 7229371c9d4SSatish Balay n--; 723831a3094SHong Zhang } 72496b9376eSHong Zhang 72549b5e25fSSatish Balay if (ncols > 0) { 72649b5e25fSSatish Balay workt = work; 7279566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 72896b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 729831a3094SHong Zhang for (j = 0; j < n; j++) { 730831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 73149b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 73249b5e25fSSatish Balay workt += bs; 73349b5e25fSSatish Balay } 73449b5e25fSSatish Balay } 7359371c9d4SSatish Balay x += bs; 7369371c9d4SSatish Balay v += n * bs2; 7379371c9d4SSatish Balay z += bs; 7389371c9d4SSatish Balay ii++; 73949b5e25fSSatish Balay } 74049b5e25fSSatish Balay 7419566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7429566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 7439566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 7443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 74549b5e25fSSatish Balay } 74649b5e25fSSatish Balay 747d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 748d71ae5a4SJacob Faibussowitsch { 74949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 750d9ca1df4SBarry Smith PetscScalar *z, x1; 751d9ca1df4SBarry Smith const PetscScalar *x, *xb; 752d9ca1df4SBarry Smith const MatScalar *v; 753d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 754d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 75598c9bda7SSatish Balay PetscInt nonzerorow = 0; 756eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 757b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 758eb1ec7c1SStefano Zampini #else 759eb1ec7c1SStefano Zampini const int aconj = 0; 760eb1ec7c1SStefano Zampini #endif 76149b5e25fSSatish Balay 76249b5e25fSSatish Balay PetscFunctionBegin; 7639566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 7643ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 7659566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7669566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 76749b5e25fSSatish Balay v = a->a; 76849b5e25fSSatish Balay xb = x; 76949b5e25fSSatish Balay 77049b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 77149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 77249b5e25fSSatish Balay x1 = xb[0]; 77349b5e25fSSatish Balay ib = aj + *ai; 774831a3094SHong Zhang jmin = 0; 77598c9bda7SSatish Balay nonzerorow += (n > 0); 7763d9ade75SStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 7779371c9d4SSatish Balay z[i] += *v++ * x[*ib++]; 7789371c9d4SSatish Balay jmin++; 779831a3094SHong Zhang } 780eb1ec7c1SStefano Zampini if (aconj) { 781eb1ec7c1SStefano Zampini for (j = jmin; j < n; j++) { 782eb1ec7c1SStefano Zampini cval = *ib; 783eb1ec7c1SStefano Zampini z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 784eb1ec7c1SStefano Zampini z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 785eb1ec7c1SStefano Zampini } 786eb1ec7c1SStefano Zampini } else { 787831a3094SHong Zhang for (j = jmin; j < n; j++) { 78849b5e25fSSatish Balay cval = *ib; 78949b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 79049b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 79149b5e25fSSatish Balay } 792eb1ec7c1SStefano Zampini } 7939371c9d4SSatish Balay xb++; 7949371c9d4SSatish Balay ai++; 79549b5e25fSSatish Balay } 79649b5e25fSSatish Balay 7979566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7989566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 79949b5e25fSSatish Balay 8009566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 8013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 80249b5e25fSSatish Balay } 80349b5e25fSSatish Balay 804d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 805d71ae5a4SJacob Faibussowitsch { 80649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 807d9ca1df4SBarry Smith PetscScalar *z, x1, x2; 808d9ca1df4SBarry Smith const PetscScalar *x, *xb; 809d9ca1df4SBarry Smith const MatScalar *v; 810d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 811d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 81298c9bda7SSatish Balay PetscInt nonzerorow = 0; 81349b5e25fSSatish Balay 81449b5e25fSSatish Balay PetscFunctionBegin; 8159566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 8163ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 8179566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8189566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 81949b5e25fSSatish Balay 82049b5e25fSSatish Balay v = a->a; 82149b5e25fSSatish Balay xb = x; 82249b5e25fSSatish Balay 82349b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 82449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8259371c9d4SSatish Balay x1 = xb[0]; 8269371c9d4SSatish Balay x2 = xb[1]; 82749b5e25fSSatish Balay ib = aj + *ai; 828831a3094SHong Zhang jmin = 0; 82998c9bda7SSatish Balay nonzerorow += (n > 0); 83059689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 83149b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 83249b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 8339371c9d4SSatish Balay v += 4; 8349371c9d4SSatish Balay jmin++; 8357fbae186SHong Zhang } 836444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 837444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 838831a3094SHong Zhang for (j = jmin; j < n; j++) { 83949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 84049b5e25fSSatish Balay cval = ib[j] * 2; 84149b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 84249b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 84349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 84449b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 84549b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 84649b5e25fSSatish Balay v += 4; 84749b5e25fSSatish Balay } 8489371c9d4SSatish Balay xb += 2; 8499371c9d4SSatish Balay ai++; 85049b5e25fSSatish Balay } 8519566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8529566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 85349b5e25fSSatish Balay 8549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 8553ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 85649b5e25fSSatish Balay } 85749b5e25fSSatish Balay 858d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 859d71ae5a4SJacob Faibussowitsch { 86049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 861d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3; 862d9ca1df4SBarry Smith const PetscScalar *x, *xb; 863d9ca1df4SBarry Smith const MatScalar *v; 864d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 865d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 86698c9bda7SSatish Balay PetscInt nonzerorow = 0; 86749b5e25fSSatish Balay 86849b5e25fSSatish Balay PetscFunctionBegin; 8699566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 8703ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 8719566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8729566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 87349b5e25fSSatish Balay 87449b5e25fSSatish Balay v = a->a; 87549b5e25fSSatish Balay xb = x; 87649b5e25fSSatish Balay 87749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 87849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8799371c9d4SSatish Balay x1 = xb[0]; 8809371c9d4SSatish Balay x2 = xb[1]; 8819371c9d4SSatish Balay x3 = xb[2]; 88249b5e25fSSatish Balay ib = aj + *ai; 883831a3094SHong Zhang jmin = 0; 88498c9bda7SSatish Balay nonzerorow += (n > 0); 88559689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 88649b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 88749b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 88849b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 8899371c9d4SSatish Balay v += 9; 8909371c9d4SSatish Balay jmin++; 8917fbae186SHong Zhang } 892444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 893444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 894831a3094SHong Zhang for (j = jmin; j < n; j++) { 89549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 89649b5e25fSSatish Balay cval = ib[j] * 3; 89749b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 89849b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 89949b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 90049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 90149b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 90249b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 90349b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 90449b5e25fSSatish Balay v += 9; 90549b5e25fSSatish Balay } 9069371c9d4SSatish Balay xb += 3; 9079371c9d4SSatish Balay ai++; 90849b5e25fSSatish Balay } 90949b5e25fSSatish Balay 9109566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9119566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 91249b5e25fSSatish Balay 9139566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 9143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 91549b5e25fSSatish Balay } 91649b5e25fSSatish Balay 917d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 918d71ae5a4SJacob Faibussowitsch { 91949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 920d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4; 921d9ca1df4SBarry Smith const PetscScalar *x, *xb; 922d9ca1df4SBarry Smith const MatScalar *v; 923d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 924d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 92598c9bda7SSatish Balay PetscInt nonzerorow = 0; 92649b5e25fSSatish Balay 92749b5e25fSSatish Balay PetscFunctionBegin; 9289566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 9293ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 9309566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9319566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 93249b5e25fSSatish Balay 93349b5e25fSSatish Balay v = a->a; 93449b5e25fSSatish Balay xb = x; 93549b5e25fSSatish Balay 93649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 93749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 9389371c9d4SSatish Balay x1 = xb[0]; 9399371c9d4SSatish Balay x2 = xb[1]; 9409371c9d4SSatish Balay x3 = xb[2]; 9419371c9d4SSatish Balay x4 = xb[3]; 94249b5e25fSSatish Balay ib = aj + *ai; 943831a3094SHong Zhang jmin = 0; 94498c9bda7SSatish Balay nonzerorow += (n > 0); 94559689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 94649b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 94749b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 94849b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 94949b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 9509371c9d4SSatish Balay v += 16; 9519371c9d4SSatish Balay jmin++; 9527fbae186SHong Zhang } 953444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 954444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 955831a3094SHong Zhang for (j = jmin; j < n; j++) { 95649b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 95749b5e25fSSatish Balay cval = ib[j] * 4; 95849b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 95949b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 96049b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 96149b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 96249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 96349b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 96449b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 96549b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 96649b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 96749b5e25fSSatish Balay v += 16; 96849b5e25fSSatish Balay } 9699371c9d4SSatish Balay xb += 4; 9709371c9d4SSatish Balay ai++; 97149b5e25fSSatish Balay } 97249b5e25fSSatish Balay 9739566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9749566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 97549b5e25fSSatish Balay 9769566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 9773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 97849b5e25fSSatish Balay } 97949b5e25fSSatish Balay 980d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 981d71ae5a4SJacob Faibussowitsch { 98249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 983d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5; 984d9ca1df4SBarry Smith const PetscScalar *x, *xb; 985d9ca1df4SBarry Smith const MatScalar *v; 986d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 987d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 98898c9bda7SSatish Balay PetscInt nonzerorow = 0; 98949b5e25fSSatish Balay 99049b5e25fSSatish Balay PetscFunctionBegin; 9919566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 9923ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 9939566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9949566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 99549b5e25fSSatish Balay 99649b5e25fSSatish Balay v = a->a; 99749b5e25fSSatish Balay xb = x; 99849b5e25fSSatish Balay 99949b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 100049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 10019371c9d4SSatish Balay x1 = xb[0]; 10029371c9d4SSatish Balay x2 = xb[1]; 10039371c9d4SSatish Balay x3 = xb[2]; 10049371c9d4SSatish Balay x4 = xb[3]; 10059371c9d4SSatish Balay x5 = xb[4]; 100649b5e25fSSatish Balay ib = aj + *ai; 1007831a3094SHong Zhang jmin = 0; 100898c9bda7SSatish Balay nonzerorow += (n > 0); 100959689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 101049b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 101149b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 101249b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 101349b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 101449b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 10159371c9d4SSatish Balay v += 25; 10169371c9d4SSatish Balay jmin++; 10177fbae186SHong Zhang } 1018444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1019444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1020831a3094SHong Zhang for (j = jmin; j < n; j++) { 102149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 102249b5e25fSSatish Balay cval = ib[j] * 5; 102349b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 102449b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 102549b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 102649b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 102749b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 102849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 102949b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 103049b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 103149b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 103249b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 103349b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 103449b5e25fSSatish Balay v += 25; 103549b5e25fSSatish Balay } 10369371c9d4SSatish Balay xb += 5; 10379371c9d4SSatish Balay ai++; 103849b5e25fSSatish Balay } 103949b5e25fSSatish Balay 10409566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10419566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 104249b5e25fSSatish Balay 10439566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 10443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 104549b5e25fSSatish Balay } 1046c2916339SPierre Jolivet 1047d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1048d71ae5a4SJacob Faibussowitsch { 104949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1050d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6; 1051d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1052d9ca1df4SBarry Smith const MatScalar *v; 1053d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1054d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 105598c9bda7SSatish Balay PetscInt nonzerorow = 0; 105649b5e25fSSatish Balay 105749b5e25fSSatish Balay PetscFunctionBegin; 10589566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 10593ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 10609566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10619566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 106249b5e25fSSatish Balay 106349b5e25fSSatish Balay v = a->a; 106449b5e25fSSatish Balay xb = x; 106549b5e25fSSatish Balay 106649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 106749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 10689371c9d4SSatish Balay x1 = xb[0]; 10699371c9d4SSatish Balay x2 = xb[1]; 10709371c9d4SSatish Balay x3 = xb[2]; 10719371c9d4SSatish Balay x4 = xb[3]; 10729371c9d4SSatish Balay x5 = xb[4]; 10739371c9d4SSatish Balay x6 = xb[5]; 107449b5e25fSSatish Balay ib = aj + *ai; 1075831a3094SHong Zhang jmin = 0; 107698c9bda7SSatish Balay nonzerorow += (n > 0); 107759689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 107849b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 107949b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 108049b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 108149b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 108249b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 108349b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 10849371c9d4SSatish Balay v += 36; 10859371c9d4SSatish Balay jmin++; 10867fbae186SHong Zhang } 1087444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1088444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1089831a3094SHong Zhang for (j = jmin; j < n; j++) { 109049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 109149b5e25fSSatish Balay cval = ib[j] * 6; 109249b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 109349b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 109449b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 109549b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 109649b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 109749b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 109849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 109949b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 110049b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 110149b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 110249b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 110349b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 110449b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 110549b5e25fSSatish Balay v += 36; 110649b5e25fSSatish Balay } 11079371c9d4SSatish Balay xb += 6; 11089371c9d4SSatish Balay ai++; 110949b5e25fSSatish Balay } 111049b5e25fSSatish Balay 11119566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11129566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 111349b5e25fSSatish Balay 11149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 11153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 111649b5e25fSSatish Balay } 111749b5e25fSSatish Balay 1118d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1119d71ae5a4SJacob Faibussowitsch { 112049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1121d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1122d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1123d9ca1df4SBarry Smith const MatScalar *v; 1124d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1125d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 112698c9bda7SSatish Balay PetscInt nonzerorow = 0; 112749b5e25fSSatish Balay 112849b5e25fSSatish Balay PetscFunctionBegin; 11299566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 11303ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 11319566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 11329566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 113349b5e25fSSatish Balay 113449b5e25fSSatish Balay v = a->a; 113549b5e25fSSatish Balay xb = x; 113649b5e25fSSatish Balay 113749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 113849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 11399371c9d4SSatish Balay x1 = xb[0]; 11409371c9d4SSatish Balay x2 = xb[1]; 11419371c9d4SSatish Balay x3 = xb[2]; 11429371c9d4SSatish Balay x4 = xb[3]; 11439371c9d4SSatish Balay x5 = xb[4]; 11449371c9d4SSatish Balay x6 = xb[5]; 11459371c9d4SSatish Balay x7 = xb[6]; 114649b5e25fSSatish Balay ib = aj + *ai; 1147831a3094SHong Zhang jmin = 0; 114898c9bda7SSatish Balay nonzerorow += (n > 0); 114959689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 115049b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 115149b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 115249b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 115349b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 115449b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 115549b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 115649b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 11579371c9d4SSatish Balay v += 49; 11589371c9d4SSatish Balay jmin++; 11597fbae186SHong Zhang } 1160444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1161444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1162831a3094SHong Zhang for (j = jmin; j < n; j++) { 116349b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 116449b5e25fSSatish Balay cval = ib[j] * 7; 116549b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 116649b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 116749b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 116849b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 116949b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 117049b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 117149b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 117249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 117349b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 117449b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 117549b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 117649b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 117749b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 117849b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 117949b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 118049b5e25fSSatish Balay v += 49; 118149b5e25fSSatish Balay } 11829371c9d4SSatish Balay xb += 7; 11839371c9d4SSatish Balay ai++; 118449b5e25fSSatish Balay } 118549b5e25fSSatish Balay 11869566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11879566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 118849b5e25fSSatish Balay 11899566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 11903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 119149b5e25fSSatish Balay } 119249b5e25fSSatish Balay 1193d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1194d71ae5a4SJacob Faibussowitsch { 119549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1196f4259b30SLisandro Dalcin PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1197d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 1198d9ca1df4SBarry Smith const MatScalar *v; 1199d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1200d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 120198c9bda7SSatish Balay PetscInt nonzerorow = 0; 120249b5e25fSSatish Balay 120349b5e25fSSatish Balay PetscFunctionBegin; 12049566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 12053ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 12069371c9d4SSatish Balay PetscCall(VecGetArrayRead(xx, &x)); 12079371c9d4SSatish Balay x_ptr = x; 12089371c9d4SSatish Balay PetscCall(VecGetArray(zz, &z)); 12099371c9d4SSatish Balay z_ptr = z; 121049b5e25fSSatish Balay 121149b5e25fSSatish Balay aj = a->j; 121249b5e25fSSatish Balay v = a->a; 121349b5e25fSSatish Balay ii = a->i; 121449b5e25fSSatish Balay 121548a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 121649b5e25fSSatish Balay work = a->mult_work; 121749b5e25fSSatish Balay 121849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 12199371c9d4SSatish Balay n = ii[1] - ii[0]; 12209371c9d4SSatish Balay ncols = n * bs; 12219371c9d4SSatish Balay workt = work; 12229371c9d4SSatish Balay idx = aj + ii[0]; 122398c9bda7SSatish Balay nonzerorow += (n > 0); 122449b5e25fSSatish Balay 122549b5e25fSSatish Balay /* upper triangular part */ 122649b5e25fSSatish Balay for (j = 0; j < n; j++) { 122749b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 122849b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 122949b5e25fSSatish Balay workt += bs; 123049b5e25fSSatish Balay } 123149b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 123296b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 123349b5e25fSSatish Balay 123449b5e25fSSatish Balay /* strict lower triangular part */ 1235831a3094SHong Zhang idx = aj + ii[0]; 123659689ffbSStefano Zampini if (n && *idx == i) { 12379371c9d4SSatish Balay ncols -= bs; 12389371c9d4SSatish Balay v += bs2; 12399371c9d4SSatish Balay idx++; 12409371c9d4SSatish Balay n--; 1241831a3094SHong Zhang } 124249b5e25fSSatish Balay if (ncols > 0) { 124349b5e25fSSatish Balay workt = work; 12449566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 124596b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1246831a3094SHong Zhang for (j = 0; j < n; j++) { 1247831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 124849b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 124949b5e25fSSatish Balay workt += bs; 125049b5e25fSSatish Balay } 125149b5e25fSSatish Balay } 125249b5e25fSSatish Balay 12539371c9d4SSatish Balay x += bs; 12549371c9d4SSatish Balay v += n * bs2; 12559371c9d4SSatish Balay z += bs; 12569371c9d4SSatish Balay ii++; 125749b5e25fSSatish Balay } 125849b5e25fSSatish Balay 12599566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 12609566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 126149b5e25fSSatish Balay 12629566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 12633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 126449b5e25fSSatish Balay } 126549b5e25fSSatish Balay 1266d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1267d71ae5a4SJacob Faibussowitsch { 126849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1269f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1270c5df96a5SBarry Smith PetscBLASInt one = 1, totalnz; 127149b5e25fSSatish Balay 127249b5e25fSSatish Balay PetscFunctionBegin; 12739566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1274792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 12759566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 12763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 127749b5e25fSSatish Balay } 127849b5e25fSSatish Balay 1279d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1280d71ae5a4SJacob Faibussowitsch { 128149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1282d9ca1df4SBarry Smith const MatScalar *v = a->a; 128349b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1284d9ca1df4SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1285d9ca1df4SBarry Smith const PetscInt *aj = a->j, *col; 128649b5e25fSSatish Balay 128749b5e25fSSatish Balay PetscFunctionBegin; 1288c40ae873SPierre Jolivet if (!a->nz) { 1289c40ae873SPierre Jolivet *norm = 0.0; 12903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1291c40ae873SPierre Jolivet } 129249b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 129349b5e25fSSatish Balay for (k = 0; k < mbs; k++) { 129459689ffbSStefano Zampini jmin = a->i[k]; 129559689ffbSStefano Zampini jmax = a->i[k + 1]; 1296831a3094SHong Zhang col = aj + jmin; 129759689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 129849b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 12999371c9d4SSatish Balay sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 13009371c9d4SSatish Balay v++; 130149b5e25fSSatish Balay } 1302831a3094SHong Zhang jmin++; 1303831a3094SHong Zhang } 1304831a3094SHong Zhang for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 130549b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 13069371c9d4SSatish Balay sum_off += PetscRealPart(PetscConj(*v) * (*v)); 13079371c9d4SSatish Balay v++; 130849b5e25fSSatish Balay } 130949b5e25fSSatish Balay } 131049b5e25fSSatish Balay } 13118f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 13129566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 13130b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 13149566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 13150b8dc8d2SHong Zhang for (i = 0; i < mbs; i++) jl[i] = mbs; 13160b8dc8d2SHong Zhang il[0] = 0; 131749b5e25fSSatish Balay 131849b5e25fSSatish Balay *norm = 0.0; 131949b5e25fSSatish Balay for (k = 0; k < mbs; k++) { /* k_th block row */ 132049b5e25fSSatish Balay for (j = 0; j < bs; j++) sum[j] = 0.0; 132149b5e25fSSatish Balay /*-- col sum --*/ 132249b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1323bbea24aaSStefano Zampini /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window) 1324831a3094SHong Zhang at step k */ 132549b5e25fSSatish Balay while (i < mbs) { 132649b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 132749b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 132849b5e25fSSatish Balay for (j = 0; j < bs; j++) { 132949b5e25fSSatish Balay v = a->a + ik * bs2 + j * bs; 133049b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 13319371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 13329371c9d4SSatish Balay v++; 133349b5e25fSSatish Balay } 133449b5e25fSSatish Balay } 133549b5e25fSSatish Balay /* update il, jl */ 1336831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1337831a3094SHong Zhang jmax = a->i[i + 1]; 133849b5e25fSSatish Balay if (jmin < jmax) { 133949b5e25fSSatish Balay il[i] = jmin; 134049b5e25fSSatish Balay j = a->j[jmin]; 13419371c9d4SSatish Balay jl[i] = jl[j]; 13429371c9d4SSatish Balay jl[j] = i; 134349b5e25fSSatish Balay } 134449b5e25fSSatish Balay i = nexti; 134549b5e25fSSatish Balay } 134649b5e25fSSatish Balay /*-- row sum --*/ 134759689ffbSStefano Zampini jmin = a->i[k]; 134859689ffbSStefano Zampini jmax = a->i[k + 1]; 134949b5e25fSSatish Balay for (i = jmin; i < jmax; i++) { 135049b5e25fSSatish Balay for (j = 0; j < bs; j++) { 135149b5e25fSSatish Balay v = a->a + i * bs2 + j; 135249b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 13539371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 13549371c9d4SSatish Balay v += bs; 135549b5e25fSSatish Balay } 135649b5e25fSSatish Balay } 135749b5e25fSSatish Balay } 135849b5e25fSSatish Balay /* add k_th block row to il, jl */ 1359831a3094SHong Zhang col = aj + jmin; 136059689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) jmin++; 136149b5e25fSSatish Balay if (jmin < jmax) { 136249b5e25fSSatish Balay il[k] = jmin; 13639371c9d4SSatish Balay j = a->j[jmin]; 13649371c9d4SSatish Balay jl[k] = jl[j]; 13659371c9d4SSatish Balay jl[j] = k; 136649b5e25fSSatish Balay } 136749b5e25fSSatish Balay for (j = 0; j < bs; j++) { 136849b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 136949b5e25fSSatish Balay } 137049b5e25fSSatish Balay } 13719566063dSJacob Faibussowitsch PetscCall(PetscFree3(sum, il, jl)); 13729566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1373f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 13743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 137549b5e25fSSatish Balay } 137649b5e25fSSatish Balay 1377d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1378d71ae5a4SJacob Faibussowitsch { 137949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 138049b5e25fSSatish Balay 138149b5e25fSSatish Balay PetscFunctionBegin; 138249b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1383d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1384ef511fbeSHong Zhang *flg = PETSC_FALSE; 13853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 138649b5e25fSSatish Balay } 138749b5e25fSSatish Balay 138849b5e25fSSatish Balay /* if the a->i are the same */ 13899566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 13903ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 139149b5e25fSSatish Balay 139249b5e25fSSatish Balay /* if a->j are the same */ 13939566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 13943ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 139526fbe8dcSKarl Rupp 139649b5e25fSSatish Balay /* if a->a are the same */ 13979566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 13983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 139949b5e25fSSatish Balay } 140049b5e25fSSatish Balay 1401d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1402d71ae5a4SJacob Faibussowitsch { 140349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1404d9ca1df4SBarry Smith PetscInt i, j, k, row, bs, ambs, bs2; 1405d9ca1df4SBarry Smith const PetscInt *ai, *aj; 140687828ca2SBarry Smith PetscScalar *x, zero = 0.0; 1407d9ca1df4SBarry Smith const MatScalar *aa, *aa_j; 140849b5e25fSSatish Balay 140949b5e25fSSatish Balay PetscFunctionBegin; 1410d0f46423SBarry Smith bs = A->rmap->bs; 1411aed4548fSBarry Smith PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 141282799104SHong Zhang 141349b5e25fSSatish Balay aa = a->a; 14148a0c6efdSHong Zhang ambs = a->mbs; 14158a0c6efdSHong Zhang 14168a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 14178a0c6efdSHong Zhang PetscInt *diag = a->diag; 14188a0c6efdSHong Zhang aa = a->a; 14198a0c6efdSHong Zhang ambs = a->mbs; 14209566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 14218a0c6efdSHong Zhang for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 14229566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 14233ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14248a0c6efdSHong Zhang } 14258a0c6efdSHong Zhang 142649b5e25fSSatish Balay ai = a->i; 142749b5e25fSSatish Balay aj = a->j; 142849b5e25fSSatish Balay bs2 = a->bs2; 14299566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 14303ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 14319566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 143249b5e25fSSatish Balay for (i = 0; i < ambs; i++) { 143349b5e25fSSatish Balay j = ai[i]; 143449b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 143549b5e25fSSatish Balay row = i * bs; 143649b5e25fSSatish Balay aa_j = aa + j * bs2; 143749b5e25fSSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 143849b5e25fSSatish Balay } 143949b5e25fSSatish Balay } 14409566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 14413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 144249b5e25fSSatish Balay } 144349b5e25fSSatish Balay 1444d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1445d71ae5a4SJacob Faibussowitsch { 144649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1447d9ca1df4SBarry Smith PetscScalar x; 1448d9ca1df4SBarry Smith const PetscScalar *l, *li, *ri; 144949b5e25fSSatish Balay MatScalar *aa, *v; 1450fff8e43fSBarry Smith PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1451fff8e43fSBarry Smith const PetscInt *ai, *aj; 1452ace3abfcSBarry Smith PetscBool flg; 145349b5e25fSSatish Balay 145449b5e25fSSatish Balay PetscFunctionBegin; 1455b3bf805bSHong Zhang if (ll != rr) { 14569566063dSJacob Faibussowitsch PetscCall(VecEqual(ll, rr, &flg)); 145728b400f6SJacob Faibussowitsch PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1458b3bf805bSHong Zhang } 14593ba16761SJacob Faibussowitsch if (!ll) PetscFunctionReturn(PETSC_SUCCESS); 146049b5e25fSSatish Balay ai = a->i; 146149b5e25fSSatish Balay aj = a->j; 146249b5e25fSSatish Balay aa = a->a; 1463d0f46423SBarry Smith m = A->rmap->N; 1464d0f46423SBarry Smith bs = A->rmap->bs; 146549b5e25fSSatish Balay mbs = a->mbs; 146649b5e25fSSatish Balay bs2 = a->bs2; 146749b5e25fSSatish Balay 14689566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 14699566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 147008401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 147149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 147249b5e25fSSatish Balay M = ai[i + 1] - ai[i]; 147349b5e25fSSatish Balay li = l + i * bs; 147449b5e25fSSatish Balay v = aa + bs2 * ai[i]; 147549b5e25fSSatish Balay for (j = 0; j < M; j++) { /* for each block */ 147649b5e25fSSatish Balay ri = l + bs * aj[ai[i] + j]; 14775e90f9d9SHong Zhang for (k = 0; k < bs; k++) { 147849b5e25fSSatish Balay x = ri[k]; 147949b5e25fSSatish Balay for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 148049b5e25fSSatish Balay } 148149b5e25fSSatish Balay } 148249b5e25fSSatish Balay } 14839566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 14849566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 14853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 148649b5e25fSSatish Balay } 148749b5e25fSSatish Balay 1488d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1489d71ae5a4SJacob Faibussowitsch { 149049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 149149b5e25fSSatish Balay 149249b5e25fSSatish Balay PetscFunctionBegin; 149349b5e25fSSatish Balay info->block_size = a->bs2; 1494ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 14956c6c5352SBarry Smith info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 14963966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 149749b5e25fSSatish Balay info->assemblies = A->num_ass; 14988e58a170SBarry Smith info->mallocs = A->info.mallocs; 14994dfa11a4SJacob Faibussowitsch info->memory = 0; /* REVIEW ME */ 1500d5f3da31SBarry Smith if (A->factortype) { 150149b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 150249b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 150349b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 150449b5e25fSSatish Balay } else { 150549b5e25fSSatish Balay info->fill_ratio_given = 0; 150649b5e25fSSatish Balay info->fill_ratio_needed = 0; 150749b5e25fSSatish Balay info->factor_mallocs = 0; 150849b5e25fSSatish Balay } 15093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 151049b5e25fSSatish Balay } 151149b5e25fSSatish Balay 1512d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1513d71ae5a4SJacob Faibussowitsch { 151449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 151549b5e25fSSatish Balay 151649b5e25fSSatish Balay PetscFunctionBegin; 15179566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 15183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 151949b5e25fSSatish Balay } 1520dc354874SHong Zhang 1521d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1522d71ae5a4SJacob Faibussowitsch { 1523dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1524d9ca1df4SBarry Smith PetscInt i, j, n, row, col, bs, mbs; 1525d9ca1df4SBarry Smith const PetscInt *ai, *aj; 1526c3fca9a7SHong Zhang PetscReal atmp; 1527d9ca1df4SBarry Smith const MatScalar *aa; 1528985db425SBarry Smith PetscScalar *x; 152913f74950SBarry Smith PetscInt ncols, brow, bcol, krow, kcol; 1530dc354874SHong Zhang 1531dc354874SHong Zhang PetscFunctionBegin; 153228b400f6SJacob Faibussowitsch PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 153328b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1534d0f46423SBarry Smith bs = A->rmap->bs; 1535dc354874SHong Zhang aa = a->a; 1536dc354874SHong Zhang ai = a->i; 1537dc354874SHong Zhang aj = a->j; 153844117c81SHong Zhang mbs = a->mbs; 1539dc354874SHong Zhang 15409566063dSJacob Faibussowitsch PetscCall(VecSet(v, 0.0)); 15419566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 15429566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 154308401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 154444117c81SHong Zhang for (i = 0; i < mbs; i++) { 15459371c9d4SSatish Balay ncols = ai[1] - ai[0]; 15469371c9d4SSatish Balay ai++; 1547d0f6400bSHong Zhang brow = bs * i; 154844117c81SHong Zhang for (j = 0; j < ncols; j++) { 1549d0f6400bSHong Zhang bcol = bs * (*aj); 155044117c81SHong Zhang for (kcol = 0; kcol < bs; kcol++) { 1551d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 155244117c81SHong Zhang for (krow = 0; krow < bs; krow++) { 15539371c9d4SSatish Balay atmp = PetscAbsScalar(*aa); 15549371c9d4SSatish Balay aa++; 1555d0f6400bSHong Zhang row = brow + krow; /* row index */ 1556c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1557c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 155844117c81SHong Zhang } 155944117c81SHong Zhang } 1560d0f6400bSHong Zhang aj++; 1561dc354874SHong Zhang } 1562dc354874SHong Zhang } 15639566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 15643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1565dc354874SHong Zhang } 1566c2916339SPierre Jolivet 1567d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1568d71ae5a4SJacob Faibussowitsch { 1569c2916339SPierre Jolivet PetscFunctionBegin; 15709566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 15714222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 15723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1573c2916339SPierre Jolivet } 1574c2916339SPierre Jolivet 157566976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1576d71ae5a4SJacob Faibussowitsch { 1577c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1578c2916339SPierre Jolivet PetscScalar *z = c; 1579c2916339SPierre Jolivet const PetscScalar *xb; 1580c2916339SPierre Jolivet PetscScalar x1; 1581c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1582c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 15833c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 1584b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 15853c2e41e1SStefano Zampini #else 15863c2e41e1SStefano Zampini const int aconj = 0; 15873c2e41e1SStefano Zampini #endif 1588c2916339SPierre Jolivet 1589c2916339SPierre Jolivet PetscFunctionBegin; 1590c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 15919371c9d4SSatish Balay n = ii[1] - ii[0]; 15929371c9d4SSatish Balay ii++; 1593c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1594c2916339SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1595c2916339SPierre Jolivet jj = idx; 1596c2916339SPierre Jolivet vv = v; 1597c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1598c2916339SPierre Jolivet idx = jj; 1599c2916339SPierre Jolivet v = vv; 1600c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16019371c9d4SSatish Balay xb = b + (*idx); 16029371c9d4SSatish Balay x1 = xb[0 + k * bm]; 1603c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1; 16043c2e41e1SStefano Zampini if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1605c2916339SPierre Jolivet v += 1; 1606c2916339SPierre Jolivet ++idx; 1607c2916339SPierre Jolivet } 1608c2916339SPierre Jolivet } 1609c2916339SPierre Jolivet z += 1; 1610c2916339SPierre Jolivet } 16113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1612c2916339SPierre Jolivet } 1613c2916339SPierre Jolivet 161466976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1615d71ae5a4SJacob Faibussowitsch { 1616c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1617c2916339SPierre Jolivet PetscScalar *z = c; 1618c2916339SPierre Jolivet const PetscScalar *xb; 1619c2916339SPierre Jolivet PetscScalar x1, x2; 1620c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1621c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1622c2916339SPierre Jolivet 1623c2916339SPierre Jolivet PetscFunctionBegin; 1624c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16259371c9d4SSatish Balay n = ii[1] - ii[0]; 16269371c9d4SSatish Balay ii++; 1627c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1628c2916339SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1629c2916339SPierre Jolivet jj = idx; 1630c2916339SPierre Jolivet vv = v; 1631c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1632c2916339SPierre Jolivet idx = jj; 1633c2916339SPierre Jolivet v = vv; 1634c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16359371c9d4SSatish Balay xb = b + 2 * (*idx); 16369371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16379371c9d4SSatish Balay x2 = xb[1 + k * bm]; 1638c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1639c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1640c2916339SPierre Jolivet if (*idx != i) { 1641c2916339SPierre Jolivet c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1642c2916339SPierre Jolivet c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1643c2916339SPierre Jolivet } 1644c2916339SPierre Jolivet v += 4; 1645c2916339SPierre Jolivet ++idx; 1646c2916339SPierre Jolivet } 1647c2916339SPierre Jolivet } 1648c2916339SPierre Jolivet z += 2; 1649c2916339SPierre Jolivet } 16503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1651c2916339SPierre Jolivet } 1652c2916339SPierre Jolivet 165366976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1654d71ae5a4SJacob Faibussowitsch { 1655c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1656c2916339SPierre Jolivet PetscScalar *z = c; 1657c2916339SPierre Jolivet const PetscScalar *xb; 1658c2916339SPierre Jolivet PetscScalar x1, x2, x3; 1659c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1660c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1661c2916339SPierre Jolivet 1662c2916339SPierre Jolivet PetscFunctionBegin; 1663c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16649371c9d4SSatish Balay n = ii[1] - ii[0]; 16659371c9d4SSatish Balay ii++; 1666c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1667c2916339SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1668c2916339SPierre Jolivet jj = idx; 1669c2916339SPierre Jolivet vv = v; 1670c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1671c2916339SPierre Jolivet idx = jj; 1672c2916339SPierre Jolivet v = vv; 1673c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16749371c9d4SSatish Balay xb = b + 3 * (*idx); 16759371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16769371c9d4SSatish Balay x2 = xb[1 + k * bm]; 16779371c9d4SSatish Balay x3 = xb[2 + k * bm]; 1678c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1679c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1680c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1681c2916339SPierre Jolivet if (*idx != i) { 1682c2916339SPierre Jolivet c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1683c2916339SPierre Jolivet c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1684c2916339SPierre Jolivet c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1685c2916339SPierre Jolivet } 1686c2916339SPierre Jolivet v += 9; 1687c2916339SPierre Jolivet ++idx; 1688c2916339SPierre Jolivet } 1689c2916339SPierre Jolivet } 1690c2916339SPierre Jolivet z += 3; 1691c2916339SPierre Jolivet } 16923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1693c2916339SPierre Jolivet } 1694c2916339SPierre Jolivet 169566976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1696d71ae5a4SJacob Faibussowitsch { 1697c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1698c2916339SPierre Jolivet PetscScalar *z = c; 1699c2916339SPierre Jolivet const PetscScalar *xb; 1700c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4; 1701c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1702c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1703c2916339SPierre Jolivet 1704c2916339SPierre Jolivet PetscFunctionBegin; 1705c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 17069371c9d4SSatish Balay n = ii[1] - ii[0]; 17079371c9d4SSatish Balay ii++; 1708c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1709c2916339SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1710c2916339SPierre Jolivet jj = idx; 1711c2916339SPierre Jolivet vv = v; 1712c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1713c2916339SPierre Jolivet idx = jj; 1714c2916339SPierre Jolivet v = vv; 1715c2916339SPierre Jolivet for (j = 0; j < n; j++) { 17169371c9d4SSatish Balay xb = b + 4 * (*idx); 17179371c9d4SSatish Balay x1 = xb[0 + k * bm]; 17189371c9d4SSatish Balay x2 = xb[1 + k * bm]; 17199371c9d4SSatish Balay x3 = xb[2 + k * bm]; 17209371c9d4SSatish Balay x4 = xb[3 + k * bm]; 1721c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1722c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1723c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1724c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1725c2916339SPierre Jolivet if (*idx != i) { 1726c2916339SPierre Jolivet c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1727c2916339SPierre Jolivet c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1728c2916339SPierre Jolivet c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1729c2916339SPierre Jolivet c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1730c2916339SPierre Jolivet } 1731c2916339SPierre Jolivet v += 16; 1732c2916339SPierre Jolivet ++idx; 1733c2916339SPierre Jolivet } 1734c2916339SPierre Jolivet } 1735c2916339SPierre Jolivet z += 4; 1736c2916339SPierre Jolivet } 17373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1738c2916339SPierre Jolivet } 1739c2916339SPierre Jolivet 174066976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1741d71ae5a4SJacob Faibussowitsch { 1742c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1743c2916339SPierre Jolivet PetscScalar *z = c; 1744c2916339SPierre Jolivet const PetscScalar *xb; 1745c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 1746c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1747c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1748c2916339SPierre Jolivet 1749c2916339SPierre Jolivet PetscFunctionBegin; 1750c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 17519371c9d4SSatish Balay n = ii[1] - ii[0]; 17529371c9d4SSatish Balay ii++; 1753c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1754c2916339SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1755c2916339SPierre Jolivet jj = idx; 1756c2916339SPierre Jolivet vv = v; 1757c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1758c2916339SPierre Jolivet idx = jj; 1759c2916339SPierre Jolivet v = vv; 1760c2916339SPierre Jolivet for (j = 0; j < n; j++) { 17619371c9d4SSatish Balay xb = b + 5 * (*idx); 17629371c9d4SSatish Balay x1 = xb[0 + k * bm]; 17639371c9d4SSatish Balay x2 = xb[1 + k * bm]; 17649371c9d4SSatish Balay x3 = xb[2 + k * bm]; 17659371c9d4SSatish Balay x4 = xb[3 + k * bm]; 17669371c9d4SSatish Balay x5 = xb[4 + k * cm]; 1767c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1768c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1769c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1770c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1771c2916339SPierre Jolivet z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1772c2916339SPierre Jolivet if (*idx != i) { 1773c2916339SPierre Jolivet c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1774c2916339SPierre Jolivet c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1775c2916339SPierre Jolivet c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1776c2916339SPierre Jolivet c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1777c2916339SPierre Jolivet c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1778c2916339SPierre Jolivet } 1779c2916339SPierre Jolivet v += 25; 1780c2916339SPierre Jolivet ++idx; 1781c2916339SPierre Jolivet } 1782c2916339SPierre Jolivet } 1783c2916339SPierre Jolivet z += 5; 1784c2916339SPierre Jolivet } 17853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1786c2916339SPierre Jolivet } 1787c2916339SPierre Jolivet 1788d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1789d71ae5a4SJacob Faibussowitsch { 1790c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1791c2916339SPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1792281439baSStefano Zampini Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1793c2916339SPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1794c2916339SPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1795c2916339SPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 1796f4259b30SLisandro Dalcin PetscScalar *z = NULL; 1797c2916339SPierre Jolivet PetscScalar *c, *b; 1798c2916339SPierre Jolivet const MatScalar *v; 1799c2916339SPierre Jolivet const PetscInt *idx, *ii; 1800c2916339SPierre Jolivet PetscScalar _DOne = 1.0; 1801c2916339SPierre Jolivet 1802c2916339SPierre Jolivet PetscFunctionBegin; 18033ba16761SJacob Faibussowitsch if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 180408401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 180508401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 180608401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1807c2916339SPierre Jolivet b = bd->v; 18089566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(C)); 18099566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 1810c2916339SPierre Jolivet switch (bs) { 1811d71ae5a4SJacob Faibussowitsch case 1: 1812d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1813d71ae5a4SJacob Faibussowitsch break; 1814d71ae5a4SJacob Faibussowitsch case 2: 1815d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1816d71ae5a4SJacob Faibussowitsch break; 1817d71ae5a4SJacob Faibussowitsch case 3: 1818d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1819d71ae5a4SJacob Faibussowitsch break; 1820d71ae5a4SJacob Faibussowitsch case 4: 1821d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1822d71ae5a4SJacob Faibussowitsch break; 1823d71ae5a4SJacob Faibussowitsch case 5: 1824d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1825d71ae5a4SJacob Faibussowitsch break; 1826c2916339SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 18279566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 18289566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 18299566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 18309566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 1831c2916339SPierre Jolivet idx = a->j; 1832c2916339SPierre Jolivet v = a->a; 1833c2916339SPierre Jolivet mbs = a->mbs; 1834c2916339SPierre Jolivet ii = a->i; 1835c2916339SPierre Jolivet z = c; 1836c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 18379371c9d4SSatish Balay n = ii[1] - ii[0]; 18389371c9d4SSatish Balay ii++; 1839c2916339SPierre Jolivet for (j = 0; j < n; j++) { 1840792fecdfSBarry Smith if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1841792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1842c2916339SPierre Jolivet v += bs2; 1843c2916339SPierre Jolivet } 1844c2916339SPierre Jolivet z += bs; 1845c2916339SPierre Jolivet } 1846c2916339SPierre Jolivet } 18479566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 18489566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 18493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1850c2916339SPierre Jolivet } 1851