1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 2c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 3c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h> 4af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 5c6db04a5SJed Brown #include <petscbt.h> 6c6db04a5SJed Brown #include <petscblaslapack.h> 749b5e25fSSatish Balay 8d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 9d71ae5a4SJacob Faibussowitsch { 105eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 117bede89fSBarry Smith PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 125d0c19d7SBarry Smith const PetscInt *idx; 13db41cccfSHong Zhang PetscBT table_out, table_in; 14d94109b8SHong Zhang 15d94109b8SHong Zhang PetscFunctionBegin; 1608401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 17d94109b8SHong Zhang mbs = a->mbs; 18d94109b8SHong Zhang ai = a->i; 19d94109b8SHong Zhang aj = a->j; 20d0f46423SBarry Smith bs = A->rmap->bs; 219566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_out)); 229566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs + 1, &nidx)); 239566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_in)); 24d94109b8SHong Zhang 25d94109b8SHong Zhang for (i = 0; i < is_max; i++) { /* for each is */ 26d94109b8SHong Zhang isz = 0; 279566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_out)); 28d94109b8SHong Zhang 29d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 309566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 319566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 32d94109b8SHong Zhang 33db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 34dbe03f88SHong Zhang bcol_max = 0; 35d94109b8SHong Zhang for (j = 0; j < n; ++j) { 36d94109b8SHong Zhang brow = idx[j] / bs; /* convert the indices into block indices */ 3708401ef6SPierre Jolivet PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 38db41cccfSHong Zhang if (!PetscBTLookupSet(table_out, brow)) { 39dbe03f88SHong Zhang nidx[isz++] = brow; 40dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 41dbe03f88SHong Zhang } 42d94109b8SHong Zhang } 439566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 449566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 45d94109b8SHong Zhang 46d94109b8SHong Zhang k = 0; 47d94109b8SHong Zhang for (j = 0; j < ov; j++) { /* for each overlap */ 48db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 499566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_in)); 509566063dSJacob Faibussowitsch for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 51d94109b8SHong Zhang 52d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 53d94109b8SHong Zhang for (brow = 0; brow < mbs; brow++) { 549371c9d4SSatish Balay start = ai[brow]; 559371c9d4SSatish Balay end = ai[brow + 1]; 56db41cccfSHong Zhang if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 57d94109b8SHong Zhang for (l = start; l < end; l++) { 58d94109b8SHong Zhang bcol = aj[l]; 59d7b97159SHong Zhang if (!PetscBTLookupSet(table_out, bcol)) { 60d7b97159SHong Zhang nidx[isz++] = bcol; 61d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 62d7b97159SHong Zhang } 63d94109b8SHong Zhang } 64d94109b8SHong Zhang k++; 65d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66da81f932SPierre Jolivet } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */ 67d94109b8SHong Zhang for (l = start; l < end; l++) { 68d94109b8SHong Zhang bcol = aj[l]; 69dbe03f88SHong Zhang if (bcol > bcol_max) break; 70db41cccfSHong Zhang if (PetscBTLookup(table_in, bcol)) { 7126fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 72d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 73d94109b8SHong Zhang } 74d94109b8SHong Zhang } 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } /* for each overlap */ 787bede89fSBarry Smith PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 797bede89fSBarry Smith } /* for each is */ 809566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_out)); 819566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 829566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_in)); 833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 8449b5e25fSSatish Balay } 8549b5e25fSSatish Balay 86847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 877cf5f706SPierre Jolivet Zero some ops' to avoid invalid use */ 88d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 89d71ae5a4SJacob Faibussowitsch { 9049b5e25fSSatish Balay PetscFunctionBegin; 919566063dSJacob Faibussowitsch PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 92f4259b30SLisandro Dalcin Bseq->ops->mult = NULL; 93f4259b30SLisandro Dalcin Bseq->ops->multadd = NULL; 94f4259b30SLisandro Dalcin Bseq->ops->multtranspose = NULL; 95f4259b30SLisandro Dalcin Bseq->ops->multtransposeadd = NULL; 96f4259b30SLisandro Dalcin Bseq->ops->lufactor = NULL; 97f4259b30SLisandro Dalcin Bseq->ops->choleskyfactor = NULL; 98f4259b30SLisandro Dalcin Bseq->ops->lufactorsymbolic = NULL; 99f4259b30SLisandro Dalcin Bseq->ops->choleskyfactorsymbolic = NULL; 100f4259b30SLisandro Dalcin Bseq->ops->getinertia = NULL; 1013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 10249b5e25fSSatish Balay } 10349b5e25fSSatish Balay 1047dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 105fdfbdca6SPierre Jolivet static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym) 106d71ae5a4SJacob Faibussowitsch { 107fdfbdca6SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL; 108fdfbdca6SPierre Jolivet Mat_SeqBAIJ *d = NULL; 109e50a8114SHong Zhang PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 110e50a8114SHong Zhang PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 111e50a8114SHong Zhang const PetscInt *irow, *icol; 112e50a8114SHong Zhang PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 113e50a8114SHong Zhang PetscInt *aj = a->j, *ai = a->i; 114e50a8114SHong Zhang MatScalar *mat_a; 115e50a8114SHong Zhang Mat C; 116e50a8114SHong Zhang PetscBool flag; 117e50a8114SHong Zhang 118e50a8114SHong Zhang PetscFunctionBegin; 1199566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 1209566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 1219566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 1229566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 123e50a8114SHong Zhang 1249566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 125e50a8114SHong Zhang ssmap = smap; 1269566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 127e50a8114SHong Zhang for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 128e50a8114SHong Zhang /* determine lens of each row */ 129e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 130e50a8114SHong Zhang kstart = ai[irow[i]]; 131e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 132e50a8114SHong Zhang lens[i] = 0; 133e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 134e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 135e50a8114SHong Zhang } 136e50a8114SHong Zhang } 137e50a8114SHong Zhang /* Create and fill new matrix */ 138e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 139fdfbdca6SPierre Jolivet if (sym) { 140e50a8114SHong Zhang c = (Mat_SeqSBAIJ *)((*B)->data); 141e50a8114SHong Zhang 142aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1439566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 144fdfbdca6SPierre Jolivet PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 1459566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 146fdfbdca6SPierre Jolivet } else { 147fdfbdca6SPierre Jolivet d = (Mat_SeqBAIJ *)((*B)->data); 148fdfbdca6SPierre Jolivet 149fdfbdca6SPierre Jolivet PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 150fdfbdca6SPierre Jolivet PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag)); 151fdfbdca6SPierre Jolivet PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 152fdfbdca6SPierre Jolivet PetscCall(PetscArrayzero(d->ilen, d->mbs)); 153fdfbdca6SPierre Jolivet } 154e50a8114SHong Zhang C = *B; 155e50a8114SHong Zhang } else { 1569566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1579566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 158fdfbdca6SPierre Jolivet if (sym) { 1599566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1609566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 161fdfbdca6SPierre Jolivet } else { 162fdfbdca6SPierre Jolivet PetscCall(MatSetType(C, MATSEQBAIJ)); 163fdfbdca6SPierre Jolivet PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 164e50a8114SHong Zhang } 165fdfbdca6SPierre Jolivet } 166*f4f49eeaSPierre Jolivet if (sym) c = (Mat_SeqSBAIJ *)C->data; 167*f4f49eeaSPierre Jolivet else d = (Mat_SeqBAIJ *)C->data; 168e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 169e50a8114SHong Zhang row = irow[i]; 170e50a8114SHong Zhang kstart = ai[row]; 171e50a8114SHong Zhang kend = kstart + a->ilen[row]; 172fdfbdca6SPierre Jolivet if (sym) { 173e50a8114SHong Zhang mat_i = c->i[i]; 1748e3a54c0SPierre Jolivet mat_j = PetscSafePointerPlusOffset(c->j, mat_i); 1758e3a54c0SPierre Jolivet mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); 176e50a8114SHong Zhang mat_ilen = c->ilen + i; 177fdfbdca6SPierre Jolivet } else { 178fdfbdca6SPierre Jolivet mat_i = d->i[i]; 1798e3a54c0SPierre Jolivet mat_j = PetscSafePointerPlusOffset(d->j, mat_i); 1808e3a54c0SPierre Jolivet mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2); 181fdfbdca6SPierre Jolivet mat_ilen = d->ilen + i; 182fdfbdca6SPierre Jolivet } 183e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 184e50a8114SHong Zhang if ((tcol = ssmap[a->j[k]])) { 185e50a8114SHong Zhang *mat_j++ = tcol - 1; 1869566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 187e50a8114SHong Zhang mat_a += bs2; 188e50a8114SHong Zhang (*mat_ilen)++; 189e50a8114SHong Zhang } 190e50a8114SHong Zhang } 191e50a8114SHong Zhang } 192e50a8114SHong Zhang /* sort */ 193e50a8114SHong Zhang { 194e50a8114SHong Zhang MatScalar *work; 195e50a8114SHong Zhang 1969566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 197e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 198e50a8114SHong Zhang PetscInt ilen; 199fdfbdca6SPierre Jolivet if (sym) { 200e50a8114SHong Zhang mat_i = c->i[i]; 2018e3a54c0SPierre Jolivet mat_j = PetscSafePointerPlusOffset(c->j, mat_i); 2028e3a54c0SPierre Jolivet mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); 203e50a8114SHong Zhang ilen = c->ilen[i]; 204fdfbdca6SPierre Jolivet } else { 205fdfbdca6SPierre Jolivet mat_i = d->i[i]; 2068e3a54c0SPierre Jolivet mat_j = PetscSafePointerPlusOffset(d->j, mat_i); 2078e3a54c0SPierre Jolivet mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2); 208fdfbdca6SPierre Jolivet ilen = d->ilen[i]; 209fdfbdca6SPierre Jolivet } 2109566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 211e50a8114SHong Zhang } 2129566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 213e50a8114SHong Zhang } 214e50a8114SHong Zhang 215e50a8114SHong Zhang /* Free work space */ 2169566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 2179566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 2189566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 2199566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 2209566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 221e50a8114SHong Zhang 2229566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 223e50a8114SHong Zhang *B = C; 2243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 225e50a8114SHong Zhang } 226e50a8114SHong Zhang 227d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 228d71ae5a4SJacob Faibussowitsch { 229fdfbdca6SPierre Jolivet Mat C[2]; 230fdfbdca6SPierre Jolivet IS is1, is2, intersect = NULL; 231fdfbdca6SPierre Jolivet PetscInt n1, n2, ni; 232fdfbdca6SPierre Jolivet PetscBool sym = PETSC_TRUE; 23349b5e25fSSatish Balay 23449b5e25fSSatish Balay PetscFunctionBegin; 235f9a48b90SPierre Jolivet PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 236f9a48b90SPierre Jolivet if (isrow == iscol) { 237f9a48b90SPierre Jolivet is2 = is1; 238f9a48b90SPierre Jolivet PetscCall(PetscObjectReference((PetscObject)is2)); 239fdfbdca6SPierre Jolivet } else { 240fdfbdca6SPierre Jolivet PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 241fdfbdca6SPierre Jolivet PetscCall(ISIntersect(is1, is2, &intersect)); 242fdfbdca6SPierre Jolivet PetscCall(ISGetLocalSize(intersect, &ni)); 243fdfbdca6SPierre Jolivet PetscCall(ISDestroy(&intersect)); 244fdfbdca6SPierre Jolivet if (ni == 0) sym = PETSC_FALSE; 245fdfbdca6SPierre Jolivet else if (PetscDefined(USE_DEBUG)) { 246fdfbdca6SPierre Jolivet PetscCall(ISGetLocalSize(is1, &n1)); 247fdfbdca6SPierre Jolivet PetscCall(ISGetLocalSize(is2, &n2)); 248fdfbdca6SPierre Jolivet PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix"); 249fdfbdca6SPierre Jolivet } 250fdfbdca6SPierre Jolivet } 251fdfbdca6SPierre Jolivet if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym)); 252fdfbdca6SPierre Jolivet else { 253fdfbdca6SPierre Jolivet PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym)); 254fdfbdca6SPierre Jolivet PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym)); 255fdfbdca6SPierre Jolivet PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1)); 256fdfbdca6SPierre Jolivet PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN)); 257fdfbdca6SPierre Jolivet PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 258fdfbdca6SPierre Jolivet if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 259fdfbdca6SPierre Jolivet else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B)); 260fdfbdca6SPierre Jolivet else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 261fdfbdca6SPierre Jolivet PetscCall(MatDestroy(C)); 262fdfbdca6SPierre Jolivet PetscCall(MatDestroy(C + 1)); 263fdfbdca6SPierre Jolivet } 2649566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2659566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 266847daeccSHong Zhang 267fdfbdca6SPierre Jolivet if (sym && isrow != iscol) { 2688f46ffcaSHong Zhang PetscBool isequal; 2699566063dSJacob Faibussowitsch PetscCall(ISEqual(isrow, iscol, &isequal)); 27048a46eb9SPierre Jolivet if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 27149b5e25fSSatish Balay } 2723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 27349b5e25fSSatish Balay } 27449b5e25fSSatish Balay 275d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 276d71ae5a4SJacob Faibussowitsch { 27713f74950SBarry Smith PetscInt i; 27849b5e25fSSatish Balay 27949b5e25fSSatish Balay PetscFunctionBegin; 280314f503fSPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B)); 281e50a8114SHong Zhang 28248a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 2833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28449b5e25fSSatish Balay } 28549b5e25fSSatish Balay 28649b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 287d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 288d71ae5a4SJacob Faibussowitsch { 28949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 290d9ca1df4SBarry Smith PetscScalar *z, x1, x2, zero = 0.0; 291d9ca1df4SBarry Smith const PetscScalar *x, *xb; 292d9ca1df4SBarry Smith const MatScalar *v; 293d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 294d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 29598c9bda7SSatish Balay PetscInt nonzerorow = 0; 29649b5e25fSSatish Balay 29749b5e25fSSatish Balay PetscFunctionBegin; 2989566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 2993ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 3009566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3019566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 30249b5e25fSSatish Balay 30349b5e25fSSatish Balay v = a->a; 30449b5e25fSSatish Balay xb = x; 30549b5e25fSSatish Balay 30649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 30749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3089371c9d4SSatish Balay x1 = xb[0]; 3099371c9d4SSatish Balay x2 = xb[1]; 31049b5e25fSSatish Balay ib = aj + *ai; 311831a3094SHong Zhang jmin = 0; 31298c9bda7SSatish Balay nonzerorow += (n > 0); 3137fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 31449b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 31549b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 3169371c9d4SSatish Balay v += 4; 3179371c9d4SSatish Balay jmin++; 3187fbae186SHong Zhang } 319444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 320444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 321831a3094SHong Zhang for (j = jmin; j < n; j++) { 32249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 32349b5e25fSSatish Balay cval = ib[j] * 2; 32449b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 32549b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 32649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 32749b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 32849b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 32949b5e25fSSatish Balay v += 4; 33049b5e25fSSatish Balay } 3319371c9d4SSatish Balay xb += 2; 3329371c9d4SSatish Balay ai++; 33349b5e25fSSatish Balay } 33449b5e25fSSatish Balay 3359566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3369566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3379566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 3383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33949b5e25fSSatish Balay } 34049b5e25fSSatish Balay 341d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 342d71ae5a4SJacob Faibussowitsch { 34349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 344d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, zero = 0.0; 345d9ca1df4SBarry Smith const PetscScalar *x, *xb; 346d9ca1df4SBarry Smith const MatScalar *v; 347d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 348d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 34998c9bda7SSatish Balay PetscInt nonzerorow = 0; 35049b5e25fSSatish Balay 35149b5e25fSSatish Balay PetscFunctionBegin; 3529566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 3533ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 3549566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3559566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 35649b5e25fSSatish Balay 35749b5e25fSSatish Balay v = a->a; 35849b5e25fSSatish Balay xb = x; 35949b5e25fSSatish Balay 36049b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 36149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3629371c9d4SSatish Balay x1 = xb[0]; 3639371c9d4SSatish Balay x2 = xb[1]; 3649371c9d4SSatish Balay x3 = xb[2]; 36549b5e25fSSatish Balay ib = aj + *ai; 366831a3094SHong Zhang jmin = 0; 36798c9bda7SSatish Balay nonzerorow += (n > 0); 3687fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 36949b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 37049b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 37149b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3729371c9d4SSatish Balay v += 9; 3739371c9d4SSatish Balay jmin++; 3747fbae186SHong Zhang } 375444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 376444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 377831a3094SHong Zhang for (j = jmin; j < n; j++) { 37849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 37949b5e25fSSatish Balay cval = ib[j] * 3; 38049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 38149b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 38249b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 38349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 38449b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 38549b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 38649b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 38749b5e25fSSatish Balay v += 9; 38849b5e25fSSatish Balay } 3899371c9d4SSatish Balay xb += 3; 3909371c9d4SSatish Balay ai++; 39149b5e25fSSatish Balay } 39249b5e25fSSatish Balay 3939566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3949566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3959566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 3963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39749b5e25fSSatish Balay } 39849b5e25fSSatish Balay 399d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 400d71ae5a4SJacob Faibussowitsch { 40149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 402d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 403d9ca1df4SBarry Smith const PetscScalar *x, *xb; 404d9ca1df4SBarry Smith const MatScalar *v; 405d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 406d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 40798c9bda7SSatish Balay PetscInt nonzerorow = 0; 40849b5e25fSSatish Balay 40949b5e25fSSatish Balay PetscFunctionBegin; 4109566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 4113ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 4129566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4139566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 41449b5e25fSSatish Balay 41549b5e25fSSatish Balay v = a->a; 41649b5e25fSSatish Balay xb = x; 41749b5e25fSSatish Balay 41849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 41949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4209371c9d4SSatish Balay x1 = xb[0]; 4219371c9d4SSatish Balay x2 = xb[1]; 4229371c9d4SSatish Balay x3 = xb[2]; 4239371c9d4SSatish Balay x4 = xb[3]; 42449b5e25fSSatish Balay ib = aj + *ai; 425831a3094SHong Zhang jmin = 0; 42698c9bda7SSatish Balay nonzerorow += (n > 0); 4277fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 42849b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 42949b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 43049b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 43149b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 4329371c9d4SSatish Balay v += 16; 4339371c9d4SSatish Balay jmin++; 4347fbae186SHong Zhang } 435444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 436444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 437831a3094SHong Zhang for (j = jmin; j < n; j++) { 43849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 43949b5e25fSSatish Balay cval = ib[j] * 4; 44049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 44149b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 44249b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 44349b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 44449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 44549b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 44649b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 44749b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 44849b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 44949b5e25fSSatish Balay v += 16; 45049b5e25fSSatish Balay } 4519371c9d4SSatish Balay xb += 4; 4529371c9d4SSatish Balay ai++; 45349b5e25fSSatish Balay } 45449b5e25fSSatish Balay 4559566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4569566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 4579566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 4583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 45949b5e25fSSatish Balay } 46049b5e25fSSatish Balay 461d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 462d71ae5a4SJacob Faibussowitsch { 46349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 464d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 465d9ca1df4SBarry Smith const PetscScalar *x, *xb; 466d9ca1df4SBarry Smith const MatScalar *v; 467d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 468d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 46998c9bda7SSatish Balay PetscInt nonzerorow = 0; 47049b5e25fSSatish Balay 47149b5e25fSSatish Balay PetscFunctionBegin; 4729566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 4733ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 4749566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4759566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 47649b5e25fSSatish Balay 47749b5e25fSSatish Balay v = a->a; 47849b5e25fSSatish Balay xb = x; 47949b5e25fSSatish Balay 48049b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 48149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4829371c9d4SSatish Balay x1 = xb[0]; 4839371c9d4SSatish Balay x2 = xb[1]; 4849371c9d4SSatish Balay x3 = xb[2]; 4859371c9d4SSatish Balay x4 = xb[3]; 4869371c9d4SSatish Balay x5 = xb[4]; 48749b5e25fSSatish Balay ib = aj + *ai; 488831a3094SHong Zhang jmin = 0; 48998c9bda7SSatish Balay nonzerorow += (n > 0); 4907fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 49149b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 49249b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 49349b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 49449b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 49549b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 4969371c9d4SSatish Balay v += 25; 4979371c9d4SSatish Balay jmin++; 4987fbae186SHong Zhang } 499444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 500444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 501831a3094SHong Zhang for (j = jmin; j < n; j++) { 50249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 50349b5e25fSSatish Balay cval = ib[j] * 5; 50449b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 50549b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 50649b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 50749b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 50849b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 50949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 51049b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 51149b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 51249b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 51349b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 51449b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 51549b5e25fSSatish Balay v += 25; 51649b5e25fSSatish Balay } 5179371c9d4SSatish Balay xb += 5; 5189371c9d4SSatish Balay ai++; 51949b5e25fSSatish Balay } 52049b5e25fSSatish Balay 5219566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5229566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5239566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 5243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 52549b5e25fSSatish Balay } 52649b5e25fSSatish Balay 527d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 528d71ae5a4SJacob Faibussowitsch { 52949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 530d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 531d9ca1df4SBarry Smith const PetscScalar *x, *xb; 532d9ca1df4SBarry Smith const MatScalar *v; 533d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 534d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 53598c9bda7SSatish Balay PetscInt nonzerorow = 0; 53649b5e25fSSatish Balay 53749b5e25fSSatish Balay PetscFunctionBegin; 5389566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 5393ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 5409566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5419566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 54249b5e25fSSatish Balay 54349b5e25fSSatish Balay v = a->a; 54449b5e25fSSatish Balay xb = x; 54549b5e25fSSatish Balay 54649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 54749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 5489371c9d4SSatish Balay x1 = xb[0]; 5499371c9d4SSatish Balay x2 = xb[1]; 5509371c9d4SSatish Balay x3 = xb[2]; 5519371c9d4SSatish Balay x4 = xb[3]; 5529371c9d4SSatish Balay x5 = xb[4]; 5539371c9d4SSatish Balay x6 = xb[5]; 55449b5e25fSSatish Balay ib = aj + *ai; 555831a3094SHong Zhang jmin = 0; 55698c9bda7SSatish Balay nonzerorow += (n > 0); 5577fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 55849b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 55949b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 56049b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 56149b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 56249b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 56349b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 5649371c9d4SSatish Balay v += 36; 5659371c9d4SSatish Balay jmin++; 5667fbae186SHong Zhang } 567444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 568444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 569831a3094SHong Zhang for (j = jmin; j < n; j++) { 57049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 57149b5e25fSSatish Balay cval = ib[j] * 6; 57249b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 57349b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 57449b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 57549b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 57649b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 57749b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 57849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 57949b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 58049b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 58149b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 58249b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 58349b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 58449b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 58549b5e25fSSatish Balay v += 36; 58649b5e25fSSatish Balay } 5879371c9d4SSatish Balay xb += 6; 5889371c9d4SSatish Balay ai++; 58949b5e25fSSatish Balay } 59049b5e25fSSatish Balay 5919566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5929566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5939566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 5943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 59549b5e25fSSatish Balay } 596c2916339SPierre Jolivet 597d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 598d71ae5a4SJacob Faibussowitsch { 59949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 600d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 601d9ca1df4SBarry Smith const PetscScalar *x, *xb; 602d9ca1df4SBarry Smith const MatScalar *v; 603d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 604d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 60598c9bda7SSatish Balay PetscInt nonzerorow = 0; 60649b5e25fSSatish Balay 60749b5e25fSSatish Balay PetscFunctionBegin; 6089566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 6093ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 6109566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6119566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 61249b5e25fSSatish Balay 61349b5e25fSSatish Balay v = a->a; 61449b5e25fSSatish Balay xb = x; 61549b5e25fSSatish Balay 61649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 61749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 6189371c9d4SSatish Balay x1 = xb[0]; 6199371c9d4SSatish Balay x2 = xb[1]; 6209371c9d4SSatish Balay x3 = xb[2]; 6219371c9d4SSatish Balay x4 = xb[3]; 6229371c9d4SSatish Balay x5 = xb[4]; 6239371c9d4SSatish Balay x6 = xb[5]; 6249371c9d4SSatish Balay x7 = xb[6]; 62549b5e25fSSatish Balay ib = aj + *ai; 626831a3094SHong Zhang jmin = 0; 62798c9bda7SSatish Balay nonzerorow += (n > 0); 6287fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 62949b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 63049b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 63149b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 63249b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 63349b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 63449b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 63549b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 6369371c9d4SSatish Balay v += 49; 6379371c9d4SSatish Balay jmin++; 6387fbae186SHong Zhang } 639444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 640444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 641831a3094SHong Zhang for (j = jmin; j < n; j++) { 64249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 64349b5e25fSSatish Balay cval = ib[j] * 7; 64449b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 64549b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 64649b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 64749b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 64849b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 64949b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 65049b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 65149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 65249b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 65349b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 65449b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 65549b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 65649b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 65749b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 65849b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 65949b5e25fSSatish Balay v += 49; 66049b5e25fSSatish Balay } 6619371c9d4SSatish Balay xb += 7; 6629371c9d4SSatish Balay ai++; 66349b5e25fSSatish Balay } 6649566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6659566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 6669566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 6673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 66849b5e25fSSatish Balay } 66949b5e25fSSatish Balay 67049b5e25fSSatish Balay /* 67149b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 67249b5e25fSSatish Balay */ 673d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 674d71ae5a4SJacob Faibussowitsch { 67549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 676d9ca1df4SBarry Smith PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 677d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 678d9ca1df4SBarry Smith const MatScalar *v; 679d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 680d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 68198c9bda7SSatish Balay PetscInt nonzerorow = 0; 68249b5e25fSSatish Balay 68349b5e25fSSatish Balay PetscFunctionBegin; 6849566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 6853ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 6869566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6879566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 68859689ffbSStefano Zampini 68959689ffbSStefano Zampini x_ptr = x; 69059689ffbSStefano Zampini z_ptr = z; 69149b5e25fSSatish Balay 69249b5e25fSSatish Balay aj = a->j; 69349b5e25fSSatish Balay v = a->a; 69449b5e25fSSatish Balay ii = a->i; 69549b5e25fSSatish Balay 69648a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 69749b5e25fSSatish Balay work = a->mult_work; 69849b5e25fSSatish Balay 69949b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 7009371c9d4SSatish Balay n = ii[1] - ii[0]; 7019371c9d4SSatish Balay ncols = n * bs; 7029371c9d4SSatish Balay workt = work; 7039371c9d4SSatish Balay idx = aj + ii[0]; 70498c9bda7SSatish Balay nonzerorow += (n > 0); 70549b5e25fSSatish Balay 70649b5e25fSSatish Balay /* upper triangular part */ 70749b5e25fSSatish Balay for (j = 0; j < n; j++) { 70849b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 70949b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 71049b5e25fSSatish Balay workt += bs; 71149b5e25fSSatish Balay } 71249b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 71396b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 71449b5e25fSSatish Balay 71549b5e25fSSatish Balay /* strict lower triangular part */ 716831a3094SHong Zhang idx = aj + ii[0]; 71759689ffbSStefano Zampini if (n && *idx == i) { 7189371c9d4SSatish Balay ncols -= bs; 7199371c9d4SSatish Balay v += bs2; 7209371c9d4SSatish Balay idx++; 7219371c9d4SSatish Balay n--; 722831a3094SHong Zhang } 72396b9376eSHong Zhang 72449b5e25fSSatish Balay if (ncols > 0) { 72549b5e25fSSatish Balay workt = work; 7269566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 72796b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 728831a3094SHong Zhang for (j = 0; j < n; j++) { 729831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 73049b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 73149b5e25fSSatish Balay workt += bs; 73249b5e25fSSatish Balay } 73349b5e25fSSatish Balay } 7349371c9d4SSatish Balay x += bs; 7359371c9d4SSatish Balay v += n * bs2; 7369371c9d4SSatish Balay z += bs; 7379371c9d4SSatish Balay ii++; 73849b5e25fSSatish Balay } 73949b5e25fSSatish Balay 7409566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7419566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 7429566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 7433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 74449b5e25fSSatish Balay } 74549b5e25fSSatish Balay 746d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 747d71ae5a4SJacob Faibussowitsch { 74849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 749d9ca1df4SBarry Smith PetscScalar *z, x1; 750d9ca1df4SBarry Smith const PetscScalar *x, *xb; 751d9ca1df4SBarry Smith const MatScalar *v; 752d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 753d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 75498c9bda7SSatish Balay PetscInt nonzerorow = 0; 755eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 756b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 757eb1ec7c1SStefano Zampini #else 758eb1ec7c1SStefano Zampini const int aconj = 0; 759eb1ec7c1SStefano Zampini #endif 76049b5e25fSSatish Balay 76149b5e25fSSatish Balay PetscFunctionBegin; 7629566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 7633ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 7649566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7659566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 76649b5e25fSSatish Balay v = a->a; 76749b5e25fSSatish Balay xb = x; 76849b5e25fSSatish Balay 76949b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 77049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 77149b5e25fSSatish Balay x1 = xb[0]; 77249b5e25fSSatish Balay ib = aj + *ai; 773831a3094SHong Zhang jmin = 0; 77498c9bda7SSatish Balay nonzerorow += (n > 0); 7753d9ade75SStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 7769371c9d4SSatish Balay z[i] += *v++ * x[*ib++]; 7779371c9d4SSatish Balay jmin++; 778831a3094SHong Zhang } 779eb1ec7c1SStefano Zampini if (aconj) { 780eb1ec7c1SStefano Zampini for (j = jmin; j < n; j++) { 781eb1ec7c1SStefano Zampini cval = *ib; 782eb1ec7c1SStefano Zampini z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 783eb1ec7c1SStefano Zampini z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 784eb1ec7c1SStefano Zampini } 785eb1ec7c1SStefano Zampini } else { 786831a3094SHong Zhang for (j = jmin; j < n; j++) { 78749b5e25fSSatish Balay cval = *ib; 78849b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 78949b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 79049b5e25fSSatish Balay } 791eb1ec7c1SStefano Zampini } 7929371c9d4SSatish Balay xb++; 7939371c9d4SSatish Balay ai++; 79449b5e25fSSatish Balay } 79549b5e25fSSatish Balay 7969566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7979566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 79849b5e25fSSatish Balay 7999566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 8003ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 80149b5e25fSSatish Balay } 80249b5e25fSSatish Balay 803d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 804d71ae5a4SJacob Faibussowitsch { 80549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 806d9ca1df4SBarry Smith PetscScalar *z, x1, x2; 807d9ca1df4SBarry Smith const PetscScalar *x, *xb; 808d9ca1df4SBarry Smith const MatScalar *v; 809d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 810d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 81198c9bda7SSatish Balay PetscInt nonzerorow = 0; 81249b5e25fSSatish Balay 81349b5e25fSSatish Balay PetscFunctionBegin; 8149566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 8153ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 8169566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8179566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 81849b5e25fSSatish Balay 81949b5e25fSSatish Balay v = a->a; 82049b5e25fSSatish Balay xb = x; 82149b5e25fSSatish Balay 82249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 82349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8249371c9d4SSatish Balay x1 = xb[0]; 8259371c9d4SSatish Balay x2 = xb[1]; 82649b5e25fSSatish Balay ib = aj + *ai; 827831a3094SHong Zhang jmin = 0; 82898c9bda7SSatish Balay nonzerorow += (n > 0); 82959689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 83049b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 83149b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 8329371c9d4SSatish Balay v += 4; 8339371c9d4SSatish Balay jmin++; 8347fbae186SHong Zhang } 835444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 836444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 837831a3094SHong Zhang for (j = jmin; j < n; j++) { 83849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 83949b5e25fSSatish Balay cval = ib[j] * 2; 84049b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 84149b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 84249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 84349b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 84449b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 84549b5e25fSSatish Balay v += 4; 84649b5e25fSSatish Balay } 8479371c9d4SSatish Balay xb += 2; 8489371c9d4SSatish Balay ai++; 84949b5e25fSSatish Balay } 8509566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8519566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 85249b5e25fSSatish Balay 8539566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 8543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 85549b5e25fSSatish Balay } 85649b5e25fSSatish Balay 857d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 858d71ae5a4SJacob Faibussowitsch { 85949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 860d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3; 861d9ca1df4SBarry Smith const PetscScalar *x, *xb; 862d9ca1df4SBarry Smith const MatScalar *v; 863d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 864d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 86598c9bda7SSatish Balay PetscInt nonzerorow = 0; 86649b5e25fSSatish Balay 86749b5e25fSSatish Balay PetscFunctionBegin; 8689566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 8693ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 8709566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8719566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 87249b5e25fSSatish Balay 87349b5e25fSSatish Balay v = a->a; 87449b5e25fSSatish Balay xb = x; 87549b5e25fSSatish Balay 87649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 87749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8789371c9d4SSatish Balay x1 = xb[0]; 8799371c9d4SSatish Balay x2 = xb[1]; 8809371c9d4SSatish Balay x3 = xb[2]; 88149b5e25fSSatish Balay ib = aj + *ai; 882831a3094SHong Zhang jmin = 0; 88398c9bda7SSatish Balay nonzerorow += (n > 0); 88459689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 88549b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 88649b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 88749b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 8889371c9d4SSatish Balay v += 9; 8899371c9d4SSatish Balay jmin++; 8907fbae186SHong Zhang } 891444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 892444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 893831a3094SHong Zhang for (j = jmin; j < n; j++) { 89449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 89549b5e25fSSatish Balay cval = ib[j] * 3; 89649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 89749b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 89849b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 89949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 90049b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 90149b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 90249b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 90349b5e25fSSatish Balay v += 9; 90449b5e25fSSatish Balay } 9059371c9d4SSatish Balay xb += 3; 9069371c9d4SSatish Balay ai++; 90749b5e25fSSatish Balay } 90849b5e25fSSatish Balay 9099566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9109566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 91149b5e25fSSatish Balay 9129566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 9133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 91449b5e25fSSatish Balay } 91549b5e25fSSatish Balay 916d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 917d71ae5a4SJacob Faibussowitsch { 91849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 919d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4; 920d9ca1df4SBarry Smith const PetscScalar *x, *xb; 921d9ca1df4SBarry Smith const MatScalar *v; 922d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 923d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 92498c9bda7SSatish Balay PetscInt nonzerorow = 0; 92549b5e25fSSatish Balay 92649b5e25fSSatish Balay PetscFunctionBegin; 9279566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 9283ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 9299566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9309566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 93149b5e25fSSatish Balay 93249b5e25fSSatish Balay v = a->a; 93349b5e25fSSatish Balay xb = x; 93449b5e25fSSatish Balay 93549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 93649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 9379371c9d4SSatish Balay x1 = xb[0]; 9389371c9d4SSatish Balay x2 = xb[1]; 9399371c9d4SSatish Balay x3 = xb[2]; 9409371c9d4SSatish Balay x4 = xb[3]; 94149b5e25fSSatish Balay ib = aj + *ai; 942831a3094SHong Zhang jmin = 0; 94398c9bda7SSatish Balay nonzerorow += (n > 0); 94459689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 94549b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 94649b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 94749b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 94849b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 9499371c9d4SSatish Balay v += 16; 9509371c9d4SSatish Balay jmin++; 9517fbae186SHong Zhang } 952444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 953444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 954831a3094SHong Zhang for (j = jmin; j < n; j++) { 95549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 95649b5e25fSSatish Balay cval = ib[j] * 4; 95749b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 95849b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 95949b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 96049b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 96149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 96249b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 96349b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 96449b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 96549b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 96649b5e25fSSatish Balay v += 16; 96749b5e25fSSatish Balay } 9689371c9d4SSatish Balay xb += 4; 9699371c9d4SSatish Balay ai++; 97049b5e25fSSatish Balay } 97149b5e25fSSatish Balay 9729566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9739566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 97449b5e25fSSatish Balay 9759566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 9763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 97749b5e25fSSatish Balay } 97849b5e25fSSatish Balay 979d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 980d71ae5a4SJacob Faibussowitsch { 98149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 982d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5; 983d9ca1df4SBarry Smith const PetscScalar *x, *xb; 984d9ca1df4SBarry Smith const MatScalar *v; 985d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 986d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 98798c9bda7SSatish Balay PetscInt nonzerorow = 0; 98849b5e25fSSatish Balay 98949b5e25fSSatish Balay PetscFunctionBegin; 9909566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 9913ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 9929566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9939566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 99449b5e25fSSatish Balay 99549b5e25fSSatish Balay v = a->a; 99649b5e25fSSatish Balay xb = x; 99749b5e25fSSatish Balay 99849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 99949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 10009371c9d4SSatish Balay x1 = xb[0]; 10019371c9d4SSatish Balay x2 = xb[1]; 10029371c9d4SSatish Balay x3 = xb[2]; 10039371c9d4SSatish Balay x4 = xb[3]; 10049371c9d4SSatish Balay x5 = xb[4]; 100549b5e25fSSatish Balay ib = aj + *ai; 1006831a3094SHong Zhang jmin = 0; 100798c9bda7SSatish Balay nonzerorow += (n > 0); 100859689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 100949b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 101049b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 101149b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 101249b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 101349b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 10149371c9d4SSatish Balay v += 25; 10159371c9d4SSatish Balay jmin++; 10167fbae186SHong Zhang } 1017444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1018444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1019831a3094SHong Zhang for (j = jmin; j < n; j++) { 102049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 102149b5e25fSSatish Balay cval = ib[j] * 5; 102249b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 102349b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 102449b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 102549b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 102649b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 102749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 102849b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 102949b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 103049b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 103149b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 103249b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 103349b5e25fSSatish Balay v += 25; 103449b5e25fSSatish Balay } 10359371c9d4SSatish Balay xb += 5; 10369371c9d4SSatish Balay ai++; 103749b5e25fSSatish Balay } 103849b5e25fSSatish Balay 10399566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10409566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 104149b5e25fSSatish Balay 10429566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 10433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 104449b5e25fSSatish Balay } 1045c2916339SPierre Jolivet 1046d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1047d71ae5a4SJacob Faibussowitsch { 104849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1049d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6; 1050d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1051d9ca1df4SBarry Smith const MatScalar *v; 1052d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1053d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 105498c9bda7SSatish Balay PetscInt nonzerorow = 0; 105549b5e25fSSatish Balay 105649b5e25fSSatish Balay PetscFunctionBegin; 10579566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 10583ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 10599566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10609566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 106149b5e25fSSatish Balay 106249b5e25fSSatish Balay v = a->a; 106349b5e25fSSatish Balay xb = x; 106449b5e25fSSatish Balay 106549b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 106649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 10679371c9d4SSatish Balay x1 = xb[0]; 10689371c9d4SSatish Balay x2 = xb[1]; 10699371c9d4SSatish Balay x3 = xb[2]; 10709371c9d4SSatish Balay x4 = xb[3]; 10719371c9d4SSatish Balay x5 = xb[4]; 10729371c9d4SSatish Balay x6 = xb[5]; 107349b5e25fSSatish Balay ib = aj + *ai; 1074831a3094SHong Zhang jmin = 0; 107598c9bda7SSatish Balay nonzerorow += (n > 0); 107659689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 107749b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 107849b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 107949b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 108049b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 108149b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 108249b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 10839371c9d4SSatish Balay v += 36; 10849371c9d4SSatish Balay jmin++; 10857fbae186SHong Zhang } 1086444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1087444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1088831a3094SHong Zhang for (j = jmin; j < n; j++) { 108949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 109049b5e25fSSatish Balay cval = ib[j] * 6; 109149b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 109249b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 109349b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 109449b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 109549b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 109649b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 109749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 109849b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 109949b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 110049b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 110149b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 110249b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 110349b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 110449b5e25fSSatish Balay v += 36; 110549b5e25fSSatish Balay } 11069371c9d4SSatish Balay xb += 6; 11079371c9d4SSatish Balay ai++; 110849b5e25fSSatish Balay } 110949b5e25fSSatish Balay 11109566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11119566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 111249b5e25fSSatish Balay 11139566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 11143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 111549b5e25fSSatish Balay } 111649b5e25fSSatish Balay 1117d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1118d71ae5a4SJacob Faibussowitsch { 111949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1120d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1121d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1122d9ca1df4SBarry Smith const MatScalar *v; 1123d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1124d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 112598c9bda7SSatish Balay PetscInt nonzerorow = 0; 112649b5e25fSSatish Balay 112749b5e25fSSatish Balay PetscFunctionBegin; 11289566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 11293ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 11309566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 11319566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 113249b5e25fSSatish Balay 113349b5e25fSSatish Balay v = a->a; 113449b5e25fSSatish Balay xb = x; 113549b5e25fSSatish Balay 113649b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 113749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 11389371c9d4SSatish Balay x1 = xb[0]; 11399371c9d4SSatish Balay x2 = xb[1]; 11409371c9d4SSatish Balay x3 = xb[2]; 11419371c9d4SSatish Balay x4 = xb[3]; 11429371c9d4SSatish Balay x5 = xb[4]; 11439371c9d4SSatish Balay x6 = xb[5]; 11449371c9d4SSatish Balay x7 = xb[6]; 114549b5e25fSSatish Balay ib = aj + *ai; 1146831a3094SHong Zhang jmin = 0; 114798c9bda7SSatish Balay nonzerorow += (n > 0); 114859689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 114949b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 115049b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 115149b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 115249b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 115349b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 115449b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 115549b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 11569371c9d4SSatish Balay v += 49; 11579371c9d4SSatish Balay jmin++; 11587fbae186SHong Zhang } 1159444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1160444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1161831a3094SHong Zhang for (j = jmin; j < n; j++) { 116249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 116349b5e25fSSatish Balay cval = ib[j] * 7; 116449b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 116549b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 116649b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 116749b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 116849b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 116949b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 117049b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 117149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 117249b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 117349b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 117449b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 117549b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 117649b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 117749b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 117849b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 117949b5e25fSSatish Balay v += 49; 118049b5e25fSSatish Balay } 11819371c9d4SSatish Balay xb += 7; 11829371c9d4SSatish Balay ai++; 118349b5e25fSSatish Balay } 118449b5e25fSSatish Balay 11859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11869566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 118749b5e25fSSatish Balay 11889566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 11893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 119049b5e25fSSatish Balay } 119149b5e25fSSatish Balay 1192d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1193d71ae5a4SJacob Faibussowitsch { 119449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1195f4259b30SLisandro Dalcin PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1196d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 1197d9ca1df4SBarry Smith const MatScalar *v; 1198d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1199d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 120098c9bda7SSatish Balay PetscInt nonzerorow = 0; 120149b5e25fSSatish Balay 120249b5e25fSSatish Balay PetscFunctionBegin; 12039566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 12043ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 12059371c9d4SSatish Balay PetscCall(VecGetArrayRead(xx, &x)); 12069371c9d4SSatish Balay x_ptr = x; 12079371c9d4SSatish Balay PetscCall(VecGetArray(zz, &z)); 12089371c9d4SSatish Balay z_ptr = z; 120949b5e25fSSatish Balay 121049b5e25fSSatish Balay aj = a->j; 121149b5e25fSSatish Balay v = a->a; 121249b5e25fSSatish Balay ii = a->i; 121349b5e25fSSatish Balay 121448a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 121549b5e25fSSatish Balay work = a->mult_work; 121649b5e25fSSatish Balay 121749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 12189371c9d4SSatish Balay n = ii[1] - ii[0]; 12199371c9d4SSatish Balay ncols = n * bs; 12209371c9d4SSatish Balay workt = work; 12219371c9d4SSatish Balay idx = aj + ii[0]; 122298c9bda7SSatish Balay nonzerorow += (n > 0); 122349b5e25fSSatish Balay 122449b5e25fSSatish Balay /* upper triangular part */ 122549b5e25fSSatish Balay for (j = 0; j < n; j++) { 122649b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 122749b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 122849b5e25fSSatish Balay workt += bs; 122949b5e25fSSatish Balay } 123049b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 123196b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 123249b5e25fSSatish Balay 123349b5e25fSSatish Balay /* strict lower triangular part */ 1234831a3094SHong Zhang idx = aj + ii[0]; 123559689ffbSStefano Zampini if (n && *idx == i) { 12369371c9d4SSatish Balay ncols -= bs; 12379371c9d4SSatish Balay v += bs2; 12389371c9d4SSatish Balay idx++; 12399371c9d4SSatish Balay n--; 1240831a3094SHong Zhang } 124149b5e25fSSatish Balay if (ncols > 0) { 124249b5e25fSSatish Balay workt = work; 12439566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 124496b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1245831a3094SHong Zhang for (j = 0; j < n; j++) { 1246831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 124749b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 124849b5e25fSSatish Balay workt += bs; 124949b5e25fSSatish Balay } 125049b5e25fSSatish Balay } 125149b5e25fSSatish Balay 12529371c9d4SSatish Balay x += bs; 12539371c9d4SSatish Balay v += n * bs2; 12549371c9d4SSatish Balay z += bs; 12559371c9d4SSatish Balay ii++; 125649b5e25fSSatish Balay } 125749b5e25fSSatish Balay 12589566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 12599566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 126049b5e25fSSatish Balay 12619566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 12623ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 126349b5e25fSSatish Balay } 126449b5e25fSSatish Balay 1265d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1266d71ae5a4SJacob Faibussowitsch { 126749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1268f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1269c5df96a5SBarry Smith PetscBLASInt one = 1, totalnz; 127049b5e25fSSatish Balay 127149b5e25fSSatish Balay PetscFunctionBegin; 12729566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1273792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 12749566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 12753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 127649b5e25fSSatish Balay } 127749b5e25fSSatish Balay 1278d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1279d71ae5a4SJacob Faibussowitsch { 128049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1281d9ca1df4SBarry Smith const MatScalar *v = a->a; 128249b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1283d9ca1df4SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1284d9ca1df4SBarry Smith const PetscInt *aj = a->j, *col; 128549b5e25fSSatish Balay 128649b5e25fSSatish Balay PetscFunctionBegin; 1287c40ae873SPierre Jolivet if (!a->nz) { 1288c40ae873SPierre Jolivet *norm = 0.0; 12893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1290c40ae873SPierre Jolivet } 129149b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 129249b5e25fSSatish Balay for (k = 0; k < mbs; k++) { 129359689ffbSStefano Zampini jmin = a->i[k]; 129459689ffbSStefano Zampini jmax = a->i[k + 1]; 1295831a3094SHong Zhang col = aj + jmin; 129659689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 129749b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 12989371c9d4SSatish Balay sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 12999371c9d4SSatish Balay v++; 130049b5e25fSSatish Balay } 1301831a3094SHong Zhang jmin++; 1302831a3094SHong Zhang } 1303831a3094SHong Zhang for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 130449b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 13059371c9d4SSatish Balay sum_off += PetscRealPart(PetscConj(*v) * (*v)); 13069371c9d4SSatish Balay v++; 130749b5e25fSSatish Balay } 130849b5e25fSSatish Balay } 130949b5e25fSSatish Balay } 13108f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 13119566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 13120b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 13139566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 13140b8dc8d2SHong Zhang for (i = 0; i < mbs; i++) jl[i] = mbs; 13150b8dc8d2SHong Zhang il[0] = 0; 131649b5e25fSSatish Balay 131749b5e25fSSatish Balay *norm = 0.0; 131849b5e25fSSatish Balay for (k = 0; k < mbs; k++) { /* k_th block row */ 131949b5e25fSSatish Balay for (j = 0; j < bs; j++) sum[j] = 0.0; 132049b5e25fSSatish Balay /*-- col sum --*/ 132149b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1322bbea24aaSStefano Zampini /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window) 1323831a3094SHong Zhang at step k */ 132449b5e25fSSatish Balay while (i < mbs) { 132549b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 132649b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 132749b5e25fSSatish Balay for (j = 0; j < bs; j++) { 132849b5e25fSSatish Balay v = a->a + ik * bs2 + j * bs; 132949b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 13309371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 13319371c9d4SSatish Balay v++; 133249b5e25fSSatish Balay } 133349b5e25fSSatish Balay } 133449b5e25fSSatish Balay /* update il, jl */ 1335831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1336831a3094SHong Zhang jmax = a->i[i + 1]; 133749b5e25fSSatish Balay if (jmin < jmax) { 133849b5e25fSSatish Balay il[i] = jmin; 133949b5e25fSSatish Balay j = a->j[jmin]; 13409371c9d4SSatish Balay jl[i] = jl[j]; 13419371c9d4SSatish Balay jl[j] = i; 134249b5e25fSSatish Balay } 134349b5e25fSSatish Balay i = nexti; 134449b5e25fSSatish Balay } 134549b5e25fSSatish Balay /*-- row sum --*/ 134659689ffbSStefano Zampini jmin = a->i[k]; 134759689ffbSStefano Zampini jmax = a->i[k + 1]; 134849b5e25fSSatish Balay for (i = jmin; i < jmax; i++) { 134949b5e25fSSatish Balay for (j = 0; j < bs; j++) { 135049b5e25fSSatish Balay v = a->a + i * bs2 + j; 135149b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 13529371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 13539371c9d4SSatish Balay v += bs; 135449b5e25fSSatish Balay } 135549b5e25fSSatish Balay } 135649b5e25fSSatish Balay } 135749b5e25fSSatish Balay /* add k_th block row to il, jl */ 1358831a3094SHong Zhang col = aj + jmin; 135959689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) jmin++; 136049b5e25fSSatish Balay if (jmin < jmax) { 136149b5e25fSSatish Balay il[k] = jmin; 13629371c9d4SSatish Balay j = a->j[jmin]; 13639371c9d4SSatish Balay jl[k] = jl[j]; 13649371c9d4SSatish Balay jl[j] = k; 136549b5e25fSSatish Balay } 136649b5e25fSSatish Balay for (j = 0; j < bs; j++) { 136749b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 136849b5e25fSSatish Balay } 136949b5e25fSSatish Balay } 13709566063dSJacob Faibussowitsch PetscCall(PetscFree3(sum, il, jl)); 13719566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1372f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 13733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 137449b5e25fSSatish Balay } 137549b5e25fSSatish Balay 1376d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1377d71ae5a4SJacob Faibussowitsch { 137849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 137949b5e25fSSatish Balay 138049b5e25fSSatish Balay PetscFunctionBegin; 138149b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1382d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1383ef511fbeSHong Zhang *flg = PETSC_FALSE; 13843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 138549b5e25fSSatish Balay } 138649b5e25fSSatish Balay 138749b5e25fSSatish Balay /* if the a->i are the same */ 13889566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 13893ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 139049b5e25fSSatish Balay 139149b5e25fSSatish Balay /* if a->j are the same */ 13929566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 13933ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 139426fbe8dcSKarl Rupp 139549b5e25fSSatish Balay /* if a->a are the same */ 13969566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 13973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 139849b5e25fSSatish Balay } 139949b5e25fSSatish Balay 1400d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1401d71ae5a4SJacob Faibussowitsch { 140249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1403d9ca1df4SBarry Smith PetscInt i, j, k, row, bs, ambs, bs2; 1404d9ca1df4SBarry Smith const PetscInt *ai, *aj; 140587828ca2SBarry Smith PetscScalar *x, zero = 0.0; 1406d9ca1df4SBarry Smith const MatScalar *aa, *aa_j; 140749b5e25fSSatish Balay 140849b5e25fSSatish Balay PetscFunctionBegin; 1409d0f46423SBarry Smith bs = A->rmap->bs; 1410aed4548fSBarry Smith PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 141182799104SHong Zhang 141249b5e25fSSatish Balay aa = a->a; 14138a0c6efdSHong Zhang ambs = a->mbs; 14148a0c6efdSHong Zhang 14158a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 14168a0c6efdSHong Zhang PetscInt *diag = a->diag; 14178a0c6efdSHong Zhang aa = a->a; 14188a0c6efdSHong Zhang ambs = a->mbs; 14199566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 14208a0c6efdSHong Zhang for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 14219566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 14223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14238a0c6efdSHong Zhang } 14248a0c6efdSHong Zhang 142549b5e25fSSatish Balay ai = a->i; 142649b5e25fSSatish Balay aj = a->j; 142749b5e25fSSatish Balay bs2 = a->bs2; 14289566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 14293ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 14309566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 143149b5e25fSSatish Balay for (i = 0; i < ambs; i++) { 143249b5e25fSSatish Balay j = ai[i]; 143349b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 143449b5e25fSSatish Balay row = i * bs; 143549b5e25fSSatish Balay aa_j = aa + j * bs2; 143649b5e25fSSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 143749b5e25fSSatish Balay } 143849b5e25fSSatish Balay } 14399566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 14403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 144149b5e25fSSatish Balay } 144249b5e25fSSatish Balay 1443d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1444d71ae5a4SJacob Faibussowitsch { 144549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1446d9ca1df4SBarry Smith PetscScalar x; 1447d9ca1df4SBarry Smith const PetscScalar *l, *li, *ri; 144849b5e25fSSatish Balay MatScalar *aa, *v; 1449fff8e43fSBarry Smith PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1450fff8e43fSBarry Smith const PetscInt *ai, *aj; 1451ace3abfcSBarry Smith PetscBool flg; 145249b5e25fSSatish Balay 145349b5e25fSSatish Balay PetscFunctionBegin; 1454b3bf805bSHong Zhang if (ll != rr) { 14559566063dSJacob Faibussowitsch PetscCall(VecEqual(ll, rr, &flg)); 145628b400f6SJacob Faibussowitsch PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1457b3bf805bSHong Zhang } 14583ba16761SJacob Faibussowitsch if (!ll) PetscFunctionReturn(PETSC_SUCCESS); 145949b5e25fSSatish Balay ai = a->i; 146049b5e25fSSatish Balay aj = a->j; 146149b5e25fSSatish Balay aa = a->a; 1462d0f46423SBarry Smith m = A->rmap->N; 1463d0f46423SBarry Smith bs = A->rmap->bs; 146449b5e25fSSatish Balay mbs = a->mbs; 146549b5e25fSSatish Balay bs2 = a->bs2; 146649b5e25fSSatish Balay 14679566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 14689566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 146908401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 147049b5e25fSSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 147149b5e25fSSatish Balay M = ai[i + 1] - ai[i]; 147249b5e25fSSatish Balay li = l + i * bs; 147349b5e25fSSatish Balay v = aa + bs2 * ai[i]; 147449b5e25fSSatish Balay for (j = 0; j < M; j++) { /* for each block */ 147549b5e25fSSatish Balay ri = l + bs * aj[ai[i] + j]; 14765e90f9d9SHong Zhang for (k = 0; k < bs; k++) { 147749b5e25fSSatish Balay x = ri[k]; 147849b5e25fSSatish Balay for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 147949b5e25fSSatish Balay } 148049b5e25fSSatish Balay } 148149b5e25fSSatish Balay } 14829566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 14839566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 14843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 148549b5e25fSSatish Balay } 148649b5e25fSSatish Balay 1487d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1488d71ae5a4SJacob Faibussowitsch { 148949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 149049b5e25fSSatish Balay 149149b5e25fSSatish Balay PetscFunctionBegin; 149249b5e25fSSatish Balay info->block_size = a->bs2; 1493ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 14946c6c5352SBarry Smith info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 14953966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 149649b5e25fSSatish Balay info->assemblies = A->num_ass; 14978e58a170SBarry Smith info->mallocs = A->info.mallocs; 14984dfa11a4SJacob Faibussowitsch info->memory = 0; /* REVIEW ME */ 1499d5f3da31SBarry Smith if (A->factortype) { 150049b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 150149b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 150249b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 150349b5e25fSSatish Balay } else { 150449b5e25fSSatish Balay info->fill_ratio_given = 0; 150549b5e25fSSatish Balay info->fill_ratio_needed = 0; 150649b5e25fSSatish Balay info->factor_mallocs = 0; 150749b5e25fSSatish Balay } 15083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 150949b5e25fSSatish Balay } 151049b5e25fSSatish Balay 1511d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1512d71ae5a4SJacob Faibussowitsch { 151349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 151449b5e25fSSatish Balay 151549b5e25fSSatish Balay PetscFunctionBegin; 15169566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 15173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 151849b5e25fSSatish Balay } 1519dc354874SHong Zhang 1520d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1521d71ae5a4SJacob Faibussowitsch { 1522dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1523d9ca1df4SBarry Smith PetscInt i, j, n, row, col, bs, mbs; 1524d9ca1df4SBarry Smith const PetscInt *ai, *aj; 1525c3fca9a7SHong Zhang PetscReal atmp; 1526d9ca1df4SBarry Smith const MatScalar *aa; 1527985db425SBarry Smith PetscScalar *x; 152813f74950SBarry Smith PetscInt ncols, brow, bcol, krow, kcol; 1529dc354874SHong Zhang 1530dc354874SHong Zhang PetscFunctionBegin; 153128b400f6SJacob Faibussowitsch PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 153228b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1533d0f46423SBarry Smith bs = A->rmap->bs; 1534dc354874SHong Zhang aa = a->a; 1535dc354874SHong Zhang ai = a->i; 1536dc354874SHong Zhang aj = a->j; 153744117c81SHong Zhang mbs = a->mbs; 1538dc354874SHong Zhang 15399566063dSJacob Faibussowitsch PetscCall(VecSet(v, 0.0)); 15409566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 15419566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 154208401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 154344117c81SHong Zhang for (i = 0; i < mbs; i++) { 15449371c9d4SSatish Balay ncols = ai[1] - ai[0]; 15459371c9d4SSatish Balay ai++; 1546d0f6400bSHong Zhang brow = bs * i; 154744117c81SHong Zhang for (j = 0; j < ncols; j++) { 1548d0f6400bSHong Zhang bcol = bs * (*aj); 154944117c81SHong Zhang for (kcol = 0; kcol < bs; kcol++) { 1550d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 155144117c81SHong Zhang for (krow = 0; krow < bs; krow++) { 15529371c9d4SSatish Balay atmp = PetscAbsScalar(*aa); 15539371c9d4SSatish Balay aa++; 1554d0f6400bSHong Zhang row = brow + krow; /* row index */ 1555c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1556c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 155744117c81SHong Zhang } 155844117c81SHong Zhang } 1559d0f6400bSHong Zhang aj++; 1560dc354874SHong Zhang } 1561dc354874SHong Zhang } 15629566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 15633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1564dc354874SHong Zhang } 1565c2916339SPierre Jolivet 1566d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1567d71ae5a4SJacob Faibussowitsch { 1568c2916339SPierre Jolivet PetscFunctionBegin; 15699566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 15704222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 15713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1572c2916339SPierre Jolivet } 1573c2916339SPierre Jolivet 157466976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1575d71ae5a4SJacob Faibussowitsch { 1576c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1577c2916339SPierre Jolivet PetscScalar *z = c; 1578c2916339SPierre Jolivet const PetscScalar *xb; 1579c2916339SPierre Jolivet PetscScalar x1; 1580c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1581c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 15823c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 1583b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 15843c2e41e1SStefano Zampini #else 15853c2e41e1SStefano Zampini const int aconj = 0; 15863c2e41e1SStefano Zampini #endif 1587c2916339SPierre Jolivet 1588c2916339SPierre Jolivet PetscFunctionBegin; 1589c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 15909371c9d4SSatish Balay n = ii[1] - ii[0]; 15919371c9d4SSatish Balay ii++; 1592c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1593c2916339SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1594c2916339SPierre Jolivet jj = idx; 1595c2916339SPierre Jolivet vv = v; 1596c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1597c2916339SPierre Jolivet idx = jj; 1598c2916339SPierre Jolivet v = vv; 1599c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16009371c9d4SSatish Balay xb = b + (*idx); 16019371c9d4SSatish Balay x1 = xb[0 + k * bm]; 1602c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1; 16033c2e41e1SStefano Zampini if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1604c2916339SPierre Jolivet v += 1; 1605c2916339SPierre Jolivet ++idx; 1606c2916339SPierre Jolivet } 1607c2916339SPierre Jolivet } 1608c2916339SPierre Jolivet z += 1; 1609c2916339SPierre Jolivet } 16103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1611c2916339SPierre Jolivet } 1612c2916339SPierre Jolivet 161366976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1614d71ae5a4SJacob Faibussowitsch { 1615c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1616c2916339SPierre Jolivet PetscScalar *z = c; 1617c2916339SPierre Jolivet const PetscScalar *xb; 1618c2916339SPierre Jolivet PetscScalar x1, x2; 1619c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1620c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1621c2916339SPierre Jolivet 1622c2916339SPierre Jolivet PetscFunctionBegin; 1623c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16249371c9d4SSatish Balay n = ii[1] - ii[0]; 16259371c9d4SSatish Balay ii++; 1626c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1627c2916339SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1628c2916339SPierre Jolivet jj = idx; 1629c2916339SPierre Jolivet vv = v; 1630c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1631c2916339SPierre Jolivet idx = jj; 1632c2916339SPierre Jolivet v = vv; 1633c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16349371c9d4SSatish Balay xb = b + 2 * (*idx); 16359371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16369371c9d4SSatish Balay x2 = xb[1 + k * bm]; 1637c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1638c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1639c2916339SPierre Jolivet if (*idx != i) { 1640c2916339SPierre Jolivet c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1641c2916339SPierre Jolivet c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1642c2916339SPierre Jolivet } 1643c2916339SPierre Jolivet v += 4; 1644c2916339SPierre Jolivet ++idx; 1645c2916339SPierre Jolivet } 1646c2916339SPierre Jolivet } 1647c2916339SPierre Jolivet z += 2; 1648c2916339SPierre Jolivet } 16493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1650c2916339SPierre Jolivet } 1651c2916339SPierre Jolivet 165266976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1653d71ae5a4SJacob Faibussowitsch { 1654c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1655c2916339SPierre Jolivet PetscScalar *z = c; 1656c2916339SPierre Jolivet const PetscScalar *xb; 1657c2916339SPierre Jolivet PetscScalar x1, x2, x3; 1658c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1659c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1660c2916339SPierre Jolivet 1661c2916339SPierre Jolivet PetscFunctionBegin; 1662c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16639371c9d4SSatish Balay n = ii[1] - ii[0]; 16649371c9d4SSatish Balay ii++; 1665c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1666c2916339SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1667c2916339SPierre Jolivet jj = idx; 1668c2916339SPierre Jolivet vv = v; 1669c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1670c2916339SPierre Jolivet idx = jj; 1671c2916339SPierre Jolivet v = vv; 1672c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16739371c9d4SSatish Balay xb = b + 3 * (*idx); 16749371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16759371c9d4SSatish Balay x2 = xb[1 + k * bm]; 16769371c9d4SSatish Balay x3 = xb[2 + k * bm]; 1677c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1678c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1679c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1680c2916339SPierre Jolivet if (*idx != i) { 1681c2916339SPierre Jolivet c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1682c2916339SPierre Jolivet c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1683c2916339SPierre Jolivet c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1684c2916339SPierre Jolivet } 1685c2916339SPierre Jolivet v += 9; 1686c2916339SPierre Jolivet ++idx; 1687c2916339SPierre Jolivet } 1688c2916339SPierre Jolivet } 1689c2916339SPierre Jolivet z += 3; 1690c2916339SPierre Jolivet } 16913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1692c2916339SPierre Jolivet } 1693c2916339SPierre Jolivet 169466976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1695d71ae5a4SJacob Faibussowitsch { 1696c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1697c2916339SPierre Jolivet PetscScalar *z = c; 1698c2916339SPierre Jolivet const PetscScalar *xb; 1699c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4; 1700c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1701c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1702c2916339SPierre Jolivet 1703c2916339SPierre Jolivet PetscFunctionBegin; 1704c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 17059371c9d4SSatish Balay n = ii[1] - ii[0]; 17069371c9d4SSatish Balay ii++; 1707c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1708c2916339SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1709c2916339SPierre Jolivet jj = idx; 1710c2916339SPierre Jolivet vv = v; 1711c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1712c2916339SPierre Jolivet idx = jj; 1713c2916339SPierre Jolivet v = vv; 1714c2916339SPierre Jolivet for (j = 0; j < n; j++) { 17159371c9d4SSatish Balay xb = b + 4 * (*idx); 17169371c9d4SSatish Balay x1 = xb[0 + k * bm]; 17179371c9d4SSatish Balay x2 = xb[1 + k * bm]; 17189371c9d4SSatish Balay x3 = xb[2 + k * bm]; 17199371c9d4SSatish Balay x4 = xb[3 + k * bm]; 1720c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1721c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1722c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1723c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1724c2916339SPierre Jolivet if (*idx != i) { 1725c2916339SPierre Jolivet c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1726c2916339SPierre Jolivet c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1727c2916339SPierre Jolivet c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1728c2916339SPierre Jolivet c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1729c2916339SPierre Jolivet } 1730c2916339SPierre Jolivet v += 16; 1731c2916339SPierre Jolivet ++idx; 1732c2916339SPierre Jolivet } 1733c2916339SPierre Jolivet } 1734c2916339SPierre Jolivet z += 4; 1735c2916339SPierre Jolivet } 17363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1737c2916339SPierre Jolivet } 1738c2916339SPierre Jolivet 173966976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1740d71ae5a4SJacob Faibussowitsch { 1741c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1742c2916339SPierre Jolivet PetscScalar *z = c; 1743c2916339SPierre Jolivet const PetscScalar *xb; 1744c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 1745c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1746c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1747c2916339SPierre Jolivet 1748c2916339SPierre Jolivet PetscFunctionBegin; 1749c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 17509371c9d4SSatish Balay n = ii[1] - ii[0]; 17519371c9d4SSatish Balay ii++; 1752c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1753c2916339SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1754c2916339SPierre Jolivet jj = idx; 1755c2916339SPierre Jolivet vv = v; 1756c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1757c2916339SPierre Jolivet idx = jj; 1758c2916339SPierre Jolivet v = vv; 1759c2916339SPierre Jolivet for (j = 0; j < n; j++) { 17609371c9d4SSatish Balay xb = b + 5 * (*idx); 17619371c9d4SSatish Balay x1 = xb[0 + k * bm]; 17629371c9d4SSatish Balay x2 = xb[1 + k * bm]; 17639371c9d4SSatish Balay x3 = xb[2 + k * bm]; 17649371c9d4SSatish Balay x4 = xb[3 + k * bm]; 17659371c9d4SSatish Balay x5 = xb[4 + k * cm]; 1766c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1767c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1768c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1769c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1770c2916339SPierre Jolivet z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1771c2916339SPierre Jolivet if (*idx != i) { 1772c2916339SPierre Jolivet c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1773c2916339SPierre Jolivet c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1774c2916339SPierre Jolivet c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1775c2916339SPierre Jolivet c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1776c2916339SPierre Jolivet c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1777c2916339SPierre Jolivet } 1778c2916339SPierre Jolivet v += 25; 1779c2916339SPierre Jolivet ++idx; 1780c2916339SPierre Jolivet } 1781c2916339SPierre Jolivet } 1782c2916339SPierre Jolivet z += 5; 1783c2916339SPierre Jolivet } 17843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1785c2916339SPierre Jolivet } 1786c2916339SPierre Jolivet 1787d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1788d71ae5a4SJacob Faibussowitsch { 1789c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1790c2916339SPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1791281439baSStefano Zampini Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1792c2916339SPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1793c2916339SPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1794c2916339SPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 1795f4259b30SLisandro Dalcin PetscScalar *z = NULL; 1796c2916339SPierre Jolivet PetscScalar *c, *b; 1797c2916339SPierre Jolivet const MatScalar *v; 1798c2916339SPierre Jolivet const PetscInt *idx, *ii; 1799c2916339SPierre Jolivet PetscScalar _DOne = 1.0; 1800c2916339SPierre Jolivet 1801c2916339SPierre Jolivet PetscFunctionBegin; 18023ba16761SJacob Faibussowitsch if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 180308401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 180408401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 180508401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1806c2916339SPierre Jolivet b = bd->v; 18079566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(C)); 18089566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 1809c2916339SPierre Jolivet switch (bs) { 1810d71ae5a4SJacob Faibussowitsch case 1: 1811d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1812d71ae5a4SJacob Faibussowitsch break; 1813d71ae5a4SJacob Faibussowitsch case 2: 1814d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1815d71ae5a4SJacob Faibussowitsch break; 1816d71ae5a4SJacob Faibussowitsch case 3: 1817d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1818d71ae5a4SJacob Faibussowitsch break; 1819d71ae5a4SJacob Faibussowitsch case 4: 1820d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1821d71ae5a4SJacob Faibussowitsch break; 1822d71ae5a4SJacob Faibussowitsch case 5: 1823d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1824d71ae5a4SJacob Faibussowitsch break; 1825c2916339SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 18269566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 18279566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 18289566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 18299566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 1830c2916339SPierre Jolivet idx = a->j; 1831c2916339SPierre Jolivet v = a->a; 1832c2916339SPierre Jolivet mbs = a->mbs; 1833c2916339SPierre Jolivet ii = a->i; 1834c2916339SPierre Jolivet z = c; 1835c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 18369371c9d4SSatish Balay n = ii[1] - ii[0]; 18379371c9d4SSatish Balay ii++; 1838c2916339SPierre Jolivet for (j = 0; j < n; j++) { 1839792fecdfSBarry Smith if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1840792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1841c2916339SPierre Jolivet v += bs2; 1842c2916339SPierre Jolivet } 1843c2916339SPierre Jolivet z += bs; 1844c2916339SPierre Jolivet } 1845c2916339SPierre Jolivet } 18469566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 18479566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 18483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1849c2916339SPierre Jolivet } 1850