149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h> 5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 6c6db04a5SJed Brown #include <petscbt.h> 7c6db04a5SJed Brown #include <petscblaslapack.h> 849b5e25fSSatish Balay 9d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 10d71ae5a4SJacob Faibussowitsch { 115eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 127bede89fSBarry Smith PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 135d0c19d7SBarry Smith const PetscInt *idx; 14db41cccfSHong Zhang PetscBT table_out, table_in; 15d94109b8SHong Zhang 16d94109b8SHong Zhang PetscFunctionBegin; 1708401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 18d94109b8SHong Zhang mbs = a->mbs; 19d94109b8SHong Zhang ai = a->i; 20d94109b8SHong Zhang aj = a->j; 21d0f46423SBarry Smith bs = A->rmap->bs; 229566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_out)); 239566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs + 1, &nidx)); 249566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs, &table_in)); 25d94109b8SHong Zhang 26d94109b8SHong Zhang for (i = 0; i < is_max; i++) { /* for each is */ 27d94109b8SHong Zhang isz = 0; 289566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_out)); 29d94109b8SHong Zhang 30d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 319566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx)); 329566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n)); 33d94109b8SHong Zhang 34db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 35dbe03f88SHong Zhang bcol_max = 0; 36d94109b8SHong Zhang for (j = 0; j < n; ++j) { 37d94109b8SHong Zhang brow = idx[j] / bs; /* convert the indices into block indices */ 3808401ef6SPierre Jolivet PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 39db41cccfSHong Zhang if (!PetscBTLookupSet(table_out, brow)) { 40dbe03f88SHong Zhang nidx[isz++] = brow; 41dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 42dbe03f88SHong Zhang } 43d94109b8SHong Zhang } 449566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx)); 459566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 46d94109b8SHong Zhang 47d94109b8SHong Zhang k = 0; 48d94109b8SHong Zhang for (j = 0; j < ov; j++) { /* for each overlap */ 49db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 509566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs, table_in)); 519566063dSJacob Faibussowitsch for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 52d94109b8SHong Zhang 53d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 54d94109b8SHong Zhang for (brow = 0; brow < mbs; brow++) { 559371c9d4SSatish Balay start = ai[brow]; 569371c9d4SSatish Balay end = ai[brow + 1]; 57db41cccfSHong Zhang if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58d94109b8SHong Zhang for (l = start; l < end; l++) { 59d94109b8SHong Zhang bcol = aj[l]; 60d7b97159SHong Zhang if (!PetscBTLookupSet(table_out, bcol)) { 61d7b97159SHong Zhang nidx[isz++] = bcol; 62d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 63d7b97159SHong Zhang } 64d94109b8SHong Zhang } 65d94109b8SHong Zhang k++; 66d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67da81f932SPierre Jolivet } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */ 68d94109b8SHong Zhang for (l = start; l < end; l++) { 69d94109b8SHong Zhang bcol = aj[l]; 70dbe03f88SHong Zhang if (bcol > bcol_max) break; 71db41cccfSHong Zhang if (PetscBTLookup(table_in, bcol)) { 7226fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 73d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 74d94109b8SHong Zhang } 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } /* for each overlap */ 797bede89fSBarry Smith PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 807bede89fSBarry Smith } /* for each is */ 819566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_out)); 829566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 839566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_in)); 843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 8549b5e25fSSatish Balay } 8649b5e25fSSatish Balay 87847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 887cf5f706SPierre Jolivet Zero some ops' to avoid invalid use */ 89d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 90d71ae5a4SJacob Faibussowitsch { 9149b5e25fSSatish Balay PetscFunctionBegin; 929566063dSJacob Faibussowitsch PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 93f4259b30SLisandro Dalcin Bseq->ops->mult = NULL; 94f4259b30SLisandro Dalcin Bseq->ops->multadd = NULL; 95f4259b30SLisandro Dalcin Bseq->ops->multtranspose = NULL; 96f4259b30SLisandro Dalcin Bseq->ops->multtransposeadd = NULL; 97f4259b30SLisandro Dalcin Bseq->ops->lufactor = NULL; 98f4259b30SLisandro Dalcin Bseq->ops->choleskyfactor = NULL; 99f4259b30SLisandro Dalcin Bseq->ops->lufactorsymbolic = NULL; 100f4259b30SLisandro Dalcin Bseq->ops->choleskyfactorsymbolic = NULL; 101f4259b30SLisandro Dalcin Bseq->ops->getinertia = NULL; 1023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 10349b5e25fSSatish Balay } 10449b5e25fSSatish Balay 1057dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 106*fdfbdca6SPierre Jolivet static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym) 107d71ae5a4SJacob Faibussowitsch { 108*fdfbdca6SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL; 109*fdfbdca6SPierre Jolivet Mat_SeqBAIJ *d = NULL; 110e50a8114SHong Zhang PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 111e50a8114SHong Zhang PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 112e50a8114SHong Zhang const PetscInt *irow, *icol; 113e50a8114SHong Zhang PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 114e50a8114SHong Zhang PetscInt *aj = a->j, *ai = a->i; 115e50a8114SHong Zhang MatScalar *mat_a; 116e50a8114SHong Zhang Mat C; 117e50a8114SHong Zhang PetscBool flag; 118e50a8114SHong Zhang 119e50a8114SHong Zhang PetscFunctionBegin; 120e50a8114SHong Zhang 1219566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &irow)); 1229566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &icol)); 1239566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow, &nrows)); 1249566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol, &ncols)); 125e50a8114SHong Zhang 1269566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1 + oldcols, &smap)); 127e50a8114SHong Zhang ssmap = smap; 1289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1 + nrows, &lens)); 129e50a8114SHong Zhang for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 130e50a8114SHong Zhang /* determine lens of each row */ 131e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 132e50a8114SHong Zhang kstart = ai[irow[i]]; 133e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 134e50a8114SHong Zhang lens[i] = 0; 135e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 136e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 137e50a8114SHong Zhang } 138e50a8114SHong Zhang } 139e50a8114SHong Zhang /* Create and fill new matrix */ 140e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 141*fdfbdca6SPierre Jolivet if (sym) { 142e50a8114SHong Zhang c = (Mat_SeqSBAIJ *)((*B)->data); 143e50a8114SHong Zhang 144aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 1459566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 146*fdfbdca6SPierre Jolivet PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 1479566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen, c->mbs)); 148*fdfbdca6SPierre Jolivet } else { 149*fdfbdca6SPierre Jolivet d = (Mat_SeqBAIJ *)((*B)->data); 150*fdfbdca6SPierre Jolivet 151*fdfbdca6SPierre Jolivet PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 152*fdfbdca6SPierre Jolivet PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag)); 153*fdfbdca6SPierre Jolivet PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 154*fdfbdca6SPierre Jolivet PetscCall(PetscArrayzero(d->ilen, d->mbs)); 155*fdfbdca6SPierre Jolivet } 156e50a8114SHong Zhang C = *B; 157e50a8114SHong Zhang } else { 1589566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 1599566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 160*fdfbdca6SPierre Jolivet if (sym) { 1619566063dSJacob Faibussowitsch PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 1629566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 163*fdfbdca6SPierre Jolivet } else { 164*fdfbdca6SPierre Jolivet PetscCall(MatSetType(C, MATSEQBAIJ)); 165*fdfbdca6SPierre Jolivet PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 166e50a8114SHong Zhang } 167*fdfbdca6SPierre Jolivet } 168*fdfbdca6SPierre Jolivet if (sym) c = (Mat_SeqSBAIJ *)(C->data); 169*fdfbdca6SPierre Jolivet else d = (Mat_SeqBAIJ *)(C->data); 170e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 171e50a8114SHong Zhang row = irow[i]; 172e50a8114SHong Zhang kstart = ai[row]; 173e50a8114SHong Zhang kend = kstart + a->ilen[row]; 174*fdfbdca6SPierre Jolivet if (sym) { 175e50a8114SHong Zhang mat_i = c->i[i]; 176e50a8114SHong Zhang mat_j = c->j + mat_i; 177e50a8114SHong Zhang mat_a = c->a + mat_i * bs2; 178e50a8114SHong Zhang mat_ilen = c->ilen + i; 179*fdfbdca6SPierre Jolivet } else { 180*fdfbdca6SPierre Jolivet mat_i = d->i[i]; 181*fdfbdca6SPierre Jolivet mat_j = d->j + mat_i; 182*fdfbdca6SPierre Jolivet mat_a = d->a + mat_i * bs2; 183*fdfbdca6SPierre Jolivet mat_ilen = d->ilen + i; 184*fdfbdca6SPierre Jolivet } 185e50a8114SHong Zhang for (k = kstart; k < kend; k++) { 186e50a8114SHong Zhang if ((tcol = ssmap[a->j[k]])) { 187e50a8114SHong Zhang *mat_j++ = tcol - 1; 1889566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 189e50a8114SHong Zhang mat_a += bs2; 190e50a8114SHong Zhang (*mat_ilen)++; 191e50a8114SHong Zhang } 192e50a8114SHong Zhang } 193e50a8114SHong Zhang } 194e50a8114SHong Zhang /* sort */ 195e50a8114SHong Zhang { 196e50a8114SHong Zhang MatScalar *work; 197e50a8114SHong Zhang 1989566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2, &work)); 199e50a8114SHong Zhang for (i = 0; i < nrows; i++) { 200e50a8114SHong Zhang PetscInt ilen; 201*fdfbdca6SPierre Jolivet if (sym) { 202e50a8114SHong Zhang mat_i = c->i[i]; 203e50a8114SHong Zhang mat_j = c->j + mat_i; 204e50a8114SHong Zhang mat_a = c->a + mat_i * bs2; 205e50a8114SHong Zhang ilen = c->ilen[i]; 206*fdfbdca6SPierre Jolivet } else { 207*fdfbdca6SPierre Jolivet mat_i = d->i[i]; 208*fdfbdca6SPierre Jolivet mat_j = d->j + mat_i; 209*fdfbdca6SPierre Jolivet mat_a = d->a + mat_i * bs2; 210*fdfbdca6SPierre Jolivet ilen = d->ilen[i]; 211*fdfbdca6SPierre Jolivet } 2129566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 213e50a8114SHong Zhang } 2149566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 215e50a8114SHong Zhang } 216e50a8114SHong Zhang 217e50a8114SHong Zhang /* Free work space */ 2189566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &icol)); 2199566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 2209566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 2219566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 2229566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 223e50a8114SHong Zhang 2249566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &irow)); 225e50a8114SHong Zhang *B = C; 2263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 227e50a8114SHong Zhang } 228e50a8114SHong Zhang 229d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 230d71ae5a4SJacob Faibussowitsch { 231*fdfbdca6SPierre Jolivet Mat C[2]; 232*fdfbdca6SPierre Jolivet IS is1, is2, intersect = NULL; 233*fdfbdca6SPierre Jolivet PetscInt n1, n2, ni; 234*fdfbdca6SPierre Jolivet PetscBool sym = PETSC_TRUE; 23549b5e25fSSatish Balay 23649b5e25fSSatish Balay PetscFunctionBegin; 237f9a48b90SPierre Jolivet PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 238f9a48b90SPierre Jolivet if (isrow == iscol) { 239f9a48b90SPierre Jolivet is2 = is1; 240f9a48b90SPierre Jolivet PetscCall(PetscObjectReference((PetscObject)is2)); 241*fdfbdca6SPierre Jolivet } else { 242*fdfbdca6SPierre Jolivet PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 243*fdfbdca6SPierre Jolivet PetscCall(ISIntersect(is1, is2, &intersect)); 244*fdfbdca6SPierre Jolivet PetscCall(ISGetLocalSize(intersect, &ni)); 245*fdfbdca6SPierre Jolivet PetscCall(ISDestroy(&intersect)); 246*fdfbdca6SPierre Jolivet if (ni == 0) sym = PETSC_FALSE; 247*fdfbdca6SPierre Jolivet else if (PetscDefined(USE_DEBUG)) { 248*fdfbdca6SPierre Jolivet PetscCall(ISGetLocalSize(is1, &n1)); 249*fdfbdca6SPierre Jolivet PetscCall(ISGetLocalSize(is2, &n2)); 250*fdfbdca6SPierre Jolivet PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix"); 251*fdfbdca6SPierre Jolivet } 252*fdfbdca6SPierre Jolivet } 253*fdfbdca6SPierre Jolivet if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym)); 254*fdfbdca6SPierre Jolivet else { 255*fdfbdca6SPierre Jolivet PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym)); 256*fdfbdca6SPierre Jolivet PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym)); 257*fdfbdca6SPierre Jolivet PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1)); 258*fdfbdca6SPierre Jolivet PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN)); 259*fdfbdca6SPierre Jolivet PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 260*fdfbdca6SPierre Jolivet if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 261*fdfbdca6SPierre Jolivet else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B)); 262*fdfbdca6SPierre Jolivet else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 263*fdfbdca6SPierre Jolivet PetscCall(MatDestroy(C)); 264*fdfbdca6SPierre Jolivet PetscCall(MatDestroy(C + 1)); 265*fdfbdca6SPierre Jolivet } 2669566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2679566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 268847daeccSHong Zhang 269*fdfbdca6SPierre Jolivet if (sym && isrow != iscol) { 2708f46ffcaSHong Zhang PetscBool isequal; 2719566063dSJacob Faibussowitsch PetscCall(ISEqual(isrow, iscol, &isequal)); 27248a46eb9SPierre Jolivet if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 27349b5e25fSSatish Balay } 2743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 27549b5e25fSSatish Balay } 27649b5e25fSSatish Balay 277d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 278d71ae5a4SJacob Faibussowitsch { 27913f74950SBarry Smith PetscInt i; 28049b5e25fSSatish Balay 28149b5e25fSSatish Balay PetscFunctionBegin; 282314f503fSPierre Jolivet if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B)); 283e50a8114SHong Zhang 28448a46eb9SPierre Jolivet for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 2853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28649b5e25fSSatish Balay } 28749b5e25fSSatish Balay 28849b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 289d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 290d71ae5a4SJacob Faibussowitsch { 29149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 292d9ca1df4SBarry Smith PetscScalar *z, x1, x2, zero = 0.0; 293d9ca1df4SBarry Smith const PetscScalar *x, *xb; 294d9ca1df4SBarry Smith const MatScalar *v; 295d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 296d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 29798c9bda7SSatish Balay PetscInt nonzerorow = 0; 29849b5e25fSSatish Balay 29949b5e25fSSatish Balay PetscFunctionBegin; 3009566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 3013ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 3029566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3039566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 30449b5e25fSSatish Balay 30549b5e25fSSatish Balay v = a->a; 30649b5e25fSSatish Balay xb = x; 30749b5e25fSSatish Balay 30849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 30949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3109371c9d4SSatish Balay x1 = xb[0]; 3119371c9d4SSatish Balay x2 = xb[1]; 31249b5e25fSSatish Balay ib = aj + *ai; 313831a3094SHong Zhang jmin = 0; 31498c9bda7SSatish Balay nonzerorow += (n > 0); 3157fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 31649b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 31749b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 3189371c9d4SSatish Balay v += 4; 3199371c9d4SSatish Balay jmin++; 3207fbae186SHong Zhang } 321444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 322444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 323831a3094SHong Zhang for (j = jmin; j < n; j++) { 32449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 32549b5e25fSSatish Balay cval = ib[j] * 2; 32649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 32749b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 32849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 32949b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 33049b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 33149b5e25fSSatish Balay v += 4; 33249b5e25fSSatish Balay } 3339371c9d4SSatish Balay xb += 2; 3349371c9d4SSatish Balay ai++; 33549b5e25fSSatish Balay } 33649b5e25fSSatish Balay 3379566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3389566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3399566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 3403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 34149b5e25fSSatish Balay } 34249b5e25fSSatish Balay 343d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 344d71ae5a4SJacob Faibussowitsch { 34549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 346d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, zero = 0.0; 347d9ca1df4SBarry Smith const PetscScalar *x, *xb; 348d9ca1df4SBarry Smith const MatScalar *v; 349d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 350d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 35198c9bda7SSatish Balay PetscInt nonzerorow = 0; 35249b5e25fSSatish Balay 35349b5e25fSSatish Balay PetscFunctionBegin; 3549566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 3553ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 3569566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3579566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 35849b5e25fSSatish Balay 35949b5e25fSSatish Balay v = a->a; 36049b5e25fSSatish Balay xb = x; 36149b5e25fSSatish Balay 36249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 36349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 3649371c9d4SSatish Balay x1 = xb[0]; 3659371c9d4SSatish Balay x2 = xb[1]; 3669371c9d4SSatish Balay x3 = xb[2]; 36749b5e25fSSatish Balay ib = aj + *ai; 368831a3094SHong Zhang jmin = 0; 36998c9bda7SSatish Balay nonzerorow += (n > 0); 3707fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 37149b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 37249b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 37349b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 3749371c9d4SSatish Balay v += 9; 3759371c9d4SSatish Balay jmin++; 3767fbae186SHong Zhang } 377444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 378444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 379831a3094SHong Zhang for (j = jmin; j < n; j++) { 38049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 38149b5e25fSSatish Balay cval = ib[j] * 3; 38249b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 38349b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 38449b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 38549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 38649b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 38749b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 38849b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 38949b5e25fSSatish Balay v += 9; 39049b5e25fSSatish Balay } 3919371c9d4SSatish Balay xb += 3; 3929371c9d4SSatish Balay ai++; 39349b5e25fSSatish Balay } 39449b5e25fSSatish Balay 3959566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 3969566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 3979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 3983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39949b5e25fSSatish Balay } 40049b5e25fSSatish Balay 401d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 402d71ae5a4SJacob Faibussowitsch { 40349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 404d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 405d9ca1df4SBarry Smith const PetscScalar *x, *xb; 406d9ca1df4SBarry Smith const MatScalar *v; 407d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 408d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 40998c9bda7SSatish Balay PetscInt nonzerorow = 0; 41049b5e25fSSatish Balay 41149b5e25fSSatish Balay PetscFunctionBegin; 4129566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 4133ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 4149566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4159566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 41649b5e25fSSatish Balay 41749b5e25fSSatish Balay v = a->a; 41849b5e25fSSatish Balay xb = x; 41949b5e25fSSatish Balay 42049b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 42149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4229371c9d4SSatish Balay x1 = xb[0]; 4239371c9d4SSatish Balay x2 = xb[1]; 4249371c9d4SSatish Balay x3 = xb[2]; 4259371c9d4SSatish Balay x4 = xb[3]; 42649b5e25fSSatish Balay ib = aj + *ai; 427831a3094SHong Zhang jmin = 0; 42898c9bda7SSatish Balay nonzerorow += (n > 0); 4297fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 43049b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 43149b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 43249b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 43349b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 4349371c9d4SSatish Balay v += 16; 4359371c9d4SSatish Balay jmin++; 4367fbae186SHong Zhang } 437444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 438444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 439831a3094SHong Zhang for (j = jmin; j < n; j++) { 44049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 44149b5e25fSSatish Balay cval = ib[j] * 4; 44249b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 44349b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 44449b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 44549b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 44649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 44749b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 44849b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 44949b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 45049b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 45149b5e25fSSatish Balay v += 16; 45249b5e25fSSatish Balay } 4539371c9d4SSatish Balay xb += 4; 4549371c9d4SSatish Balay ai++; 45549b5e25fSSatish Balay } 45649b5e25fSSatish Balay 4579566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 4589566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 4599566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 4603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 46149b5e25fSSatish Balay } 46249b5e25fSSatish Balay 463d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 464d71ae5a4SJacob Faibussowitsch { 46549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 466d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 467d9ca1df4SBarry Smith const PetscScalar *x, *xb; 468d9ca1df4SBarry Smith const MatScalar *v; 469d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 470d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 47198c9bda7SSatish Balay PetscInt nonzerorow = 0; 47249b5e25fSSatish Balay 47349b5e25fSSatish Balay PetscFunctionBegin; 4749566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 4753ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 4769566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 4779566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 47849b5e25fSSatish Balay 47949b5e25fSSatish Balay v = a->a; 48049b5e25fSSatish Balay xb = x; 48149b5e25fSSatish Balay 48249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 48349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 4849371c9d4SSatish Balay x1 = xb[0]; 4859371c9d4SSatish Balay x2 = xb[1]; 4869371c9d4SSatish Balay x3 = xb[2]; 4879371c9d4SSatish Balay x4 = xb[3]; 4889371c9d4SSatish Balay x5 = xb[4]; 48949b5e25fSSatish Balay ib = aj + *ai; 490831a3094SHong Zhang jmin = 0; 49198c9bda7SSatish Balay nonzerorow += (n > 0); 4927fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 49349b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 49449b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 49549b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 49649b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 49749b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 4989371c9d4SSatish Balay v += 25; 4999371c9d4SSatish Balay jmin++; 5007fbae186SHong Zhang } 501444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 502444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 503831a3094SHong Zhang for (j = jmin; j < n; j++) { 50449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 50549b5e25fSSatish Balay cval = ib[j] * 5; 50649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 50749b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 50849b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 50949b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 51049b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 51149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 51249b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 51349b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 51449b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 51549b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 51649b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 51749b5e25fSSatish Balay v += 25; 51849b5e25fSSatish Balay } 5199371c9d4SSatish Balay xb += 5; 5209371c9d4SSatish Balay ai++; 52149b5e25fSSatish Balay } 52249b5e25fSSatish Balay 5239566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5249566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5259566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 5263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 52749b5e25fSSatish Balay } 52849b5e25fSSatish Balay 529d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 530d71ae5a4SJacob Faibussowitsch { 53149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 532d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 533d9ca1df4SBarry Smith const PetscScalar *x, *xb; 534d9ca1df4SBarry Smith const MatScalar *v; 535d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 536d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 53798c9bda7SSatish Balay PetscInt nonzerorow = 0; 53849b5e25fSSatish Balay 53949b5e25fSSatish Balay PetscFunctionBegin; 5409566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 5413ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 5429566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5439566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 54449b5e25fSSatish Balay 54549b5e25fSSatish Balay v = a->a; 54649b5e25fSSatish Balay xb = x; 54749b5e25fSSatish Balay 54849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 54949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 5509371c9d4SSatish Balay x1 = xb[0]; 5519371c9d4SSatish Balay x2 = xb[1]; 5529371c9d4SSatish Balay x3 = xb[2]; 5539371c9d4SSatish Balay x4 = xb[3]; 5549371c9d4SSatish Balay x5 = xb[4]; 5559371c9d4SSatish Balay x6 = xb[5]; 55649b5e25fSSatish Balay ib = aj + *ai; 557831a3094SHong Zhang jmin = 0; 55898c9bda7SSatish Balay nonzerorow += (n > 0); 5597fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 56049b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 56149b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 56249b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 56349b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 56449b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 56549b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 5669371c9d4SSatish Balay v += 36; 5679371c9d4SSatish Balay jmin++; 5687fbae186SHong Zhang } 569444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 570444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 571831a3094SHong Zhang for (j = jmin; j < n; j++) { 57249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 57349b5e25fSSatish Balay cval = ib[j] * 6; 57449b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 57549b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 57649b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 57749b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 57849b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 57949b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 58049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 58149b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 58249b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 58349b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 58449b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 58549b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 58649b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 58749b5e25fSSatish Balay v += 36; 58849b5e25fSSatish Balay } 5899371c9d4SSatish Balay xb += 6; 5909371c9d4SSatish Balay ai++; 59149b5e25fSSatish Balay } 59249b5e25fSSatish Balay 5939566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5949566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 5959566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 5963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 59749b5e25fSSatish Balay } 598c2916339SPierre Jolivet 599d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 600d71ae5a4SJacob Faibussowitsch { 60149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 602d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 603d9ca1df4SBarry Smith const PetscScalar *x, *xb; 604d9ca1df4SBarry Smith const MatScalar *v; 605d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 606d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 60798c9bda7SSatish Balay PetscInt nonzerorow = 0; 60849b5e25fSSatish Balay 60949b5e25fSSatish Balay PetscFunctionBegin; 6109566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 6113ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 6129566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6139566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 61449b5e25fSSatish Balay 61549b5e25fSSatish Balay v = a->a; 61649b5e25fSSatish Balay xb = x; 61749b5e25fSSatish Balay 61849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 61949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 6209371c9d4SSatish Balay x1 = xb[0]; 6219371c9d4SSatish Balay x2 = xb[1]; 6229371c9d4SSatish Balay x3 = xb[2]; 6239371c9d4SSatish Balay x4 = xb[3]; 6249371c9d4SSatish Balay x5 = xb[4]; 6259371c9d4SSatish Balay x6 = xb[5]; 6269371c9d4SSatish Balay x7 = xb[6]; 62749b5e25fSSatish Balay ib = aj + *ai; 628831a3094SHong Zhang jmin = 0; 62998c9bda7SSatish Balay nonzerorow += (n > 0); 6307fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 63149b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 63249b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 63349b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 63449b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 63549b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 63649b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 63749b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 6389371c9d4SSatish Balay v += 49; 6399371c9d4SSatish Balay jmin++; 6407fbae186SHong Zhang } 641444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 642444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 643831a3094SHong Zhang for (j = jmin; j < n; j++) { 64449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 64549b5e25fSSatish Balay cval = ib[j] * 7; 64649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 64749b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 64849b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 64949b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 65049b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 65149b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 65249b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 65349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 65449b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 65549b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 65649b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 65749b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 65849b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 65949b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 66049b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 66149b5e25fSSatish Balay v += 49; 66249b5e25fSSatish Balay } 6639371c9d4SSatish Balay xb += 7; 6649371c9d4SSatish Balay ai++; 66549b5e25fSSatish Balay } 6669566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 6679566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 6689566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 6693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 67049b5e25fSSatish Balay } 67149b5e25fSSatish Balay 67249b5e25fSSatish Balay /* 67349b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 67449b5e25fSSatish Balay */ 675d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 676d71ae5a4SJacob Faibussowitsch { 67749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 678d9ca1df4SBarry Smith PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 679d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 680d9ca1df4SBarry Smith const MatScalar *v; 681d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 682d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 68398c9bda7SSatish Balay PetscInt nonzerorow = 0; 68449b5e25fSSatish Balay 68549b5e25fSSatish Balay PetscFunctionBegin; 6869566063dSJacob Faibussowitsch PetscCall(VecSet(zz, zero)); 6873ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 6889566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 6899566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 69059689ffbSStefano Zampini 69159689ffbSStefano Zampini x_ptr = x; 69259689ffbSStefano Zampini z_ptr = z; 69349b5e25fSSatish Balay 69449b5e25fSSatish Balay aj = a->j; 69549b5e25fSSatish Balay v = a->a; 69649b5e25fSSatish Balay ii = a->i; 69749b5e25fSSatish Balay 69848a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 69949b5e25fSSatish Balay work = a->mult_work; 70049b5e25fSSatish Balay 70149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 7029371c9d4SSatish Balay n = ii[1] - ii[0]; 7039371c9d4SSatish Balay ncols = n * bs; 7049371c9d4SSatish Balay workt = work; 7059371c9d4SSatish Balay idx = aj + ii[0]; 70698c9bda7SSatish Balay nonzerorow += (n > 0); 70749b5e25fSSatish Balay 70849b5e25fSSatish Balay /* upper triangular part */ 70949b5e25fSSatish Balay for (j = 0; j < n; j++) { 71049b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 71149b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 71249b5e25fSSatish Balay workt += bs; 71349b5e25fSSatish Balay } 71449b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 71596b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 71649b5e25fSSatish Balay 71749b5e25fSSatish Balay /* strict lower triangular part */ 718831a3094SHong Zhang idx = aj + ii[0]; 71959689ffbSStefano Zampini if (n && *idx == i) { 7209371c9d4SSatish Balay ncols -= bs; 7219371c9d4SSatish Balay v += bs2; 7229371c9d4SSatish Balay idx++; 7239371c9d4SSatish Balay n--; 724831a3094SHong Zhang } 72596b9376eSHong Zhang 72649b5e25fSSatish Balay if (ncols > 0) { 72749b5e25fSSatish Balay workt = work; 7289566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 72996b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 730831a3094SHong Zhang for (j = 0; j < n; j++) { 731831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 73249b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 73349b5e25fSSatish Balay workt += bs; 73449b5e25fSSatish Balay } 73549b5e25fSSatish Balay } 7369371c9d4SSatish Balay x += bs; 7379371c9d4SSatish Balay v += n * bs2; 7389371c9d4SSatish Balay z += bs; 7399371c9d4SSatish Balay ii++; 74049b5e25fSSatish Balay } 74149b5e25fSSatish Balay 7429566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7439566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 7449566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 7453ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 74649b5e25fSSatish Balay } 74749b5e25fSSatish Balay 748d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 749d71ae5a4SJacob Faibussowitsch { 75049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 751d9ca1df4SBarry Smith PetscScalar *z, x1; 752d9ca1df4SBarry Smith const PetscScalar *x, *xb; 753d9ca1df4SBarry Smith const MatScalar *v; 754d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 755d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 75698c9bda7SSatish Balay PetscInt nonzerorow = 0; 757eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 758b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 759eb1ec7c1SStefano Zampini #else 760eb1ec7c1SStefano Zampini const int aconj = 0; 761eb1ec7c1SStefano Zampini #endif 76249b5e25fSSatish Balay 76349b5e25fSSatish Balay PetscFunctionBegin; 7649566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 7653ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 7669566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 7679566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 76849b5e25fSSatish Balay v = a->a; 76949b5e25fSSatish Balay xb = x; 77049b5e25fSSatish Balay 77149b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 77249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 77349b5e25fSSatish Balay x1 = xb[0]; 77449b5e25fSSatish Balay ib = aj + *ai; 775831a3094SHong Zhang jmin = 0; 77698c9bda7SSatish Balay nonzerorow += (n > 0); 7773d9ade75SStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 7789371c9d4SSatish Balay z[i] += *v++ * x[*ib++]; 7799371c9d4SSatish Balay jmin++; 780831a3094SHong Zhang } 781eb1ec7c1SStefano Zampini if (aconj) { 782eb1ec7c1SStefano Zampini for (j = jmin; j < n; j++) { 783eb1ec7c1SStefano Zampini cval = *ib; 784eb1ec7c1SStefano Zampini z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 785eb1ec7c1SStefano Zampini z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 786eb1ec7c1SStefano Zampini } 787eb1ec7c1SStefano Zampini } else { 788831a3094SHong Zhang for (j = jmin; j < n; j++) { 78949b5e25fSSatish Balay cval = *ib; 79049b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 79149b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 79249b5e25fSSatish Balay } 793eb1ec7c1SStefano Zampini } 7949371c9d4SSatish Balay xb++; 7959371c9d4SSatish Balay ai++; 79649b5e25fSSatish Balay } 79749b5e25fSSatish Balay 7989566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7999566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 80049b5e25fSSatish Balay 8019566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 8023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 80349b5e25fSSatish Balay } 80449b5e25fSSatish Balay 805d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 806d71ae5a4SJacob Faibussowitsch { 80749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 808d9ca1df4SBarry Smith PetscScalar *z, x1, x2; 809d9ca1df4SBarry Smith const PetscScalar *x, *xb; 810d9ca1df4SBarry Smith const MatScalar *v; 811d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 812d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 81398c9bda7SSatish Balay PetscInt nonzerorow = 0; 81449b5e25fSSatish Balay 81549b5e25fSSatish Balay PetscFunctionBegin; 8169566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 8173ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 8189566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8199566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 82049b5e25fSSatish Balay 82149b5e25fSSatish Balay v = a->a; 82249b5e25fSSatish Balay xb = x; 82349b5e25fSSatish Balay 82449b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 82549b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8269371c9d4SSatish Balay x1 = xb[0]; 8279371c9d4SSatish Balay x2 = xb[1]; 82849b5e25fSSatish Balay ib = aj + *ai; 829831a3094SHong Zhang jmin = 0; 83098c9bda7SSatish Balay nonzerorow += (n > 0); 83159689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 83249b5e25fSSatish Balay z[2 * i] += v[0] * x1 + v[2] * x2; 83349b5e25fSSatish Balay z[2 * i + 1] += v[2] * x1 + v[3] * x2; 8349371c9d4SSatish Balay v += 4; 8359371c9d4SSatish Balay jmin++; 8367fbae186SHong Zhang } 837444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 838444d8c10SJed Brown PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 839831a3094SHong Zhang for (j = jmin; j < n; j++) { 84049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 84149b5e25fSSatish Balay cval = ib[j] * 2; 84249b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2; 84349b5e25fSSatish Balay z[cval + 1] += v[2] * x1 + v[3] * x2; 84449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 84549b5e25fSSatish Balay z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 84649b5e25fSSatish Balay z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 84749b5e25fSSatish Balay v += 4; 84849b5e25fSSatish Balay } 8499371c9d4SSatish Balay xb += 2; 8509371c9d4SSatish Balay ai++; 85149b5e25fSSatish Balay } 8529566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 8539566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 85449b5e25fSSatish Balay 8559566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 8563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 85749b5e25fSSatish Balay } 85849b5e25fSSatish Balay 859d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 860d71ae5a4SJacob Faibussowitsch { 86149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 862d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3; 863d9ca1df4SBarry Smith const PetscScalar *x, *xb; 864d9ca1df4SBarry Smith const MatScalar *v; 865d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 866d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 86798c9bda7SSatish Balay PetscInt nonzerorow = 0; 86849b5e25fSSatish Balay 86949b5e25fSSatish Balay PetscFunctionBegin; 8709566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 8713ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 8729566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 8739566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 87449b5e25fSSatish Balay 87549b5e25fSSatish Balay v = a->a; 87649b5e25fSSatish Balay xb = x; 87749b5e25fSSatish Balay 87849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 87949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 8809371c9d4SSatish Balay x1 = xb[0]; 8819371c9d4SSatish Balay x2 = xb[1]; 8829371c9d4SSatish Balay x3 = xb[2]; 88349b5e25fSSatish Balay ib = aj + *ai; 884831a3094SHong Zhang jmin = 0; 88598c9bda7SSatish Balay nonzerorow += (n > 0); 88659689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 88749b5e25fSSatish Balay z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 88849b5e25fSSatish Balay z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 88949b5e25fSSatish Balay z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 8909371c9d4SSatish Balay v += 9; 8919371c9d4SSatish Balay jmin++; 8927fbae186SHong Zhang } 893444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 894444d8c10SJed Brown PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 895831a3094SHong Zhang for (j = jmin; j < n; j++) { 89649b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 89749b5e25fSSatish Balay cval = ib[j] * 3; 89849b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 89949b5e25fSSatish Balay z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 90049b5e25fSSatish Balay z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 90149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 90249b5e25fSSatish Balay z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 90349b5e25fSSatish Balay z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 90449b5e25fSSatish Balay z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 90549b5e25fSSatish Balay v += 9; 90649b5e25fSSatish Balay } 9079371c9d4SSatish Balay xb += 3; 9089371c9d4SSatish Balay ai++; 90949b5e25fSSatish Balay } 91049b5e25fSSatish Balay 9119566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9129566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 91349b5e25fSSatish Balay 9149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 9153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 91649b5e25fSSatish Balay } 91749b5e25fSSatish Balay 918d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 919d71ae5a4SJacob Faibussowitsch { 92049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 921d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4; 922d9ca1df4SBarry Smith const PetscScalar *x, *xb; 923d9ca1df4SBarry Smith const MatScalar *v; 924d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 925d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 92698c9bda7SSatish Balay PetscInt nonzerorow = 0; 92749b5e25fSSatish Balay 92849b5e25fSSatish Balay PetscFunctionBegin; 9299566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 9303ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 9319566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9329566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 93349b5e25fSSatish Balay 93449b5e25fSSatish Balay v = a->a; 93549b5e25fSSatish Balay xb = x; 93649b5e25fSSatish Balay 93749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 93849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 9399371c9d4SSatish Balay x1 = xb[0]; 9409371c9d4SSatish Balay x2 = xb[1]; 9419371c9d4SSatish Balay x3 = xb[2]; 9429371c9d4SSatish Balay x4 = xb[3]; 94349b5e25fSSatish Balay ib = aj + *ai; 944831a3094SHong Zhang jmin = 0; 94598c9bda7SSatish Balay nonzerorow += (n > 0); 94659689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 94749b5e25fSSatish Balay z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 94849b5e25fSSatish Balay z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 94949b5e25fSSatish Balay z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 95049b5e25fSSatish Balay z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 9519371c9d4SSatish Balay v += 16; 9529371c9d4SSatish Balay jmin++; 9537fbae186SHong Zhang } 954444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 955444d8c10SJed Brown PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 956831a3094SHong Zhang for (j = jmin; j < n; j++) { 95749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 95849b5e25fSSatish Balay cval = ib[j] * 4; 95949b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 96049b5e25fSSatish Balay z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 96149b5e25fSSatish Balay z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 96249b5e25fSSatish Balay z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 96349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 96449b5e25fSSatish Balay z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 96549b5e25fSSatish Balay z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 96649b5e25fSSatish Balay z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 96749b5e25fSSatish Balay z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 96849b5e25fSSatish Balay v += 16; 96949b5e25fSSatish Balay } 9709371c9d4SSatish Balay xb += 4; 9719371c9d4SSatish Balay ai++; 97249b5e25fSSatish Balay } 97349b5e25fSSatish Balay 9749566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 9759566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 97649b5e25fSSatish Balay 9779566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 9783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 97949b5e25fSSatish Balay } 98049b5e25fSSatish Balay 981d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 982d71ae5a4SJacob Faibussowitsch { 98349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 984d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5; 985d9ca1df4SBarry Smith const PetscScalar *x, *xb; 986d9ca1df4SBarry Smith const MatScalar *v; 987d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 988d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 98998c9bda7SSatish Balay PetscInt nonzerorow = 0; 99049b5e25fSSatish Balay 99149b5e25fSSatish Balay PetscFunctionBegin; 9929566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 9933ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 9949566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 9959566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 99649b5e25fSSatish Balay 99749b5e25fSSatish Balay v = a->a; 99849b5e25fSSatish Balay xb = x; 99949b5e25fSSatish Balay 100049b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 100149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 10029371c9d4SSatish Balay x1 = xb[0]; 10039371c9d4SSatish Balay x2 = xb[1]; 10049371c9d4SSatish Balay x3 = xb[2]; 10059371c9d4SSatish Balay x4 = xb[3]; 10069371c9d4SSatish Balay x5 = xb[4]; 100749b5e25fSSatish Balay ib = aj + *ai; 1008831a3094SHong Zhang jmin = 0; 100998c9bda7SSatish Balay nonzerorow += (n > 0); 101059689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 101149b5e25fSSatish Balay z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 101249b5e25fSSatish Balay z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 101349b5e25fSSatish Balay z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 101449b5e25fSSatish Balay z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 101549b5e25fSSatish Balay z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 10169371c9d4SSatish Balay v += 25; 10179371c9d4SSatish Balay jmin++; 10187fbae186SHong Zhang } 1019444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1020444d8c10SJed Brown PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1021831a3094SHong Zhang for (j = jmin; j < n; j++) { 102249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 102349b5e25fSSatish Balay cval = ib[j] * 5; 102449b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 102549b5e25fSSatish Balay z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 102649b5e25fSSatish Balay z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 102749b5e25fSSatish Balay z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 102849b5e25fSSatish Balay z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 102949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 103049b5e25fSSatish Balay z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 103149b5e25fSSatish Balay z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 103249b5e25fSSatish Balay z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 103349b5e25fSSatish Balay z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 103449b5e25fSSatish Balay z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 103549b5e25fSSatish Balay v += 25; 103649b5e25fSSatish Balay } 10379371c9d4SSatish Balay xb += 5; 10389371c9d4SSatish Balay ai++; 103949b5e25fSSatish Balay } 104049b5e25fSSatish Balay 10419566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 10429566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 104349b5e25fSSatish Balay 10449566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 10453ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 104649b5e25fSSatish Balay } 1047c2916339SPierre Jolivet 1048d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1049d71ae5a4SJacob Faibussowitsch { 105049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1051d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6; 1052d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1053d9ca1df4SBarry Smith const MatScalar *v; 1054d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1055d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 105698c9bda7SSatish Balay PetscInt nonzerorow = 0; 105749b5e25fSSatish Balay 105849b5e25fSSatish Balay PetscFunctionBegin; 10599566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 10603ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 10619566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 10629566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 106349b5e25fSSatish Balay 106449b5e25fSSatish Balay v = a->a; 106549b5e25fSSatish Balay xb = x; 106649b5e25fSSatish Balay 106749b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 106849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 10699371c9d4SSatish Balay x1 = xb[0]; 10709371c9d4SSatish Balay x2 = xb[1]; 10719371c9d4SSatish Balay x3 = xb[2]; 10729371c9d4SSatish Balay x4 = xb[3]; 10739371c9d4SSatish Balay x5 = xb[4]; 10749371c9d4SSatish Balay x6 = xb[5]; 107549b5e25fSSatish Balay ib = aj + *ai; 1076831a3094SHong Zhang jmin = 0; 107798c9bda7SSatish Balay nonzerorow += (n > 0); 107859689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 107949b5e25fSSatish Balay z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 108049b5e25fSSatish Balay z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 108149b5e25fSSatish Balay z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 108249b5e25fSSatish Balay z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 108349b5e25fSSatish Balay z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 108449b5e25fSSatish Balay z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 10859371c9d4SSatish Balay v += 36; 10869371c9d4SSatish Balay jmin++; 10877fbae186SHong Zhang } 1088444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1089444d8c10SJed Brown PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1090831a3094SHong Zhang for (j = jmin; j < n; j++) { 109149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 109249b5e25fSSatish Balay cval = ib[j] * 6; 109349b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 109449b5e25fSSatish Balay z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 109549b5e25fSSatish Balay z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 109649b5e25fSSatish Balay z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 109749b5e25fSSatish Balay z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 109849b5e25fSSatish Balay z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 109949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 110049b5e25fSSatish Balay z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 110149b5e25fSSatish Balay z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 110249b5e25fSSatish Balay z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 110349b5e25fSSatish Balay z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 110449b5e25fSSatish Balay z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 110549b5e25fSSatish Balay z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 110649b5e25fSSatish Balay v += 36; 110749b5e25fSSatish Balay } 11089371c9d4SSatish Balay xb += 6; 11099371c9d4SSatish Balay ai++; 111049b5e25fSSatish Balay } 111149b5e25fSSatish Balay 11129566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11139566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 111449b5e25fSSatish Balay 11159566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 11163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 111749b5e25fSSatish Balay } 111849b5e25fSSatish Balay 1119d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1120d71ae5a4SJacob Faibussowitsch { 112149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1122d9ca1df4SBarry Smith PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1123d9ca1df4SBarry Smith const PetscScalar *x, *xb; 1124d9ca1df4SBarry Smith const MatScalar *v; 1125d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1126d9ca1df4SBarry Smith const PetscInt *aj = a->j, *ai = a->i, *ib; 112798c9bda7SSatish Balay PetscInt nonzerorow = 0; 112849b5e25fSSatish Balay 112949b5e25fSSatish Balay PetscFunctionBegin; 11309566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 11313ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 11329566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 11339566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 113449b5e25fSSatish Balay 113549b5e25fSSatish Balay v = a->a; 113649b5e25fSSatish Balay xb = x; 113749b5e25fSSatish Balay 113849b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 113949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 11409371c9d4SSatish Balay x1 = xb[0]; 11419371c9d4SSatish Balay x2 = xb[1]; 11429371c9d4SSatish Balay x3 = xb[2]; 11439371c9d4SSatish Balay x4 = xb[3]; 11449371c9d4SSatish Balay x5 = xb[4]; 11459371c9d4SSatish Balay x6 = xb[5]; 11469371c9d4SSatish Balay x7 = xb[6]; 114749b5e25fSSatish Balay ib = aj + *ai; 1148831a3094SHong Zhang jmin = 0; 114998c9bda7SSatish Balay nonzerorow += (n > 0); 115059689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 115149b5e25fSSatish Balay z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 115249b5e25fSSatish Balay z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 115349b5e25fSSatish Balay z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 115449b5e25fSSatish Balay z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 115549b5e25fSSatish Balay z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 115649b5e25fSSatish Balay z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 115749b5e25fSSatish Balay z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 11589371c9d4SSatish Balay v += 49; 11599371c9d4SSatish Balay jmin++; 11607fbae186SHong Zhang } 1161444d8c10SJed Brown PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1162444d8c10SJed Brown PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1163831a3094SHong Zhang for (j = jmin; j < n; j++) { 116449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 116549b5e25fSSatish Balay cval = ib[j] * 7; 116649b5e25fSSatish Balay z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 116749b5e25fSSatish Balay z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 116849b5e25fSSatish Balay z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 116949b5e25fSSatish Balay z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 117049b5e25fSSatish Balay z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 117149b5e25fSSatish Balay z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 117249b5e25fSSatish Balay z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 117349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 117449b5e25fSSatish Balay z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 117549b5e25fSSatish Balay z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 117649b5e25fSSatish Balay z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 117749b5e25fSSatish Balay z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 117849b5e25fSSatish Balay z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 117949b5e25fSSatish Balay z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 118049b5e25fSSatish Balay z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 118149b5e25fSSatish Balay v += 49; 118249b5e25fSSatish Balay } 11839371c9d4SSatish Balay xb += 7; 11849371c9d4SSatish Balay ai++; 118549b5e25fSSatish Balay } 118649b5e25fSSatish Balay 11879566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 11889566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 118949b5e25fSSatish Balay 11909566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 11913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 119249b5e25fSSatish Balay } 119349b5e25fSSatish Balay 1194d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1195d71ae5a4SJacob Faibussowitsch { 119649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1197f4259b30SLisandro Dalcin PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1198d9ca1df4SBarry Smith const PetscScalar *x, *x_ptr, *xb; 1199d9ca1df4SBarry Smith const MatScalar *v; 1200d9ca1df4SBarry Smith PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1201d9ca1df4SBarry Smith const PetscInt *idx, *aj, *ii; 120298c9bda7SSatish Balay PetscInt nonzerorow = 0; 120349b5e25fSSatish Balay 120449b5e25fSSatish Balay PetscFunctionBegin; 12059566063dSJacob Faibussowitsch PetscCall(VecCopy(yy, zz)); 12063ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 12079371c9d4SSatish Balay PetscCall(VecGetArrayRead(xx, &x)); 12089371c9d4SSatish Balay x_ptr = x; 12099371c9d4SSatish Balay PetscCall(VecGetArray(zz, &z)); 12109371c9d4SSatish Balay z_ptr = z; 121149b5e25fSSatish Balay 121249b5e25fSSatish Balay aj = a->j; 121349b5e25fSSatish Balay v = a->a; 121449b5e25fSSatish Balay ii = a->i; 121549b5e25fSSatish Balay 121648a46eb9SPierre Jolivet if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 121749b5e25fSSatish Balay work = a->mult_work; 121849b5e25fSSatish Balay 121949b5e25fSSatish Balay for (i = 0; i < mbs; i++) { 12209371c9d4SSatish Balay n = ii[1] - ii[0]; 12219371c9d4SSatish Balay ncols = n * bs; 12229371c9d4SSatish Balay workt = work; 12239371c9d4SSatish Balay idx = aj + ii[0]; 122498c9bda7SSatish Balay nonzerorow += (n > 0); 122549b5e25fSSatish Balay 122649b5e25fSSatish Balay /* upper triangular part */ 122749b5e25fSSatish Balay for (j = 0; j < n; j++) { 122849b5e25fSSatish Balay xb = x_ptr + bs * (*idx++); 122949b5e25fSSatish Balay for (k = 0; k < bs; k++) workt[k] = xb[k]; 123049b5e25fSSatish Balay workt += bs; 123149b5e25fSSatish Balay } 123249b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 123396b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 123449b5e25fSSatish Balay 123549b5e25fSSatish Balay /* strict lower triangular part */ 1236831a3094SHong Zhang idx = aj + ii[0]; 123759689ffbSStefano Zampini if (n && *idx == i) { 12389371c9d4SSatish Balay ncols -= bs; 12399371c9d4SSatish Balay v += bs2; 12409371c9d4SSatish Balay idx++; 12419371c9d4SSatish Balay n--; 1242831a3094SHong Zhang } 124349b5e25fSSatish Balay if (ncols > 0) { 124449b5e25fSSatish Balay workt = work; 12459566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt, ncols)); 124696b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1247831a3094SHong Zhang for (j = 0; j < n; j++) { 1248831a3094SHong Zhang zb = z_ptr + bs * (*idx++); 124949b5e25fSSatish Balay for (k = 0; k < bs; k++) zb[k] += workt[k]; 125049b5e25fSSatish Balay workt += bs; 125149b5e25fSSatish Balay } 125249b5e25fSSatish Balay } 125349b5e25fSSatish Balay 12549371c9d4SSatish Balay x += bs; 12559371c9d4SSatish Balay v += n * bs2; 12569371c9d4SSatish Balay z += bs; 12579371c9d4SSatish Balay ii++; 125849b5e25fSSatish Balay } 125949b5e25fSSatish Balay 12609566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 12619566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 126249b5e25fSSatish Balay 12639566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 12643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 126549b5e25fSSatish Balay } 126649b5e25fSSatish Balay 1267d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1268d71ae5a4SJacob Faibussowitsch { 126949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1270f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1271c5df96a5SBarry Smith PetscBLASInt one = 1, totalnz; 127249b5e25fSSatish Balay 127349b5e25fSSatish Balay PetscFunctionBegin; 12749566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1275792fecdfSBarry Smith PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 12769566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 12773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 127849b5e25fSSatish Balay } 127949b5e25fSSatish Balay 1280d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1281d71ae5a4SJacob Faibussowitsch { 128249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1283d9ca1df4SBarry Smith const MatScalar *v = a->a; 128449b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1285d9ca1df4SBarry Smith PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1286d9ca1df4SBarry Smith const PetscInt *aj = a->j, *col; 128749b5e25fSSatish Balay 128849b5e25fSSatish Balay PetscFunctionBegin; 1289c40ae873SPierre Jolivet if (!a->nz) { 1290c40ae873SPierre Jolivet *norm = 0.0; 12913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1292c40ae873SPierre Jolivet } 129349b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 129449b5e25fSSatish Balay for (k = 0; k < mbs; k++) { 129559689ffbSStefano Zampini jmin = a->i[k]; 129659689ffbSStefano Zampini jmax = a->i[k + 1]; 1297831a3094SHong Zhang col = aj + jmin; 129859689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 129949b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 13009371c9d4SSatish Balay sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 13019371c9d4SSatish Balay v++; 130249b5e25fSSatish Balay } 1303831a3094SHong Zhang jmin++; 1304831a3094SHong Zhang } 1305831a3094SHong Zhang for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 130649b5e25fSSatish Balay for (i = 0; i < bs2; i++) { 13079371c9d4SSatish Balay sum_off += PetscRealPart(PetscConj(*v) * (*v)); 13089371c9d4SSatish Balay v++; 130949b5e25fSSatish Balay } 131049b5e25fSSatish Balay } 131149b5e25fSSatish Balay } 13128f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 13139566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 13140b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 13159566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 13160b8dc8d2SHong Zhang for (i = 0; i < mbs; i++) jl[i] = mbs; 13170b8dc8d2SHong Zhang il[0] = 0; 131849b5e25fSSatish Balay 131949b5e25fSSatish Balay *norm = 0.0; 132049b5e25fSSatish Balay for (k = 0; k < mbs; k++) { /* k_th block row */ 132149b5e25fSSatish Balay for (j = 0; j < bs; j++) sum[j] = 0.0; 132249b5e25fSSatish Balay /*-- col sum --*/ 132349b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1324bbea24aaSStefano Zampini /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window) 1325831a3094SHong Zhang at step k */ 132649b5e25fSSatish Balay while (i < mbs) { 132749b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 132849b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 132949b5e25fSSatish Balay for (j = 0; j < bs; j++) { 133049b5e25fSSatish Balay v = a->a + ik * bs2 + j * bs; 133149b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 13329371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 13339371c9d4SSatish Balay v++; 133449b5e25fSSatish Balay } 133549b5e25fSSatish Balay } 133649b5e25fSSatish Balay /* update il, jl */ 1337831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1338831a3094SHong Zhang jmax = a->i[i + 1]; 133949b5e25fSSatish Balay if (jmin < jmax) { 134049b5e25fSSatish Balay il[i] = jmin; 134149b5e25fSSatish Balay j = a->j[jmin]; 13429371c9d4SSatish Balay jl[i] = jl[j]; 13439371c9d4SSatish Balay jl[j] = i; 134449b5e25fSSatish Balay } 134549b5e25fSSatish Balay i = nexti; 134649b5e25fSSatish Balay } 134749b5e25fSSatish Balay /*-- row sum --*/ 134859689ffbSStefano Zampini jmin = a->i[k]; 134959689ffbSStefano Zampini jmax = a->i[k + 1]; 135049b5e25fSSatish Balay for (i = jmin; i < jmax; i++) { 135149b5e25fSSatish Balay for (j = 0; j < bs; j++) { 135249b5e25fSSatish Balay v = a->a + i * bs2 + j; 135349b5e25fSSatish Balay for (k1 = 0; k1 < bs; k1++) { 13549371c9d4SSatish Balay sum[j] += PetscAbsScalar(*v); 13559371c9d4SSatish Balay v += bs; 135649b5e25fSSatish Balay } 135749b5e25fSSatish Balay } 135849b5e25fSSatish Balay } 135949b5e25fSSatish Balay /* add k_th block row to il, jl */ 1360831a3094SHong Zhang col = aj + jmin; 136159689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) jmin++; 136249b5e25fSSatish Balay if (jmin < jmax) { 136349b5e25fSSatish Balay il[k] = jmin; 13649371c9d4SSatish Balay j = a->j[jmin]; 13659371c9d4SSatish Balay jl[k] = jl[j]; 13669371c9d4SSatish Balay jl[j] = k; 136749b5e25fSSatish Balay } 136849b5e25fSSatish Balay for (j = 0; j < bs; j++) { 136949b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 137049b5e25fSSatish Balay } 137149b5e25fSSatish Balay } 13729566063dSJacob Faibussowitsch PetscCall(PetscFree3(sum, il, jl)); 13739566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1374f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 13753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 137649b5e25fSSatish Balay } 137749b5e25fSSatish Balay 1378d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1379d71ae5a4SJacob Faibussowitsch { 138049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 138149b5e25fSSatish Balay 138249b5e25fSSatish Balay PetscFunctionBegin; 138349b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1384d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1385ef511fbeSHong Zhang *flg = PETSC_FALSE; 13863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 138749b5e25fSSatish Balay } 138849b5e25fSSatish Balay 138949b5e25fSSatish Balay /* if the a->i are the same */ 13909566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 13913ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 139249b5e25fSSatish Balay 139349b5e25fSSatish Balay /* if a->j are the same */ 13949566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 13953ba16761SJacob Faibussowitsch if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 139626fbe8dcSKarl Rupp 139749b5e25fSSatish Balay /* if a->a are the same */ 13989566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 13993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 140049b5e25fSSatish Balay } 140149b5e25fSSatish Balay 1402d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1403d71ae5a4SJacob Faibussowitsch { 140449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1405d9ca1df4SBarry Smith PetscInt i, j, k, row, bs, ambs, bs2; 1406d9ca1df4SBarry Smith const PetscInt *ai, *aj; 140787828ca2SBarry Smith PetscScalar *x, zero = 0.0; 1408d9ca1df4SBarry Smith const MatScalar *aa, *aa_j; 140949b5e25fSSatish Balay 141049b5e25fSSatish Balay PetscFunctionBegin; 1411d0f46423SBarry Smith bs = A->rmap->bs; 1412aed4548fSBarry Smith PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 141382799104SHong Zhang 141449b5e25fSSatish Balay aa = a->a; 14158a0c6efdSHong Zhang ambs = a->mbs; 14168a0c6efdSHong Zhang 14178a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 14188a0c6efdSHong Zhang PetscInt *diag = a->diag; 14198a0c6efdSHong Zhang aa = a->a; 14208a0c6efdSHong Zhang ambs = a->mbs; 14219566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 14228a0c6efdSHong Zhang for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 14239566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 14243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14258a0c6efdSHong Zhang } 14268a0c6efdSHong Zhang 142749b5e25fSSatish Balay ai = a->i; 142849b5e25fSSatish Balay aj = a->j; 142949b5e25fSSatish Balay bs2 = a->bs2; 14309566063dSJacob Faibussowitsch PetscCall(VecSet(v, zero)); 14313ba16761SJacob Faibussowitsch if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 14329566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 143349b5e25fSSatish Balay for (i = 0; i < ambs; i++) { 143449b5e25fSSatish Balay j = ai[i]; 143549b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 143649b5e25fSSatish Balay row = i * bs; 143749b5e25fSSatish Balay aa_j = aa + j * bs2; 143849b5e25fSSatish Balay for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 143949b5e25fSSatish Balay } 144049b5e25fSSatish Balay } 14419566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 14423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 144349b5e25fSSatish Balay } 144449b5e25fSSatish Balay 1445d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1446d71ae5a4SJacob Faibussowitsch { 144749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1448d9ca1df4SBarry Smith PetscScalar x; 1449d9ca1df4SBarry Smith const PetscScalar *l, *li, *ri; 145049b5e25fSSatish Balay MatScalar *aa, *v; 1451fff8e43fSBarry Smith PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1452fff8e43fSBarry Smith const PetscInt *ai, *aj; 1453ace3abfcSBarry Smith PetscBool flg; 145449b5e25fSSatish Balay 145549b5e25fSSatish Balay PetscFunctionBegin; 1456b3bf805bSHong Zhang if (ll != rr) { 14579566063dSJacob Faibussowitsch PetscCall(VecEqual(ll, rr, &flg)); 145828b400f6SJacob Faibussowitsch PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1459b3bf805bSHong Zhang } 14603ba16761SJacob Faibussowitsch if (!ll) PetscFunctionReturn(PETSC_SUCCESS); 146149b5e25fSSatish Balay ai = a->i; 146249b5e25fSSatish Balay aj = a->j; 146349b5e25fSSatish Balay aa = a->a; 1464d0f46423SBarry Smith m = A->rmap->N; 1465d0f46423SBarry Smith bs = A->rmap->bs; 146649b5e25fSSatish Balay mbs = a->mbs; 146749b5e25fSSatish Balay bs2 = a->bs2; 146849b5e25fSSatish Balay 14699566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll, &l)); 14709566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll, &lm)); 147108401ef6SPierre Jolivet PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 147249b5e25fSSatish Balay for (i = 0; i < mbs; i++) { /* for each block row */ 147349b5e25fSSatish Balay M = ai[i + 1] - ai[i]; 147449b5e25fSSatish Balay li = l + i * bs; 147549b5e25fSSatish Balay v = aa + bs2 * ai[i]; 147649b5e25fSSatish Balay for (j = 0; j < M; j++) { /* for each block */ 147749b5e25fSSatish Balay ri = l + bs * aj[ai[i] + j]; 14785e90f9d9SHong Zhang for (k = 0; k < bs; k++) { 147949b5e25fSSatish Balay x = ri[k]; 148049b5e25fSSatish Balay for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 148149b5e25fSSatish Balay } 148249b5e25fSSatish Balay } 148349b5e25fSSatish Balay } 14849566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll, &l)); 14859566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 14863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 148749b5e25fSSatish Balay } 148849b5e25fSSatish Balay 1489d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1490d71ae5a4SJacob Faibussowitsch { 149149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 149249b5e25fSSatish Balay 149349b5e25fSSatish Balay PetscFunctionBegin; 149449b5e25fSSatish Balay info->block_size = a->bs2; 1495ceed8ce5SJed Brown info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 14966c6c5352SBarry Smith info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 14973966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 149849b5e25fSSatish Balay info->assemblies = A->num_ass; 14998e58a170SBarry Smith info->mallocs = A->info.mallocs; 15004dfa11a4SJacob Faibussowitsch info->memory = 0; /* REVIEW ME */ 1501d5f3da31SBarry Smith if (A->factortype) { 150249b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 150349b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 150449b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 150549b5e25fSSatish Balay } else { 150649b5e25fSSatish Balay info->fill_ratio_given = 0; 150749b5e25fSSatish Balay info->fill_ratio_needed = 0; 150849b5e25fSSatish Balay info->factor_mallocs = 0; 150949b5e25fSSatish Balay } 15103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 151149b5e25fSSatish Balay } 151249b5e25fSSatish Balay 1513d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1514d71ae5a4SJacob Faibussowitsch { 151549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 151649b5e25fSSatish Balay 151749b5e25fSSatish Balay PetscFunctionBegin; 15189566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 15193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 152049b5e25fSSatish Balay } 1521dc354874SHong Zhang 1522d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1523d71ae5a4SJacob Faibussowitsch { 1524dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1525d9ca1df4SBarry Smith PetscInt i, j, n, row, col, bs, mbs; 1526d9ca1df4SBarry Smith const PetscInt *ai, *aj; 1527c3fca9a7SHong Zhang PetscReal atmp; 1528d9ca1df4SBarry Smith const MatScalar *aa; 1529985db425SBarry Smith PetscScalar *x; 153013f74950SBarry Smith PetscInt ncols, brow, bcol, krow, kcol; 1531dc354874SHong Zhang 1532dc354874SHong Zhang PetscFunctionBegin; 153328b400f6SJacob Faibussowitsch PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 153428b400f6SJacob Faibussowitsch PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1535d0f46423SBarry Smith bs = A->rmap->bs; 1536dc354874SHong Zhang aa = a->a; 1537dc354874SHong Zhang ai = a->i; 1538dc354874SHong Zhang aj = a->j; 153944117c81SHong Zhang mbs = a->mbs; 1540dc354874SHong Zhang 15419566063dSJacob Faibussowitsch PetscCall(VecSet(v, 0.0)); 15429566063dSJacob Faibussowitsch PetscCall(VecGetArray(v, &x)); 15439566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v, &n)); 154408401ef6SPierre Jolivet PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 154544117c81SHong Zhang for (i = 0; i < mbs; i++) { 15469371c9d4SSatish Balay ncols = ai[1] - ai[0]; 15479371c9d4SSatish Balay ai++; 1548d0f6400bSHong Zhang brow = bs * i; 154944117c81SHong Zhang for (j = 0; j < ncols; j++) { 1550d0f6400bSHong Zhang bcol = bs * (*aj); 155144117c81SHong Zhang for (kcol = 0; kcol < bs; kcol++) { 1552d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 155344117c81SHong Zhang for (krow = 0; krow < bs; krow++) { 15549371c9d4SSatish Balay atmp = PetscAbsScalar(*aa); 15559371c9d4SSatish Balay aa++; 1556d0f6400bSHong Zhang row = brow + krow; /* row index */ 1557c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1558c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 155944117c81SHong Zhang } 156044117c81SHong Zhang } 1561d0f6400bSHong Zhang aj++; 1562dc354874SHong Zhang } 1563dc354874SHong Zhang } 15649566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v, &x)); 15653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1566dc354874SHong Zhang } 1567c2916339SPierre Jolivet 1568d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1569d71ae5a4SJacob Faibussowitsch { 1570c2916339SPierre Jolivet PetscFunctionBegin; 15719566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 15724222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 15733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1574c2916339SPierre Jolivet } 1575c2916339SPierre Jolivet 157666976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1577d71ae5a4SJacob Faibussowitsch { 1578c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1579c2916339SPierre Jolivet PetscScalar *z = c; 1580c2916339SPierre Jolivet const PetscScalar *xb; 1581c2916339SPierre Jolivet PetscScalar x1; 1582c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1583c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 15843c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 1585b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 15863c2e41e1SStefano Zampini #else 15873c2e41e1SStefano Zampini const int aconj = 0; 15883c2e41e1SStefano Zampini #endif 1589c2916339SPierre Jolivet 1590c2916339SPierre Jolivet PetscFunctionBegin; 1591c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 15929371c9d4SSatish Balay n = ii[1] - ii[0]; 15939371c9d4SSatish Balay ii++; 1594c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1595c2916339SPierre Jolivet PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1596c2916339SPierre Jolivet jj = idx; 1597c2916339SPierre Jolivet vv = v; 1598c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1599c2916339SPierre Jolivet idx = jj; 1600c2916339SPierre Jolivet v = vv; 1601c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16029371c9d4SSatish Balay xb = b + (*idx); 16039371c9d4SSatish Balay x1 = xb[0 + k * bm]; 1604c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1; 16053c2e41e1SStefano Zampini if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1606c2916339SPierre Jolivet v += 1; 1607c2916339SPierre Jolivet ++idx; 1608c2916339SPierre Jolivet } 1609c2916339SPierre Jolivet } 1610c2916339SPierre Jolivet z += 1; 1611c2916339SPierre Jolivet } 16123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1613c2916339SPierre Jolivet } 1614c2916339SPierre Jolivet 161566976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1616d71ae5a4SJacob Faibussowitsch { 1617c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1618c2916339SPierre Jolivet PetscScalar *z = c; 1619c2916339SPierre Jolivet const PetscScalar *xb; 1620c2916339SPierre Jolivet PetscScalar x1, x2; 1621c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1622c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1623c2916339SPierre Jolivet 1624c2916339SPierre Jolivet PetscFunctionBegin; 1625c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16269371c9d4SSatish Balay n = ii[1] - ii[0]; 16279371c9d4SSatish Balay ii++; 1628c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1629c2916339SPierre Jolivet PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1630c2916339SPierre Jolivet jj = idx; 1631c2916339SPierre Jolivet vv = v; 1632c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1633c2916339SPierre Jolivet idx = jj; 1634c2916339SPierre Jolivet v = vv; 1635c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16369371c9d4SSatish Balay xb = b + 2 * (*idx); 16379371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16389371c9d4SSatish Balay x2 = xb[1 + k * bm]; 1639c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1640c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1641c2916339SPierre Jolivet if (*idx != i) { 1642c2916339SPierre Jolivet c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1643c2916339SPierre Jolivet c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1644c2916339SPierre Jolivet } 1645c2916339SPierre Jolivet v += 4; 1646c2916339SPierre Jolivet ++idx; 1647c2916339SPierre Jolivet } 1648c2916339SPierre Jolivet } 1649c2916339SPierre Jolivet z += 2; 1650c2916339SPierre Jolivet } 16513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1652c2916339SPierre Jolivet } 1653c2916339SPierre Jolivet 165466976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1655d71ae5a4SJacob Faibussowitsch { 1656c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1657c2916339SPierre Jolivet PetscScalar *z = c; 1658c2916339SPierre Jolivet const PetscScalar *xb; 1659c2916339SPierre Jolivet PetscScalar x1, x2, x3; 1660c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1661c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1662c2916339SPierre Jolivet 1663c2916339SPierre Jolivet PetscFunctionBegin; 1664c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 16659371c9d4SSatish Balay n = ii[1] - ii[0]; 16669371c9d4SSatish Balay ii++; 1667c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1668c2916339SPierre Jolivet PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1669c2916339SPierre Jolivet jj = idx; 1670c2916339SPierre Jolivet vv = v; 1671c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1672c2916339SPierre Jolivet idx = jj; 1673c2916339SPierre Jolivet v = vv; 1674c2916339SPierre Jolivet for (j = 0; j < n; j++) { 16759371c9d4SSatish Balay xb = b + 3 * (*idx); 16769371c9d4SSatish Balay x1 = xb[0 + k * bm]; 16779371c9d4SSatish Balay x2 = xb[1 + k * bm]; 16789371c9d4SSatish Balay x3 = xb[2 + k * bm]; 1679c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1680c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1681c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1682c2916339SPierre Jolivet if (*idx != i) { 1683c2916339SPierre Jolivet c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1684c2916339SPierre Jolivet c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1685c2916339SPierre Jolivet c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1686c2916339SPierre Jolivet } 1687c2916339SPierre Jolivet v += 9; 1688c2916339SPierre Jolivet ++idx; 1689c2916339SPierre Jolivet } 1690c2916339SPierre Jolivet } 1691c2916339SPierre Jolivet z += 3; 1692c2916339SPierre Jolivet } 16933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1694c2916339SPierre Jolivet } 1695c2916339SPierre Jolivet 169666976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1697d71ae5a4SJacob Faibussowitsch { 1698c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1699c2916339SPierre Jolivet PetscScalar *z = c; 1700c2916339SPierre Jolivet const PetscScalar *xb; 1701c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4; 1702c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1703c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1704c2916339SPierre Jolivet 1705c2916339SPierre Jolivet PetscFunctionBegin; 1706c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 17079371c9d4SSatish Balay n = ii[1] - ii[0]; 17089371c9d4SSatish Balay ii++; 1709c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1710c2916339SPierre Jolivet PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1711c2916339SPierre Jolivet jj = idx; 1712c2916339SPierre Jolivet vv = v; 1713c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1714c2916339SPierre Jolivet idx = jj; 1715c2916339SPierre Jolivet v = vv; 1716c2916339SPierre Jolivet for (j = 0; j < n; j++) { 17179371c9d4SSatish Balay xb = b + 4 * (*idx); 17189371c9d4SSatish Balay x1 = xb[0 + k * bm]; 17199371c9d4SSatish Balay x2 = xb[1 + k * bm]; 17209371c9d4SSatish Balay x3 = xb[2 + k * bm]; 17219371c9d4SSatish Balay x4 = xb[3 + k * bm]; 1722c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1723c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1724c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1725c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1726c2916339SPierre Jolivet if (*idx != i) { 1727c2916339SPierre Jolivet c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1728c2916339SPierre Jolivet c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1729c2916339SPierre Jolivet c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1730c2916339SPierre Jolivet c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1731c2916339SPierre Jolivet } 1732c2916339SPierre Jolivet v += 16; 1733c2916339SPierre Jolivet ++idx; 1734c2916339SPierre Jolivet } 1735c2916339SPierre Jolivet } 1736c2916339SPierre Jolivet z += 4; 1737c2916339SPierre Jolivet } 17383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1739c2916339SPierre Jolivet } 1740c2916339SPierre Jolivet 174166976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1742d71ae5a4SJacob Faibussowitsch { 1743c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1744c2916339SPierre Jolivet PetscScalar *z = c; 1745c2916339SPierre Jolivet const PetscScalar *xb; 1746c2916339SPierre Jolivet PetscScalar x1, x2, x3, x4, x5; 1747c2916339SPierre Jolivet const MatScalar *v = a->a, *vv; 1748c2916339SPierre Jolivet PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1749c2916339SPierre Jolivet 1750c2916339SPierre Jolivet PetscFunctionBegin; 1751c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 17529371c9d4SSatish Balay n = ii[1] - ii[0]; 17539371c9d4SSatish Balay ii++; 1754c2916339SPierre Jolivet PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1755c2916339SPierre Jolivet PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1756c2916339SPierre Jolivet jj = idx; 1757c2916339SPierre Jolivet vv = v; 1758c2916339SPierre Jolivet for (k = 0; k < cn; k++) { 1759c2916339SPierre Jolivet idx = jj; 1760c2916339SPierre Jolivet v = vv; 1761c2916339SPierre Jolivet for (j = 0; j < n; j++) { 17629371c9d4SSatish Balay xb = b + 5 * (*idx); 17639371c9d4SSatish Balay x1 = xb[0 + k * bm]; 17649371c9d4SSatish Balay x2 = xb[1 + k * bm]; 17659371c9d4SSatish Balay x3 = xb[2 + k * bm]; 17669371c9d4SSatish Balay x4 = xb[3 + k * bm]; 17679371c9d4SSatish Balay x5 = xb[4 + k * cm]; 1768c2916339SPierre Jolivet z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1769c2916339SPierre Jolivet z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1770c2916339SPierre Jolivet z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1771c2916339SPierre Jolivet z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1772c2916339SPierre Jolivet z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1773c2916339SPierre Jolivet if (*idx != i) { 1774c2916339SPierre Jolivet c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1775c2916339SPierre Jolivet c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1776c2916339SPierre Jolivet c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1777c2916339SPierre Jolivet c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1778c2916339SPierre Jolivet c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1779c2916339SPierre Jolivet } 1780c2916339SPierre Jolivet v += 25; 1781c2916339SPierre Jolivet ++idx; 1782c2916339SPierre Jolivet } 1783c2916339SPierre Jolivet } 1784c2916339SPierre Jolivet z += 5; 1785c2916339SPierre Jolivet } 17863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1787c2916339SPierre Jolivet } 1788c2916339SPierre Jolivet 1789d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1790d71ae5a4SJacob Faibussowitsch { 1791c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1792c2916339SPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1793281439baSStefano Zampini Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1794c2916339SPierre Jolivet PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1795c2916339SPierre Jolivet PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1796c2916339SPierre Jolivet PetscBLASInt bbs, bcn, bbm, bcm; 1797f4259b30SLisandro Dalcin PetscScalar *z = NULL; 1798c2916339SPierre Jolivet PetscScalar *c, *b; 1799c2916339SPierre Jolivet const MatScalar *v; 1800c2916339SPierre Jolivet const PetscInt *idx, *ii; 1801c2916339SPierre Jolivet PetscScalar _DOne = 1.0; 1802c2916339SPierre Jolivet 1803c2916339SPierre Jolivet PetscFunctionBegin; 18043ba16761SJacob Faibussowitsch if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 180508401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 180608401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 180708401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1808c2916339SPierre Jolivet b = bd->v; 18099566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(C)); 18109566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 1811c2916339SPierre Jolivet switch (bs) { 1812d71ae5a4SJacob Faibussowitsch case 1: 1813d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1814d71ae5a4SJacob Faibussowitsch break; 1815d71ae5a4SJacob Faibussowitsch case 2: 1816d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1817d71ae5a4SJacob Faibussowitsch break; 1818d71ae5a4SJacob Faibussowitsch case 3: 1819d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1820d71ae5a4SJacob Faibussowitsch break; 1821d71ae5a4SJacob Faibussowitsch case 4: 1822d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1823d71ae5a4SJacob Faibussowitsch break; 1824d71ae5a4SJacob Faibussowitsch case 5: 1825d71ae5a4SJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1826d71ae5a4SJacob Faibussowitsch break; 1827c2916339SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 18289566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs, &bbs)); 18299566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn, &bcn)); 18309566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm, &bbm)); 18319566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm, &bcm)); 1832c2916339SPierre Jolivet idx = a->j; 1833c2916339SPierre Jolivet v = a->a; 1834c2916339SPierre Jolivet mbs = a->mbs; 1835c2916339SPierre Jolivet ii = a->i; 1836c2916339SPierre Jolivet z = c; 1837c2916339SPierre Jolivet for (i = 0; i < mbs; i++) { 18389371c9d4SSatish Balay n = ii[1] - ii[0]; 18399371c9d4SSatish Balay ii++; 1840c2916339SPierre Jolivet for (j = 0; j < n; j++) { 1841792fecdfSBarry Smith if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1842792fecdfSBarry Smith PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1843c2916339SPierre Jolivet v += bs2; 1844c2916339SPierre Jolivet } 1845c2916339SPierre Jolivet z += bs; 1846c2916339SPierre Jolivet } 1847c2916339SPierre Jolivet } 18489566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 18499566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 18503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1851c2916339SPierre Jolivet } 1852