1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <../src/mat/impls/sbaij/seq/sbaij.h> 4 #include <petsc/private/kernels/blockinvert.h> 5 #include <petscbt.h> 6 #include <petscblaslapack.h> 7 8 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 9 { 10 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 11 PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 12 const PetscInt *idx; 13 PetscBT table_out, table_in; 14 15 PetscFunctionBegin; 16 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 17 mbs = a->mbs; 18 ai = a->i; 19 aj = a->j; 20 bs = A->rmap->bs; 21 PetscCall(PetscBTCreate(mbs, &table_out)); 22 PetscCall(PetscMalloc1(mbs + 1, &nidx)); 23 PetscCall(PetscBTCreate(mbs, &table_in)); 24 25 for (i = 0; i < is_max; i++) { /* for each is */ 26 isz = 0; 27 PetscCall(PetscBTMemzero(mbs, table_out)); 28 29 /* Extract the indices, assume there can be duplicate entries */ 30 PetscCall(ISGetIndices(is[i], &idx)); 31 PetscCall(ISGetLocalSize(is[i], &n)); 32 33 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 34 bcol_max = 0; 35 for (j = 0; j < n; ++j) { 36 brow = idx[j] / bs; /* convert the indices into block indices */ 37 PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 38 if (!PetscBTLookupSet(table_out, brow)) { 39 nidx[isz++] = brow; 40 if (bcol_max < brow) bcol_max = brow; 41 } 42 } 43 PetscCall(ISRestoreIndices(is[i], &idx)); 44 PetscCall(ISDestroy(&is[i])); 45 46 k = 0; 47 for (j = 0; j < ov; j++) { /* for each overlap */ 48 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 49 PetscCall(PetscBTMemzero(mbs, table_in)); 50 for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 51 52 n = isz; /* length of the updated is[i] */ 53 for (brow = 0; brow < mbs; brow++) { 54 start = ai[brow]; 55 end = ai[brow + 1]; 56 if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 57 for (l = start; l < end; l++) { 58 bcol = aj[l]; 59 if (!PetscBTLookupSet(table_out, bcol)) { 60 nidx[isz++] = bcol; 61 if (bcol_max < bcol) bcol_max = bcol; 62 } 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l < end; l++) { 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table_in, bcol)) { 71 if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 79 } /* for each is */ 80 PetscCall(PetscBTDestroy(&table_out)); 81 PetscCall(PetscFree(nidx)); 82 PetscCall(PetscBTDestroy(&table_in)); 83 PetscFunctionReturn(PETSC_SUCCESS); 84 } 85 86 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 87 Zero some ops' to avoid invalid use */ 88 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 89 { 90 PetscFunctionBegin; 91 PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 92 Bseq->ops->mult = NULL; 93 Bseq->ops->multadd = NULL; 94 Bseq->ops->multtranspose = NULL; 95 Bseq->ops->multtransposeadd = NULL; 96 Bseq->ops->lufactor = NULL; 97 Bseq->ops->choleskyfactor = NULL; 98 Bseq->ops->lufactorsymbolic = NULL; 99 Bseq->ops->choleskyfactorsymbolic = NULL; 100 Bseq->ops->getinertia = NULL; 101 PetscFunctionReturn(PETSC_SUCCESS); 102 } 103 104 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 105 static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym) 106 { 107 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL; 108 Mat_SeqBAIJ *d = NULL; 109 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 110 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 111 const PetscInt *irow, *icol; 112 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 113 PetscInt *aj = a->j, *ai = a->i; 114 MatScalar *mat_a; 115 Mat C; 116 PetscBool flag; 117 118 PetscFunctionBegin; 119 PetscCall(ISGetIndices(isrow, &irow)); 120 PetscCall(ISGetIndices(iscol, &icol)); 121 PetscCall(ISGetLocalSize(isrow, &nrows)); 122 PetscCall(ISGetLocalSize(iscol, &ncols)); 123 124 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 125 ssmap = smap; 126 PetscCall(PetscMalloc1(1 + nrows, &lens)); 127 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 128 /* determine lens of each row */ 129 for (i = 0; i < nrows; i++) { 130 kstart = ai[irow[i]]; 131 kend = kstart + a->ilen[irow[i]]; 132 lens[i] = 0; 133 for (k = kstart; k < kend; k++) { 134 if (ssmap[aj[k]]) lens[i]++; 135 } 136 } 137 /* Create and fill new matrix */ 138 if (scall == MAT_REUSE_MATRIX) { 139 if (sym) { 140 c = (Mat_SeqSBAIJ *)((*B)->data); 141 142 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 143 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 144 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 145 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 146 } else { 147 d = (Mat_SeqBAIJ *)((*B)->data); 148 149 PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 150 PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag)); 151 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 152 PetscCall(PetscArrayzero(d->ilen, d->mbs)); 153 } 154 C = *B; 155 } else { 156 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 157 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 158 if (sym) { 159 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 160 PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 161 } else { 162 PetscCall(MatSetType(C, MATSEQBAIJ)); 163 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 164 } 165 } 166 if (sym) c = (Mat_SeqSBAIJ *)C->data; 167 else d = (Mat_SeqBAIJ *)C->data; 168 for (i = 0; i < nrows; i++) { 169 row = irow[i]; 170 kstart = ai[row]; 171 kend = kstart + a->ilen[row]; 172 if (sym) { 173 mat_i = c->i[i]; 174 mat_j = PetscSafePointerPlusOffset(c->j, mat_i); 175 mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); 176 mat_ilen = c->ilen + i; 177 } else { 178 mat_i = d->i[i]; 179 mat_j = PetscSafePointerPlusOffset(d->j, mat_i); 180 mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2); 181 mat_ilen = d->ilen + i; 182 } 183 for (k = kstart; k < kend; k++) { 184 if ((tcol = ssmap[a->j[k]])) { 185 *mat_j++ = tcol - 1; 186 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 187 mat_a += bs2; 188 (*mat_ilen)++; 189 } 190 } 191 } 192 /* sort */ 193 { 194 MatScalar *work; 195 196 PetscCall(PetscMalloc1(bs2, &work)); 197 for (i = 0; i < nrows; i++) { 198 PetscInt ilen; 199 if (sym) { 200 mat_i = c->i[i]; 201 mat_j = PetscSafePointerPlusOffset(c->j, mat_i); 202 mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); 203 ilen = c->ilen[i]; 204 } else { 205 mat_i = d->i[i]; 206 mat_j = PetscSafePointerPlusOffset(d->j, mat_i); 207 mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2); 208 ilen = d->ilen[i]; 209 } 210 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 211 } 212 PetscCall(PetscFree(work)); 213 } 214 215 /* Free work space */ 216 PetscCall(ISRestoreIndices(iscol, &icol)); 217 PetscCall(PetscFree(smap)); 218 PetscCall(PetscFree(lens)); 219 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 220 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 221 222 PetscCall(ISRestoreIndices(isrow, &irow)); 223 *B = C; 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 228 { 229 Mat C[2]; 230 IS is1, is2, intersect = NULL; 231 PetscInt n1, n2, ni; 232 PetscBool sym = PETSC_TRUE; 233 234 PetscFunctionBegin; 235 PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 236 if (isrow == iscol) { 237 is2 = is1; 238 PetscCall(PetscObjectReference((PetscObject)is2)); 239 } else { 240 PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 241 PetscCall(ISIntersect(is1, is2, &intersect)); 242 PetscCall(ISGetLocalSize(intersect, &ni)); 243 PetscCall(ISDestroy(&intersect)); 244 if (ni == 0) sym = PETSC_FALSE; 245 else if (PetscDefined(USE_DEBUG)) { 246 PetscCall(ISGetLocalSize(is1, &n1)); 247 PetscCall(ISGetLocalSize(is2, &n2)); 248 PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix"); 249 } 250 } 251 if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym)); 252 else { 253 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym)); 254 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym)); 255 PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1)); 256 PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN)); 257 PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 258 if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 259 else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B)); 260 else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 261 PetscCall(MatDestroy(C)); 262 PetscCall(MatDestroy(C + 1)); 263 } 264 PetscCall(ISDestroy(&is1)); 265 PetscCall(ISDestroy(&is2)); 266 267 if (sym && isrow != iscol) { 268 PetscBool isequal; 269 PetscCall(ISEqual(isrow, iscol, &isequal)); 270 if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 271 } 272 PetscFunctionReturn(PETSC_SUCCESS); 273 } 274 275 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 276 { 277 PetscInt i; 278 279 PetscFunctionBegin; 280 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B)); 281 282 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 283 PetscFunctionReturn(PETSC_SUCCESS); 284 } 285 286 /* Should check that shapes of vectors and matrices match */ 287 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 288 { 289 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 290 PetscScalar *z, x1, x2, zero = 0.0; 291 const PetscScalar *x, *xb; 292 const MatScalar *v; 293 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 294 const PetscInt *aj = a->j, *ai = a->i, *ib; 295 PetscInt nonzerorow = 0; 296 297 PetscFunctionBegin; 298 PetscCall(VecSet(zz, zero)); 299 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 300 PetscCall(VecGetArrayRead(xx, &x)); 301 PetscCall(VecGetArray(zz, &z)); 302 303 v = a->a; 304 xb = x; 305 306 for (i = 0; i < mbs; i++) { 307 n = ai[1] - ai[0]; /* length of i_th block row of A */ 308 x1 = xb[0]; 309 x2 = xb[1]; 310 ib = aj + *ai; 311 jmin = 0; 312 nonzerorow += (n > 0); 313 if (*ib == i) { /* (diag of A)*x */ 314 z[2 * i] += v[0] * x1 + v[2] * x2; 315 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 316 v += 4; 317 jmin++; 318 } 319 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 320 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 321 for (j = jmin; j < n; j++) { 322 /* (strict lower triangular part of A)*x */ 323 cval = ib[j] * 2; 324 z[cval] += v[0] * x1 + v[1] * x2; 325 z[cval + 1] += v[2] * x1 + v[3] * x2; 326 /* (strict upper triangular part of A)*x */ 327 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 328 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 329 v += 4; 330 } 331 xb += 2; 332 ai++; 333 } 334 335 PetscCall(VecRestoreArrayRead(xx, &x)); 336 PetscCall(VecRestoreArray(zz, &z)); 337 PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 338 PetscFunctionReturn(PETSC_SUCCESS); 339 } 340 341 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 342 { 343 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 344 PetscScalar *z, x1, x2, x3, zero = 0.0; 345 const PetscScalar *x, *xb; 346 const MatScalar *v; 347 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 348 const PetscInt *aj = a->j, *ai = a->i, *ib; 349 PetscInt nonzerorow = 0; 350 351 PetscFunctionBegin; 352 PetscCall(VecSet(zz, zero)); 353 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 354 PetscCall(VecGetArrayRead(xx, &x)); 355 PetscCall(VecGetArray(zz, &z)); 356 357 v = a->a; 358 xb = x; 359 360 for (i = 0; i < mbs; i++) { 361 n = ai[1] - ai[0]; /* length of i_th block row of A */ 362 x1 = xb[0]; 363 x2 = xb[1]; 364 x3 = xb[2]; 365 ib = aj + *ai; 366 jmin = 0; 367 nonzerorow += (n > 0); 368 if (*ib == i) { /* (diag of A)*x */ 369 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 370 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 371 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 372 v += 9; 373 jmin++; 374 } 375 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 376 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 377 for (j = jmin; j < n; j++) { 378 /* (strict lower triangular part of A)*x */ 379 cval = ib[j] * 3; 380 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 381 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 382 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 383 /* (strict upper triangular part of A)*x */ 384 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 385 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 386 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 387 v += 9; 388 } 389 xb += 3; 390 ai++; 391 } 392 393 PetscCall(VecRestoreArrayRead(xx, &x)); 394 PetscCall(VecRestoreArray(zz, &z)); 395 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 396 PetscFunctionReturn(PETSC_SUCCESS); 397 } 398 399 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 400 { 401 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 402 PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 403 const PetscScalar *x, *xb; 404 const MatScalar *v; 405 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 406 const PetscInt *aj = a->j, *ai = a->i, *ib; 407 PetscInt nonzerorow = 0; 408 409 PetscFunctionBegin; 410 PetscCall(VecSet(zz, zero)); 411 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 412 PetscCall(VecGetArrayRead(xx, &x)); 413 PetscCall(VecGetArray(zz, &z)); 414 415 v = a->a; 416 xb = x; 417 418 for (i = 0; i < mbs; i++) { 419 n = ai[1] - ai[0]; /* length of i_th block row of A */ 420 x1 = xb[0]; 421 x2 = xb[1]; 422 x3 = xb[2]; 423 x4 = xb[3]; 424 ib = aj + *ai; 425 jmin = 0; 426 nonzerorow += (n > 0); 427 if (*ib == i) { /* (diag of A)*x */ 428 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 429 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 430 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 431 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 432 v += 16; 433 jmin++; 434 } 435 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 436 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 437 for (j = jmin; j < n; j++) { 438 /* (strict lower triangular part of A)*x */ 439 cval = ib[j] * 4; 440 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 441 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 442 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 443 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 444 /* (strict upper triangular part of A)*x */ 445 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 446 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 447 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 448 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 449 v += 16; 450 } 451 xb += 4; 452 ai++; 453 } 454 455 PetscCall(VecRestoreArrayRead(xx, &x)); 456 PetscCall(VecRestoreArray(zz, &z)); 457 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 458 PetscFunctionReturn(PETSC_SUCCESS); 459 } 460 461 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 462 { 463 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 464 PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 465 const PetscScalar *x, *xb; 466 const MatScalar *v; 467 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 468 const PetscInt *aj = a->j, *ai = a->i, *ib; 469 PetscInt nonzerorow = 0; 470 471 PetscFunctionBegin; 472 PetscCall(VecSet(zz, zero)); 473 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 474 PetscCall(VecGetArrayRead(xx, &x)); 475 PetscCall(VecGetArray(zz, &z)); 476 477 v = a->a; 478 xb = x; 479 480 for (i = 0; i < mbs; i++) { 481 n = ai[1] - ai[0]; /* length of i_th block row of A */ 482 x1 = xb[0]; 483 x2 = xb[1]; 484 x3 = xb[2]; 485 x4 = xb[3]; 486 x5 = xb[4]; 487 ib = aj + *ai; 488 jmin = 0; 489 nonzerorow += (n > 0); 490 if (*ib == i) { /* (diag of A)*x */ 491 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 492 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 493 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 494 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 495 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 496 v += 25; 497 jmin++; 498 } 499 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 500 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 501 for (j = jmin; j < n; j++) { 502 /* (strict lower triangular part of A)*x */ 503 cval = ib[j] * 5; 504 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 505 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 506 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 507 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 508 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 509 /* (strict upper triangular part of A)*x */ 510 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 511 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 512 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 513 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 514 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 515 v += 25; 516 } 517 xb += 5; 518 ai++; 519 } 520 521 PetscCall(VecRestoreArrayRead(xx, &x)); 522 PetscCall(VecRestoreArray(zz, &z)); 523 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 524 PetscFunctionReturn(PETSC_SUCCESS); 525 } 526 527 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 528 { 529 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 530 PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 531 const PetscScalar *x, *xb; 532 const MatScalar *v; 533 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 534 const PetscInt *aj = a->j, *ai = a->i, *ib; 535 PetscInt nonzerorow = 0; 536 537 PetscFunctionBegin; 538 PetscCall(VecSet(zz, zero)); 539 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 540 PetscCall(VecGetArrayRead(xx, &x)); 541 PetscCall(VecGetArray(zz, &z)); 542 543 v = a->a; 544 xb = x; 545 546 for (i = 0; i < mbs; i++) { 547 n = ai[1] - ai[0]; /* length of i_th block row of A */ 548 x1 = xb[0]; 549 x2 = xb[1]; 550 x3 = xb[2]; 551 x4 = xb[3]; 552 x5 = xb[4]; 553 x6 = xb[5]; 554 ib = aj + *ai; 555 jmin = 0; 556 nonzerorow += (n > 0); 557 if (*ib == i) { /* (diag of A)*x */ 558 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 559 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 560 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 561 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 562 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 563 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 564 v += 36; 565 jmin++; 566 } 567 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 568 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 569 for (j = jmin; j < n; j++) { 570 /* (strict lower triangular part of A)*x */ 571 cval = ib[j] * 6; 572 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 573 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 574 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 575 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 576 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 577 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 578 /* (strict upper triangular part of A)*x */ 579 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 580 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 581 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 582 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 583 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 584 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 585 v += 36; 586 } 587 xb += 6; 588 ai++; 589 } 590 591 PetscCall(VecRestoreArrayRead(xx, &x)); 592 PetscCall(VecRestoreArray(zz, &z)); 593 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 594 PetscFunctionReturn(PETSC_SUCCESS); 595 } 596 597 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 598 { 599 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 600 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 601 const PetscScalar *x, *xb; 602 const MatScalar *v; 603 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 604 const PetscInt *aj = a->j, *ai = a->i, *ib; 605 PetscInt nonzerorow = 0; 606 607 PetscFunctionBegin; 608 PetscCall(VecSet(zz, zero)); 609 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 610 PetscCall(VecGetArrayRead(xx, &x)); 611 PetscCall(VecGetArray(zz, &z)); 612 613 v = a->a; 614 xb = x; 615 616 for (i = 0; i < mbs; i++) { 617 n = ai[1] - ai[0]; /* length of i_th block row of A */ 618 x1 = xb[0]; 619 x2 = xb[1]; 620 x3 = xb[2]; 621 x4 = xb[3]; 622 x5 = xb[4]; 623 x6 = xb[5]; 624 x7 = xb[6]; 625 ib = aj + *ai; 626 jmin = 0; 627 nonzerorow += (n > 0); 628 if (*ib == i) { /* (diag of A)*x */ 629 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 630 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 631 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 632 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 633 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 634 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 635 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 636 v += 49; 637 jmin++; 638 } 639 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 640 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 641 for (j = jmin; j < n; j++) { 642 /* (strict lower triangular part of A)*x */ 643 cval = ib[j] * 7; 644 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 645 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 646 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 647 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 648 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 649 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 650 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 651 /* (strict upper triangular part of A)*x */ 652 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 653 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 654 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 655 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 656 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 657 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 658 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 659 v += 49; 660 } 661 xb += 7; 662 ai++; 663 } 664 PetscCall(VecRestoreArrayRead(xx, &x)); 665 PetscCall(VecRestoreArray(zz, &z)); 666 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 667 PetscFunctionReturn(PETSC_SUCCESS); 668 } 669 670 /* 671 This will not work with MatScalar == float because it calls the BLAS 672 */ 673 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 674 { 675 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 676 PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 677 const PetscScalar *x, *x_ptr, *xb; 678 const MatScalar *v; 679 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 680 const PetscInt *idx, *aj, *ii; 681 PetscInt nonzerorow = 0; 682 683 PetscFunctionBegin; 684 PetscCall(VecSet(zz, zero)); 685 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 686 PetscCall(VecGetArrayRead(xx, &x)); 687 PetscCall(VecGetArray(zz, &z)); 688 689 x_ptr = x; 690 z_ptr = z; 691 692 aj = a->j; 693 v = a->a; 694 ii = a->i; 695 696 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 697 work = a->mult_work; 698 699 for (i = 0; i < mbs; i++) { 700 n = ii[1] - ii[0]; 701 ncols = n * bs; 702 workt = work; 703 idx = aj + ii[0]; 704 nonzerorow += (n > 0); 705 706 /* upper triangular part */ 707 for (j = 0; j < n; j++) { 708 xb = x_ptr + bs * (*idx++); 709 for (k = 0; k < bs; k++) workt[k] = xb[k]; 710 workt += bs; 711 } 712 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 713 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 714 715 /* strict lower triangular part */ 716 idx = aj + ii[0]; 717 if (n && *idx == i) { 718 ncols -= bs; 719 v += bs2; 720 idx++; 721 n--; 722 } 723 724 if (ncols > 0) { 725 workt = work; 726 PetscCall(PetscArrayzero(workt, ncols)); 727 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 728 for (j = 0; j < n; j++) { 729 zb = z_ptr + bs * (*idx++); 730 for (k = 0; k < bs; k++) zb[k] += workt[k]; 731 workt += bs; 732 } 733 } 734 x += bs; 735 v += n * bs2; 736 z += bs; 737 ii++; 738 } 739 740 PetscCall(VecRestoreArrayRead(xx, &x)); 741 PetscCall(VecRestoreArray(zz, &z)); 742 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 743 PetscFunctionReturn(PETSC_SUCCESS); 744 } 745 746 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 747 { 748 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 749 PetscScalar *z, x1; 750 const PetscScalar *x, *xb; 751 const MatScalar *v; 752 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 753 const PetscInt *aj = a->j, *ai = a->i, *ib; 754 PetscInt nonzerorow = 0; 755 const int aconj = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0; 756 757 PetscFunctionBegin; 758 PetscCall(VecCopy(yy, zz)); 759 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 760 PetscCall(VecGetArrayRead(xx, &x)); 761 PetscCall(VecGetArray(zz, &z)); 762 v = a->a; 763 xb = x; 764 765 for (i = 0; i < mbs; i++) { 766 n = ai[1] - ai[0]; /* length of i_th row of A */ 767 x1 = xb[0]; 768 ib = aj + *ai; 769 jmin = 0; 770 nonzerorow += (n > 0); 771 if (n && *ib == i) { /* (diag of A)*x */ 772 z[i] += *v++ * x[*ib++]; 773 jmin++; 774 } 775 if (aconj) { 776 for (j = jmin; j < n; j++) { 777 cval = *ib; 778 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 779 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 780 } 781 } else { 782 for (j = jmin; j < n; j++) { 783 cval = *ib; 784 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 785 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 786 } 787 } 788 xb++; 789 ai++; 790 } 791 792 PetscCall(VecRestoreArrayRead(xx, &x)); 793 PetscCall(VecRestoreArray(zz, &z)); 794 795 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 796 PetscFunctionReturn(PETSC_SUCCESS); 797 } 798 799 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 800 { 801 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 802 PetscScalar *z, x1, x2; 803 const PetscScalar *x, *xb; 804 const MatScalar *v; 805 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 806 const PetscInt *aj = a->j, *ai = a->i, *ib; 807 PetscInt nonzerorow = 0; 808 809 PetscFunctionBegin; 810 PetscCall(VecCopy(yy, zz)); 811 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 812 PetscCall(VecGetArrayRead(xx, &x)); 813 PetscCall(VecGetArray(zz, &z)); 814 815 v = a->a; 816 xb = x; 817 818 for (i = 0; i < mbs; i++) { 819 n = ai[1] - ai[0]; /* length of i_th block row of A */ 820 x1 = xb[0]; 821 x2 = xb[1]; 822 ib = aj + *ai; 823 jmin = 0; 824 nonzerorow += (n > 0); 825 if (n && *ib == i) { /* (diag of A)*x */ 826 z[2 * i] += v[0] * x1 + v[2] * x2; 827 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 828 v += 4; 829 jmin++; 830 } 831 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 832 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 833 for (j = jmin; j < n; j++) { 834 /* (strict lower triangular part of A)*x */ 835 cval = ib[j] * 2; 836 z[cval] += v[0] * x1 + v[1] * x2; 837 z[cval + 1] += v[2] * x1 + v[3] * x2; 838 /* (strict upper triangular part of A)*x */ 839 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 840 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 841 v += 4; 842 } 843 xb += 2; 844 ai++; 845 } 846 PetscCall(VecRestoreArrayRead(xx, &x)); 847 PetscCall(VecRestoreArray(zz, &z)); 848 849 PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 850 PetscFunctionReturn(PETSC_SUCCESS); 851 } 852 853 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 854 { 855 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 856 PetscScalar *z, x1, x2, x3; 857 const PetscScalar *x, *xb; 858 const MatScalar *v; 859 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 860 const PetscInt *aj = a->j, *ai = a->i, *ib; 861 PetscInt nonzerorow = 0; 862 863 PetscFunctionBegin; 864 PetscCall(VecCopy(yy, zz)); 865 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 866 PetscCall(VecGetArrayRead(xx, &x)); 867 PetscCall(VecGetArray(zz, &z)); 868 869 v = a->a; 870 xb = x; 871 872 for (i = 0; i < mbs; i++) { 873 n = ai[1] - ai[0]; /* length of i_th block row of A */ 874 x1 = xb[0]; 875 x2 = xb[1]; 876 x3 = xb[2]; 877 ib = aj + *ai; 878 jmin = 0; 879 nonzerorow += (n > 0); 880 if (n && *ib == i) { /* (diag of A)*x */ 881 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 882 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 883 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 884 v += 9; 885 jmin++; 886 } 887 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 888 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 889 for (j = jmin; j < n; j++) { 890 /* (strict lower triangular part of A)*x */ 891 cval = ib[j] * 3; 892 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 893 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 894 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 895 /* (strict upper triangular part of A)*x */ 896 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 897 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 898 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 899 v += 9; 900 } 901 xb += 3; 902 ai++; 903 } 904 905 PetscCall(VecRestoreArrayRead(xx, &x)); 906 PetscCall(VecRestoreArray(zz, &z)); 907 908 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 909 PetscFunctionReturn(PETSC_SUCCESS); 910 } 911 912 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 913 { 914 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 915 PetscScalar *z, x1, x2, x3, x4; 916 const PetscScalar *x, *xb; 917 const MatScalar *v; 918 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 919 const PetscInt *aj = a->j, *ai = a->i, *ib; 920 PetscInt nonzerorow = 0; 921 922 PetscFunctionBegin; 923 PetscCall(VecCopy(yy, zz)); 924 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 925 PetscCall(VecGetArrayRead(xx, &x)); 926 PetscCall(VecGetArray(zz, &z)); 927 928 v = a->a; 929 xb = x; 930 931 for (i = 0; i < mbs; i++) { 932 n = ai[1] - ai[0]; /* length of i_th block row of A */ 933 x1 = xb[0]; 934 x2 = xb[1]; 935 x3 = xb[2]; 936 x4 = xb[3]; 937 ib = aj + *ai; 938 jmin = 0; 939 nonzerorow += (n > 0); 940 if (n && *ib == i) { /* (diag of A)*x */ 941 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 942 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 943 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 944 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 945 v += 16; 946 jmin++; 947 } 948 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 949 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 950 for (j = jmin; j < n; j++) { 951 /* (strict lower triangular part of A)*x */ 952 cval = ib[j] * 4; 953 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 954 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 955 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 956 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 957 /* (strict upper triangular part of A)*x */ 958 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 959 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 960 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 961 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 962 v += 16; 963 } 964 xb += 4; 965 ai++; 966 } 967 968 PetscCall(VecRestoreArrayRead(xx, &x)); 969 PetscCall(VecRestoreArray(zz, &z)); 970 971 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 972 PetscFunctionReturn(PETSC_SUCCESS); 973 } 974 975 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 976 { 977 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 978 PetscScalar *z, x1, x2, x3, x4, x5; 979 const PetscScalar *x, *xb; 980 const MatScalar *v; 981 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 982 const PetscInt *aj = a->j, *ai = a->i, *ib; 983 PetscInt nonzerorow = 0; 984 985 PetscFunctionBegin; 986 PetscCall(VecCopy(yy, zz)); 987 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 988 PetscCall(VecGetArrayRead(xx, &x)); 989 PetscCall(VecGetArray(zz, &z)); 990 991 v = a->a; 992 xb = x; 993 994 for (i = 0; i < mbs; i++) { 995 n = ai[1] - ai[0]; /* length of i_th block row of A */ 996 x1 = xb[0]; 997 x2 = xb[1]; 998 x3 = xb[2]; 999 x4 = xb[3]; 1000 x5 = xb[4]; 1001 ib = aj + *ai; 1002 jmin = 0; 1003 nonzerorow += (n > 0); 1004 if (n && *ib == i) { /* (diag of A)*x */ 1005 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1006 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1007 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1008 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 1009 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1010 v += 25; 1011 jmin++; 1012 } 1013 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1014 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1015 for (j = jmin; j < n; j++) { 1016 /* (strict lower triangular part of A)*x */ 1017 cval = ib[j] * 5; 1018 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 1019 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 1020 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 1021 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 1022 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1023 /* (strict upper triangular part of A)*x */ 1024 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 1025 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 1026 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 1027 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 1028 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 1029 v += 25; 1030 } 1031 xb += 5; 1032 ai++; 1033 } 1034 1035 PetscCall(VecRestoreArrayRead(xx, &x)); 1036 PetscCall(VecRestoreArray(zz, &z)); 1037 1038 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 1039 PetscFunctionReturn(PETSC_SUCCESS); 1040 } 1041 1042 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1043 { 1044 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1045 PetscScalar *z, x1, x2, x3, x4, x5, x6; 1046 const PetscScalar *x, *xb; 1047 const MatScalar *v; 1048 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1049 const PetscInt *aj = a->j, *ai = a->i, *ib; 1050 PetscInt nonzerorow = 0; 1051 1052 PetscFunctionBegin; 1053 PetscCall(VecCopy(yy, zz)); 1054 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1055 PetscCall(VecGetArrayRead(xx, &x)); 1056 PetscCall(VecGetArray(zz, &z)); 1057 1058 v = a->a; 1059 xb = x; 1060 1061 for (i = 0; i < mbs; i++) { 1062 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1063 x1 = xb[0]; 1064 x2 = xb[1]; 1065 x3 = xb[2]; 1066 x4 = xb[3]; 1067 x5 = xb[4]; 1068 x6 = xb[5]; 1069 ib = aj + *ai; 1070 jmin = 0; 1071 nonzerorow += (n > 0); 1072 if (n && *ib == i) { /* (diag of A)*x */ 1073 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 1074 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 1075 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 1076 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 1077 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 1078 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1079 v += 36; 1080 jmin++; 1081 } 1082 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1083 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1084 for (j = jmin; j < n; j++) { 1085 /* (strict lower triangular part of A)*x */ 1086 cval = ib[j] * 6; 1087 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 1088 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 1089 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 1090 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 1091 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 1092 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1093 /* (strict upper triangular part of A)*x */ 1094 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 1095 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 1096 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 1097 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 1098 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 1099 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 1100 v += 36; 1101 } 1102 xb += 6; 1103 ai++; 1104 } 1105 1106 PetscCall(VecRestoreArrayRead(xx, &x)); 1107 PetscCall(VecRestoreArray(zz, &z)); 1108 1109 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 1110 PetscFunctionReturn(PETSC_SUCCESS); 1111 } 1112 1113 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1114 { 1115 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1116 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1117 const PetscScalar *x, *xb; 1118 const MatScalar *v; 1119 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1120 const PetscInt *aj = a->j, *ai = a->i, *ib; 1121 PetscInt nonzerorow = 0; 1122 1123 PetscFunctionBegin; 1124 PetscCall(VecCopy(yy, zz)); 1125 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1126 PetscCall(VecGetArrayRead(xx, &x)); 1127 PetscCall(VecGetArray(zz, &z)); 1128 1129 v = a->a; 1130 xb = x; 1131 1132 for (i = 0; i < mbs; i++) { 1133 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1134 x1 = xb[0]; 1135 x2 = xb[1]; 1136 x3 = xb[2]; 1137 x4 = xb[3]; 1138 x5 = xb[4]; 1139 x6 = xb[5]; 1140 x7 = xb[6]; 1141 ib = aj + *ai; 1142 jmin = 0; 1143 nonzerorow += (n > 0); 1144 if (n && *ib == i) { /* (diag of A)*x */ 1145 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 1146 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 1147 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 1148 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 1149 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 1150 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 1151 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1152 v += 49; 1153 jmin++; 1154 } 1155 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1156 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1157 for (j = jmin; j < n; j++) { 1158 /* (strict lower triangular part of A)*x */ 1159 cval = ib[j] * 7; 1160 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 1161 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 1162 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 1163 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 1164 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 1165 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 1166 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1167 /* (strict upper triangular part of A)*x */ 1168 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 1169 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 1170 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 1171 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 1172 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 1173 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 1174 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 1175 v += 49; 1176 } 1177 xb += 7; 1178 ai++; 1179 } 1180 1181 PetscCall(VecRestoreArrayRead(xx, &x)); 1182 PetscCall(VecRestoreArray(zz, &z)); 1183 1184 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 1185 PetscFunctionReturn(PETSC_SUCCESS); 1186 } 1187 1188 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1189 { 1190 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1191 PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1192 const PetscScalar *x, *x_ptr, *xb; 1193 const MatScalar *v; 1194 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1195 const PetscInt *idx, *aj, *ii; 1196 PetscInt nonzerorow = 0; 1197 1198 PetscFunctionBegin; 1199 PetscCall(VecCopy(yy, zz)); 1200 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1201 PetscCall(VecGetArrayRead(xx, &x)); 1202 x_ptr = x; 1203 PetscCall(VecGetArray(zz, &z)); 1204 z_ptr = z; 1205 1206 aj = a->j; 1207 v = a->a; 1208 ii = a->i; 1209 1210 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 1211 work = a->mult_work; 1212 1213 for (i = 0; i < mbs; i++) { 1214 n = ii[1] - ii[0]; 1215 ncols = n * bs; 1216 workt = work; 1217 idx = aj + ii[0]; 1218 nonzerorow += (n > 0); 1219 1220 /* upper triangular part */ 1221 for (j = 0; j < n; j++) { 1222 xb = x_ptr + bs * (*idx++); 1223 for (k = 0; k < bs; k++) workt[k] = xb[k]; 1224 workt += bs; 1225 } 1226 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1227 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 1228 1229 /* strict lower triangular part */ 1230 idx = aj + ii[0]; 1231 if (n && *idx == i) { 1232 ncols -= bs; 1233 v += bs2; 1234 idx++; 1235 n--; 1236 } 1237 if (ncols > 0) { 1238 workt = work; 1239 PetscCall(PetscArrayzero(workt, ncols)); 1240 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1241 for (j = 0; j < n; j++) { 1242 zb = z_ptr + bs * (*idx++); 1243 for (k = 0; k < bs; k++) zb[k] += workt[k]; 1244 workt += bs; 1245 } 1246 } 1247 1248 x += bs; 1249 v += n * bs2; 1250 z += bs; 1251 ii++; 1252 } 1253 1254 PetscCall(VecRestoreArrayRead(xx, &x)); 1255 PetscCall(VecRestoreArray(zz, &z)); 1256 1257 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 1258 PetscFunctionReturn(PETSC_SUCCESS); 1259 } 1260 1261 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1262 { 1263 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1264 PetscScalar oalpha = alpha; 1265 PetscBLASInt one = 1, totalnz; 1266 1267 PetscFunctionBegin; 1268 PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1269 PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 1270 PetscCall(PetscLogFlops(totalnz)); 1271 PetscFunctionReturn(PETSC_SUCCESS); 1272 } 1273 1274 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1275 { 1276 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1277 const MatScalar *v = a->a; 1278 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1279 PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1280 const PetscInt *aj = a->j, *col; 1281 1282 PetscFunctionBegin; 1283 if (!a->nz) { 1284 *norm = 0.0; 1285 PetscFunctionReturn(PETSC_SUCCESS); 1286 } 1287 if (type == NORM_FROBENIUS) { 1288 for (k = 0; k < mbs; k++) { 1289 jmin = a->i[k]; 1290 jmax = a->i[k + 1]; 1291 col = aj + jmin; 1292 if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 1293 for (i = 0; i < bs2; i++) { 1294 sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 1295 v++; 1296 } 1297 jmin++; 1298 } 1299 for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 1300 for (i = 0; i < bs2; i++) { 1301 sum_off += PetscRealPart(PetscConj(*v) * (*v)); 1302 v++; 1303 } 1304 } 1305 } 1306 *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 1307 PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 1308 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1309 PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 1310 for (i = 0; i < mbs; i++) jl[i] = mbs; 1311 il[0] = 0; 1312 1313 *norm = 0.0; 1314 for (k = 0; k < mbs; k++) { /* k_th block row */ 1315 for (j = 0; j < bs; j++) sum[j] = 0.0; 1316 /*-- col sum --*/ 1317 i = jl[k]; /* first |A(i,k)| to be added */ 1318 /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window) 1319 at step k */ 1320 while (i < mbs) { 1321 nexti = jl[i]; /* next block row to be added */ 1322 ik = il[i]; /* block index of A(i,k) in the array a */ 1323 for (j = 0; j < bs; j++) { 1324 v = a->a + ik * bs2 + j * bs; 1325 for (k1 = 0; k1 < bs; k1++) { 1326 sum[j] += PetscAbsScalar(*v); 1327 v++; 1328 } 1329 } 1330 /* update il, jl */ 1331 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1332 jmax = a->i[i + 1]; 1333 if (jmin < jmax) { 1334 il[i] = jmin; 1335 j = a->j[jmin]; 1336 jl[i] = jl[j]; 1337 jl[j] = i; 1338 } 1339 i = nexti; 1340 } 1341 /*-- row sum --*/ 1342 jmin = a->i[k]; 1343 jmax = a->i[k + 1]; 1344 for (i = jmin; i < jmax; i++) { 1345 for (j = 0; j < bs; j++) { 1346 v = a->a + i * bs2 + j; 1347 for (k1 = 0; k1 < bs; k1++) { 1348 sum[j] += PetscAbsScalar(*v); 1349 v += bs; 1350 } 1351 } 1352 } 1353 /* add k_th block row to il, jl */ 1354 col = aj + jmin; 1355 if (jmax - jmin > 0 && *col == k) jmin++; 1356 if (jmin < jmax) { 1357 il[k] = jmin; 1358 j = a->j[jmin]; 1359 jl[k] = jl[j]; 1360 jl[j] = k; 1361 } 1362 for (j = 0; j < bs; j++) { 1363 if (sum[j] > *norm) *norm = sum[j]; 1364 } 1365 } 1366 PetscCall(PetscFree3(sum, il, jl)); 1367 PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1368 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 1369 PetscFunctionReturn(PETSC_SUCCESS); 1370 } 1371 1372 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1373 { 1374 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 1375 1376 PetscFunctionBegin; 1377 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1378 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1379 *flg = PETSC_FALSE; 1380 PetscFunctionReturn(PETSC_SUCCESS); 1381 } 1382 1383 /* if the a->i are the same */ 1384 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 1385 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1386 1387 /* if a->j are the same */ 1388 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 1389 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1390 1391 /* if a->a are the same */ 1392 PetscCall(PetscArraycmp(a->a, b->a, a->nz * A->rmap->bs * A->rmap->bs, flg)); 1393 PetscFunctionReturn(PETSC_SUCCESS); 1394 } 1395 1396 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1397 { 1398 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1399 PetscInt n; 1400 const PetscInt bs = A->rmap->bs, ambs = a->mbs, bs2 = a->bs2; 1401 PetscScalar *x; 1402 const MatScalar *aa = a->a, *aa_j; 1403 const PetscInt *ai = a->i, *adiag; 1404 PetscBool diagDense; 1405 1406 PetscFunctionBegin; 1407 PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 1408 PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, &adiag, &diagDense)); 1409 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1410 PetscCall(VecGetArrayWrite(v, &x)); 1411 for (PetscInt i = 0; i < ambs; i++) x[i] = 1.0 / aa[adiag[i]]; 1412 PetscCall(VecRestoreArrayWrite(v, &x)); 1413 PetscFunctionReturn(PETSC_SUCCESS); 1414 } 1415 1416 PetscCall(VecGetLocalSize(v, &n)); 1417 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1418 PetscCall(VecGetArrayWrite(v, &x)); 1419 1420 if (diagDense) { 1421 for (PetscInt i = 0, row = 0; i < ambs; i++) { 1422 aa_j = aa + adiag[i] * bs2; 1423 for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k]; 1424 } 1425 } else { 1426 for (PetscInt i = 0, row = 0; i < ambs; i++) { 1427 const PetscInt j = adiag[i]; 1428 1429 if (j != ai[i + 1]) { 1430 aa_j = aa + j * bs2; 1431 for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k]; 1432 } else { 1433 for (PetscInt k = 0; k < bs; k++) x[row++] = 0.0; 1434 } 1435 } 1436 } 1437 PetscCall(VecRestoreArrayWrite(v, &x)); 1438 PetscFunctionReturn(PETSC_SUCCESS); 1439 } 1440 1441 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1442 { 1443 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1444 PetscScalar x; 1445 const PetscScalar *l, *li, *ri; 1446 MatScalar *aa, *v; 1447 PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1448 const PetscInt *ai, *aj; 1449 1450 PetscFunctionBegin; 1451 if (!ll) PetscFunctionReturn(PETSC_SUCCESS); 1452 ai = a->i; 1453 aj = a->j; 1454 aa = a->a; 1455 m = A->rmap->N; 1456 bs = A->rmap->bs; 1457 mbs = a->mbs; 1458 bs2 = a->bs2; 1459 1460 PetscCall(VecGetArrayRead(ll, &l)); 1461 PetscCall(VecGetLocalSize(ll, &lm)); 1462 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1463 for (i = 0; i < mbs; i++) { /* for each block row */ 1464 M = ai[i + 1] - ai[i]; 1465 li = l + i * bs; 1466 v = aa + bs2 * ai[i]; 1467 for (j = 0; j < M; j++) { /* for each block */ 1468 ri = l + bs * aj[ai[i] + j]; 1469 for (k = 0; k < bs; k++) { 1470 x = ri[k]; 1471 for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 1472 } 1473 } 1474 } 1475 PetscCall(VecRestoreArrayRead(ll, &l)); 1476 PetscCall(PetscLogFlops(2.0 * a->nz)); 1477 PetscFunctionReturn(PETSC_SUCCESS); 1478 } 1479 1480 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1481 { 1482 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1483 1484 PetscFunctionBegin; 1485 info->block_size = a->bs2; 1486 info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 1487 info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 1488 info->nz_unneeded = info->nz_allocated - info->nz_used; 1489 info->assemblies = A->num_ass; 1490 info->mallocs = A->info.mallocs; 1491 info->memory = 0; /* REVIEW ME */ 1492 if (A->factortype) { 1493 info->fill_ratio_given = A->info.fill_ratio_given; 1494 info->fill_ratio_needed = A->info.fill_ratio_needed; 1495 info->factor_mallocs = A->info.factor_mallocs; 1496 } else { 1497 info->fill_ratio_given = 0; 1498 info->fill_ratio_needed = 0; 1499 info->factor_mallocs = 0; 1500 } 1501 PetscFunctionReturn(PETSC_SUCCESS); 1502 } 1503 1504 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1505 { 1506 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1507 1508 PetscFunctionBegin; 1509 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 1510 PetscFunctionReturn(PETSC_SUCCESS); 1511 } 1512 1513 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1514 { 1515 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1516 PetscInt i, j, n, row, col, bs, mbs; 1517 const PetscInt *ai, *aj; 1518 PetscReal atmp; 1519 const MatScalar *aa; 1520 PetscScalar *x; 1521 PetscInt ncols, brow, bcol, krow, kcol; 1522 1523 PetscFunctionBegin; 1524 PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 1525 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1526 bs = A->rmap->bs; 1527 aa = a->a; 1528 ai = a->i; 1529 aj = a->j; 1530 mbs = a->mbs; 1531 1532 PetscCall(VecSet(v, 0.0)); 1533 PetscCall(VecGetArray(v, &x)); 1534 PetscCall(VecGetLocalSize(v, &n)); 1535 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1536 for (i = 0; i < mbs; i++) { 1537 ncols = ai[1] - ai[0]; 1538 ai++; 1539 brow = bs * i; 1540 for (j = 0; j < ncols; j++) { 1541 bcol = bs * (*aj); 1542 for (kcol = 0; kcol < bs; kcol++) { 1543 col = bcol + kcol; /* col index */ 1544 for (krow = 0; krow < bs; krow++) { 1545 atmp = PetscAbsScalar(*aa); 1546 aa++; 1547 row = brow + krow; /* row index */ 1548 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1549 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1550 } 1551 } 1552 aj++; 1553 } 1554 } 1555 PetscCall(VecRestoreArray(v, &x)); 1556 PetscFunctionReturn(PETSC_SUCCESS); 1557 } 1558 1559 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1560 { 1561 PetscFunctionBegin; 1562 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1563 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1564 PetscFunctionReturn(PETSC_SUCCESS); 1565 } 1566 1567 static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1568 { 1569 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1570 PetscScalar *z = c; 1571 const PetscScalar *xb; 1572 PetscScalar x1; 1573 const MatScalar *v = a->a, *vv; 1574 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1575 const int aconj = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0; 1576 1577 PetscFunctionBegin; 1578 for (i = 0; i < mbs; i++) { 1579 n = ii[1] - ii[0]; 1580 ii++; 1581 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1582 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1583 jj = idx; 1584 vv = v; 1585 for (k = 0; k < cn; k++) { 1586 idx = jj; 1587 v = vv; 1588 for (j = 0; j < n; j++) { 1589 xb = b + (*idx); 1590 x1 = xb[0 + k * bm]; 1591 z[0 + k * cm] += v[0] * x1; 1592 if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1593 v += 1; 1594 ++idx; 1595 } 1596 } 1597 z += 1; 1598 } 1599 PetscFunctionReturn(PETSC_SUCCESS); 1600 } 1601 1602 static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1603 { 1604 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1605 PetscScalar *z = c; 1606 const PetscScalar *xb; 1607 PetscScalar x1, x2; 1608 const MatScalar *v = a->a, *vv; 1609 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1610 1611 PetscFunctionBegin; 1612 for (i = 0; i < mbs; i++) { 1613 n = ii[1] - ii[0]; 1614 ii++; 1615 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1616 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1617 jj = idx; 1618 vv = v; 1619 for (k = 0; k < cn; k++) { 1620 idx = jj; 1621 v = vv; 1622 for (j = 0; j < n; j++) { 1623 xb = b + 2 * (*idx); 1624 x1 = xb[0 + k * bm]; 1625 x2 = xb[1 + k * bm]; 1626 z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1627 z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1628 if (*idx != i) { 1629 c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1630 c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1631 } 1632 v += 4; 1633 ++idx; 1634 } 1635 } 1636 z += 2; 1637 } 1638 PetscFunctionReturn(PETSC_SUCCESS); 1639 } 1640 1641 static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1642 { 1643 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1644 PetscScalar *z = c; 1645 const PetscScalar *xb; 1646 PetscScalar x1, x2, x3; 1647 const MatScalar *v = a->a, *vv; 1648 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1649 1650 PetscFunctionBegin; 1651 for (i = 0; i < mbs; i++) { 1652 n = ii[1] - ii[0]; 1653 ii++; 1654 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1655 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1656 jj = idx; 1657 vv = v; 1658 for (k = 0; k < cn; k++) { 1659 idx = jj; 1660 v = vv; 1661 for (j = 0; j < n; j++) { 1662 xb = b + 3 * (*idx); 1663 x1 = xb[0 + k * bm]; 1664 x2 = xb[1 + k * bm]; 1665 x3 = xb[2 + k * bm]; 1666 z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1667 z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1668 z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1669 if (*idx != i) { 1670 c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1671 c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1672 c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1673 } 1674 v += 9; 1675 ++idx; 1676 } 1677 } 1678 z += 3; 1679 } 1680 PetscFunctionReturn(PETSC_SUCCESS); 1681 } 1682 1683 static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1684 { 1685 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1686 PetscScalar *z = c; 1687 const PetscScalar *xb; 1688 PetscScalar x1, x2, x3, x4; 1689 const MatScalar *v = a->a, *vv; 1690 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1691 1692 PetscFunctionBegin; 1693 for (i = 0; i < mbs; i++) { 1694 n = ii[1] - ii[0]; 1695 ii++; 1696 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1697 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1698 jj = idx; 1699 vv = v; 1700 for (k = 0; k < cn; k++) { 1701 idx = jj; 1702 v = vv; 1703 for (j = 0; j < n; j++) { 1704 xb = b + 4 * (*idx); 1705 x1 = xb[0 + k * bm]; 1706 x2 = xb[1 + k * bm]; 1707 x3 = xb[2 + k * bm]; 1708 x4 = xb[3 + k * bm]; 1709 z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1710 z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1711 z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1712 z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1713 if (*idx != i) { 1714 c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1715 c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1716 c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1717 c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1718 } 1719 v += 16; 1720 ++idx; 1721 } 1722 } 1723 z += 4; 1724 } 1725 PetscFunctionReturn(PETSC_SUCCESS); 1726 } 1727 1728 static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1729 { 1730 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1731 PetscScalar *z = c; 1732 const PetscScalar *xb; 1733 PetscScalar x1, x2, x3, x4, x5; 1734 const MatScalar *v = a->a, *vv; 1735 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1736 1737 PetscFunctionBegin; 1738 for (i = 0; i < mbs; i++) { 1739 n = ii[1] - ii[0]; 1740 ii++; 1741 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1742 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1743 jj = idx; 1744 vv = v; 1745 for (k = 0; k < cn; k++) { 1746 idx = jj; 1747 v = vv; 1748 for (j = 0; j < n; j++) { 1749 xb = b + 5 * (*idx); 1750 x1 = xb[0 + k * bm]; 1751 x2 = xb[1 + k * bm]; 1752 x3 = xb[2 + k * bm]; 1753 x4 = xb[3 + k * bm]; 1754 x5 = xb[4 + k * cm]; 1755 z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1756 z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1757 z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1758 z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1759 z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1760 if (*idx != i) { 1761 c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1762 c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1763 c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1764 c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1765 c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1766 } 1767 v += 25; 1768 ++idx; 1769 } 1770 } 1771 z += 5; 1772 } 1773 PetscFunctionReturn(PETSC_SUCCESS); 1774 } 1775 1776 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1777 { 1778 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1779 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1780 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1781 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1782 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1783 PetscBLASInt bbs, bcn, bbm, bcm; 1784 PetscScalar *z = NULL; 1785 PetscScalar *c, *b; 1786 const MatScalar *v; 1787 const PetscInt *idx, *ii; 1788 PetscScalar _DOne = 1.0; 1789 1790 PetscFunctionBegin; 1791 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1792 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1793 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1794 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1795 b = bd->v; 1796 PetscCall(MatZeroEntries(C)); 1797 PetscCall(MatDenseGetArray(C, &c)); 1798 switch (bs) { 1799 case 1: 1800 PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1801 break; 1802 case 2: 1803 PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1804 break; 1805 case 3: 1806 PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1807 break; 1808 case 4: 1809 PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1810 break; 1811 case 5: 1812 PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1813 break; 1814 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1815 PetscCall(PetscBLASIntCast(bs, &bbs)); 1816 PetscCall(PetscBLASIntCast(cn, &bcn)); 1817 PetscCall(PetscBLASIntCast(bm, &bbm)); 1818 PetscCall(PetscBLASIntCast(cm, &bcm)); 1819 idx = a->j; 1820 v = a->a; 1821 mbs = a->mbs; 1822 ii = a->i; 1823 z = c; 1824 for (i = 0; i < mbs; i++) { 1825 n = ii[1] - ii[0]; 1826 ii++; 1827 for (j = 0; j < n; j++) { 1828 if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1829 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1830 v += bs2; 1831 } 1832 z += bs; 1833 } 1834 } 1835 PetscCall(MatDenseRestoreArray(C, &c)); 1836 PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1837 PetscFunctionReturn(PETSC_SUCCESS); 1838 } 1839