1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <../src/mat/impls/sbaij/seq/sbaij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 #include <petscbt.h> 7 #include <petscblaslapack.h> 8 9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 10 { 11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 12 PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 13 const PetscInt *idx; 14 PetscBT table_out, table_in; 15 16 PetscFunctionBegin; 17 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 18 mbs = a->mbs; 19 ai = a->i; 20 aj = a->j; 21 bs = A->rmap->bs; 22 PetscCall(PetscBTCreate(mbs, &table_out)); 23 PetscCall(PetscMalloc1(mbs + 1, &nidx)); 24 PetscCall(PetscBTCreate(mbs, &table_in)); 25 26 for (i = 0; i < is_max; i++) { /* for each is */ 27 isz = 0; 28 PetscCall(PetscBTMemzero(mbs, table_out)); 29 30 /* Extract the indices, assume there can be duplicate entries */ 31 PetscCall(ISGetIndices(is[i], &idx)); 32 PetscCall(ISGetLocalSize(is[i], &n)); 33 34 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 35 bcol_max = 0; 36 for (j = 0; j < n; ++j) { 37 brow = idx[j] / bs; /* convert the indices into block indices */ 38 PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 39 if (!PetscBTLookupSet(table_out, brow)) { 40 nidx[isz++] = brow; 41 if (bcol_max < brow) bcol_max = brow; 42 } 43 } 44 PetscCall(ISRestoreIndices(is[i], &idx)); 45 PetscCall(ISDestroy(&is[i])); 46 47 k = 0; 48 for (j = 0; j < ov; j++) { /* for each overlap */ 49 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 50 PetscCall(PetscBTMemzero(mbs, table_in)); 51 for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 52 53 n = isz; /* length of the updated is[i] */ 54 for (brow = 0; brow < mbs; brow++) { 55 start = ai[brow]; 56 end = ai[brow + 1]; 57 if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58 for (l = start; l < end; l++) { 59 bcol = aj[l]; 60 if (!PetscBTLookupSet(table_out, bcol)) { 61 nidx[isz++] = bcol; 62 if (bcol_max < bcol) bcol_max = bcol; 63 } 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l < end; l++) { 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table_in, bcol)) { 72 if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 80 } /* for each is */ 81 PetscCall(PetscBTDestroy(&table_out)); 82 PetscCall(PetscFree(nidx)); 83 PetscCall(PetscBTDestroy(&table_in)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 88 Zero some ops' to avoid invalid use */ 89 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 90 { 91 PetscFunctionBegin; 92 PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 93 Bseq->ops->mult = NULL; 94 Bseq->ops->multadd = NULL; 95 Bseq->ops->multtranspose = NULL; 96 Bseq->ops->multtransposeadd = NULL; 97 Bseq->ops->lufactor = NULL; 98 Bseq->ops->choleskyfactor = NULL; 99 Bseq->ops->lufactorsymbolic = NULL; 100 Bseq->ops->choleskyfactorsymbolic = NULL; 101 Bseq->ops->getinertia = NULL; 102 PetscFunctionReturn(PETSC_SUCCESS); 103 } 104 105 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 106 static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym) 107 { 108 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL; 109 Mat_SeqBAIJ *d = NULL; 110 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 111 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 112 const PetscInt *irow, *icol; 113 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 114 PetscInt *aj = a->j, *ai = a->i; 115 MatScalar *mat_a; 116 Mat C; 117 PetscBool flag; 118 119 PetscFunctionBegin; 120 121 PetscCall(ISGetIndices(isrow, &irow)); 122 PetscCall(ISGetIndices(iscol, &icol)); 123 PetscCall(ISGetLocalSize(isrow, &nrows)); 124 PetscCall(ISGetLocalSize(iscol, &ncols)); 125 126 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 127 ssmap = smap; 128 PetscCall(PetscMalloc1(1 + nrows, &lens)); 129 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 130 /* determine lens of each row */ 131 for (i = 0; i < nrows; i++) { 132 kstart = ai[irow[i]]; 133 kend = kstart + a->ilen[irow[i]]; 134 lens[i] = 0; 135 for (k = kstart; k < kend; k++) { 136 if (ssmap[aj[k]]) lens[i]++; 137 } 138 } 139 /* Create and fill new matrix */ 140 if (scall == MAT_REUSE_MATRIX) { 141 if (sym) { 142 c = (Mat_SeqSBAIJ *)((*B)->data); 143 144 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 145 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 146 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 147 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 148 } else { 149 d = (Mat_SeqBAIJ *)((*B)->data); 150 151 PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 152 PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag)); 153 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); 154 PetscCall(PetscArrayzero(d->ilen, d->mbs)); 155 } 156 C = *B; 157 } else { 158 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 159 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 160 if (sym) { 161 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 162 PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 163 } else { 164 PetscCall(MatSetType(C, MATSEQBAIJ)); 165 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); 166 } 167 } 168 if (sym) c = (Mat_SeqSBAIJ *)(C->data); 169 else d = (Mat_SeqBAIJ *)(C->data); 170 for (i = 0; i < nrows; i++) { 171 row = irow[i]; 172 kstart = ai[row]; 173 kend = kstart + a->ilen[row]; 174 if (sym) { 175 mat_i = c->i[i]; 176 mat_j = c->j + mat_i; 177 mat_a = c->a + mat_i * bs2; 178 mat_ilen = c->ilen + i; 179 } else { 180 mat_i = d->i[i]; 181 mat_j = d->j + mat_i; 182 mat_a = d->a + mat_i * bs2; 183 mat_ilen = d->ilen + i; 184 } 185 for (k = kstart; k < kend; k++) { 186 if ((tcol = ssmap[a->j[k]])) { 187 *mat_j++ = tcol - 1; 188 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 189 mat_a += bs2; 190 (*mat_ilen)++; 191 } 192 } 193 } 194 /* sort */ 195 { 196 MatScalar *work; 197 198 PetscCall(PetscMalloc1(bs2, &work)); 199 for (i = 0; i < nrows; i++) { 200 PetscInt ilen; 201 if (sym) { 202 mat_i = c->i[i]; 203 mat_j = c->j + mat_i; 204 mat_a = c->a + mat_i * bs2; 205 ilen = c->ilen[i]; 206 } else { 207 mat_i = d->i[i]; 208 mat_j = d->j + mat_i; 209 mat_a = d->a + mat_i * bs2; 210 ilen = d->ilen[i]; 211 } 212 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 213 } 214 PetscCall(PetscFree(work)); 215 } 216 217 /* Free work space */ 218 PetscCall(ISRestoreIndices(iscol, &icol)); 219 PetscCall(PetscFree(smap)); 220 PetscCall(PetscFree(lens)); 221 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 222 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 223 224 PetscCall(ISRestoreIndices(isrow, &irow)); 225 *B = C; 226 PetscFunctionReturn(PETSC_SUCCESS); 227 } 228 229 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 230 { 231 Mat C[2]; 232 IS is1, is2, intersect = NULL; 233 PetscInt n1, n2, ni; 234 PetscBool sym = PETSC_TRUE; 235 236 PetscFunctionBegin; 237 PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); 238 if (isrow == iscol) { 239 is2 = is1; 240 PetscCall(PetscObjectReference((PetscObject)is2)); 241 } else { 242 PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); 243 PetscCall(ISIntersect(is1, is2, &intersect)); 244 PetscCall(ISGetLocalSize(intersect, &ni)); 245 PetscCall(ISDestroy(&intersect)); 246 if (ni == 0) sym = PETSC_FALSE; 247 else if (PetscDefined(USE_DEBUG)) { 248 PetscCall(ISGetLocalSize(is1, &n1)); 249 PetscCall(ISGetLocalSize(is2, &n2)); 250 PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix"); 251 } 252 } 253 if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym)); 254 else { 255 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym)); 256 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym)); 257 PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1)); 258 PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN)); 259 PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 260 if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 261 else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B)); 262 else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN)); 263 PetscCall(MatDestroy(C)); 264 PetscCall(MatDestroy(C + 1)); 265 } 266 PetscCall(ISDestroy(&is1)); 267 PetscCall(ISDestroy(&is2)); 268 269 if (sym && isrow != iscol) { 270 PetscBool isequal; 271 PetscCall(ISEqual(isrow, iscol, &isequal)); 272 if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 273 } 274 PetscFunctionReturn(PETSC_SUCCESS); 275 } 276 277 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 278 { 279 PetscInt i; 280 281 PetscFunctionBegin; 282 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B)); 283 284 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 285 PetscFunctionReturn(PETSC_SUCCESS); 286 } 287 288 /* Should check that shapes of vectors and matrices match */ 289 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 290 { 291 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 292 PetscScalar *z, x1, x2, zero = 0.0; 293 const PetscScalar *x, *xb; 294 const MatScalar *v; 295 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 296 const PetscInt *aj = a->j, *ai = a->i, *ib; 297 PetscInt nonzerorow = 0; 298 299 PetscFunctionBegin; 300 PetscCall(VecSet(zz, zero)); 301 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 302 PetscCall(VecGetArrayRead(xx, &x)); 303 PetscCall(VecGetArray(zz, &z)); 304 305 v = a->a; 306 xb = x; 307 308 for (i = 0; i < mbs; i++) { 309 n = ai[1] - ai[0]; /* length of i_th block row of A */ 310 x1 = xb[0]; 311 x2 = xb[1]; 312 ib = aj + *ai; 313 jmin = 0; 314 nonzerorow += (n > 0); 315 if (*ib == i) { /* (diag of A)*x */ 316 z[2 * i] += v[0] * x1 + v[2] * x2; 317 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 318 v += 4; 319 jmin++; 320 } 321 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 322 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 323 for (j = jmin; j < n; j++) { 324 /* (strict lower triangular part of A)*x */ 325 cval = ib[j] * 2; 326 z[cval] += v[0] * x1 + v[1] * x2; 327 z[cval + 1] += v[2] * x1 + v[3] * x2; 328 /* (strict upper triangular part of A)*x */ 329 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 330 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 331 v += 4; 332 } 333 xb += 2; 334 ai++; 335 } 336 337 PetscCall(VecRestoreArrayRead(xx, &x)); 338 PetscCall(VecRestoreArray(zz, &z)); 339 PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 340 PetscFunctionReturn(PETSC_SUCCESS); 341 } 342 343 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 344 { 345 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 346 PetscScalar *z, x1, x2, x3, zero = 0.0; 347 const PetscScalar *x, *xb; 348 const MatScalar *v; 349 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 350 const PetscInt *aj = a->j, *ai = a->i, *ib; 351 PetscInt nonzerorow = 0; 352 353 PetscFunctionBegin; 354 PetscCall(VecSet(zz, zero)); 355 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 356 PetscCall(VecGetArrayRead(xx, &x)); 357 PetscCall(VecGetArray(zz, &z)); 358 359 v = a->a; 360 xb = x; 361 362 for (i = 0; i < mbs; i++) { 363 n = ai[1] - ai[0]; /* length of i_th block row of A */ 364 x1 = xb[0]; 365 x2 = xb[1]; 366 x3 = xb[2]; 367 ib = aj + *ai; 368 jmin = 0; 369 nonzerorow += (n > 0); 370 if (*ib == i) { /* (diag of A)*x */ 371 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 372 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 373 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 374 v += 9; 375 jmin++; 376 } 377 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 378 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 379 for (j = jmin; j < n; j++) { 380 /* (strict lower triangular part of A)*x */ 381 cval = ib[j] * 3; 382 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 383 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 384 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 385 /* (strict upper triangular part of A)*x */ 386 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 387 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 388 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 389 v += 9; 390 } 391 xb += 3; 392 ai++; 393 } 394 395 PetscCall(VecRestoreArrayRead(xx, &x)); 396 PetscCall(VecRestoreArray(zz, &z)); 397 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 398 PetscFunctionReturn(PETSC_SUCCESS); 399 } 400 401 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 402 { 403 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 404 PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 405 const PetscScalar *x, *xb; 406 const MatScalar *v; 407 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 408 const PetscInt *aj = a->j, *ai = a->i, *ib; 409 PetscInt nonzerorow = 0; 410 411 PetscFunctionBegin; 412 PetscCall(VecSet(zz, zero)); 413 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 414 PetscCall(VecGetArrayRead(xx, &x)); 415 PetscCall(VecGetArray(zz, &z)); 416 417 v = a->a; 418 xb = x; 419 420 for (i = 0; i < mbs; i++) { 421 n = ai[1] - ai[0]; /* length of i_th block row of A */ 422 x1 = xb[0]; 423 x2 = xb[1]; 424 x3 = xb[2]; 425 x4 = xb[3]; 426 ib = aj + *ai; 427 jmin = 0; 428 nonzerorow += (n > 0); 429 if (*ib == i) { /* (diag of A)*x */ 430 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 431 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 432 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 433 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 434 v += 16; 435 jmin++; 436 } 437 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 438 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 439 for (j = jmin; j < n; j++) { 440 /* (strict lower triangular part of A)*x */ 441 cval = ib[j] * 4; 442 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 443 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 444 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 445 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 446 /* (strict upper triangular part of A)*x */ 447 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 448 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 449 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 450 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 451 v += 16; 452 } 453 xb += 4; 454 ai++; 455 } 456 457 PetscCall(VecRestoreArrayRead(xx, &x)); 458 PetscCall(VecRestoreArray(zz, &z)); 459 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 460 PetscFunctionReturn(PETSC_SUCCESS); 461 } 462 463 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 464 { 465 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 466 PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 467 const PetscScalar *x, *xb; 468 const MatScalar *v; 469 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 470 const PetscInt *aj = a->j, *ai = a->i, *ib; 471 PetscInt nonzerorow = 0; 472 473 PetscFunctionBegin; 474 PetscCall(VecSet(zz, zero)); 475 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 476 PetscCall(VecGetArrayRead(xx, &x)); 477 PetscCall(VecGetArray(zz, &z)); 478 479 v = a->a; 480 xb = x; 481 482 for (i = 0; i < mbs; i++) { 483 n = ai[1] - ai[0]; /* length of i_th block row of A */ 484 x1 = xb[0]; 485 x2 = xb[1]; 486 x3 = xb[2]; 487 x4 = xb[3]; 488 x5 = xb[4]; 489 ib = aj + *ai; 490 jmin = 0; 491 nonzerorow += (n > 0); 492 if (*ib == i) { /* (diag of A)*x */ 493 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 494 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 495 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 496 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 497 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 498 v += 25; 499 jmin++; 500 } 501 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 502 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 503 for (j = jmin; j < n; j++) { 504 /* (strict lower triangular part of A)*x */ 505 cval = ib[j] * 5; 506 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 507 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 508 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 509 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 510 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 511 /* (strict upper triangular part of A)*x */ 512 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 513 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 514 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 515 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 516 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 517 v += 25; 518 } 519 xb += 5; 520 ai++; 521 } 522 523 PetscCall(VecRestoreArrayRead(xx, &x)); 524 PetscCall(VecRestoreArray(zz, &z)); 525 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 526 PetscFunctionReturn(PETSC_SUCCESS); 527 } 528 529 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 530 { 531 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 532 PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 533 const PetscScalar *x, *xb; 534 const MatScalar *v; 535 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 536 const PetscInt *aj = a->j, *ai = a->i, *ib; 537 PetscInt nonzerorow = 0; 538 539 PetscFunctionBegin; 540 PetscCall(VecSet(zz, zero)); 541 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 542 PetscCall(VecGetArrayRead(xx, &x)); 543 PetscCall(VecGetArray(zz, &z)); 544 545 v = a->a; 546 xb = x; 547 548 for (i = 0; i < mbs; i++) { 549 n = ai[1] - ai[0]; /* length of i_th block row of A */ 550 x1 = xb[0]; 551 x2 = xb[1]; 552 x3 = xb[2]; 553 x4 = xb[3]; 554 x5 = xb[4]; 555 x6 = xb[5]; 556 ib = aj + *ai; 557 jmin = 0; 558 nonzerorow += (n > 0); 559 if (*ib == i) { /* (diag of A)*x */ 560 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 561 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 562 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 563 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 564 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 565 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 566 v += 36; 567 jmin++; 568 } 569 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 570 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 571 for (j = jmin; j < n; j++) { 572 /* (strict lower triangular part of A)*x */ 573 cval = ib[j] * 6; 574 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 575 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 576 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 577 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 578 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 579 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 580 /* (strict upper triangular part of A)*x */ 581 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 582 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 583 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 584 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 585 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 586 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 587 v += 36; 588 } 589 xb += 6; 590 ai++; 591 } 592 593 PetscCall(VecRestoreArrayRead(xx, &x)); 594 PetscCall(VecRestoreArray(zz, &z)); 595 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 596 PetscFunctionReturn(PETSC_SUCCESS); 597 } 598 599 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 600 { 601 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 602 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 603 const PetscScalar *x, *xb; 604 const MatScalar *v; 605 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 606 const PetscInt *aj = a->j, *ai = a->i, *ib; 607 PetscInt nonzerorow = 0; 608 609 PetscFunctionBegin; 610 PetscCall(VecSet(zz, zero)); 611 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 612 PetscCall(VecGetArrayRead(xx, &x)); 613 PetscCall(VecGetArray(zz, &z)); 614 615 v = a->a; 616 xb = x; 617 618 for (i = 0; i < mbs; i++) { 619 n = ai[1] - ai[0]; /* length of i_th block row of A */ 620 x1 = xb[0]; 621 x2 = xb[1]; 622 x3 = xb[2]; 623 x4 = xb[3]; 624 x5 = xb[4]; 625 x6 = xb[5]; 626 x7 = xb[6]; 627 ib = aj + *ai; 628 jmin = 0; 629 nonzerorow += (n > 0); 630 if (*ib == i) { /* (diag of A)*x */ 631 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 632 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 633 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 634 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 635 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 636 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 637 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 638 v += 49; 639 jmin++; 640 } 641 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 642 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 643 for (j = jmin; j < n; j++) { 644 /* (strict lower triangular part of A)*x */ 645 cval = ib[j] * 7; 646 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 647 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 648 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 649 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 650 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 651 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 652 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 653 /* (strict upper triangular part of A)*x */ 654 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 655 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 656 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 657 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 658 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 659 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 660 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 661 v += 49; 662 } 663 xb += 7; 664 ai++; 665 } 666 PetscCall(VecRestoreArrayRead(xx, &x)); 667 PetscCall(VecRestoreArray(zz, &z)); 668 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 669 PetscFunctionReturn(PETSC_SUCCESS); 670 } 671 672 /* 673 This will not work with MatScalar == float because it calls the BLAS 674 */ 675 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 676 { 677 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 678 PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 679 const PetscScalar *x, *x_ptr, *xb; 680 const MatScalar *v; 681 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 682 const PetscInt *idx, *aj, *ii; 683 PetscInt nonzerorow = 0; 684 685 PetscFunctionBegin; 686 PetscCall(VecSet(zz, zero)); 687 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 688 PetscCall(VecGetArrayRead(xx, &x)); 689 PetscCall(VecGetArray(zz, &z)); 690 691 x_ptr = x; 692 z_ptr = z; 693 694 aj = a->j; 695 v = a->a; 696 ii = a->i; 697 698 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 699 work = a->mult_work; 700 701 for (i = 0; i < mbs; i++) { 702 n = ii[1] - ii[0]; 703 ncols = n * bs; 704 workt = work; 705 idx = aj + ii[0]; 706 nonzerorow += (n > 0); 707 708 /* upper triangular part */ 709 for (j = 0; j < n; j++) { 710 xb = x_ptr + bs * (*idx++); 711 for (k = 0; k < bs; k++) workt[k] = xb[k]; 712 workt += bs; 713 } 714 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 715 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 716 717 /* strict lower triangular part */ 718 idx = aj + ii[0]; 719 if (n && *idx == i) { 720 ncols -= bs; 721 v += bs2; 722 idx++; 723 n--; 724 } 725 726 if (ncols > 0) { 727 workt = work; 728 PetscCall(PetscArrayzero(workt, ncols)); 729 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 730 for (j = 0; j < n; j++) { 731 zb = z_ptr + bs * (*idx++); 732 for (k = 0; k < bs; k++) zb[k] += workt[k]; 733 workt += bs; 734 } 735 } 736 x += bs; 737 v += n * bs2; 738 z += bs; 739 ii++; 740 } 741 742 PetscCall(VecRestoreArrayRead(xx, &x)); 743 PetscCall(VecRestoreArray(zz, &z)); 744 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 745 PetscFunctionReturn(PETSC_SUCCESS); 746 } 747 748 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 749 { 750 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 751 PetscScalar *z, x1; 752 const PetscScalar *x, *xb; 753 const MatScalar *v; 754 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 755 const PetscInt *aj = a->j, *ai = a->i, *ib; 756 PetscInt nonzerorow = 0; 757 #if defined(PETSC_USE_COMPLEX) 758 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 759 #else 760 const int aconj = 0; 761 #endif 762 763 PetscFunctionBegin; 764 PetscCall(VecCopy(yy, zz)); 765 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 766 PetscCall(VecGetArrayRead(xx, &x)); 767 PetscCall(VecGetArray(zz, &z)); 768 v = a->a; 769 xb = x; 770 771 for (i = 0; i < mbs; i++) { 772 n = ai[1] - ai[0]; /* length of i_th row of A */ 773 x1 = xb[0]; 774 ib = aj + *ai; 775 jmin = 0; 776 nonzerorow += (n > 0); 777 if (n && *ib == i) { /* (diag of A)*x */ 778 z[i] += *v++ * x[*ib++]; 779 jmin++; 780 } 781 if (aconj) { 782 for (j = jmin; j < n; j++) { 783 cval = *ib; 784 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 785 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 786 } 787 } else { 788 for (j = jmin; j < n; j++) { 789 cval = *ib; 790 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 791 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 792 } 793 } 794 xb++; 795 ai++; 796 } 797 798 PetscCall(VecRestoreArrayRead(xx, &x)); 799 PetscCall(VecRestoreArray(zz, &z)); 800 801 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 802 PetscFunctionReturn(PETSC_SUCCESS); 803 } 804 805 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 806 { 807 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 808 PetscScalar *z, x1, x2; 809 const PetscScalar *x, *xb; 810 const MatScalar *v; 811 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 812 const PetscInt *aj = a->j, *ai = a->i, *ib; 813 PetscInt nonzerorow = 0; 814 815 PetscFunctionBegin; 816 PetscCall(VecCopy(yy, zz)); 817 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 818 PetscCall(VecGetArrayRead(xx, &x)); 819 PetscCall(VecGetArray(zz, &z)); 820 821 v = a->a; 822 xb = x; 823 824 for (i = 0; i < mbs; i++) { 825 n = ai[1] - ai[0]; /* length of i_th block row of A */ 826 x1 = xb[0]; 827 x2 = xb[1]; 828 ib = aj + *ai; 829 jmin = 0; 830 nonzerorow += (n > 0); 831 if (n && *ib == i) { /* (diag of A)*x */ 832 z[2 * i] += v[0] * x1 + v[2] * x2; 833 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 834 v += 4; 835 jmin++; 836 } 837 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 838 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 839 for (j = jmin; j < n; j++) { 840 /* (strict lower triangular part of A)*x */ 841 cval = ib[j] * 2; 842 z[cval] += v[0] * x1 + v[1] * x2; 843 z[cval + 1] += v[2] * x1 + v[3] * x2; 844 /* (strict upper triangular part of A)*x */ 845 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 846 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 847 v += 4; 848 } 849 xb += 2; 850 ai++; 851 } 852 PetscCall(VecRestoreArrayRead(xx, &x)); 853 PetscCall(VecRestoreArray(zz, &z)); 854 855 PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 856 PetscFunctionReturn(PETSC_SUCCESS); 857 } 858 859 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 860 { 861 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 862 PetscScalar *z, x1, x2, x3; 863 const PetscScalar *x, *xb; 864 const MatScalar *v; 865 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 866 const PetscInt *aj = a->j, *ai = a->i, *ib; 867 PetscInt nonzerorow = 0; 868 869 PetscFunctionBegin; 870 PetscCall(VecCopy(yy, zz)); 871 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 872 PetscCall(VecGetArrayRead(xx, &x)); 873 PetscCall(VecGetArray(zz, &z)); 874 875 v = a->a; 876 xb = x; 877 878 for (i = 0; i < mbs; i++) { 879 n = ai[1] - ai[0]; /* length of i_th block row of A */ 880 x1 = xb[0]; 881 x2 = xb[1]; 882 x3 = xb[2]; 883 ib = aj + *ai; 884 jmin = 0; 885 nonzerorow += (n > 0); 886 if (n && *ib == i) { /* (diag of A)*x */ 887 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 888 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 889 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 890 v += 9; 891 jmin++; 892 } 893 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 894 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 895 for (j = jmin; j < n; j++) { 896 /* (strict lower triangular part of A)*x */ 897 cval = ib[j] * 3; 898 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 899 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 900 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 901 /* (strict upper triangular part of A)*x */ 902 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 903 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 904 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 905 v += 9; 906 } 907 xb += 3; 908 ai++; 909 } 910 911 PetscCall(VecRestoreArrayRead(xx, &x)); 912 PetscCall(VecRestoreArray(zz, &z)); 913 914 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 915 PetscFunctionReturn(PETSC_SUCCESS); 916 } 917 918 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 919 { 920 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 921 PetscScalar *z, x1, x2, x3, x4; 922 const PetscScalar *x, *xb; 923 const MatScalar *v; 924 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 925 const PetscInt *aj = a->j, *ai = a->i, *ib; 926 PetscInt nonzerorow = 0; 927 928 PetscFunctionBegin; 929 PetscCall(VecCopy(yy, zz)); 930 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 931 PetscCall(VecGetArrayRead(xx, &x)); 932 PetscCall(VecGetArray(zz, &z)); 933 934 v = a->a; 935 xb = x; 936 937 for (i = 0; i < mbs; i++) { 938 n = ai[1] - ai[0]; /* length of i_th block row of A */ 939 x1 = xb[0]; 940 x2 = xb[1]; 941 x3 = xb[2]; 942 x4 = xb[3]; 943 ib = aj + *ai; 944 jmin = 0; 945 nonzerorow += (n > 0); 946 if (n && *ib == i) { /* (diag of A)*x */ 947 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 948 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 949 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 950 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 951 v += 16; 952 jmin++; 953 } 954 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 955 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 956 for (j = jmin; j < n; j++) { 957 /* (strict lower triangular part of A)*x */ 958 cval = ib[j] * 4; 959 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 960 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 961 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 962 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 963 /* (strict upper triangular part of A)*x */ 964 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 965 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 966 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 967 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 968 v += 16; 969 } 970 xb += 4; 971 ai++; 972 } 973 974 PetscCall(VecRestoreArrayRead(xx, &x)); 975 PetscCall(VecRestoreArray(zz, &z)); 976 977 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 978 PetscFunctionReturn(PETSC_SUCCESS); 979 } 980 981 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 982 { 983 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 984 PetscScalar *z, x1, x2, x3, x4, x5; 985 const PetscScalar *x, *xb; 986 const MatScalar *v; 987 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 988 const PetscInt *aj = a->j, *ai = a->i, *ib; 989 PetscInt nonzerorow = 0; 990 991 PetscFunctionBegin; 992 PetscCall(VecCopy(yy, zz)); 993 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 994 PetscCall(VecGetArrayRead(xx, &x)); 995 PetscCall(VecGetArray(zz, &z)); 996 997 v = a->a; 998 xb = x; 999 1000 for (i = 0; i < mbs; i++) { 1001 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1002 x1 = xb[0]; 1003 x2 = xb[1]; 1004 x3 = xb[2]; 1005 x4 = xb[3]; 1006 x5 = xb[4]; 1007 ib = aj + *ai; 1008 jmin = 0; 1009 nonzerorow += (n > 0); 1010 if (n && *ib == i) { /* (diag of A)*x */ 1011 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1012 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1013 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1014 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 1015 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1016 v += 25; 1017 jmin++; 1018 } 1019 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1020 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1021 for (j = jmin; j < n; j++) { 1022 /* (strict lower triangular part of A)*x */ 1023 cval = ib[j] * 5; 1024 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 1025 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 1026 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 1027 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 1028 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1029 /* (strict upper triangular part of A)*x */ 1030 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 1031 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 1032 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 1033 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 1034 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 1035 v += 25; 1036 } 1037 xb += 5; 1038 ai++; 1039 } 1040 1041 PetscCall(VecRestoreArrayRead(xx, &x)); 1042 PetscCall(VecRestoreArray(zz, &z)); 1043 1044 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 1045 PetscFunctionReturn(PETSC_SUCCESS); 1046 } 1047 1048 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1049 { 1050 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1051 PetscScalar *z, x1, x2, x3, x4, x5, x6; 1052 const PetscScalar *x, *xb; 1053 const MatScalar *v; 1054 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1055 const PetscInt *aj = a->j, *ai = a->i, *ib; 1056 PetscInt nonzerorow = 0; 1057 1058 PetscFunctionBegin; 1059 PetscCall(VecCopy(yy, zz)); 1060 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1061 PetscCall(VecGetArrayRead(xx, &x)); 1062 PetscCall(VecGetArray(zz, &z)); 1063 1064 v = a->a; 1065 xb = x; 1066 1067 for (i = 0; i < mbs; i++) { 1068 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1069 x1 = xb[0]; 1070 x2 = xb[1]; 1071 x3 = xb[2]; 1072 x4 = xb[3]; 1073 x5 = xb[4]; 1074 x6 = xb[5]; 1075 ib = aj + *ai; 1076 jmin = 0; 1077 nonzerorow += (n > 0); 1078 if (n && *ib == i) { /* (diag of A)*x */ 1079 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 1080 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 1081 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 1082 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 1083 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 1084 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1085 v += 36; 1086 jmin++; 1087 } 1088 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1089 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1090 for (j = jmin; j < n; j++) { 1091 /* (strict lower triangular part of A)*x */ 1092 cval = ib[j] * 6; 1093 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 1094 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 1095 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 1096 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 1097 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 1098 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1099 /* (strict upper triangular part of A)*x */ 1100 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 1101 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 1102 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 1103 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 1104 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 1105 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 1106 v += 36; 1107 } 1108 xb += 6; 1109 ai++; 1110 } 1111 1112 PetscCall(VecRestoreArrayRead(xx, &x)); 1113 PetscCall(VecRestoreArray(zz, &z)); 1114 1115 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 1116 PetscFunctionReturn(PETSC_SUCCESS); 1117 } 1118 1119 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1120 { 1121 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1122 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1123 const PetscScalar *x, *xb; 1124 const MatScalar *v; 1125 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1126 const PetscInt *aj = a->j, *ai = a->i, *ib; 1127 PetscInt nonzerorow = 0; 1128 1129 PetscFunctionBegin; 1130 PetscCall(VecCopy(yy, zz)); 1131 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1132 PetscCall(VecGetArrayRead(xx, &x)); 1133 PetscCall(VecGetArray(zz, &z)); 1134 1135 v = a->a; 1136 xb = x; 1137 1138 for (i = 0; i < mbs; i++) { 1139 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1140 x1 = xb[0]; 1141 x2 = xb[1]; 1142 x3 = xb[2]; 1143 x4 = xb[3]; 1144 x5 = xb[4]; 1145 x6 = xb[5]; 1146 x7 = xb[6]; 1147 ib = aj + *ai; 1148 jmin = 0; 1149 nonzerorow += (n > 0); 1150 if (n && *ib == i) { /* (diag of A)*x */ 1151 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 1152 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 1153 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 1154 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 1155 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 1156 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 1157 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1158 v += 49; 1159 jmin++; 1160 } 1161 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1162 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1163 for (j = jmin; j < n; j++) { 1164 /* (strict lower triangular part of A)*x */ 1165 cval = ib[j] * 7; 1166 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 1167 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 1168 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 1169 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 1170 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 1171 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 1172 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1173 /* (strict upper triangular part of A)*x */ 1174 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 1175 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 1176 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 1177 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 1178 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 1179 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 1180 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 1181 v += 49; 1182 } 1183 xb += 7; 1184 ai++; 1185 } 1186 1187 PetscCall(VecRestoreArrayRead(xx, &x)); 1188 PetscCall(VecRestoreArray(zz, &z)); 1189 1190 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 1191 PetscFunctionReturn(PETSC_SUCCESS); 1192 } 1193 1194 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1195 { 1196 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1197 PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1198 const PetscScalar *x, *x_ptr, *xb; 1199 const MatScalar *v; 1200 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1201 const PetscInt *idx, *aj, *ii; 1202 PetscInt nonzerorow = 0; 1203 1204 PetscFunctionBegin; 1205 PetscCall(VecCopy(yy, zz)); 1206 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1207 PetscCall(VecGetArrayRead(xx, &x)); 1208 x_ptr = x; 1209 PetscCall(VecGetArray(zz, &z)); 1210 z_ptr = z; 1211 1212 aj = a->j; 1213 v = a->a; 1214 ii = a->i; 1215 1216 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 1217 work = a->mult_work; 1218 1219 for (i = 0; i < mbs; i++) { 1220 n = ii[1] - ii[0]; 1221 ncols = n * bs; 1222 workt = work; 1223 idx = aj + ii[0]; 1224 nonzerorow += (n > 0); 1225 1226 /* upper triangular part */ 1227 for (j = 0; j < n; j++) { 1228 xb = x_ptr + bs * (*idx++); 1229 for (k = 0; k < bs; k++) workt[k] = xb[k]; 1230 workt += bs; 1231 } 1232 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1233 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 1234 1235 /* strict lower triangular part */ 1236 idx = aj + ii[0]; 1237 if (n && *idx == i) { 1238 ncols -= bs; 1239 v += bs2; 1240 idx++; 1241 n--; 1242 } 1243 if (ncols > 0) { 1244 workt = work; 1245 PetscCall(PetscArrayzero(workt, ncols)); 1246 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1247 for (j = 0; j < n; j++) { 1248 zb = z_ptr + bs * (*idx++); 1249 for (k = 0; k < bs; k++) zb[k] += workt[k]; 1250 workt += bs; 1251 } 1252 } 1253 1254 x += bs; 1255 v += n * bs2; 1256 z += bs; 1257 ii++; 1258 } 1259 1260 PetscCall(VecRestoreArrayRead(xx, &x)); 1261 PetscCall(VecRestoreArray(zz, &z)); 1262 1263 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 1264 PetscFunctionReturn(PETSC_SUCCESS); 1265 } 1266 1267 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1268 { 1269 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1270 PetscScalar oalpha = alpha; 1271 PetscBLASInt one = 1, totalnz; 1272 1273 PetscFunctionBegin; 1274 PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1275 PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 1276 PetscCall(PetscLogFlops(totalnz)); 1277 PetscFunctionReturn(PETSC_SUCCESS); 1278 } 1279 1280 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1281 { 1282 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1283 const MatScalar *v = a->a; 1284 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1285 PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1286 const PetscInt *aj = a->j, *col; 1287 1288 PetscFunctionBegin; 1289 if (!a->nz) { 1290 *norm = 0.0; 1291 PetscFunctionReturn(PETSC_SUCCESS); 1292 } 1293 if (type == NORM_FROBENIUS) { 1294 for (k = 0; k < mbs; k++) { 1295 jmin = a->i[k]; 1296 jmax = a->i[k + 1]; 1297 col = aj + jmin; 1298 if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 1299 for (i = 0; i < bs2; i++) { 1300 sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 1301 v++; 1302 } 1303 jmin++; 1304 } 1305 for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 1306 for (i = 0; i < bs2; i++) { 1307 sum_off += PetscRealPart(PetscConj(*v) * (*v)); 1308 v++; 1309 } 1310 } 1311 } 1312 *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 1313 PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 1314 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1315 PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 1316 for (i = 0; i < mbs; i++) jl[i] = mbs; 1317 il[0] = 0; 1318 1319 *norm = 0.0; 1320 for (k = 0; k < mbs; k++) { /* k_th block row */ 1321 for (j = 0; j < bs; j++) sum[j] = 0.0; 1322 /*-- col sum --*/ 1323 i = jl[k]; /* first |A(i,k)| to be added */ 1324 /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window) 1325 at step k */ 1326 while (i < mbs) { 1327 nexti = jl[i]; /* next block row to be added */ 1328 ik = il[i]; /* block index of A(i,k) in the array a */ 1329 for (j = 0; j < bs; j++) { 1330 v = a->a + ik * bs2 + j * bs; 1331 for (k1 = 0; k1 < bs; k1++) { 1332 sum[j] += PetscAbsScalar(*v); 1333 v++; 1334 } 1335 } 1336 /* update il, jl */ 1337 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1338 jmax = a->i[i + 1]; 1339 if (jmin < jmax) { 1340 il[i] = jmin; 1341 j = a->j[jmin]; 1342 jl[i] = jl[j]; 1343 jl[j] = i; 1344 } 1345 i = nexti; 1346 } 1347 /*-- row sum --*/ 1348 jmin = a->i[k]; 1349 jmax = a->i[k + 1]; 1350 for (i = jmin; i < jmax; i++) { 1351 for (j = 0; j < bs; j++) { 1352 v = a->a + i * bs2 + j; 1353 for (k1 = 0; k1 < bs; k1++) { 1354 sum[j] += PetscAbsScalar(*v); 1355 v += bs; 1356 } 1357 } 1358 } 1359 /* add k_th block row to il, jl */ 1360 col = aj + jmin; 1361 if (jmax - jmin > 0 && *col == k) jmin++; 1362 if (jmin < jmax) { 1363 il[k] = jmin; 1364 j = a->j[jmin]; 1365 jl[k] = jl[j]; 1366 jl[j] = k; 1367 } 1368 for (j = 0; j < bs; j++) { 1369 if (sum[j] > *norm) *norm = sum[j]; 1370 } 1371 } 1372 PetscCall(PetscFree3(sum, il, jl)); 1373 PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1374 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 1375 PetscFunctionReturn(PETSC_SUCCESS); 1376 } 1377 1378 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1379 { 1380 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 1381 1382 PetscFunctionBegin; 1383 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1384 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1385 *flg = PETSC_FALSE; 1386 PetscFunctionReturn(PETSC_SUCCESS); 1387 } 1388 1389 /* if the a->i are the same */ 1390 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 1391 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1392 1393 /* if a->j are the same */ 1394 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 1395 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1396 1397 /* if a->a are the same */ 1398 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 1399 PetscFunctionReturn(PETSC_SUCCESS); 1400 } 1401 1402 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1403 { 1404 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1405 PetscInt i, j, k, row, bs, ambs, bs2; 1406 const PetscInt *ai, *aj; 1407 PetscScalar *x, zero = 0.0; 1408 const MatScalar *aa, *aa_j; 1409 1410 PetscFunctionBegin; 1411 bs = A->rmap->bs; 1412 PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 1413 1414 aa = a->a; 1415 ambs = a->mbs; 1416 1417 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1418 PetscInt *diag = a->diag; 1419 aa = a->a; 1420 ambs = a->mbs; 1421 PetscCall(VecGetArray(v, &x)); 1422 for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 1423 PetscCall(VecRestoreArray(v, &x)); 1424 PetscFunctionReturn(PETSC_SUCCESS); 1425 } 1426 1427 ai = a->i; 1428 aj = a->j; 1429 bs2 = a->bs2; 1430 PetscCall(VecSet(v, zero)); 1431 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1432 PetscCall(VecGetArray(v, &x)); 1433 for (i = 0; i < ambs; i++) { 1434 j = ai[i]; 1435 if (aj[j] == i) { /* if this is a diagonal element */ 1436 row = i * bs; 1437 aa_j = aa + j * bs2; 1438 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 1439 } 1440 } 1441 PetscCall(VecRestoreArray(v, &x)); 1442 PetscFunctionReturn(PETSC_SUCCESS); 1443 } 1444 1445 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1446 { 1447 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1448 PetscScalar x; 1449 const PetscScalar *l, *li, *ri; 1450 MatScalar *aa, *v; 1451 PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1452 const PetscInt *ai, *aj; 1453 PetscBool flg; 1454 1455 PetscFunctionBegin; 1456 if (ll != rr) { 1457 PetscCall(VecEqual(ll, rr, &flg)); 1458 PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1459 } 1460 if (!ll) PetscFunctionReturn(PETSC_SUCCESS); 1461 ai = a->i; 1462 aj = a->j; 1463 aa = a->a; 1464 m = A->rmap->N; 1465 bs = A->rmap->bs; 1466 mbs = a->mbs; 1467 bs2 = a->bs2; 1468 1469 PetscCall(VecGetArrayRead(ll, &l)); 1470 PetscCall(VecGetLocalSize(ll, &lm)); 1471 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1472 for (i = 0; i < mbs; i++) { /* for each block row */ 1473 M = ai[i + 1] - ai[i]; 1474 li = l + i * bs; 1475 v = aa + bs2 * ai[i]; 1476 for (j = 0; j < M; j++) { /* for each block */ 1477 ri = l + bs * aj[ai[i] + j]; 1478 for (k = 0; k < bs; k++) { 1479 x = ri[k]; 1480 for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 1481 } 1482 } 1483 } 1484 PetscCall(VecRestoreArrayRead(ll, &l)); 1485 PetscCall(PetscLogFlops(2.0 * a->nz)); 1486 PetscFunctionReturn(PETSC_SUCCESS); 1487 } 1488 1489 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1490 { 1491 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1492 1493 PetscFunctionBegin; 1494 info->block_size = a->bs2; 1495 info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 1496 info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 1497 info->nz_unneeded = info->nz_allocated - info->nz_used; 1498 info->assemblies = A->num_ass; 1499 info->mallocs = A->info.mallocs; 1500 info->memory = 0; /* REVIEW ME */ 1501 if (A->factortype) { 1502 info->fill_ratio_given = A->info.fill_ratio_given; 1503 info->fill_ratio_needed = A->info.fill_ratio_needed; 1504 info->factor_mallocs = A->info.factor_mallocs; 1505 } else { 1506 info->fill_ratio_given = 0; 1507 info->fill_ratio_needed = 0; 1508 info->factor_mallocs = 0; 1509 } 1510 PetscFunctionReturn(PETSC_SUCCESS); 1511 } 1512 1513 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1514 { 1515 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1516 1517 PetscFunctionBegin; 1518 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 1519 PetscFunctionReturn(PETSC_SUCCESS); 1520 } 1521 1522 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1523 { 1524 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1525 PetscInt i, j, n, row, col, bs, mbs; 1526 const PetscInt *ai, *aj; 1527 PetscReal atmp; 1528 const MatScalar *aa; 1529 PetscScalar *x; 1530 PetscInt ncols, brow, bcol, krow, kcol; 1531 1532 PetscFunctionBegin; 1533 PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 1534 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1535 bs = A->rmap->bs; 1536 aa = a->a; 1537 ai = a->i; 1538 aj = a->j; 1539 mbs = a->mbs; 1540 1541 PetscCall(VecSet(v, 0.0)); 1542 PetscCall(VecGetArray(v, &x)); 1543 PetscCall(VecGetLocalSize(v, &n)); 1544 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1545 for (i = 0; i < mbs; i++) { 1546 ncols = ai[1] - ai[0]; 1547 ai++; 1548 brow = bs * i; 1549 for (j = 0; j < ncols; j++) { 1550 bcol = bs * (*aj); 1551 for (kcol = 0; kcol < bs; kcol++) { 1552 col = bcol + kcol; /* col index */ 1553 for (krow = 0; krow < bs; krow++) { 1554 atmp = PetscAbsScalar(*aa); 1555 aa++; 1556 row = brow + krow; /* row index */ 1557 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1558 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1559 } 1560 } 1561 aj++; 1562 } 1563 } 1564 PetscCall(VecRestoreArray(v, &x)); 1565 PetscFunctionReturn(PETSC_SUCCESS); 1566 } 1567 1568 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1569 { 1570 PetscFunctionBegin; 1571 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1572 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1573 PetscFunctionReturn(PETSC_SUCCESS); 1574 } 1575 1576 static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1577 { 1578 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1579 PetscScalar *z = c; 1580 const PetscScalar *xb; 1581 PetscScalar x1; 1582 const MatScalar *v = a->a, *vv; 1583 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1584 #if defined(PETSC_USE_COMPLEX) 1585 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 1586 #else 1587 const int aconj = 0; 1588 #endif 1589 1590 PetscFunctionBegin; 1591 for (i = 0; i < mbs; i++) { 1592 n = ii[1] - ii[0]; 1593 ii++; 1594 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1595 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1596 jj = idx; 1597 vv = v; 1598 for (k = 0; k < cn; k++) { 1599 idx = jj; 1600 v = vv; 1601 for (j = 0; j < n; j++) { 1602 xb = b + (*idx); 1603 x1 = xb[0 + k * bm]; 1604 z[0 + k * cm] += v[0] * x1; 1605 if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1606 v += 1; 1607 ++idx; 1608 } 1609 } 1610 z += 1; 1611 } 1612 PetscFunctionReturn(PETSC_SUCCESS); 1613 } 1614 1615 static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1616 { 1617 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1618 PetscScalar *z = c; 1619 const PetscScalar *xb; 1620 PetscScalar x1, x2; 1621 const MatScalar *v = a->a, *vv; 1622 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1623 1624 PetscFunctionBegin; 1625 for (i = 0; i < mbs; i++) { 1626 n = ii[1] - ii[0]; 1627 ii++; 1628 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1629 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1630 jj = idx; 1631 vv = v; 1632 for (k = 0; k < cn; k++) { 1633 idx = jj; 1634 v = vv; 1635 for (j = 0; j < n; j++) { 1636 xb = b + 2 * (*idx); 1637 x1 = xb[0 + k * bm]; 1638 x2 = xb[1 + k * bm]; 1639 z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1640 z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1641 if (*idx != i) { 1642 c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1643 c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1644 } 1645 v += 4; 1646 ++idx; 1647 } 1648 } 1649 z += 2; 1650 } 1651 PetscFunctionReturn(PETSC_SUCCESS); 1652 } 1653 1654 static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1655 { 1656 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1657 PetscScalar *z = c; 1658 const PetscScalar *xb; 1659 PetscScalar x1, x2, x3; 1660 const MatScalar *v = a->a, *vv; 1661 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1662 1663 PetscFunctionBegin; 1664 for (i = 0; i < mbs; i++) { 1665 n = ii[1] - ii[0]; 1666 ii++; 1667 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1668 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1669 jj = idx; 1670 vv = v; 1671 for (k = 0; k < cn; k++) { 1672 idx = jj; 1673 v = vv; 1674 for (j = 0; j < n; j++) { 1675 xb = b + 3 * (*idx); 1676 x1 = xb[0 + k * bm]; 1677 x2 = xb[1 + k * bm]; 1678 x3 = xb[2 + k * bm]; 1679 z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1680 z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1681 z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1682 if (*idx != i) { 1683 c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1684 c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1685 c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1686 } 1687 v += 9; 1688 ++idx; 1689 } 1690 } 1691 z += 3; 1692 } 1693 PetscFunctionReturn(PETSC_SUCCESS); 1694 } 1695 1696 static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1697 { 1698 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1699 PetscScalar *z = c; 1700 const PetscScalar *xb; 1701 PetscScalar x1, x2, x3, x4; 1702 const MatScalar *v = a->a, *vv; 1703 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1704 1705 PetscFunctionBegin; 1706 for (i = 0; i < mbs; i++) { 1707 n = ii[1] - ii[0]; 1708 ii++; 1709 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1710 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1711 jj = idx; 1712 vv = v; 1713 for (k = 0; k < cn; k++) { 1714 idx = jj; 1715 v = vv; 1716 for (j = 0; j < n; j++) { 1717 xb = b + 4 * (*idx); 1718 x1 = xb[0 + k * bm]; 1719 x2 = xb[1 + k * bm]; 1720 x3 = xb[2 + k * bm]; 1721 x4 = xb[3 + k * bm]; 1722 z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1723 z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1724 z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1725 z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1726 if (*idx != i) { 1727 c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1728 c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1729 c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1730 c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1731 } 1732 v += 16; 1733 ++idx; 1734 } 1735 } 1736 z += 4; 1737 } 1738 PetscFunctionReturn(PETSC_SUCCESS); 1739 } 1740 1741 static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1742 { 1743 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1744 PetscScalar *z = c; 1745 const PetscScalar *xb; 1746 PetscScalar x1, x2, x3, x4, x5; 1747 const MatScalar *v = a->a, *vv; 1748 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1749 1750 PetscFunctionBegin; 1751 for (i = 0; i < mbs; i++) { 1752 n = ii[1] - ii[0]; 1753 ii++; 1754 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1755 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1756 jj = idx; 1757 vv = v; 1758 for (k = 0; k < cn; k++) { 1759 idx = jj; 1760 v = vv; 1761 for (j = 0; j < n; j++) { 1762 xb = b + 5 * (*idx); 1763 x1 = xb[0 + k * bm]; 1764 x2 = xb[1 + k * bm]; 1765 x3 = xb[2 + k * bm]; 1766 x4 = xb[3 + k * bm]; 1767 x5 = xb[4 + k * cm]; 1768 z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1769 z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1770 z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1771 z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1772 z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1773 if (*idx != i) { 1774 c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1775 c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1776 c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1777 c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1778 c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1779 } 1780 v += 25; 1781 ++idx; 1782 } 1783 } 1784 z += 5; 1785 } 1786 PetscFunctionReturn(PETSC_SUCCESS); 1787 } 1788 1789 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1790 { 1791 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1792 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1793 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1794 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1795 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1796 PetscBLASInt bbs, bcn, bbm, bcm; 1797 PetscScalar *z = NULL; 1798 PetscScalar *c, *b; 1799 const MatScalar *v; 1800 const PetscInt *idx, *ii; 1801 PetscScalar _DOne = 1.0; 1802 1803 PetscFunctionBegin; 1804 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1805 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1806 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1807 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1808 b = bd->v; 1809 PetscCall(MatZeroEntries(C)); 1810 PetscCall(MatDenseGetArray(C, &c)); 1811 switch (bs) { 1812 case 1: 1813 PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1814 break; 1815 case 2: 1816 PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1817 break; 1818 case 3: 1819 PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1820 break; 1821 case 4: 1822 PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1823 break; 1824 case 5: 1825 PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1826 break; 1827 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1828 PetscCall(PetscBLASIntCast(bs, &bbs)); 1829 PetscCall(PetscBLASIntCast(cn, &bcn)); 1830 PetscCall(PetscBLASIntCast(bm, &bbm)); 1831 PetscCall(PetscBLASIntCast(cm, &bcm)); 1832 idx = a->j; 1833 v = a->a; 1834 mbs = a->mbs; 1835 ii = a->i; 1836 z = c; 1837 for (i = 0; i < mbs; i++) { 1838 n = ii[1] - ii[0]; 1839 ii++; 1840 for (j = 0; j < n; j++) { 1841 if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1842 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1843 v += bs2; 1844 } 1845 z += bs; 1846 } 1847 } 1848 PetscCall(MatDenseRestoreArray(C, &c)); 1849 PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1850 PetscFunctionReturn(PETSC_SUCCESS); 1851 } 1852