1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <../src/mat/impls/sbaij/seq/sbaij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 #include <petscbt.h> 7 #include <petscblaslapack.h> 8 9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) { 10 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 11 PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs, *nidx2; 12 const PetscInt *idx; 13 PetscBT table_out, table_in; 14 15 PetscFunctionBegin; 16 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 17 mbs = a->mbs; 18 ai = a->i; 19 aj = a->j; 20 bs = A->rmap->bs; 21 PetscCall(PetscBTCreate(mbs, &table_out)); 22 PetscCall(PetscMalloc1(mbs + 1, &nidx)); 23 PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2)); 24 PetscCall(PetscBTCreate(mbs, &table_in)); 25 26 for (i = 0; i < is_max; i++) { /* for each is */ 27 isz = 0; 28 PetscCall(PetscBTMemzero(mbs, table_out)); 29 30 /* Extract the indices, assume there can be duplicate entries */ 31 PetscCall(ISGetIndices(is[i], &idx)); 32 PetscCall(ISGetLocalSize(is[i], &n)); 33 34 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 35 bcol_max = 0; 36 for (j = 0; j < n; ++j) { 37 brow = idx[j] / bs; /* convert the indices into block indices */ 38 PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 39 if (!PetscBTLookupSet(table_out, brow)) { 40 nidx[isz++] = brow; 41 if (bcol_max < brow) bcol_max = brow; 42 } 43 } 44 PetscCall(ISRestoreIndices(is[i], &idx)); 45 PetscCall(ISDestroy(&is[i])); 46 47 k = 0; 48 for (j = 0; j < ov; j++) { /* for each overlap */ 49 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 50 PetscCall(PetscBTMemzero(mbs, table_in)); 51 for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 52 53 n = isz; /* length of the updated is[i] */ 54 for (brow = 0; brow < mbs; brow++) { 55 start = ai[brow]; 56 end = ai[brow + 1]; 57 if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58 for (l = start; l < end; l++) { 59 bcol = aj[l]; 60 if (!PetscBTLookupSet(table_out, bcol)) { 61 nidx[isz++] = bcol; 62 if (bcol_max < bcol) bcol_max = bcol; 63 } 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l < end; l++) { 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table_in, bcol)) { 72 if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 80 /* expand the Index Set */ 81 for (j = 0; j < isz; j++) { 82 for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k; 83 } 84 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i)); 85 } 86 PetscCall(PetscBTDestroy(&table_out)); 87 PetscCall(PetscFree(nidx)); 88 PetscCall(PetscFree(nidx2)); 89 PetscCall(PetscBTDestroy(&table_in)); 90 PetscFunctionReturn(0); 91 } 92 93 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 94 Zero some ops' to avoid invalid usse */ 95 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) { 96 PetscFunctionBegin; 97 PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 98 Bseq->ops->mult = NULL; 99 Bseq->ops->multadd = NULL; 100 Bseq->ops->multtranspose = NULL; 101 Bseq->ops->multtransposeadd = NULL; 102 Bseq->ops->lufactor = NULL; 103 Bseq->ops->choleskyfactor = NULL; 104 Bseq->ops->lufactorsymbolic = NULL; 105 Bseq->ops->choleskyfactorsymbolic = NULL; 106 Bseq->ops->getinertia = NULL; 107 PetscFunctionReturn(0); 108 } 109 110 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 111 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 112 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c; 113 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 114 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 115 const PetscInt *irow, *icol; 116 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 117 PetscInt *aj = a->j, *ai = a->i; 118 MatScalar *mat_a; 119 Mat C; 120 PetscBool flag; 121 122 PetscFunctionBegin; 123 124 PetscCall(ISGetIndices(isrow, &irow)); 125 PetscCall(ISGetIndices(iscol, &icol)); 126 PetscCall(ISGetLocalSize(isrow, &nrows)); 127 PetscCall(ISGetLocalSize(iscol, &ncols)); 128 129 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 130 ssmap = smap; 131 PetscCall(PetscMalloc1(1 + nrows, &lens)); 132 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 133 /* determine lens of each row */ 134 for (i = 0; i < nrows; i++) { 135 kstart = ai[irow[i]]; 136 kend = kstart + a->ilen[irow[i]]; 137 lens[i] = 0; 138 for (k = kstart; k < kend; k++) { 139 if (ssmap[aj[k]]) lens[i]++; 140 } 141 } 142 /* Create and fill new matrix */ 143 if (scall == MAT_REUSE_MATRIX) { 144 c = (Mat_SeqSBAIJ *)((*B)->data); 145 146 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 147 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 148 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 149 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 150 C = *B; 151 } else { 152 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 153 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 154 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 155 PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 156 } 157 c = (Mat_SeqSBAIJ *)(C->data); 158 for (i = 0; i < nrows; i++) { 159 row = irow[i]; 160 kstart = ai[row]; 161 kend = kstart + a->ilen[row]; 162 mat_i = c->i[i]; 163 mat_j = c->j + mat_i; 164 mat_a = c->a + mat_i * bs2; 165 mat_ilen = c->ilen + i; 166 for (k = kstart; k < kend; k++) { 167 if ((tcol = ssmap[a->j[k]])) { 168 *mat_j++ = tcol - 1; 169 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 170 mat_a += bs2; 171 (*mat_ilen)++; 172 } 173 } 174 } 175 /* sort */ 176 { 177 MatScalar *work; 178 179 PetscCall(PetscMalloc1(bs2, &work)); 180 for (i = 0; i < nrows; i++) { 181 PetscInt ilen; 182 mat_i = c->i[i]; 183 mat_j = c->j + mat_i; 184 mat_a = c->a + mat_i * bs2; 185 ilen = c->ilen[i]; 186 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 187 } 188 PetscCall(PetscFree(work)); 189 } 190 191 /* Free work space */ 192 PetscCall(ISRestoreIndices(iscol, &icol)); 193 PetscCall(PetscFree(smap)); 194 PetscCall(PetscFree(lens)); 195 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 196 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 197 198 PetscCall(ISRestoreIndices(isrow, &irow)); 199 *B = C; 200 PetscFunctionReturn(0); 201 } 202 203 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { 204 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 205 IS is1, is2; 206 PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs; 207 const PetscInt *irow, *icol; 208 209 PetscFunctionBegin; 210 PetscCall(ISGetIndices(isrow, &irow)); 211 PetscCall(ISGetIndices(iscol, &icol)); 212 PetscCall(ISGetLocalSize(isrow, &nrows)); 213 PetscCall(ISGetLocalSize(iscol, &ncols)); 214 215 /* Verify if the indices corespond to each element in a block 216 and form the IS with compressed IS */ 217 maxmnbs = PetscMax(a->mbs, a->nbs); 218 PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 219 PetscCall(PetscArrayzero(vary, a->mbs)); 220 for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 221 for (i = 0; i < a->mbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); } 222 count = 0; 223 for (i = 0; i < nrows; i++) { 224 PetscInt j = irow[i] / bs; 225 if ((vary[j]--) == bs) iary[count++] = j; 226 } 227 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 228 229 PetscCall(PetscArrayzero(vary, a->nbs)); 230 for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 231 for (i = 0; i < a->nbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); } 232 count = 0; 233 for (i = 0; i < ncols; i++) { 234 PetscInt j = icol[i] / bs; 235 if ((vary[j]--) == bs) iary[count++] = j; 236 } 237 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 238 PetscCall(ISRestoreIndices(isrow, &irow)); 239 PetscCall(ISRestoreIndices(iscol, &icol)); 240 PetscCall(PetscFree2(vary, iary)); 241 242 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B)); 243 PetscCall(ISDestroy(&is1)); 244 PetscCall(ISDestroy(&is2)); 245 246 if (isrow != iscol) { 247 PetscBool isequal; 248 PetscCall(ISEqual(isrow, iscol, &isequal)); 249 if (!isequal) { PetscCall(MatSeqSBAIJZeroOps_Private(*B)); } 250 } 251 PetscFunctionReturn(0); 252 } 253 254 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) { 255 PetscInt i; 256 257 PetscFunctionBegin; 258 if (scall == MAT_INITIAL_MATRIX) { PetscCall(PetscCalloc1(n + 1, B)); } 259 260 for (i = 0; i < n; i++) { PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); } 261 PetscFunctionReturn(0); 262 } 263 264 /* -------------------------------------------------------*/ 265 /* Should check that shapes of vectors and matrices match */ 266 /* -------------------------------------------------------*/ 267 268 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) { 269 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 270 PetscScalar *z, x1, x2, zero = 0.0; 271 const PetscScalar *x, *xb; 272 const MatScalar *v; 273 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 274 const PetscInt *aj = a->j, *ai = a->i, *ib; 275 PetscInt nonzerorow = 0; 276 277 PetscFunctionBegin; 278 PetscCall(VecSet(zz, zero)); 279 if (!a->nz) PetscFunctionReturn(0); 280 PetscCall(VecGetArrayRead(xx, &x)); 281 PetscCall(VecGetArray(zz, &z)); 282 283 v = a->a; 284 xb = x; 285 286 for (i = 0; i < mbs; i++) { 287 n = ai[1] - ai[0]; /* length of i_th block row of A */ 288 x1 = xb[0]; 289 x2 = xb[1]; 290 ib = aj + *ai; 291 jmin = 0; 292 nonzerorow += (n > 0); 293 if (*ib == i) { /* (diag of A)*x */ 294 z[2 * i] += v[0] * x1 + v[2] * x2; 295 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 296 v += 4; 297 jmin++; 298 } 299 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 300 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 301 for (j = jmin; j < n; j++) { 302 /* (strict lower triangular part of A)*x */ 303 cval = ib[j] * 2; 304 z[cval] += v[0] * x1 + v[1] * x2; 305 z[cval + 1] += v[2] * x1 + v[3] * x2; 306 /* (strict upper triangular part of A)*x */ 307 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 308 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 309 v += 4; 310 } 311 xb += 2; 312 ai++; 313 } 314 315 PetscCall(VecRestoreArrayRead(xx, &x)); 316 PetscCall(VecRestoreArray(zz, &z)); 317 PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 318 PetscFunctionReturn(0); 319 } 320 321 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) { 322 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 323 PetscScalar *z, x1, x2, x3, zero = 0.0; 324 const PetscScalar *x, *xb; 325 const MatScalar *v; 326 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 327 const PetscInt *aj = a->j, *ai = a->i, *ib; 328 PetscInt nonzerorow = 0; 329 330 PetscFunctionBegin; 331 PetscCall(VecSet(zz, zero)); 332 if (!a->nz) PetscFunctionReturn(0); 333 PetscCall(VecGetArrayRead(xx, &x)); 334 PetscCall(VecGetArray(zz, &z)); 335 336 v = a->a; 337 xb = x; 338 339 for (i = 0; i < mbs; i++) { 340 n = ai[1] - ai[0]; /* length of i_th block row of A */ 341 x1 = xb[0]; 342 x2 = xb[1]; 343 x3 = xb[2]; 344 ib = aj + *ai; 345 jmin = 0; 346 nonzerorow += (n > 0); 347 if (*ib == i) { /* (diag of A)*x */ 348 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 349 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 350 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 351 v += 9; 352 jmin++; 353 } 354 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 355 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 356 for (j = jmin; j < n; j++) { 357 /* (strict lower triangular part of A)*x */ 358 cval = ib[j] * 3; 359 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 360 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 361 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 362 /* (strict upper triangular part of A)*x */ 363 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 364 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 365 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 366 v += 9; 367 } 368 xb += 3; 369 ai++; 370 } 371 372 PetscCall(VecRestoreArrayRead(xx, &x)); 373 PetscCall(VecRestoreArray(zz, &z)); 374 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 375 PetscFunctionReturn(0); 376 } 377 378 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) { 379 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 380 PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 381 const PetscScalar *x, *xb; 382 const MatScalar *v; 383 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 384 const PetscInt *aj = a->j, *ai = a->i, *ib; 385 PetscInt nonzerorow = 0; 386 387 PetscFunctionBegin; 388 PetscCall(VecSet(zz, zero)); 389 if (!a->nz) PetscFunctionReturn(0); 390 PetscCall(VecGetArrayRead(xx, &x)); 391 PetscCall(VecGetArray(zz, &z)); 392 393 v = a->a; 394 xb = x; 395 396 for (i = 0; i < mbs; i++) { 397 n = ai[1] - ai[0]; /* length of i_th block row of A */ 398 x1 = xb[0]; 399 x2 = xb[1]; 400 x3 = xb[2]; 401 x4 = xb[3]; 402 ib = aj + *ai; 403 jmin = 0; 404 nonzerorow += (n > 0); 405 if (*ib == i) { /* (diag of A)*x */ 406 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 407 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 408 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 409 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 410 v += 16; 411 jmin++; 412 } 413 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 414 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 415 for (j = jmin; j < n; j++) { 416 /* (strict lower triangular part of A)*x */ 417 cval = ib[j] * 4; 418 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 419 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 420 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 421 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 422 /* (strict upper triangular part of A)*x */ 423 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 424 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 425 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 426 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 427 v += 16; 428 } 429 xb += 4; 430 ai++; 431 } 432 433 PetscCall(VecRestoreArrayRead(xx, &x)); 434 PetscCall(VecRestoreArray(zz, &z)); 435 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 436 PetscFunctionReturn(0); 437 } 438 439 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) { 440 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 441 PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 442 const PetscScalar *x, *xb; 443 const MatScalar *v; 444 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 445 const PetscInt *aj = a->j, *ai = a->i, *ib; 446 PetscInt nonzerorow = 0; 447 448 PetscFunctionBegin; 449 PetscCall(VecSet(zz, zero)); 450 if (!a->nz) PetscFunctionReturn(0); 451 PetscCall(VecGetArrayRead(xx, &x)); 452 PetscCall(VecGetArray(zz, &z)); 453 454 v = a->a; 455 xb = x; 456 457 for (i = 0; i < mbs; i++) { 458 n = ai[1] - ai[0]; /* length of i_th block row of A */ 459 x1 = xb[0]; 460 x2 = xb[1]; 461 x3 = xb[2]; 462 x4 = xb[3]; 463 x5 = xb[4]; 464 ib = aj + *ai; 465 jmin = 0; 466 nonzerorow += (n > 0); 467 if (*ib == i) { /* (diag of A)*x */ 468 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 469 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 470 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 471 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 472 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 473 v += 25; 474 jmin++; 475 } 476 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 477 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 478 for (j = jmin; j < n; j++) { 479 /* (strict lower triangular part of A)*x */ 480 cval = ib[j] * 5; 481 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 482 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 483 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 484 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 485 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 486 /* (strict upper triangular part of A)*x */ 487 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 488 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 489 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 490 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 491 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 492 v += 25; 493 } 494 xb += 5; 495 ai++; 496 } 497 498 PetscCall(VecRestoreArrayRead(xx, &x)); 499 PetscCall(VecRestoreArray(zz, &z)); 500 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 501 PetscFunctionReturn(0); 502 } 503 504 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) { 505 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 506 PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 507 const PetscScalar *x, *xb; 508 const MatScalar *v; 509 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 510 const PetscInt *aj = a->j, *ai = a->i, *ib; 511 PetscInt nonzerorow = 0; 512 513 PetscFunctionBegin; 514 PetscCall(VecSet(zz, zero)); 515 if (!a->nz) PetscFunctionReturn(0); 516 PetscCall(VecGetArrayRead(xx, &x)); 517 PetscCall(VecGetArray(zz, &z)); 518 519 v = a->a; 520 xb = x; 521 522 for (i = 0; i < mbs; i++) { 523 n = ai[1] - ai[0]; /* length of i_th block row of A */ 524 x1 = xb[0]; 525 x2 = xb[1]; 526 x3 = xb[2]; 527 x4 = xb[3]; 528 x5 = xb[4]; 529 x6 = xb[5]; 530 ib = aj + *ai; 531 jmin = 0; 532 nonzerorow += (n > 0); 533 if (*ib == i) { /* (diag of A)*x */ 534 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 535 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 536 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 537 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 538 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 539 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 540 v += 36; 541 jmin++; 542 } 543 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 544 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 545 for (j = jmin; j < n; j++) { 546 /* (strict lower triangular part of A)*x */ 547 cval = ib[j] * 6; 548 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 549 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 550 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 551 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 552 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 553 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 554 /* (strict upper triangular part of A)*x */ 555 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 556 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 557 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 558 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 559 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 560 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 561 v += 36; 562 } 563 xb += 6; 564 ai++; 565 } 566 567 PetscCall(VecRestoreArrayRead(xx, &x)); 568 PetscCall(VecRestoreArray(zz, &z)); 569 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 570 PetscFunctionReturn(0); 571 } 572 573 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) { 574 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 575 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 576 const PetscScalar *x, *xb; 577 const MatScalar *v; 578 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 579 const PetscInt *aj = a->j, *ai = a->i, *ib; 580 PetscInt nonzerorow = 0; 581 582 PetscFunctionBegin; 583 PetscCall(VecSet(zz, zero)); 584 if (!a->nz) PetscFunctionReturn(0); 585 PetscCall(VecGetArrayRead(xx, &x)); 586 PetscCall(VecGetArray(zz, &z)); 587 588 v = a->a; 589 xb = x; 590 591 for (i = 0; i < mbs; i++) { 592 n = ai[1] - ai[0]; /* length of i_th block row of A */ 593 x1 = xb[0]; 594 x2 = xb[1]; 595 x3 = xb[2]; 596 x4 = xb[3]; 597 x5 = xb[4]; 598 x6 = xb[5]; 599 x7 = xb[6]; 600 ib = aj + *ai; 601 jmin = 0; 602 nonzerorow += (n > 0); 603 if (*ib == i) { /* (diag of A)*x */ 604 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 605 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 606 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 607 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 608 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 609 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 610 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 611 v += 49; 612 jmin++; 613 } 614 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 615 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 616 for (j = jmin; j < n; j++) { 617 /* (strict lower triangular part of A)*x */ 618 cval = ib[j] * 7; 619 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 620 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 621 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 622 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 623 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 624 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 625 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 626 /* (strict upper triangular part of A)*x */ 627 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 628 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 629 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 630 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 631 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 632 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 633 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 634 v += 49; 635 } 636 xb += 7; 637 ai++; 638 } 639 PetscCall(VecRestoreArrayRead(xx, &x)); 640 PetscCall(VecRestoreArray(zz, &z)); 641 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 642 PetscFunctionReturn(0); 643 } 644 645 /* 646 This will not work with MatScalar == float because it calls the BLAS 647 */ 648 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) { 649 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 650 PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 651 const PetscScalar *x, *x_ptr, *xb; 652 const MatScalar *v; 653 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 654 const PetscInt *idx, *aj, *ii; 655 PetscInt nonzerorow = 0; 656 657 PetscFunctionBegin; 658 PetscCall(VecSet(zz, zero)); 659 if (!a->nz) PetscFunctionReturn(0); 660 PetscCall(VecGetArrayRead(xx, &x)); 661 PetscCall(VecGetArray(zz, &z)); 662 663 x_ptr = x; 664 z_ptr = z; 665 666 aj = a->j; 667 v = a->a; 668 ii = a->i; 669 670 if (!a->mult_work) { PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); } 671 work = a->mult_work; 672 673 for (i = 0; i < mbs; i++) { 674 n = ii[1] - ii[0]; 675 ncols = n * bs; 676 workt = work; 677 idx = aj + ii[0]; 678 nonzerorow += (n > 0); 679 680 /* upper triangular part */ 681 for (j = 0; j < n; j++) { 682 xb = x_ptr + bs * (*idx++); 683 for (k = 0; k < bs; k++) workt[k] = xb[k]; 684 workt += bs; 685 } 686 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 687 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 688 689 /* strict lower triangular part */ 690 idx = aj + ii[0]; 691 if (n && *idx == i) { 692 ncols -= bs; 693 v += bs2; 694 idx++; 695 n--; 696 } 697 698 if (ncols > 0) { 699 workt = work; 700 PetscCall(PetscArrayzero(workt, ncols)); 701 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 702 for (j = 0; j < n; j++) { 703 zb = z_ptr + bs * (*idx++); 704 for (k = 0; k < bs; k++) zb[k] += workt[k]; 705 workt += bs; 706 } 707 } 708 x += bs; 709 v += n * bs2; 710 z += bs; 711 ii++; 712 } 713 714 PetscCall(VecRestoreArrayRead(xx, &x)); 715 PetscCall(VecRestoreArray(zz, &z)); 716 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 717 PetscFunctionReturn(0); 718 } 719 720 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) { 721 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 722 PetscScalar *z, x1; 723 const PetscScalar *x, *xb; 724 const MatScalar *v; 725 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 726 const PetscInt *aj = a->j, *ai = a->i, *ib; 727 PetscInt nonzerorow = 0; 728 #if defined(PETSC_USE_COMPLEX) 729 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 730 #else 731 const int aconj = 0; 732 #endif 733 734 PetscFunctionBegin; 735 PetscCall(VecCopy(yy, zz)); 736 if (!a->nz) PetscFunctionReturn(0); 737 PetscCall(VecGetArrayRead(xx, &x)); 738 PetscCall(VecGetArray(zz, &z)); 739 v = a->a; 740 xb = x; 741 742 for (i = 0; i < mbs; i++) { 743 n = ai[1] - ai[0]; /* length of i_th row of A */ 744 x1 = xb[0]; 745 ib = aj + *ai; 746 jmin = 0; 747 nonzerorow += (n > 0); 748 if (n && *ib == i) { /* (diag of A)*x */ 749 z[i] += *v++ * x[*ib++]; 750 jmin++; 751 } 752 if (aconj) { 753 for (j = jmin; j < n; j++) { 754 cval = *ib; 755 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 756 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 757 } 758 } else { 759 for (j = jmin; j < n; j++) { 760 cval = *ib; 761 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 762 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 763 } 764 } 765 xb++; 766 ai++; 767 } 768 769 PetscCall(VecRestoreArrayRead(xx, &x)); 770 PetscCall(VecRestoreArray(zz, &z)); 771 772 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 773 PetscFunctionReturn(0); 774 } 775 776 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) { 777 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 778 PetscScalar *z, x1, x2; 779 const PetscScalar *x, *xb; 780 const MatScalar *v; 781 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 782 const PetscInt *aj = a->j, *ai = a->i, *ib; 783 PetscInt nonzerorow = 0; 784 785 PetscFunctionBegin; 786 PetscCall(VecCopy(yy, zz)); 787 if (!a->nz) PetscFunctionReturn(0); 788 PetscCall(VecGetArrayRead(xx, &x)); 789 PetscCall(VecGetArray(zz, &z)); 790 791 v = a->a; 792 xb = x; 793 794 for (i = 0; i < mbs; i++) { 795 n = ai[1] - ai[0]; /* length of i_th block row of A */ 796 x1 = xb[0]; 797 x2 = xb[1]; 798 ib = aj + *ai; 799 jmin = 0; 800 nonzerorow += (n > 0); 801 if (n && *ib == i) { /* (diag of A)*x */ 802 z[2 * i] += v[0] * x1 + v[2] * x2; 803 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 804 v += 4; 805 jmin++; 806 } 807 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 808 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 809 for (j = jmin; j < n; j++) { 810 /* (strict lower triangular part of A)*x */ 811 cval = ib[j] * 2; 812 z[cval] += v[0] * x1 + v[1] * x2; 813 z[cval + 1] += v[2] * x1 + v[3] * x2; 814 /* (strict upper triangular part of A)*x */ 815 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 816 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 817 v += 4; 818 } 819 xb += 2; 820 ai++; 821 } 822 PetscCall(VecRestoreArrayRead(xx, &x)); 823 PetscCall(VecRestoreArray(zz, &z)); 824 825 PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 826 PetscFunctionReturn(0); 827 } 828 829 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) { 830 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 831 PetscScalar *z, x1, x2, x3; 832 const PetscScalar *x, *xb; 833 const MatScalar *v; 834 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 835 const PetscInt *aj = a->j, *ai = a->i, *ib; 836 PetscInt nonzerorow = 0; 837 838 PetscFunctionBegin; 839 PetscCall(VecCopy(yy, zz)); 840 if (!a->nz) PetscFunctionReturn(0); 841 PetscCall(VecGetArrayRead(xx, &x)); 842 PetscCall(VecGetArray(zz, &z)); 843 844 v = a->a; 845 xb = x; 846 847 for (i = 0; i < mbs; i++) { 848 n = ai[1] - ai[0]; /* length of i_th block row of A */ 849 x1 = xb[0]; 850 x2 = xb[1]; 851 x3 = xb[2]; 852 ib = aj + *ai; 853 jmin = 0; 854 nonzerorow += (n > 0); 855 if (n && *ib == i) { /* (diag of A)*x */ 856 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 857 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 858 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 859 v += 9; 860 jmin++; 861 } 862 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 863 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 864 for (j = jmin; j < n; j++) { 865 /* (strict lower triangular part of A)*x */ 866 cval = ib[j] * 3; 867 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 868 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 869 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 870 /* (strict upper triangular part of A)*x */ 871 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 872 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 873 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 874 v += 9; 875 } 876 xb += 3; 877 ai++; 878 } 879 880 PetscCall(VecRestoreArrayRead(xx, &x)); 881 PetscCall(VecRestoreArray(zz, &z)); 882 883 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 884 PetscFunctionReturn(0); 885 } 886 887 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) { 888 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 889 PetscScalar *z, x1, x2, x3, x4; 890 const PetscScalar *x, *xb; 891 const MatScalar *v; 892 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 893 const PetscInt *aj = a->j, *ai = a->i, *ib; 894 PetscInt nonzerorow = 0; 895 896 PetscFunctionBegin; 897 PetscCall(VecCopy(yy, zz)); 898 if (!a->nz) PetscFunctionReturn(0); 899 PetscCall(VecGetArrayRead(xx, &x)); 900 PetscCall(VecGetArray(zz, &z)); 901 902 v = a->a; 903 xb = x; 904 905 for (i = 0; i < mbs; i++) { 906 n = ai[1] - ai[0]; /* length of i_th block row of A */ 907 x1 = xb[0]; 908 x2 = xb[1]; 909 x3 = xb[2]; 910 x4 = xb[3]; 911 ib = aj + *ai; 912 jmin = 0; 913 nonzerorow += (n > 0); 914 if (n && *ib == i) { /* (diag of A)*x */ 915 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 916 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 917 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 918 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 919 v += 16; 920 jmin++; 921 } 922 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 923 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 924 for (j = jmin; j < n; j++) { 925 /* (strict lower triangular part of A)*x */ 926 cval = ib[j] * 4; 927 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 928 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 929 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 930 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 931 /* (strict upper triangular part of A)*x */ 932 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 933 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 934 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 935 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 936 v += 16; 937 } 938 xb += 4; 939 ai++; 940 } 941 942 PetscCall(VecRestoreArrayRead(xx, &x)); 943 PetscCall(VecRestoreArray(zz, &z)); 944 945 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 946 PetscFunctionReturn(0); 947 } 948 949 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) { 950 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 951 PetscScalar *z, x1, x2, x3, x4, x5; 952 const PetscScalar *x, *xb; 953 const MatScalar *v; 954 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 955 const PetscInt *aj = a->j, *ai = a->i, *ib; 956 PetscInt nonzerorow = 0; 957 958 PetscFunctionBegin; 959 PetscCall(VecCopy(yy, zz)); 960 if (!a->nz) PetscFunctionReturn(0); 961 PetscCall(VecGetArrayRead(xx, &x)); 962 PetscCall(VecGetArray(zz, &z)); 963 964 v = a->a; 965 xb = x; 966 967 for (i = 0; i < mbs; i++) { 968 n = ai[1] - ai[0]; /* length of i_th block row of A */ 969 x1 = xb[0]; 970 x2 = xb[1]; 971 x3 = xb[2]; 972 x4 = xb[3]; 973 x5 = xb[4]; 974 ib = aj + *ai; 975 jmin = 0; 976 nonzerorow += (n > 0); 977 if (n && *ib == i) { /* (diag of A)*x */ 978 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 979 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 980 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 981 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 982 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 983 v += 25; 984 jmin++; 985 } 986 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 987 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 988 for (j = jmin; j < n; j++) { 989 /* (strict lower triangular part of A)*x */ 990 cval = ib[j] * 5; 991 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 992 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 993 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 994 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 995 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 996 /* (strict upper triangular part of A)*x */ 997 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 998 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 999 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 1000 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 1001 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 1002 v += 25; 1003 } 1004 xb += 5; 1005 ai++; 1006 } 1007 1008 PetscCall(VecRestoreArrayRead(xx, &x)); 1009 PetscCall(VecRestoreArray(zz, &z)); 1010 1011 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 1012 PetscFunctionReturn(0); 1013 } 1014 1015 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) { 1016 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1017 PetscScalar *z, x1, x2, x3, x4, x5, x6; 1018 const PetscScalar *x, *xb; 1019 const MatScalar *v; 1020 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1021 const PetscInt *aj = a->j, *ai = a->i, *ib; 1022 PetscInt nonzerorow = 0; 1023 1024 PetscFunctionBegin; 1025 PetscCall(VecCopy(yy, zz)); 1026 if (!a->nz) PetscFunctionReturn(0); 1027 PetscCall(VecGetArrayRead(xx, &x)); 1028 PetscCall(VecGetArray(zz, &z)); 1029 1030 v = a->a; 1031 xb = x; 1032 1033 for (i = 0; i < mbs; i++) { 1034 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1035 x1 = xb[0]; 1036 x2 = xb[1]; 1037 x3 = xb[2]; 1038 x4 = xb[3]; 1039 x5 = xb[4]; 1040 x6 = xb[5]; 1041 ib = aj + *ai; 1042 jmin = 0; 1043 nonzerorow += (n > 0); 1044 if (n && *ib == i) { /* (diag of A)*x */ 1045 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 1046 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 1047 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 1048 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 1049 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 1050 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1051 v += 36; 1052 jmin++; 1053 } 1054 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1055 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1056 for (j = jmin; j < n; j++) { 1057 /* (strict lower triangular part of A)*x */ 1058 cval = ib[j] * 6; 1059 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 1060 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 1061 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 1062 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 1063 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 1064 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1065 /* (strict upper triangular part of A)*x */ 1066 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 1067 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 1068 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 1069 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 1070 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 1071 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 1072 v += 36; 1073 } 1074 xb += 6; 1075 ai++; 1076 } 1077 1078 PetscCall(VecRestoreArrayRead(xx, &x)); 1079 PetscCall(VecRestoreArray(zz, &z)); 1080 1081 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 1082 PetscFunctionReturn(0); 1083 } 1084 1085 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) { 1086 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1087 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1088 const PetscScalar *x, *xb; 1089 const MatScalar *v; 1090 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1091 const PetscInt *aj = a->j, *ai = a->i, *ib; 1092 PetscInt nonzerorow = 0; 1093 1094 PetscFunctionBegin; 1095 PetscCall(VecCopy(yy, zz)); 1096 if (!a->nz) PetscFunctionReturn(0); 1097 PetscCall(VecGetArrayRead(xx, &x)); 1098 PetscCall(VecGetArray(zz, &z)); 1099 1100 v = a->a; 1101 xb = x; 1102 1103 for (i = 0; i < mbs; i++) { 1104 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1105 x1 = xb[0]; 1106 x2 = xb[1]; 1107 x3 = xb[2]; 1108 x4 = xb[3]; 1109 x5 = xb[4]; 1110 x6 = xb[5]; 1111 x7 = xb[6]; 1112 ib = aj + *ai; 1113 jmin = 0; 1114 nonzerorow += (n > 0); 1115 if (n && *ib == i) { /* (diag of A)*x */ 1116 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 1117 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 1118 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 1119 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 1120 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 1121 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 1122 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1123 v += 49; 1124 jmin++; 1125 } 1126 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1127 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1128 for (j = jmin; j < n; j++) { 1129 /* (strict lower triangular part of A)*x */ 1130 cval = ib[j] * 7; 1131 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 1132 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 1133 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 1134 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 1135 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 1136 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 1137 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1138 /* (strict upper triangular part of A)*x */ 1139 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 1140 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 1141 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 1142 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 1143 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 1144 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 1145 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 1146 v += 49; 1147 } 1148 xb += 7; 1149 ai++; 1150 } 1151 1152 PetscCall(VecRestoreArrayRead(xx, &x)); 1153 PetscCall(VecRestoreArray(zz, &z)); 1154 1155 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 1156 PetscFunctionReturn(0); 1157 } 1158 1159 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) { 1160 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1161 PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1162 const PetscScalar *x, *x_ptr, *xb; 1163 const MatScalar *v; 1164 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1165 const PetscInt *idx, *aj, *ii; 1166 PetscInt nonzerorow = 0; 1167 1168 PetscFunctionBegin; 1169 PetscCall(VecCopy(yy, zz)); 1170 if (!a->nz) PetscFunctionReturn(0); 1171 PetscCall(VecGetArrayRead(xx, &x)); 1172 x_ptr = x; 1173 PetscCall(VecGetArray(zz, &z)); 1174 z_ptr = z; 1175 1176 aj = a->j; 1177 v = a->a; 1178 ii = a->i; 1179 1180 if (!a->mult_work) { PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); } 1181 work = a->mult_work; 1182 1183 for (i = 0; i < mbs; i++) { 1184 n = ii[1] - ii[0]; 1185 ncols = n * bs; 1186 workt = work; 1187 idx = aj + ii[0]; 1188 nonzerorow += (n > 0); 1189 1190 /* upper triangular part */ 1191 for (j = 0; j < n; j++) { 1192 xb = x_ptr + bs * (*idx++); 1193 for (k = 0; k < bs; k++) workt[k] = xb[k]; 1194 workt += bs; 1195 } 1196 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1197 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 1198 1199 /* strict lower triangular part */ 1200 idx = aj + ii[0]; 1201 if (n && *idx == i) { 1202 ncols -= bs; 1203 v += bs2; 1204 idx++; 1205 n--; 1206 } 1207 if (ncols > 0) { 1208 workt = work; 1209 PetscCall(PetscArrayzero(workt, ncols)); 1210 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1211 for (j = 0; j < n; j++) { 1212 zb = z_ptr + bs * (*idx++); 1213 for (k = 0; k < bs; k++) zb[k] += workt[k]; 1214 workt += bs; 1215 } 1216 } 1217 1218 x += bs; 1219 v += n * bs2; 1220 z += bs; 1221 ii++; 1222 } 1223 1224 PetscCall(VecRestoreArrayRead(xx, &x)); 1225 PetscCall(VecRestoreArray(zz, &z)); 1226 1227 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 1228 PetscFunctionReturn(0); 1229 } 1230 1231 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) { 1232 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1233 PetscScalar oalpha = alpha; 1234 PetscBLASInt one = 1, totalnz; 1235 1236 PetscFunctionBegin; 1237 PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1238 PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 1239 PetscCall(PetscLogFlops(totalnz)); 1240 PetscFunctionReturn(0); 1241 } 1242 1243 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) { 1244 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1245 const MatScalar *v = a->a; 1246 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1247 PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1248 const PetscInt *aj = a->j, *col; 1249 1250 PetscFunctionBegin; 1251 if (!a->nz) { 1252 *norm = 0.0; 1253 PetscFunctionReturn(0); 1254 } 1255 if (type == NORM_FROBENIUS) { 1256 for (k = 0; k < mbs; k++) { 1257 jmin = a->i[k]; 1258 jmax = a->i[k + 1]; 1259 col = aj + jmin; 1260 if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 1261 for (i = 0; i < bs2; i++) { 1262 sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 1263 v++; 1264 } 1265 jmin++; 1266 } 1267 for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 1268 for (i = 0; i < bs2; i++) { 1269 sum_off += PetscRealPart(PetscConj(*v) * (*v)); 1270 v++; 1271 } 1272 } 1273 } 1274 *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 1275 PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 1276 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1277 PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 1278 for (i = 0; i < mbs; i++) jl[i] = mbs; 1279 il[0] = 0; 1280 1281 *norm = 0.0; 1282 for (k = 0; k < mbs; k++) { /* k_th block row */ 1283 for (j = 0; j < bs; j++) sum[j] = 0.0; 1284 /*-- col sum --*/ 1285 i = jl[k]; /* first |A(i,k)| to be added */ 1286 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1287 at step k */ 1288 while (i < mbs) { 1289 nexti = jl[i]; /* next block row to be added */ 1290 ik = il[i]; /* block index of A(i,k) in the array a */ 1291 for (j = 0; j < bs; j++) { 1292 v = a->a + ik * bs2 + j * bs; 1293 for (k1 = 0; k1 < bs; k1++) { 1294 sum[j] += PetscAbsScalar(*v); 1295 v++; 1296 } 1297 } 1298 /* update il, jl */ 1299 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1300 jmax = a->i[i + 1]; 1301 if (jmin < jmax) { 1302 il[i] = jmin; 1303 j = a->j[jmin]; 1304 jl[i] = jl[j]; 1305 jl[j] = i; 1306 } 1307 i = nexti; 1308 } 1309 /*-- row sum --*/ 1310 jmin = a->i[k]; 1311 jmax = a->i[k + 1]; 1312 for (i = jmin; i < jmax; i++) { 1313 for (j = 0; j < bs; j++) { 1314 v = a->a + i * bs2 + j; 1315 for (k1 = 0; k1 < bs; k1++) { 1316 sum[j] += PetscAbsScalar(*v); 1317 v += bs; 1318 } 1319 } 1320 } 1321 /* add k_th block row to il, jl */ 1322 col = aj + jmin; 1323 if (jmax - jmin > 0 && *col == k) jmin++; 1324 if (jmin < jmax) { 1325 il[k] = jmin; 1326 j = a->j[jmin]; 1327 jl[k] = jl[j]; 1328 jl[j] = k; 1329 } 1330 for (j = 0; j < bs; j++) { 1331 if (sum[j] > *norm) *norm = sum[j]; 1332 } 1333 } 1334 PetscCall(PetscFree3(sum, il, jl)); 1335 PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1336 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 1337 PetscFunctionReturn(0); 1338 } 1339 1340 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) { 1341 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 1342 1343 PetscFunctionBegin; 1344 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1345 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1346 *flg = PETSC_FALSE; 1347 PetscFunctionReturn(0); 1348 } 1349 1350 /* if the a->i are the same */ 1351 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 1352 if (!*flg) PetscFunctionReturn(0); 1353 1354 /* if a->j are the same */ 1355 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 1356 if (!*flg) PetscFunctionReturn(0); 1357 1358 /* if a->a are the same */ 1359 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 1360 PetscFunctionReturn(0); 1361 } 1362 1363 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) { 1364 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1365 PetscInt i, j, k, row, bs, ambs, bs2; 1366 const PetscInt *ai, *aj; 1367 PetscScalar *x, zero = 0.0; 1368 const MatScalar *aa, *aa_j; 1369 1370 PetscFunctionBegin; 1371 bs = A->rmap->bs; 1372 PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 1373 1374 aa = a->a; 1375 ambs = a->mbs; 1376 1377 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1378 PetscInt *diag = a->diag; 1379 aa = a->a; 1380 ambs = a->mbs; 1381 PetscCall(VecGetArray(v, &x)); 1382 for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 1383 PetscCall(VecRestoreArray(v, &x)); 1384 PetscFunctionReturn(0); 1385 } 1386 1387 ai = a->i; 1388 aj = a->j; 1389 bs2 = a->bs2; 1390 PetscCall(VecSet(v, zero)); 1391 if (!a->nz) PetscFunctionReturn(0); 1392 PetscCall(VecGetArray(v, &x)); 1393 for (i = 0; i < ambs; i++) { 1394 j = ai[i]; 1395 if (aj[j] == i) { /* if this is a diagonal element */ 1396 row = i * bs; 1397 aa_j = aa + j * bs2; 1398 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 1399 } 1400 } 1401 PetscCall(VecRestoreArray(v, &x)); 1402 PetscFunctionReturn(0); 1403 } 1404 1405 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) { 1406 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1407 PetscScalar x; 1408 const PetscScalar *l, *li, *ri; 1409 MatScalar *aa, *v; 1410 PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1411 const PetscInt *ai, *aj; 1412 PetscBool flg; 1413 1414 PetscFunctionBegin; 1415 if (ll != rr) { 1416 PetscCall(VecEqual(ll, rr, &flg)); 1417 PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1418 } 1419 if (!ll) PetscFunctionReturn(0); 1420 ai = a->i; 1421 aj = a->j; 1422 aa = a->a; 1423 m = A->rmap->N; 1424 bs = A->rmap->bs; 1425 mbs = a->mbs; 1426 bs2 = a->bs2; 1427 1428 PetscCall(VecGetArrayRead(ll, &l)); 1429 PetscCall(VecGetLocalSize(ll, &lm)); 1430 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1431 for (i = 0; i < mbs; i++) { /* for each block row */ 1432 M = ai[i + 1] - ai[i]; 1433 li = l + i * bs; 1434 v = aa + bs2 * ai[i]; 1435 for (j = 0; j < M; j++) { /* for each block */ 1436 ri = l + bs * aj[ai[i] + j]; 1437 for (k = 0; k < bs; k++) { 1438 x = ri[k]; 1439 for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 1440 } 1441 } 1442 } 1443 PetscCall(VecRestoreArrayRead(ll, &l)); 1444 PetscCall(PetscLogFlops(2.0 * a->nz)); 1445 PetscFunctionReturn(0); 1446 } 1447 1448 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) { 1449 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1450 1451 PetscFunctionBegin; 1452 info->block_size = a->bs2; 1453 info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 1454 info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 1455 info->nz_unneeded = info->nz_allocated - info->nz_used; 1456 info->assemblies = A->num_ass; 1457 info->mallocs = A->info.mallocs; 1458 info->memory = ((PetscObject)A)->mem; 1459 if (A->factortype) { 1460 info->fill_ratio_given = A->info.fill_ratio_given; 1461 info->fill_ratio_needed = A->info.fill_ratio_needed; 1462 info->factor_mallocs = A->info.factor_mallocs; 1463 } else { 1464 info->fill_ratio_given = 0; 1465 info->fill_ratio_needed = 0; 1466 info->factor_mallocs = 0; 1467 } 1468 PetscFunctionReturn(0); 1469 } 1470 1471 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) { 1472 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1473 1474 PetscFunctionBegin; 1475 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 1476 PetscFunctionReturn(0); 1477 } 1478 1479 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) { 1480 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1481 PetscInt i, j, n, row, col, bs, mbs; 1482 const PetscInt *ai, *aj; 1483 PetscReal atmp; 1484 const MatScalar *aa; 1485 PetscScalar *x; 1486 PetscInt ncols, brow, bcol, krow, kcol; 1487 1488 PetscFunctionBegin; 1489 PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 1490 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1491 bs = A->rmap->bs; 1492 aa = a->a; 1493 ai = a->i; 1494 aj = a->j; 1495 mbs = a->mbs; 1496 1497 PetscCall(VecSet(v, 0.0)); 1498 PetscCall(VecGetArray(v, &x)); 1499 PetscCall(VecGetLocalSize(v, &n)); 1500 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1501 for (i = 0; i < mbs; i++) { 1502 ncols = ai[1] - ai[0]; 1503 ai++; 1504 brow = bs * i; 1505 for (j = 0; j < ncols; j++) { 1506 bcol = bs * (*aj); 1507 for (kcol = 0; kcol < bs; kcol++) { 1508 col = bcol + kcol; /* col index */ 1509 for (krow = 0; krow < bs; krow++) { 1510 atmp = PetscAbsScalar(*aa); 1511 aa++; 1512 row = brow + krow; /* row index */ 1513 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1514 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1515 } 1516 } 1517 aj++; 1518 } 1519 } 1520 PetscCall(VecRestoreArray(v, &x)); 1521 PetscFunctionReturn(0); 1522 } 1523 1524 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) { 1525 PetscFunctionBegin; 1526 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1527 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1528 PetscFunctionReturn(0); 1529 } 1530 1531 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 1532 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1533 PetscScalar *z = c; 1534 const PetscScalar *xb; 1535 PetscScalar x1; 1536 const MatScalar *v = a->a, *vv; 1537 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1538 #if defined(PETSC_USE_COMPLEX) 1539 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 1540 #else 1541 const int aconj = 0; 1542 #endif 1543 1544 PetscFunctionBegin; 1545 for (i = 0; i < mbs; i++) { 1546 n = ii[1] - ii[0]; 1547 ii++; 1548 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1549 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1550 jj = idx; 1551 vv = v; 1552 for (k = 0; k < cn; k++) { 1553 idx = jj; 1554 v = vv; 1555 for (j = 0; j < n; j++) { 1556 xb = b + (*idx); 1557 x1 = xb[0 + k * bm]; 1558 z[0 + k * cm] += v[0] * x1; 1559 if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1560 v += 1; 1561 ++idx; 1562 } 1563 } 1564 z += 1; 1565 } 1566 PetscFunctionReturn(0); 1567 } 1568 1569 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 1570 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1571 PetscScalar *z = c; 1572 const PetscScalar *xb; 1573 PetscScalar x1, x2; 1574 const MatScalar *v = a->a, *vv; 1575 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1576 1577 PetscFunctionBegin; 1578 for (i = 0; i < mbs; i++) { 1579 n = ii[1] - ii[0]; 1580 ii++; 1581 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1582 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1583 jj = idx; 1584 vv = v; 1585 for (k = 0; k < cn; k++) { 1586 idx = jj; 1587 v = vv; 1588 for (j = 0; j < n; j++) { 1589 xb = b + 2 * (*idx); 1590 x1 = xb[0 + k * bm]; 1591 x2 = xb[1 + k * bm]; 1592 z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1593 z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1594 if (*idx != i) { 1595 c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1596 c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1597 } 1598 v += 4; 1599 ++idx; 1600 } 1601 } 1602 z += 2; 1603 } 1604 PetscFunctionReturn(0); 1605 } 1606 1607 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 1608 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1609 PetscScalar *z = c; 1610 const PetscScalar *xb; 1611 PetscScalar x1, x2, x3; 1612 const MatScalar *v = a->a, *vv; 1613 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1614 1615 PetscFunctionBegin; 1616 for (i = 0; i < mbs; i++) { 1617 n = ii[1] - ii[0]; 1618 ii++; 1619 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1620 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1621 jj = idx; 1622 vv = v; 1623 for (k = 0; k < cn; k++) { 1624 idx = jj; 1625 v = vv; 1626 for (j = 0; j < n; j++) { 1627 xb = b + 3 * (*idx); 1628 x1 = xb[0 + k * bm]; 1629 x2 = xb[1 + k * bm]; 1630 x3 = xb[2 + k * bm]; 1631 z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1632 z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1633 z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1634 if (*idx != i) { 1635 c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1636 c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1637 c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1638 } 1639 v += 9; 1640 ++idx; 1641 } 1642 } 1643 z += 3; 1644 } 1645 PetscFunctionReturn(0); 1646 } 1647 1648 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 1649 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1650 PetscScalar *z = c; 1651 const PetscScalar *xb; 1652 PetscScalar x1, x2, x3, x4; 1653 const MatScalar *v = a->a, *vv; 1654 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1655 1656 PetscFunctionBegin; 1657 for (i = 0; i < mbs; i++) { 1658 n = ii[1] - ii[0]; 1659 ii++; 1660 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1661 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1662 jj = idx; 1663 vv = v; 1664 for (k = 0; k < cn; k++) { 1665 idx = jj; 1666 v = vv; 1667 for (j = 0; j < n; j++) { 1668 xb = b + 4 * (*idx); 1669 x1 = xb[0 + k * bm]; 1670 x2 = xb[1 + k * bm]; 1671 x3 = xb[2 + k * bm]; 1672 x4 = xb[3 + k * bm]; 1673 z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1674 z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1675 z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1676 z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1677 if (*idx != i) { 1678 c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1679 c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1680 c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1681 c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1682 } 1683 v += 16; 1684 ++idx; 1685 } 1686 } 1687 z += 4; 1688 } 1689 PetscFunctionReturn(0); 1690 } 1691 1692 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { 1693 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1694 PetscScalar *z = c; 1695 const PetscScalar *xb; 1696 PetscScalar x1, x2, x3, x4, x5; 1697 const MatScalar *v = a->a, *vv; 1698 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1699 1700 PetscFunctionBegin; 1701 for (i = 0; i < mbs; i++) { 1702 n = ii[1] - ii[0]; 1703 ii++; 1704 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1705 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1706 jj = idx; 1707 vv = v; 1708 for (k = 0; k < cn; k++) { 1709 idx = jj; 1710 v = vv; 1711 for (j = 0; j < n; j++) { 1712 xb = b + 5 * (*idx); 1713 x1 = xb[0 + k * bm]; 1714 x2 = xb[1 + k * bm]; 1715 x3 = xb[2 + k * bm]; 1716 x4 = xb[3 + k * bm]; 1717 x5 = xb[4 + k * cm]; 1718 z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1719 z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1720 z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1721 z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1722 z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1723 if (*idx != i) { 1724 c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1725 c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1726 c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1727 c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1728 c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1729 } 1730 v += 25; 1731 ++idx; 1732 } 1733 } 1734 z += 5; 1735 } 1736 PetscFunctionReturn(0); 1737 } 1738 1739 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) { 1740 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1741 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1742 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1743 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1744 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1745 PetscBLASInt bbs, bcn, bbm, bcm; 1746 PetscScalar *z = NULL; 1747 PetscScalar *c, *b; 1748 const MatScalar *v; 1749 const PetscInt *idx, *ii; 1750 PetscScalar _DOne = 1.0; 1751 1752 PetscFunctionBegin; 1753 if (!cm || !cn) PetscFunctionReturn(0); 1754 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1755 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1756 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1757 b = bd->v; 1758 PetscCall(MatZeroEntries(C)); 1759 PetscCall(MatDenseGetArray(C, &c)); 1760 switch (bs) { 1761 case 1: PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); break; 1762 case 2: PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); break; 1763 case 3: PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); break; 1764 case 4: PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); break; 1765 case 5: PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); break; 1766 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1767 PetscCall(PetscBLASIntCast(bs, &bbs)); 1768 PetscCall(PetscBLASIntCast(cn, &bcn)); 1769 PetscCall(PetscBLASIntCast(bm, &bbm)); 1770 PetscCall(PetscBLASIntCast(cm, &bcm)); 1771 idx = a->j; 1772 v = a->a; 1773 mbs = a->mbs; 1774 ii = a->i; 1775 z = c; 1776 for (i = 0; i < mbs; i++) { 1777 n = ii[1] - ii[0]; 1778 ii++; 1779 for (j = 0; j < n; j++) { 1780 if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1781 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1782 v += bs2; 1783 } 1784 z += bs; 1785 } 1786 } 1787 PetscCall(MatDenseRestoreArray(C, &c)); 1788 PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1789 PetscFunctionReturn(0); 1790 } 1791