1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <../src/mat/impls/sbaij/seq/sbaij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 #include <petscbt.h> 7 #include <petscblaslapack.h> 8 9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 10 { 11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 12 PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs, *nidx2; 13 const PetscInt *idx; 14 PetscBT table_out, table_in; 15 16 PetscFunctionBegin; 17 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 18 mbs = a->mbs; 19 ai = a->i; 20 aj = a->j; 21 bs = A->rmap->bs; 22 PetscCall(PetscBTCreate(mbs, &table_out)); 23 PetscCall(PetscMalloc1(mbs + 1, &nidx)); 24 PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2)); 25 PetscCall(PetscBTCreate(mbs, &table_in)); 26 27 for (i = 0; i < is_max; i++) { /* for each is */ 28 isz = 0; 29 PetscCall(PetscBTMemzero(mbs, table_out)); 30 31 /* Extract the indices, assume there can be duplicate entries */ 32 PetscCall(ISGetIndices(is[i], &idx)); 33 PetscCall(ISGetLocalSize(is[i], &n)); 34 35 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 36 bcol_max = 0; 37 for (j = 0; j < n; ++j) { 38 brow = idx[j] / bs; /* convert the indices into block indices */ 39 PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 40 if (!PetscBTLookupSet(table_out, brow)) { 41 nidx[isz++] = brow; 42 if (bcol_max < brow) bcol_max = brow; 43 } 44 } 45 PetscCall(ISRestoreIndices(is[i], &idx)); 46 PetscCall(ISDestroy(&is[i])); 47 48 k = 0; 49 for (j = 0; j < ov; j++) { /* for each overlap */ 50 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 51 PetscCall(PetscBTMemzero(mbs, table_in)); 52 for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 53 54 n = isz; /* length of the updated is[i] */ 55 for (brow = 0; brow < mbs; brow++) { 56 start = ai[brow]; 57 end = ai[brow + 1]; 58 if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 59 for (l = start; l < end; l++) { 60 bcol = aj[l]; 61 if (!PetscBTLookupSet(table_out, bcol)) { 62 nidx[isz++] = bcol; 63 if (bcol_max < bcol) bcol_max = bcol; 64 } 65 } 66 k++; 67 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 68 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 69 for (l = start; l < end; l++) { 70 bcol = aj[l]; 71 if (bcol > bcol_max) break; 72 if (PetscBTLookup(table_in, bcol)) { 73 if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 74 break; /* for l = start; l<end ; l++) */ 75 } 76 } 77 } 78 } 79 } /* for each overlap */ 80 81 /* expand the Index Set */ 82 for (j = 0; j < isz; j++) { 83 for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k; 84 } 85 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i)); 86 } 87 PetscCall(PetscBTDestroy(&table_out)); 88 PetscCall(PetscFree(nidx)); 89 PetscCall(PetscFree(nidx2)); 90 PetscCall(PetscBTDestroy(&table_in)); 91 PetscFunctionReturn(0); 92 } 93 94 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 95 Zero some ops' to avoid invalid usse */ 96 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 97 { 98 PetscFunctionBegin; 99 PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 100 Bseq->ops->mult = NULL; 101 Bseq->ops->multadd = NULL; 102 Bseq->ops->multtranspose = NULL; 103 Bseq->ops->multtransposeadd = NULL; 104 Bseq->ops->lufactor = NULL; 105 Bseq->ops->choleskyfactor = NULL; 106 Bseq->ops->lufactorsymbolic = NULL; 107 Bseq->ops->choleskyfactorsymbolic = NULL; 108 Bseq->ops->getinertia = NULL; 109 PetscFunctionReturn(0); 110 } 111 112 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 113 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 114 { 115 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c; 116 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 117 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 118 const PetscInt *irow, *icol; 119 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 120 PetscInt *aj = a->j, *ai = a->i; 121 MatScalar *mat_a; 122 Mat C; 123 PetscBool flag; 124 125 PetscFunctionBegin; 126 127 PetscCall(ISGetIndices(isrow, &irow)); 128 PetscCall(ISGetIndices(iscol, &icol)); 129 PetscCall(ISGetLocalSize(isrow, &nrows)); 130 PetscCall(ISGetLocalSize(iscol, &ncols)); 131 132 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 133 ssmap = smap; 134 PetscCall(PetscMalloc1(1 + nrows, &lens)); 135 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 136 /* determine lens of each row */ 137 for (i = 0; i < nrows; i++) { 138 kstart = ai[irow[i]]; 139 kend = kstart + a->ilen[irow[i]]; 140 lens[i] = 0; 141 for (k = kstart; k < kend; k++) { 142 if (ssmap[aj[k]]) lens[i]++; 143 } 144 } 145 /* Create and fill new matrix */ 146 if (scall == MAT_REUSE_MATRIX) { 147 c = (Mat_SeqSBAIJ *)((*B)->data); 148 149 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 150 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 151 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 152 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 153 C = *B; 154 } else { 155 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 156 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 157 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 158 PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 159 } 160 c = (Mat_SeqSBAIJ *)(C->data); 161 for (i = 0; i < nrows; i++) { 162 row = irow[i]; 163 kstart = ai[row]; 164 kend = kstart + a->ilen[row]; 165 mat_i = c->i[i]; 166 mat_j = c->j + mat_i; 167 mat_a = c->a + mat_i * bs2; 168 mat_ilen = c->ilen + i; 169 for (k = kstart; k < kend; k++) { 170 if ((tcol = ssmap[a->j[k]])) { 171 *mat_j++ = tcol - 1; 172 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 173 mat_a += bs2; 174 (*mat_ilen)++; 175 } 176 } 177 } 178 /* sort */ 179 { 180 MatScalar *work; 181 182 PetscCall(PetscMalloc1(bs2, &work)); 183 for (i = 0; i < nrows; i++) { 184 PetscInt ilen; 185 mat_i = c->i[i]; 186 mat_j = c->j + mat_i; 187 mat_a = c->a + mat_i * bs2; 188 ilen = c->ilen[i]; 189 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 190 } 191 PetscCall(PetscFree(work)); 192 } 193 194 /* Free work space */ 195 PetscCall(ISRestoreIndices(iscol, &icol)); 196 PetscCall(PetscFree(smap)); 197 PetscCall(PetscFree(lens)); 198 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 199 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 200 201 PetscCall(ISRestoreIndices(isrow, &irow)); 202 *B = C; 203 PetscFunctionReturn(0); 204 } 205 206 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 207 { 208 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 209 IS is1, is2; 210 PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs; 211 const PetscInt *irow, *icol; 212 213 PetscFunctionBegin; 214 PetscCall(ISGetIndices(isrow, &irow)); 215 PetscCall(ISGetIndices(iscol, &icol)); 216 PetscCall(ISGetLocalSize(isrow, &nrows)); 217 PetscCall(ISGetLocalSize(iscol, &ncols)); 218 219 /* Verify if the indices corespond to each element in a block 220 and form the IS with compressed IS */ 221 maxmnbs = PetscMax(a->mbs, a->nbs); 222 PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 223 PetscCall(PetscArrayzero(vary, a->mbs)); 224 for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 225 for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 226 count = 0; 227 for (i = 0; i < nrows; i++) { 228 PetscInt j = irow[i] / bs; 229 if ((vary[j]--) == bs) iary[count++] = j; 230 } 231 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 232 233 PetscCall(PetscArrayzero(vary, a->nbs)); 234 for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 235 for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 236 count = 0; 237 for (i = 0; i < ncols; i++) { 238 PetscInt j = icol[i] / bs; 239 if ((vary[j]--) == bs) iary[count++] = j; 240 } 241 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 242 PetscCall(ISRestoreIndices(isrow, &irow)); 243 PetscCall(ISRestoreIndices(iscol, &icol)); 244 PetscCall(PetscFree2(vary, iary)); 245 246 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B)); 247 PetscCall(ISDestroy(&is1)); 248 PetscCall(ISDestroy(&is2)); 249 250 if (isrow != iscol) { 251 PetscBool isequal; 252 PetscCall(ISEqual(isrow, iscol, &isequal)); 253 if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 254 } 255 PetscFunctionReturn(0); 256 } 257 258 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 259 { 260 PetscInt i; 261 262 PetscFunctionBegin; 263 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 264 265 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 266 PetscFunctionReturn(0); 267 } 268 269 /* -------------------------------------------------------*/ 270 /* Should check that shapes of vectors and matrices match */ 271 /* -------------------------------------------------------*/ 272 273 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 274 { 275 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 276 PetscScalar *z, x1, x2, zero = 0.0; 277 const PetscScalar *x, *xb; 278 const MatScalar *v; 279 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 280 const PetscInt *aj = a->j, *ai = a->i, *ib; 281 PetscInt nonzerorow = 0; 282 283 PetscFunctionBegin; 284 PetscCall(VecSet(zz, zero)); 285 if (!a->nz) PetscFunctionReturn(0); 286 PetscCall(VecGetArrayRead(xx, &x)); 287 PetscCall(VecGetArray(zz, &z)); 288 289 v = a->a; 290 xb = x; 291 292 for (i = 0; i < mbs; i++) { 293 n = ai[1] - ai[0]; /* length of i_th block row of A */ 294 x1 = xb[0]; 295 x2 = xb[1]; 296 ib = aj + *ai; 297 jmin = 0; 298 nonzerorow += (n > 0); 299 if (*ib == i) { /* (diag of A)*x */ 300 z[2 * i] += v[0] * x1 + v[2] * x2; 301 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 302 v += 4; 303 jmin++; 304 } 305 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 306 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 307 for (j = jmin; j < n; j++) { 308 /* (strict lower triangular part of A)*x */ 309 cval = ib[j] * 2; 310 z[cval] += v[0] * x1 + v[1] * x2; 311 z[cval + 1] += v[2] * x1 + v[3] * x2; 312 /* (strict upper triangular part of A)*x */ 313 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 314 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 315 v += 4; 316 } 317 xb += 2; 318 ai++; 319 } 320 321 PetscCall(VecRestoreArrayRead(xx, &x)); 322 PetscCall(VecRestoreArray(zz, &z)); 323 PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 324 PetscFunctionReturn(0); 325 } 326 327 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 328 { 329 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 330 PetscScalar *z, x1, x2, x3, zero = 0.0; 331 const PetscScalar *x, *xb; 332 const MatScalar *v; 333 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 334 const PetscInt *aj = a->j, *ai = a->i, *ib; 335 PetscInt nonzerorow = 0; 336 337 PetscFunctionBegin; 338 PetscCall(VecSet(zz, zero)); 339 if (!a->nz) PetscFunctionReturn(0); 340 PetscCall(VecGetArrayRead(xx, &x)); 341 PetscCall(VecGetArray(zz, &z)); 342 343 v = a->a; 344 xb = x; 345 346 for (i = 0; i < mbs; i++) { 347 n = ai[1] - ai[0]; /* length of i_th block row of A */ 348 x1 = xb[0]; 349 x2 = xb[1]; 350 x3 = xb[2]; 351 ib = aj + *ai; 352 jmin = 0; 353 nonzerorow += (n > 0); 354 if (*ib == i) { /* (diag of A)*x */ 355 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 356 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 357 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 358 v += 9; 359 jmin++; 360 } 361 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 362 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 363 for (j = jmin; j < n; j++) { 364 /* (strict lower triangular part of A)*x */ 365 cval = ib[j] * 3; 366 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 367 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 368 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 369 /* (strict upper triangular part of A)*x */ 370 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 371 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 372 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 373 v += 9; 374 } 375 xb += 3; 376 ai++; 377 } 378 379 PetscCall(VecRestoreArrayRead(xx, &x)); 380 PetscCall(VecRestoreArray(zz, &z)); 381 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 382 PetscFunctionReturn(0); 383 } 384 385 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 386 { 387 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 388 PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 389 const PetscScalar *x, *xb; 390 const MatScalar *v; 391 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 392 const PetscInt *aj = a->j, *ai = a->i, *ib; 393 PetscInt nonzerorow = 0; 394 395 PetscFunctionBegin; 396 PetscCall(VecSet(zz, zero)); 397 if (!a->nz) PetscFunctionReturn(0); 398 PetscCall(VecGetArrayRead(xx, &x)); 399 PetscCall(VecGetArray(zz, &z)); 400 401 v = a->a; 402 xb = x; 403 404 for (i = 0; i < mbs; i++) { 405 n = ai[1] - ai[0]; /* length of i_th block row of A */ 406 x1 = xb[0]; 407 x2 = xb[1]; 408 x3 = xb[2]; 409 x4 = xb[3]; 410 ib = aj + *ai; 411 jmin = 0; 412 nonzerorow += (n > 0); 413 if (*ib == i) { /* (diag of A)*x */ 414 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 415 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 416 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 417 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 418 v += 16; 419 jmin++; 420 } 421 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 422 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 423 for (j = jmin; j < n; j++) { 424 /* (strict lower triangular part of A)*x */ 425 cval = ib[j] * 4; 426 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 427 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 428 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 429 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 430 /* (strict upper triangular part of A)*x */ 431 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 432 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 433 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 434 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 435 v += 16; 436 } 437 xb += 4; 438 ai++; 439 } 440 441 PetscCall(VecRestoreArrayRead(xx, &x)); 442 PetscCall(VecRestoreArray(zz, &z)); 443 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 444 PetscFunctionReturn(0); 445 } 446 447 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 448 { 449 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 450 PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 451 const PetscScalar *x, *xb; 452 const MatScalar *v; 453 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 454 const PetscInt *aj = a->j, *ai = a->i, *ib; 455 PetscInt nonzerorow = 0; 456 457 PetscFunctionBegin; 458 PetscCall(VecSet(zz, zero)); 459 if (!a->nz) PetscFunctionReturn(0); 460 PetscCall(VecGetArrayRead(xx, &x)); 461 PetscCall(VecGetArray(zz, &z)); 462 463 v = a->a; 464 xb = x; 465 466 for (i = 0; i < mbs; i++) { 467 n = ai[1] - ai[0]; /* length of i_th block row of A */ 468 x1 = xb[0]; 469 x2 = xb[1]; 470 x3 = xb[2]; 471 x4 = xb[3]; 472 x5 = xb[4]; 473 ib = aj + *ai; 474 jmin = 0; 475 nonzerorow += (n > 0); 476 if (*ib == i) { /* (diag of A)*x */ 477 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 478 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 479 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 480 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 481 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 482 v += 25; 483 jmin++; 484 } 485 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 486 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 487 for (j = jmin; j < n; j++) { 488 /* (strict lower triangular part of A)*x */ 489 cval = ib[j] * 5; 490 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 491 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 492 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 493 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 494 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 495 /* (strict upper triangular part of A)*x */ 496 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 497 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 498 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 499 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 500 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 501 v += 25; 502 } 503 xb += 5; 504 ai++; 505 } 506 507 PetscCall(VecRestoreArrayRead(xx, &x)); 508 PetscCall(VecRestoreArray(zz, &z)); 509 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 510 PetscFunctionReturn(0); 511 } 512 513 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 514 { 515 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 516 PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 517 const PetscScalar *x, *xb; 518 const MatScalar *v; 519 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 520 const PetscInt *aj = a->j, *ai = a->i, *ib; 521 PetscInt nonzerorow = 0; 522 523 PetscFunctionBegin; 524 PetscCall(VecSet(zz, zero)); 525 if (!a->nz) PetscFunctionReturn(0); 526 PetscCall(VecGetArrayRead(xx, &x)); 527 PetscCall(VecGetArray(zz, &z)); 528 529 v = a->a; 530 xb = x; 531 532 for (i = 0; i < mbs; i++) { 533 n = ai[1] - ai[0]; /* length of i_th block row of A */ 534 x1 = xb[0]; 535 x2 = xb[1]; 536 x3 = xb[2]; 537 x4 = xb[3]; 538 x5 = xb[4]; 539 x6 = xb[5]; 540 ib = aj + *ai; 541 jmin = 0; 542 nonzerorow += (n > 0); 543 if (*ib == i) { /* (diag of A)*x */ 544 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 545 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 546 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 547 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 548 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 549 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 550 v += 36; 551 jmin++; 552 } 553 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 554 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 555 for (j = jmin; j < n; j++) { 556 /* (strict lower triangular part of A)*x */ 557 cval = ib[j] * 6; 558 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 559 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 560 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 561 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 562 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 563 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 564 /* (strict upper triangular part of A)*x */ 565 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 566 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 567 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 568 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 569 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 570 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 571 v += 36; 572 } 573 xb += 6; 574 ai++; 575 } 576 577 PetscCall(VecRestoreArrayRead(xx, &x)); 578 PetscCall(VecRestoreArray(zz, &z)); 579 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 580 PetscFunctionReturn(0); 581 } 582 583 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 584 { 585 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 586 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 587 const PetscScalar *x, *xb; 588 const MatScalar *v; 589 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 590 const PetscInt *aj = a->j, *ai = a->i, *ib; 591 PetscInt nonzerorow = 0; 592 593 PetscFunctionBegin; 594 PetscCall(VecSet(zz, zero)); 595 if (!a->nz) PetscFunctionReturn(0); 596 PetscCall(VecGetArrayRead(xx, &x)); 597 PetscCall(VecGetArray(zz, &z)); 598 599 v = a->a; 600 xb = x; 601 602 for (i = 0; i < mbs; i++) { 603 n = ai[1] - ai[0]; /* length of i_th block row of A */ 604 x1 = xb[0]; 605 x2 = xb[1]; 606 x3 = xb[2]; 607 x4 = xb[3]; 608 x5 = xb[4]; 609 x6 = xb[5]; 610 x7 = xb[6]; 611 ib = aj + *ai; 612 jmin = 0; 613 nonzerorow += (n > 0); 614 if (*ib == i) { /* (diag of A)*x */ 615 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 616 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 617 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 618 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 619 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 620 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 621 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 622 v += 49; 623 jmin++; 624 } 625 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 626 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 627 for (j = jmin; j < n; j++) { 628 /* (strict lower triangular part of A)*x */ 629 cval = ib[j] * 7; 630 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 631 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 632 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 633 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 634 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 635 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 636 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 637 /* (strict upper triangular part of A)*x */ 638 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 639 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 640 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 641 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 642 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 643 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 644 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 645 v += 49; 646 } 647 xb += 7; 648 ai++; 649 } 650 PetscCall(VecRestoreArrayRead(xx, &x)); 651 PetscCall(VecRestoreArray(zz, &z)); 652 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 653 PetscFunctionReturn(0); 654 } 655 656 /* 657 This will not work with MatScalar == float because it calls the BLAS 658 */ 659 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 660 { 661 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 662 PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 663 const PetscScalar *x, *x_ptr, *xb; 664 const MatScalar *v; 665 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 666 const PetscInt *idx, *aj, *ii; 667 PetscInt nonzerorow = 0; 668 669 PetscFunctionBegin; 670 PetscCall(VecSet(zz, zero)); 671 if (!a->nz) PetscFunctionReturn(0); 672 PetscCall(VecGetArrayRead(xx, &x)); 673 PetscCall(VecGetArray(zz, &z)); 674 675 x_ptr = x; 676 z_ptr = z; 677 678 aj = a->j; 679 v = a->a; 680 ii = a->i; 681 682 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 683 work = a->mult_work; 684 685 for (i = 0; i < mbs; i++) { 686 n = ii[1] - ii[0]; 687 ncols = n * bs; 688 workt = work; 689 idx = aj + ii[0]; 690 nonzerorow += (n > 0); 691 692 /* upper triangular part */ 693 for (j = 0; j < n; j++) { 694 xb = x_ptr + bs * (*idx++); 695 for (k = 0; k < bs; k++) workt[k] = xb[k]; 696 workt += bs; 697 } 698 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 699 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 700 701 /* strict lower triangular part */ 702 idx = aj + ii[0]; 703 if (n && *idx == i) { 704 ncols -= bs; 705 v += bs2; 706 idx++; 707 n--; 708 } 709 710 if (ncols > 0) { 711 workt = work; 712 PetscCall(PetscArrayzero(workt, ncols)); 713 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 714 for (j = 0; j < n; j++) { 715 zb = z_ptr + bs * (*idx++); 716 for (k = 0; k < bs; k++) zb[k] += workt[k]; 717 workt += bs; 718 } 719 } 720 x += bs; 721 v += n * bs2; 722 z += bs; 723 ii++; 724 } 725 726 PetscCall(VecRestoreArrayRead(xx, &x)); 727 PetscCall(VecRestoreArray(zz, &z)); 728 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 729 PetscFunctionReturn(0); 730 } 731 732 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 733 { 734 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 735 PetscScalar *z, x1; 736 const PetscScalar *x, *xb; 737 const MatScalar *v; 738 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 739 const PetscInt *aj = a->j, *ai = a->i, *ib; 740 PetscInt nonzerorow = 0; 741 #if defined(PETSC_USE_COMPLEX) 742 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 743 #else 744 const int aconj = 0; 745 #endif 746 747 PetscFunctionBegin; 748 PetscCall(VecCopy(yy, zz)); 749 if (!a->nz) PetscFunctionReturn(0); 750 PetscCall(VecGetArrayRead(xx, &x)); 751 PetscCall(VecGetArray(zz, &z)); 752 v = a->a; 753 xb = x; 754 755 for (i = 0; i < mbs; i++) { 756 n = ai[1] - ai[0]; /* length of i_th row of A */ 757 x1 = xb[0]; 758 ib = aj + *ai; 759 jmin = 0; 760 nonzerorow += (n > 0); 761 if (n && *ib == i) { /* (diag of A)*x */ 762 z[i] += *v++ * x[*ib++]; 763 jmin++; 764 } 765 if (aconj) { 766 for (j = jmin; j < n; j++) { 767 cval = *ib; 768 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 769 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 770 } 771 } else { 772 for (j = jmin; j < n; j++) { 773 cval = *ib; 774 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 775 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 776 } 777 } 778 xb++; 779 ai++; 780 } 781 782 PetscCall(VecRestoreArrayRead(xx, &x)); 783 PetscCall(VecRestoreArray(zz, &z)); 784 785 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 786 PetscFunctionReturn(0); 787 } 788 789 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 790 { 791 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 792 PetscScalar *z, x1, x2; 793 const PetscScalar *x, *xb; 794 const MatScalar *v; 795 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 796 const PetscInt *aj = a->j, *ai = a->i, *ib; 797 PetscInt nonzerorow = 0; 798 799 PetscFunctionBegin; 800 PetscCall(VecCopy(yy, zz)); 801 if (!a->nz) PetscFunctionReturn(0); 802 PetscCall(VecGetArrayRead(xx, &x)); 803 PetscCall(VecGetArray(zz, &z)); 804 805 v = a->a; 806 xb = x; 807 808 for (i = 0; i < mbs; i++) { 809 n = ai[1] - ai[0]; /* length of i_th block row of A */ 810 x1 = xb[0]; 811 x2 = xb[1]; 812 ib = aj + *ai; 813 jmin = 0; 814 nonzerorow += (n > 0); 815 if (n && *ib == i) { /* (diag of A)*x */ 816 z[2 * i] += v[0] * x1 + v[2] * x2; 817 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 818 v += 4; 819 jmin++; 820 } 821 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 822 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 823 for (j = jmin; j < n; j++) { 824 /* (strict lower triangular part of A)*x */ 825 cval = ib[j] * 2; 826 z[cval] += v[0] * x1 + v[1] * x2; 827 z[cval + 1] += v[2] * x1 + v[3] * x2; 828 /* (strict upper triangular part of A)*x */ 829 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 830 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 831 v += 4; 832 } 833 xb += 2; 834 ai++; 835 } 836 PetscCall(VecRestoreArrayRead(xx, &x)); 837 PetscCall(VecRestoreArray(zz, &z)); 838 839 PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 840 PetscFunctionReturn(0); 841 } 842 843 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 844 { 845 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 846 PetscScalar *z, x1, x2, x3; 847 const PetscScalar *x, *xb; 848 const MatScalar *v; 849 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 850 const PetscInt *aj = a->j, *ai = a->i, *ib; 851 PetscInt nonzerorow = 0; 852 853 PetscFunctionBegin; 854 PetscCall(VecCopy(yy, zz)); 855 if (!a->nz) PetscFunctionReturn(0); 856 PetscCall(VecGetArrayRead(xx, &x)); 857 PetscCall(VecGetArray(zz, &z)); 858 859 v = a->a; 860 xb = x; 861 862 for (i = 0; i < mbs; i++) { 863 n = ai[1] - ai[0]; /* length of i_th block row of A */ 864 x1 = xb[0]; 865 x2 = xb[1]; 866 x3 = xb[2]; 867 ib = aj + *ai; 868 jmin = 0; 869 nonzerorow += (n > 0); 870 if (n && *ib == i) { /* (diag of A)*x */ 871 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 872 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 873 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 874 v += 9; 875 jmin++; 876 } 877 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 878 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 879 for (j = jmin; j < n; j++) { 880 /* (strict lower triangular part of A)*x */ 881 cval = ib[j] * 3; 882 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 883 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 884 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 885 /* (strict upper triangular part of A)*x */ 886 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 887 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 888 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 889 v += 9; 890 } 891 xb += 3; 892 ai++; 893 } 894 895 PetscCall(VecRestoreArrayRead(xx, &x)); 896 PetscCall(VecRestoreArray(zz, &z)); 897 898 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 899 PetscFunctionReturn(0); 900 } 901 902 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 903 { 904 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 905 PetscScalar *z, x1, x2, x3, x4; 906 const PetscScalar *x, *xb; 907 const MatScalar *v; 908 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 909 const PetscInt *aj = a->j, *ai = a->i, *ib; 910 PetscInt nonzerorow = 0; 911 912 PetscFunctionBegin; 913 PetscCall(VecCopy(yy, zz)); 914 if (!a->nz) PetscFunctionReturn(0); 915 PetscCall(VecGetArrayRead(xx, &x)); 916 PetscCall(VecGetArray(zz, &z)); 917 918 v = a->a; 919 xb = x; 920 921 for (i = 0; i < mbs; i++) { 922 n = ai[1] - ai[0]; /* length of i_th block row of A */ 923 x1 = xb[0]; 924 x2 = xb[1]; 925 x3 = xb[2]; 926 x4 = xb[3]; 927 ib = aj + *ai; 928 jmin = 0; 929 nonzerorow += (n > 0); 930 if (n && *ib == i) { /* (diag of A)*x */ 931 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 932 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 933 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 934 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 935 v += 16; 936 jmin++; 937 } 938 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 939 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 940 for (j = jmin; j < n; j++) { 941 /* (strict lower triangular part of A)*x */ 942 cval = ib[j] * 4; 943 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 944 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 945 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 946 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 947 /* (strict upper triangular part of A)*x */ 948 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 949 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 950 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 951 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 952 v += 16; 953 } 954 xb += 4; 955 ai++; 956 } 957 958 PetscCall(VecRestoreArrayRead(xx, &x)); 959 PetscCall(VecRestoreArray(zz, &z)); 960 961 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 962 PetscFunctionReturn(0); 963 } 964 965 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 966 { 967 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 968 PetscScalar *z, x1, x2, x3, x4, x5; 969 const PetscScalar *x, *xb; 970 const MatScalar *v; 971 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 972 const PetscInt *aj = a->j, *ai = a->i, *ib; 973 PetscInt nonzerorow = 0; 974 975 PetscFunctionBegin; 976 PetscCall(VecCopy(yy, zz)); 977 if (!a->nz) PetscFunctionReturn(0); 978 PetscCall(VecGetArrayRead(xx, &x)); 979 PetscCall(VecGetArray(zz, &z)); 980 981 v = a->a; 982 xb = x; 983 984 for (i = 0; i < mbs; i++) { 985 n = ai[1] - ai[0]; /* length of i_th block row of A */ 986 x1 = xb[0]; 987 x2 = xb[1]; 988 x3 = xb[2]; 989 x4 = xb[3]; 990 x5 = xb[4]; 991 ib = aj + *ai; 992 jmin = 0; 993 nonzerorow += (n > 0); 994 if (n && *ib == i) { /* (diag of A)*x */ 995 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 996 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 997 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 998 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 999 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1000 v += 25; 1001 jmin++; 1002 } 1003 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1004 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1005 for (j = jmin; j < n; j++) { 1006 /* (strict lower triangular part of A)*x */ 1007 cval = ib[j] * 5; 1008 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 1009 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 1010 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 1011 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 1012 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1013 /* (strict upper triangular part of A)*x */ 1014 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 1015 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 1016 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 1017 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 1018 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 1019 v += 25; 1020 } 1021 xb += 5; 1022 ai++; 1023 } 1024 1025 PetscCall(VecRestoreArrayRead(xx, &x)); 1026 PetscCall(VecRestoreArray(zz, &z)); 1027 1028 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 1029 PetscFunctionReturn(0); 1030 } 1031 1032 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1033 { 1034 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1035 PetscScalar *z, x1, x2, x3, x4, x5, x6; 1036 const PetscScalar *x, *xb; 1037 const MatScalar *v; 1038 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1039 const PetscInt *aj = a->j, *ai = a->i, *ib; 1040 PetscInt nonzerorow = 0; 1041 1042 PetscFunctionBegin; 1043 PetscCall(VecCopy(yy, zz)); 1044 if (!a->nz) PetscFunctionReturn(0); 1045 PetscCall(VecGetArrayRead(xx, &x)); 1046 PetscCall(VecGetArray(zz, &z)); 1047 1048 v = a->a; 1049 xb = x; 1050 1051 for (i = 0; i < mbs; i++) { 1052 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1053 x1 = xb[0]; 1054 x2 = xb[1]; 1055 x3 = xb[2]; 1056 x4 = xb[3]; 1057 x5 = xb[4]; 1058 x6 = xb[5]; 1059 ib = aj + *ai; 1060 jmin = 0; 1061 nonzerorow += (n > 0); 1062 if (n && *ib == i) { /* (diag of A)*x */ 1063 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 1064 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 1065 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 1066 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 1067 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 1068 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1069 v += 36; 1070 jmin++; 1071 } 1072 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1073 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1074 for (j = jmin; j < n; j++) { 1075 /* (strict lower triangular part of A)*x */ 1076 cval = ib[j] * 6; 1077 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 1078 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 1079 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 1080 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 1081 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 1082 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1083 /* (strict upper triangular part of A)*x */ 1084 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 1085 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 1086 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 1087 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 1088 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 1089 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 1090 v += 36; 1091 } 1092 xb += 6; 1093 ai++; 1094 } 1095 1096 PetscCall(VecRestoreArrayRead(xx, &x)); 1097 PetscCall(VecRestoreArray(zz, &z)); 1098 1099 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 1100 PetscFunctionReturn(0); 1101 } 1102 1103 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1104 { 1105 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1106 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1107 const PetscScalar *x, *xb; 1108 const MatScalar *v; 1109 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1110 const PetscInt *aj = a->j, *ai = a->i, *ib; 1111 PetscInt nonzerorow = 0; 1112 1113 PetscFunctionBegin; 1114 PetscCall(VecCopy(yy, zz)); 1115 if (!a->nz) PetscFunctionReturn(0); 1116 PetscCall(VecGetArrayRead(xx, &x)); 1117 PetscCall(VecGetArray(zz, &z)); 1118 1119 v = a->a; 1120 xb = x; 1121 1122 for (i = 0; i < mbs; i++) { 1123 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1124 x1 = xb[0]; 1125 x2 = xb[1]; 1126 x3 = xb[2]; 1127 x4 = xb[3]; 1128 x5 = xb[4]; 1129 x6 = xb[5]; 1130 x7 = xb[6]; 1131 ib = aj + *ai; 1132 jmin = 0; 1133 nonzerorow += (n > 0); 1134 if (n && *ib == i) { /* (diag of A)*x */ 1135 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 1136 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 1137 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 1138 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 1139 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 1140 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 1141 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1142 v += 49; 1143 jmin++; 1144 } 1145 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1146 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1147 for (j = jmin; j < n; j++) { 1148 /* (strict lower triangular part of A)*x */ 1149 cval = ib[j] * 7; 1150 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 1151 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 1152 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 1153 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 1154 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 1155 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 1156 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1157 /* (strict upper triangular part of A)*x */ 1158 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 1159 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 1160 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 1161 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 1162 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 1163 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 1164 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 1165 v += 49; 1166 } 1167 xb += 7; 1168 ai++; 1169 } 1170 1171 PetscCall(VecRestoreArrayRead(xx, &x)); 1172 PetscCall(VecRestoreArray(zz, &z)); 1173 1174 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 1175 PetscFunctionReturn(0); 1176 } 1177 1178 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1179 { 1180 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1181 PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1182 const PetscScalar *x, *x_ptr, *xb; 1183 const MatScalar *v; 1184 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1185 const PetscInt *idx, *aj, *ii; 1186 PetscInt nonzerorow = 0; 1187 1188 PetscFunctionBegin; 1189 PetscCall(VecCopy(yy, zz)); 1190 if (!a->nz) PetscFunctionReturn(0); 1191 PetscCall(VecGetArrayRead(xx, &x)); 1192 x_ptr = x; 1193 PetscCall(VecGetArray(zz, &z)); 1194 z_ptr = z; 1195 1196 aj = a->j; 1197 v = a->a; 1198 ii = a->i; 1199 1200 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 1201 work = a->mult_work; 1202 1203 for (i = 0; i < mbs; i++) { 1204 n = ii[1] - ii[0]; 1205 ncols = n * bs; 1206 workt = work; 1207 idx = aj + ii[0]; 1208 nonzerorow += (n > 0); 1209 1210 /* upper triangular part */ 1211 for (j = 0; j < n; j++) { 1212 xb = x_ptr + bs * (*idx++); 1213 for (k = 0; k < bs; k++) workt[k] = xb[k]; 1214 workt += bs; 1215 } 1216 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1217 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 1218 1219 /* strict lower triangular part */ 1220 idx = aj + ii[0]; 1221 if (n && *idx == i) { 1222 ncols -= bs; 1223 v += bs2; 1224 idx++; 1225 n--; 1226 } 1227 if (ncols > 0) { 1228 workt = work; 1229 PetscCall(PetscArrayzero(workt, ncols)); 1230 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1231 for (j = 0; j < n; j++) { 1232 zb = z_ptr + bs * (*idx++); 1233 for (k = 0; k < bs; k++) zb[k] += workt[k]; 1234 workt += bs; 1235 } 1236 } 1237 1238 x += bs; 1239 v += n * bs2; 1240 z += bs; 1241 ii++; 1242 } 1243 1244 PetscCall(VecRestoreArrayRead(xx, &x)); 1245 PetscCall(VecRestoreArray(zz, &z)); 1246 1247 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 1248 PetscFunctionReturn(0); 1249 } 1250 1251 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1252 { 1253 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1254 PetscScalar oalpha = alpha; 1255 PetscBLASInt one = 1, totalnz; 1256 1257 PetscFunctionBegin; 1258 PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1259 PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 1260 PetscCall(PetscLogFlops(totalnz)); 1261 PetscFunctionReturn(0); 1262 } 1263 1264 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1265 { 1266 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1267 const MatScalar *v = a->a; 1268 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1269 PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1270 const PetscInt *aj = a->j, *col; 1271 1272 PetscFunctionBegin; 1273 if (!a->nz) { 1274 *norm = 0.0; 1275 PetscFunctionReturn(0); 1276 } 1277 if (type == NORM_FROBENIUS) { 1278 for (k = 0; k < mbs; k++) { 1279 jmin = a->i[k]; 1280 jmax = a->i[k + 1]; 1281 col = aj + jmin; 1282 if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 1283 for (i = 0; i < bs2; i++) { 1284 sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 1285 v++; 1286 } 1287 jmin++; 1288 } 1289 for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 1290 for (i = 0; i < bs2; i++) { 1291 sum_off += PetscRealPart(PetscConj(*v) * (*v)); 1292 v++; 1293 } 1294 } 1295 } 1296 *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 1297 PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 1298 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1299 PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 1300 for (i = 0; i < mbs; i++) jl[i] = mbs; 1301 il[0] = 0; 1302 1303 *norm = 0.0; 1304 for (k = 0; k < mbs; k++) { /* k_th block row */ 1305 for (j = 0; j < bs; j++) sum[j] = 0.0; 1306 /*-- col sum --*/ 1307 i = jl[k]; /* first |A(i,k)| to be added */ 1308 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1309 at step k */ 1310 while (i < mbs) { 1311 nexti = jl[i]; /* next block row to be added */ 1312 ik = il[i]; /* block index of A(i,k) in the array a */ 1313 for (j = 0; j < bs; j++) { 1314 v = a->a + ik * bs2 + j * bs; 1315 for (k1 = 0; k1 < bs; k1++) { 1316 sum[j] += PetscAbsScalar(*v); 1317 v++; 1318 } 1319 } 1320 /* update il, jl */ 1321 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1322 jmax = a->i[i + 1]; 1323 if (jmin < jmax) { 1324 il[i] = jmin; 1325 j = a->j[jmin]; 1326 jl[i] = jl[j]; 1327 jl[j] = i; 1328 } 1329 i = nexti; 1330 } 1331 /*-- row sum --*/ 1332 jmin = a->i[k]; 1333 jmax = a->i[k + 1]; 1334 for (i = jmin; i < jmax; i++) { 1335 for (j = 0; j < bs; j++) { 1336 v = a->a + i * bs2 + j; 1337 for (k1 = 0; k1 < bs; k1++) { 1338 sum[j] += PetscAbsScalar(*v); 1339 v += bs; 1340 } 1341 } 1342 } 1343 /* add k_th block row to il, jl */ 1344 col = aj + jmin; 1345 if (jmax - jmin > 0 && *col == k) jmin++; 1346 if (jmin < jmax) { 1347 il[k] = jmin; 1348 j = a->j[jmin]; 1349 jl[k] = jl[j]; 1350 jl[j] = k; 1351 } 1352 for (j = 0; j < bs; j++) { 1353 if (sum[j] > *norm) *norm = sum[j]; 1354 } 1355 } 1356 PetscCall(PetscFree3(sum, il, jl)); 1357 PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1358 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 1359 PetscFunctionReturn(0); 1360 } 1361 1362 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1363 { 1364 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 1365 1366 PetscFunctionBegin; 1367 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1368 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1369 *flg = PETSC_FALSE; 1370 PetscFunctionReturn(0); 1371 } 1372 1373 /* if the a->i are the same */ 1374 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 1375 if (!*flg) PetscFunctionReturn(0); 1376 1377 /* if a->j are the same */ 1378 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 1379 if (!*flg) PetscFunctionReturn(0); 1380 1381 /* if a->a are the same */ 1382 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 1383 PetscFunctionReturn(0); 1384 } 1385 1386 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1387 { 1388 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1389 PetscInt i, j, k, row, bs, ambs, bs2; 1390 const PetscInt *ai, *aj; 1391 PetscScalar *x, zero = 0.0; 1392 const MatScalar *aa, *aa_j; 1393 1394 PetscFunctionBegin; 1395 bs = A->rmap->bs; 1396 PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 1397 1398 aa = a->a; 1399 ambs = a->mbs; 1400 1401 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1402 PetscInt *diag = a->diag; 1403 aa = a->a; 1404 ambs = a->mbs; 1405 PetscCall(VecGetArray(v, &x)); 1406 for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 1407 PetscCall(VecRestoreArray(v, &x)); 1408 PetscFunctionReturn(0); 1409 } 1410 1411 ai = a->i; 1412 aj = a->j; 1413 bs2 = a->bs2; 1414 PetscCall(VecSet(v, zero)); 1415 if (!a->nz) PetscFunctionReturn(0); 1416 PetscCall(VecGetArray(v, &x)); 1417 for (i = 0; i < ambs; i++) { 1418 j = ai[i]; 1419 if (aj[j] == i) { /* if this is a diagonal element */ 1420 row = i * bs; 1421 aa_j = aa + j * bs2; 1422 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 1423 } 1424 } 1425 PetscCall(VecRestoreArray(v, &x)); 1426 PetscFunctionReturn(0); 1427 } 1428 1429 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1430 { 1431 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1432 PetscScalar x; 1433 const PetscScalar *l, *li, *ri; 1434 MatScalar *aa, *v; 1435 PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1436 const PetscInt *ai, *aj; 1437 PetscBool flg; 1438 1439 PetscFunctionBegin; 1440 if (ll != rr) { 1441 PetscCall(VecEqual(ll, rr, &flg)); 1442 PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1443 } 1444 if (!ll) PetscFunctionReturn(0); 1445 ai = a->i; 1446 aj = a->j; 1447 aa = a->a; 1448 m = A->rmap->N; 1449 bs = A->rmap->bs; 1450 mbs = a->mbs; 1451 bs2 = a->bs2; 1452 1453 PetscCall(VecGetArrayRead(ll, &l)); 1454 PetscCall(VecGetLocalSize(ll, &lm)); 1455 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1456 for (i = 0; i < mbs; i++) { /* for each block row */ 1457 M = ai[i + 1] - ai[i]; 1458 li = l + i * bs; 1459 v = aa + bs2 * ai[i]; 1460 for (j = 0; j < M; j++) { /* for each block */ 1461 ri = l + bs * aj[ai[i] + j]; 1462 for (k = 0; k < bs; k++) { 1463 x = ri[k]; 1464 for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 1465 } 1466 } 1467 } 1468 PetscCall(VecRestoreArrayRead(ll, &l)); 1469 PetscCall(PetscLogFlops(2.0 * a->nz)); 1470 PetscFunctionReturn(0); 1471 } 1472 1473 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1474 { 1475 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1476 1477 PetscFunctionBegin; 1478 info->block_size = a->bs2; 1479 info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 1480 info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 1481 info->nz_unneeded = info->nz_allocated - info->nz_used; 1482 info->assemblies = A->num_ass; 1483 info->mallocs = A->info.mallocs; 1484 info->memory = 0; /* REVIEW ME */ 1485 if (A->factortype) { 1486 info->fill_ratio_given = A->info.fill_ratio_given; 1487 info->fill_ratio_needed = A->info.fill_ratio_needed; 1488 info->factor_mallocs = A->info.factor_mallocs; 1489 } else { 1490 info->fill_ratio_given = 0; 1491 info->fill_ratio_needed = 0; 1492 info->factor_mallocs = 0; 1493 } 1494 PetscFunctionReturn(0); 1495 } 1496 1497 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1498 { 1499 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1500 1501 PetscFunctionBegin; 1502 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 1503 PetscFunctionReturn(0); 1504 } 1505 1506 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1507 { 1508 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1509 PetscInt i, j, n, row, col, bs, mbs; 1510 const PetscInt *ai, *aj; 1511 PetscReal atmp; 1512 const MatScalar *aa; 1513 PetscScalar *x; 1514 PetscInt ncols, brow, bcol, krow, kcol; 1515 1516 PetscFunctionBegin; 1517 PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 1518 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1519 bs = A->rmap->bs; 1520 aa = a->a; 1521 ai = a->i; 1522 aj = a->j; 1523 mbs = a->mbs; 1524 1525 PetscCall(VecSet(v, 0.0)); 1526 PetscCall(VecGetArray(v, &x)); 1527 PetscCall(VecGetLocalSize(v, &n)); 1528 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1529 for (i = 0; i < mbs; i++) { 1530 ncols = ai[1] - ai[0]; 1531 ai++; 1532 brow = bs * i; 1533 for (j = 0; j < ncols; j++) { 1534 bcol = bs * (*aj); 1535 for (kcol = 0; kcol < bs; kcol++) { 1536 col = bcol + kcol; /* col index */ 1537 for (krow = 0; krow < bs; krow++) { 1538 atmp = PetscAbsScalar(*aa); 1539 aa++; 1540 row = brow + krow; /* row index */ 1541 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1542 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1543 } 1544 } 1545 aj++; 1546 } 1547 } 1548 PetscCall(VecRestoreArray(v, &x)); 1549 PetscFunctionReturn(0); 1550 } 1551 1552 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1553 { 1554 PetscFunctionBegin; 1555 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1556 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1557 PetscFunctionReturn(0); 1558 } 1559 1560 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1561 { 1562 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1563 PetscScalar *z = c; 1564 const PetscScalar *xb; 1565 PetscScalar x1; 1566 const MatScalar *v = a->a, *vv; 1567 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1568 #if defined(PETSC_USE_COMPLEX) 1569 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 1570 #else 1571 const int aconj = 0; 1572 #endif 1573 1574 PetscFunctionBegin; 1575 for (i = 0; i < mbs; i++) { 1576 n = ii[1] - ii[0]; 1577 ii++; 1578 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1579 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1580 jj = idx; 1581 vv = v; 1582 for (k = 0; k < cn; k++) { 1583 idx = jj; 1584 v = vv; 1585 for (j = 0; j < n; j++) { 1586 xb = b + (*idx); 1587 x1 = xb[0 + k * bm]; 1588 z[0 + k * cm] += v[0] * x1; 1589 if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1590 v += 1; 1591 ++idx; 1592 } 1593 } 1594 z += 1; 1595 } 1596 PetscFunctionReturn(0); 1597 } 1598 1599 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1600 { 1601 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1602 PetscScalar *z = c; 1603 const PetscScalar *xb; 1604 PetscScalar x1, x2; 1605 const MatScalar *v = a->a, *vv; 1606 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1607 1608 PetscFunctionBegin; 1609 for (i = 0; i < mbs; i++) { 1610 n = ii[1] - ii[0]; 1611 ii++; 1612 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1613 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1614 jj = idx; 1615 vv = v; 1616 for (k = 0; k < cn; k++) { 1617 idx = jj; 1618 v = vv; 1619 for (j = 0; j < n; j++) { 1620 xb = b + 2 * (*idx); 1621 x1 = xb[0 + k * bm]; 1622 x2 = xb[1 + k * bm]; 1623 z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1624 z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1625 if (*idx != i) { 1626 c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1627 c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1628 } 1629 v += 4; 1630 ++idx; 1631 } 1632 } 1633 z += 2; 1634 } 1635 PetscFunctionReturn(0); 1636 } 1637 1638 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1639 { 1640 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1641 PetscScalar *z = c; 1642 const PetscScalar *xb; 1643 PetscScalar x1, x2, x3; 1644 const MatScalar *v = a->a, *vv; 1645 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1646 1647 PetscFunctionBegin; 1648 for (i = 0; i < mbs; i++) { 1649 n = ii[1] - ii[0]; 1650 ii++; 1651 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1652 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1653 jj = idx; 1654 vv = v; 1655 for (k = 0; k < cn; k++) { 1656 idx = jj; 1657 v = vv; 1658 for (j = 0; j < n; j++) { 1659 xb = b + 3 * (*idx); 1660 x1 = xb[0 + k * bm]; 1661 x2 = xb[1 + k * bm]; 1662 x3 = xb[2 + k * bm]; 1663 z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1664 z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1665 z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1666 if (*idx != i) { 1667 c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1668 c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1669 c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1670 } 1671 v += 9; 1672 ++idx; 1673 } 1674 } 1675 z += 3; 1676 } 1677 PetscFunctionReturn(0); 1678 } 1679 1680 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1681 { 1682 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1683 PetscScalar *z = c; 1684 const PetscScalar *xb; 1685 PetscScalar x1, x2, x3, x4; 1686 const MatScalar *v = a->a, *vv; 1687 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1688 1689 PetscFunctionBegin; 1690 for (i = 0; i < mbs; i++) { 1691 n = ii[1] - ii[0]; 1692 ii++; 1693 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1694 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1695 jj = idx; 1696 vv = v; 1697 for (k = 0; k < cn; k++) { 1698 idx = jj; 1699 v = vv; 1700 for (j = 0; j < n; j++) { 1701 xb = b + 4 * (*idx); 1702 x1 = xb[0 + k * bm]; 1703 x2 = xb[1 + k * bm]; 1704 x3 = xb[2 + k * bm]; 1705 x4 = xb[3 + k * bm]; 1706 z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1707 z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1708 z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1709 z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1710 if (*idx != i) { 1711 c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1712 c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1713 c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1714 c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1715 } 1716 v += 16; 1717 ++idx; 1718 } 1719 } 1720 z += 4; 1721 } 1722 PetscFunctionReturn(0); 1723 } 1724 1725 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1726 { 1727 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1728 PetscScalar *z = c; 1729 const PetscScalar *xb; 1730 PetscScalar x1, x2, x3, x4, x5; 1731 const MatScalar *v = a->a, *vv; 1732 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1733 1734 PetscFunctionBegin; 1735 for (i = 0; i < mbs; i++) { 1736 n = ii[1] - ii[0]; 1737 ii++; 1738 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1739 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1740 jj = idx; 1741 vv = v; 1742 for (k = 0; k < cn; k++) { 1743 idx = jj; 1744 v = vv; 1745 for (j = 0; j < n; j++) { 1746 xb = b + 5 * (*idx); 1747 x1 = xb[0 + k * bm]; 1748 x2 = xb[1 + k * bm]; 1749 x3 = xb[2 + k * bm]; 1750 x4 = xb[3 + k * bm]; 1751 x5 = xb[4 + k * cm]; 1752 z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1753 z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1754 z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1755 z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1756 z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1757 if (*idx != i) { 1758 c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1759 c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1760 c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1761 c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1762 c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1763 } 1764 v += 25; 1765 ++idx; 1766 } 1767 } 1768 z += 5; 1769 } 1770 PetscFunctionReturn(0); 1771 } 1772 1773 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1774 { 1775 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1776 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1777 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1778 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1779 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1780 PetscBLASInt bbs, bcn, bbm, bcm; 1781 PetscScalar *z = NULL; 1782 PetscScalar *c, *b; 1783 const MatScalar *v; 1784 const PetscInt *idx, *ii; 1785 PetscScalar _DOne = 1.0; 1786 1787 PetscFunctionBegin; 1788 if (!cm || !cn) PetscFunctionReturn(0); 1789 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1790 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1791 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1792 b = bd->v; 1793 PetscCall(MatZeroEntries(C)); 1794 PetscCall(MatDenseGetArray(C, &c)); 1795 switch (bs) { 1796 case 1: 1797 PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1798 break; 1799 case 2: 1800 PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1801 break; 1802 case 3: 1803 PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1804 break; 1805 case 4: 1806 PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1807 break; 1808 case 5: 1809 PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1810 break; 1811 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1812 PetscCall(PetscBLASIntCast(bs, &bbs)); 1813 PetscCall(PetscBLASIntCast(cn, &bcn)); 1814 PetscCall(PetscBLASIntCast(bm, &bbm)); 1815 PetscCall(PetscBLASIntCast(cm, &bcm)); 1816 idx = a->j; 1817 v = a->a; 1818 mbs = a->mbs; 1819 ii = a->i; 1820 z = c; 1821 for (i = 0; i < mbs; i++) { 1822 n = ii[1] - ii[0]; 1823 ii++; 1824 for (j = 0; j < n; j++) { 1825 if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1826 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1827 v += bs2; 1828 } 1829 z += bs; 1830 } 1831 } 1832 PetscCall(MatDenseRestoreArray(C, &c)); 1833 PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1834 PetscFunctionReturn(0); 1835 } 1836