1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <../src/mat/impls/sbaij/seq/sbaij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 #include <petscbt.h> 7 #include <petscblaslapack.h> 8 9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) 10 { 11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 12 PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs; 13 const PetscInt *idx; 14 PetscBT table_out, table_in; 15 16 PetscFunctionBegin; 17 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 18 mbs = a->mbs; 19 ai = a->i; 20 aj = a->j; 21 bs = A->rmap->bs; 22 PetscCall(PetscBTCreate(mbs, &table_out)); 23 PetscCall(PetscMalloc1(mbs + 1, &nidx)); 24 PetscCall(PetscBTCreate(mbs, &table_in)); 25 26 for (i = 0; i < is_max; i++) { /* for each is */ 27 isz = 0; 28 PetscCall(PetscBTMemzero(mbs, table_out)); 29 30 /* Extract the indices, assume there can be duplicate entries */ 31 PetscCall(ISGetIndices(is[i], &idx)); 32 PetscCall(ISGetLocalSize(is[i], &n)); 33 34 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 35 bcol_max = 0; 36 for (j = 0; j < n; ++j) { 37 brow = idx[j] / bs; /* convert the indices into block indices */ 38 PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); 39 if (!PetscBTLookupSet(table_out, brow)) { 40 nidx[isz++] = brow; 41 if (bcol_max < brow) bcol_max = brow; 42 } 43 } 44 PetscCall(ISRestoreIndices(is[i], &idx)); 45 PetscCall(ISDestroy(&is[i])); 46 47 k = 0; 48 for (j = 0; j < ov; j++) { /* for each overlap */ 49 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 50 PetscCall(PetscBTMemzero(mbs, table_in)); 51 for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l])); 52 53 n = isz; /* length of the updated is[i] */ 54 for (brow = 0; brow < mbs; brow++) { 55 start = ai[brow]; 56 end = ai[brow + 1]; 57 if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58 for (l = start; l < end; l++) { 59 bcol = aj[l]; 60 if (!PetscBTLookupSet(table_out, bcol)) { 61 nidx[isz++] = bcol; 62 if (bcol_max < bcol) bcol_max = bcol; 63 } 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l < end; l++) { 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table_in, bcol)) { 72 if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow; 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); 80 } /* for each is */ 81 PetscCall(PetscBTDestroy(&table_out)); 82 PetscCall(PetscFree(nidx)); 83 PetscCall(PetscBTDestroy(&table_in)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 88 Zero some ops' to avoid invalid usse */ 89 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 90 { 91 PetscFunctionBegin; 92 PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE)); 93 Bseq->ops->mult = NULL; 94 Bseq->ops->multadd = NULL; 95 Bseq->ops->multtranspose = NULL; 96 Bseq->ops->multtransposeadd = NULL; 97 Bseq->ops->lufactor = NULL; 98 Bseq->ops->choleskyfactor = NULL; 99 Bseq->ops->lufactorsymbolic = NULL; 100 Bseq->ops->choleskyfactorsymbolic = NULL; 101 Bseq->ops->getinertia = NULL; 102 PetscFunctionReturn(PETSC_SUCCESS); 103 } 104 105 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 106 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 107 { 108 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c; 109 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; 110 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; 111 const PetscInt *irow, *icol; 112 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; 113 PetscInt *aj = a->j, *ai = a->i; 114 MatScalar *mat_a; 115 Mat C; 116 PetscBool flag; 117 118 PetscFunctionBegin; 119 120 PetscCall(ISGetIndices(isrow, &irow)); 121 PetscCall(ISGetIndices(iscol, &icol)); 122 PetscCall(ISGetLocalSize(isrow, &nrows)); 123 PetscCall(ISGetLocalSize(iscol, &ncols)); 124 125 PetscCall(PetscCalloc1(1 + oldcols, &smap)); 126 ssmap = smap; 127 PetscCall(PetscMalloc1(1 + nrows, &lens)); 128 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; 129 /* determine lens of each row */ 130 for (i = 0; i < nrows; i++) { 131 kstart = ai[irow[i]]; 132 kend = kstart + a->ilen[irow[i]]; 133 lens[i] = 0; 134 for (k = kstart; k < kend; k++) { 135 if (ssmap[aj[k]]) lens[i]++; 136 } 137 } 138 /* Create and fill new matrix */ 139 if (scall == MAT_REUSE_MATRIX) { 140 c = (Mat_SeqSBAIJ *)((*B)->data); 141 142 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); 143 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); 144 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros"); 145 PetscCall(PetscArrayzero(c->ilen, c->mbs)); 146 C = *B; 147 } else { 148 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); 149 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); 150 PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); 151 PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens)); 152 } 153 c = (Mat_SeqSBAIJ *)(C->data); 154 for (i = 0; i < nrows; i++) { 155 row = irow[i]; 156 kstart = ai[row]; 157 kend = kstart + a->ilen[row]; 158 mat_i = c->i[i]; 159 mat_j = c->j + mat_i; 160 mat_a = c->a + mat_i * bs2; 161 mat_ilen = c->ilen + i; 162 for (k = kstart; k < kend; k++) { 163 if ((tcol = ssmap[a->j[k]])) { 164 *mat_j++ = tcol - 1; 165 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); 166 mat_a += bs2; 167 (*mat_ilen)++; 168 } 169 } 170 } 171 /* sort */ 172 { 173 MatScalar *work; 174 175 PetscCall(PetscMalloc1(bs2, &work)); 176 for (i = 0; i < nrows; i++) { 177 PetscInt ilen; 178 mat_i = c->i[i]; 179 mat_j = c->j + mat_i; 180 mat_a = c->a + mat_i * bs2; 181 ilen = c->ilen[i]; 182 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); 183 } 184 PetscCall(PetscFree(work)); 185 } 186 187 /* Free work space */ 188 PetscCall(ISRestoreIndices(iscol, &icol)); 189 PetscCall(PetscFree(smap)); 190 PetscCall(PetscFree(lens)); 191 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 192 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 193 194 PetscCall(ISRestoreIndices(isrow, &irow)); 195 *B = C; 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) 200 { 201 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 202 IS is1, is2; 203 PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs; 204 const PetscInt *irow, *icol; 205 206 PetscFunctionBegin; 207 PetscCall(ISGetIndices(isrow, &irow)); 208 PetscCall(ISGetIndices(iscol, &icol)); 209 PetscCall(ISGetLocalSize(isrow, &nrows)); 210 PetscCall(ISGetLocalSize(iscol, &ncols)); 211 212 /* Verify if the indices correspond to each element in a block 213 and form the IS with compressed IS */ 214 maxmnbs = PetscMax(a->mbs, a->nbs); 215 PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary)); 216 PetscCall(PetscArrayzero(vary, a->mbs)); 217 for (i = 0; i < nrows; i++) vary[irow[i] / bs]++; 218 for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); 219 count = 0; 220 for (i = 0; i < nrows; i++) { 221 PetscInt j = irow[i] / bs; 222 if ((vary[j]--) == bs) iary[count++] = j; 223 } 224 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1)); 225 226 PetscCall(PetscArrayzero(vary, a->nbs)); 227 for (i = 0; i < ncols; i++) vary[icol[i] / bs]++; 228 for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); 229 count = 0; 230 for (i = 0; i < ncols; i++) { 231 PetscInt j = icol[i] / bs; 232 if ((vary[j]--) == bs) iary[count++] = j; 233 } 234 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2)); 235 PetscCall(ISRestoreIndices(isrow, &irow)); 236 PetscCall(ISRestoreIndices(iscol, &icol)); 237 PetscCall(PetscFree2(vary, iary)); 238 239 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B)); 240 PetscCall(ISDestroy(&is1)); 241 PetscCall(ISDestroy(&is2)); 242 243 if (isrow != iscol) { 244 PetscBool isequal; 245 PetscCall(ISEqual(isrow, iscol, &isequal)); 246 if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 247 } 248 PetscFunctionReturn(PETSC_SUCCESS); 249 } 250 251 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) 252 { 253 PetscInt i; 254 255 PetscFunctionBegin; 256 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); 257 258 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); 259 PetscFunctionReturn(PETSC_SUCCESS); 260 } 261 262 /* Should check that shapes of vectors and matrices match */ 263 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz) 264 { 265 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 266 PetscScalar *z, x1, x2, zero = 0.0; 267 const PetscScalar *x, *xb; 268 const MatScalar *v; 269 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 270 const PetscInt *aj = a->j, *ai = a->i, *ib; 271 PetscInt nonzerorow = 0; 272 273 PetscFunctionBegin; 274 PetscCall(VecSet(zz, zero)); 275 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 276 PetscCall(VecGetArrayRead(xx, &x)); 277 PetscCall(VecGetArray(zz, &z)); 278 279 v = a->a; 280 xb = x; 281 282 for (i = 0; i < mbs; i++) { 283 n = ai[1] - ai[0]; /* length of i_th block row of A */ 284 x1 = xb[0]; 285 x2 = xb[1]; 286 ib = aj + *ai; 287 jmin = 0; 288 nonzerorow += (n > 0); 289 if (*ib == i) { /* (diag of A)*x */ 290 z[2 * i] += v[0] * x1 + v[2] * x2; 291 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 292 v += 4; 293 jmin++; 294 } 295 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 296 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 297 for (j = jmin; j < n; j++) { 298 /* (strict lower triangular part of A)*x */ 299 cval = ib[j] * 2; 300 z[cval] += v[0] * x1 + v[1] * x2; 301 z[cval + 1] += v[2] * x1 + v[3] * x2; 302 /* (strict upper triangular part of A)*x */ 303 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 304 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 305 v += 4; 306 } 307 xb += 2; 308 ai++; 309 } 310 311 PetscCall(VecRestoreArrayRead(xx, &x)); 312 PetscCall(VecRestoreArray(zz, &z)); 313 PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 314 PetscFunctionReturn(PETSC_SUCCESS); 315 } 316 317 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz) 318 { 319 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 320 PetscScalar *z, x1, x2, x3, zero = 0.0; 321 const PetscScalar *x, *xb; 322 const MatScalar *v; 323 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 324 const PetscInt *aj = a->j, *ai = a->i, *ib; 325 PetscInt nonzerorow = 0; 326 327 PetscFunctionBegin; 328 PetscCall(VecSet(zz, zero)); 329 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 330 PetscCall(VecGetArrayRead(xx, &x)); 331 PetscCall(VecGetArray(zz, &z)); 332 333 v = a->a; 334 xb = x; 335 336 for (i = 0; i < mbs; i++) { 337 n = ai[1] - ai[0]; /* length of i_th block row of A */ 338 x1 = xb[0]; 339 x2 = xb[1]; 340 x3 = xb[2]; 341 ib = aj + *ai; 342 jmin = 0; 343 nonzerorow += (n > 0); 344 if (*ib == i) { /* (diag of A)*x */ 345 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 346 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 347 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 348 v += 9; 349 jmin++; 350 } 351 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 352 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 353 for (j = jmin; j < n; j++) { 354 /* (strict lower triangular part of A)*x */ 355 cval = ib[j] * 3; 356 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 357 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 358 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 359 /* (strict upper triangular part of A)*x */ 360 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 361 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 362 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 363 v += 9; 364 } 365 xb += 3; 366 ai++; 367 } 368 369 PetscCall(VecRestoreArrayRead(xx, &x)); 370 PetscCall(VecRestoreArray(zz, &z)); 371 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 372 PetscFunctionReturn(PETSC_SUCCESS); 373 } 374 375 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz) 376 { 377 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 378 PetscScalar *z, x1, x2, x3, x4, zero = 0.0; 379 const PetscScalar *x, *xb; 380 const MatScalar *v; 381 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 382 const PetscInt *aj = a->j, *ai = a->i, *ib; 383 PetscInt nonzerorow = 0; 384 385 PetscFunctionBegin; 386 PetscCall(VecSet(zz, zero)); 387 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 388 PetscCall(VecGetArrayRead(xx, &x)); 389 PetscCall(VecGetArray(zz, &z)); 390 391 v = a->a; 392 xb = x; 393 394 for (i = 0; i < mbs; i++) { 395 n = ai[1] - ai[0]; /* length of i_th block row of A */ 396 x1 = xb[0]; 397 x2 = xb[1]; 398 x3 = xb[2]; 399 x4 = xb[3]; 400 ib = aj + *ai; 401 jmin = 0; 402 nonzerorow += (n > 0); 403 if (*ib == i) { /* (diag of A)*x */ 404 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 405 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 406 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 407 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 408 v += 16; 409 jmin++; 410 } 411 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 412 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 413 for (j = jmin; j < n; j++) { 414 /* (strict lower triangular part of A)*x */ 415 cval = ib[j] * 4; 416 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 417 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 418 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 419 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 420 /* (strict upper triangular part of A)*x */ 421 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 422 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 423 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 424 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 425 v += 16; 426 } 427 xb += 4; 428 ai++; 429 } 430 431 PetscCall(VecRestoreArrayRead(xx, &x)); 432 PetscCall(VecRestoreArray(zz, &z)); 433 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 434 PetscFunctionReturn(PETSC_SUCCESS); 435 } 436 437 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz) 438 { 439 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 440 PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0; 441 const PetscScalar *x, *xb; 442 const MatScalar *v; 443 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 444 const PetscInt *aj = a->j, *ai = a->i, *ib; 445 PetscInt nonzerorow = 0; 446 447 PetscFunctionBegin; 448 PetscCall(VecSet(zz, zero)); 449 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 450 PetscCall(VecGetArrayRead(xx, &x)); 451 PetscCall(VecGetArray(zz, &z)); 452 453 v = a->a; 454 xb = x; 455 456 for (i = 0; i < mbs; i++) { 457 n = ai[1] - ai[0]; /* length of i_th block row of A */ 458 x1 = xb[0]; 459 x2 = xb[1]; 460 x3 = xb[2]; 461 x4 = xb[3]; 462 x5 = xb[4]; 463 ib = aj + *ai; 464 jmin = 0; 465 nonzerorow += (n > 0); 466 if (*ib == i) { /* (diag of A)*x */ 467 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 468 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 469 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 470 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 471 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 472 v += 25; 473 jmin++; 474 } 475 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 476 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 477 for (j = jmin; j < n; j++) { 478 /* (strict lower triangular part of A)*x */ 479 cval = ib[j] * 5; 480 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 481 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 482 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 483 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 484 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 485 /* (strict upper triangular part of A)*x */ 486 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 487 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 488 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 489 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 490 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 491 v += 25; 492 } 493 xb += 5; 494 ai++; 495 } 496 497 PetscCall(VecRestoreArrayRead(xx, &x)); 498 PetscCall(VecRestoreArray(zz, &z)); 499 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 500 PetscFunctionReturn(PETSC_SUCCESS); 501 } 502 503 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz) 504 { 505 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 506 PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0; 507 const PetscScalar *x, *xb; 508 const MatScalar *v; 509 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 510 const PetscInt *aj = a->j, *ai = a->i, *ib; 511 PetscInt nonzerorow = 0; 512 513 PetscFunctionBegin; 514 PetscCall(VecSet(zz, zero)); 515 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 516 PetscCall(VecGetArrayRead(xx, &x)); 517 PetscCall(VecGetArray(zz, &z)); 518 519 v = a->a; 520 xb = x; 521 522 for (i = 0; i < mbs; i++) { 523 n = ai[1] - ai[0]; /* length of i_th block row of A */ 524 x1 = xb[0]; 525 x2 = xb[1]; 526 x3 = xb[2]; 527 x4 = xb[3]; 528 x5 = xb[4]; 529 x6 = xb[5]; 530 ib = aj + *ai; 531 jmin = 0; 532 nonzerorow += (n > 0); 533 if (*ib == i) { /* (diag of A)*x */ 534 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 535 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 536 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 537 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 538 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 539 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 540 v += 36; 541 jmin++; 542 } 543 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 544 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 545 for (j = jmin; j < n; j++) { 546 /* (strict lower triangular part of A)*x */ 547 cval = ib[j] * 6; 548 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 549 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 550 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 551 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 552 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 553 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 554 /* (strict upper triangular part of A)*x */ 555 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 556 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 557 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 558 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 559 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 560 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 561 v += 36; 562 } 563 xb += 6; 564 ai++; 565 } 566 567 PetscCall(VecRestoreArrayRead(xx, &x)); 568 PetscCall(VecRestoreArray(zz, &z)); 569 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 570 PetscFunctionReturn(PETSC_SUCCESS); 571 } 572 573 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz) 574 { 575 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 576 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0; 577 const PetscScalar *x, *xb; 578 const MatScalar *v; 579 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 580 const PetscInt *aj = a->j, *ai = a->i, *ib; 581 PetscInt nonzerorow = 0; 582 583 PetscFunctionBegin; 584 PetscCall(VecSet(zz, zero)); 585 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 586 PetscCall(VecGetArrayRead(xx, &x)); 587 PetscCall(VecGetArray(zz, &z)); 588 589 v = a->a; 590 xb = x; 591 592 for (i = 0; i < mbs; i++) { 593 n = ai[1] - ai[0]; /* length of i_th block row of A */ 594 x1 = xb[0]; 595 x2 = xb[1]; 596 x3 = xb[2]; 597 x4 = xb[3]; 598 x5 = xb[4]; 599 x6 = xb[5]; 600 x7 = xb[6]; 601 ib = aj + *ai; 602 jmin = 0; 603 nonzerorow += (n > 0); 604 if (*ib == i) { /* (diag of A)*x */ 605 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 606 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 607 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 608 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 609 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 610 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 611 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 612 v += 49; 613 jmin++; 614 } 615 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 616 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 617 for (j = jmin; j < n; j++) { 618 /* (strict lower triangular part of A)*x */ 619 cval = ib[j] * 7; 620 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 621 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 622 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 623 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 624 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 625 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 626 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 627 /* (strict upper triangular part of A)*x */ 628 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 629 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 630 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 631 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 632 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 633 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 634 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 635 v += 49; 636 } 637 xb += 7; 638 ai++; 639 } 640 PetscCall(VecRestoreArrayRead(xx, &x)); 641 PetscCall(VecRestoreArray(zz, &z)); 642 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow)); 643 PetscFunctionReturn(PETSC_SUCCESS); 644 } 645 646 /* 647 This will not work with MatScalar == float because it calls the BLAS 648 */ 649 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz) 650 { 651 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 652 PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0; 653 const PetscScalar *x, *x_ptr, *xb; 654 const MatScalar *v; 655 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 656 const PetscInt *idx, *aj, *ii; 657 PetscInt nonzerorow = 0; 658 659 PetscFunctionBegin; 660 PetscCall(VecSet(zz, zero)); 661 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 662 PetscCall(VecGetArrayRead(xx, &x)); 663 PetscCall(VecGetArray(zz, &z)); 664 665 x_ptr = x; 666 z_ptr = z; 667 668 aj = a->j; 669 v = a->a; 670 ii = a->i; 671 672 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work)); 673 work = a->mult_work; 674 675 for (i = 0; i < mbs; i++) { 676 n = ii[1] - ii[0]; 677 ncols = n * bs; 678 workt = work; 679 idx = aj + ii[0]; 680 nonzerorow += (n > 0); 681 682 /* upper triangular part */ 683 for (j = 0; j < n; j++) { 684 xb = x_ptr + bs * (*idx++); 685 for (k = 0; k < bs; k++) workt[k] = xb[k]; 686 workt += bs; 687 } 688 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 689 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 690 691 /* strict lower triangular part */ 692 idx = aj + ii[0]; 693 if (n && *idx == i) { 694 ncols -= bs; 695 v += bs2; 696 idx++; 697 n--; 698 } 699 700 if (ncols > 0) { 701 workt = work; 702 PetscCall(PetscArrayzero(workt, ncols)); 703 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 704 for (j = 0; j < n; j++) { 705 zb = z_ptr + bs * (*idx++); 706 for (k = 0; k < bs; k++) zb[k] += workt[k]; 707 workt += bs; 708 } 709 } 710 x += bs; 711 v += n * bs2; 712 z += bs; 713 ii++; 714 } 715 716 PetscCall(VecRestoreArrayRead(xx, &x)); 717 PetscCall(VecRestoreArray(zz, &z)); 718 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow)); 719 PetscFunctionReturn(PETSC_SUCCESS); 720 } 721 722 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) 723 { 724 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 725 PetscScalar *z, x1; 726 const PetscScalar *x, *xb; 727 const MatScalar *v; 728 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 729 const PetscInt *aj = a->j, *ai = a->i, *ib; 730 PetscInt nonzerorow = 0; 731 #if defined(PETSC_USE_COMPLEX) 732 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 733 #else 734 const int aconj = 0; 735 #endif 736 737 PetscFunctionBegin; 738 PetscCall(VecCopy(yy, zz)); 739 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 740 PetscCall(VecGetArrayRead(xx, &x)); 741 PetscCall(VecGetArray(zz, &z)); 742 v = a->a; 743 xb = x; 744 745 for (i = 0; i < mbs; i++) { 746 n = ai[1] - ai[0]; /* length of i_th row of A */ 747 x1 = xb[0]; 748 ib = aj + *ai; 749 jmin = 0; 750 nonzerorow += (n > 0); 751 if (n && *ib == i) { /* (diag of A)*x */ 752 z[i] += *v++ * x[*ib++]; 753 jmin++; 754 } 755 if (aconj) { 756 for (j = jmin; j < n; j++) { 757 cval = *ib; 758 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 759 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 760 } 761 } else { 762 for (j = jmin; j < n; j++) { 763 cval = *ib; 764 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 765 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 766 } 767 } 768 xb++; 769 ai++; 770 } 771 772 PetscCall(VecRestoreArrayRead(xx, &x)); 773 PetscCall(VecRestoreArray(zz, &z)); 774 775 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 776 PetscFunctionReturn(PETSC_SUCCESS); 777 } 778 779 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) 780 { 781 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 782 PetscScalar *z, x1, x2; 783 const PetscScalar *x, *xb; 784 const MatScalar *v; 785 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 786 const PetscInt *aj = a->j, *ai = a->i, *ib; 787 PetscInt nonzerorow = 0; 788 789 PetscFunctionBegin; 790 PetscCall(VecCopy(yy, zz)); 791 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 792 PetscCall(VecGetArrayRead(xx, &x)); 793 PetscCall(VecGetArray(zz, &z)); 794 795 v = a->a; 796 xb = x; 797 798 for (i = 0; i < mbs; i++) { 799 n = ai[1] - ai[0]; /* length of i_th block row of A */ 800 x1 = xb[0]; 801 x2 = xb[1]; 802 ib = aj + *ai; 803 jmin = 0; 804 nonzerorow += (n > 0); 805 if (n && *ib == i) { /* (diag of A)*x */ 806 z[2 * i] += v[0] * x1 + v[2] * x2; 807 z[2 * i + 1] += v[2] * x1 + v[3] * x2; 808 v += 4; 809 jmin++; 810 } 811 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 812 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 813 for (j = jmin; j < n; j++) { 814 /* (strict lower triangular part of A)*x */ 815 cval = ib[j] * 2; 816 z[cval] += v[0] * x1 + v[1] * x2; 817 z[cval + 1] += v[2] * x1 + v[3] * x2; 818 /* (strict upper triangular part of A)*x */ 819 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1]; 820 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1]; 821 v += 4; 822 } 823 xb += 2; 824 ai++; 825 } 826 PetscCall(VecRestoreArrayRead(xx, &x)); 827 PetscCall(VecRestoreArray(zz, &z)); 828 829 PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow))); 830 PetscFunctionReturn(PETSC_SUCCESS); 831 } 832 833 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) 834 { 835 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 836 PetscScalar *z, x1, x2, x3; 837 const PetscScalar *x, *xb; 838 const MatScalar *v; 839 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 840 const PetscInt *aj = a->j, *ai = a->i, *ib; 841 PetscInt nonzerorow = 0; 842 843 PetscFunctionBegin; 844 PetscCall(VecCopy(yy, zz)); 845 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 846 PetscCall(VecGetArrayRead(xx, &x)); 847 PetscCall(VecGetArray(zz, &z)); 848 849 v = a->a; 850 xb = x; 851 852 for (i = 0; i < mbs; i++) { 853 n = ai[1] - ai[0]; /* length of i_th block row of A */ 854 x1 = xb[0]; 855 x2 = xb[1]; 856 x3 = xb[2]; 857 ib = aj + *ai; 858 jmin = 0; 859 nonzerorow += (n > 0); 860 if (n && *ib == i) { /* (diag of A)*x */ 861 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3; 862 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3; 863 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 864 v += 9; 865 jmin++; 866 } 867 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 868 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 869 for (j = jmin; j < n; j++) { 870 /* (strict lower triangular part of A)*x */ 871 cval = ib[j] * 3; 872 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3; 873 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3; 874 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3; 875 /* (strict upper triangular part of A)*x */ 876 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2]; 877 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2]; 878 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2]; 879 v += 9; 880 } 881 xb += 3; 882 ai++; 883 } 884 885 PetscCall(VecRestoreArrayRead(xx, &x)); 886 PetscCall(VecRestoreArray(zz, &z)); 887 888 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow))); 889 PetscFunctionReturn(PETSC_SUCCESS); 890 } 891 892 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) 893 { 894 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 895 PetscScalar *z, x1, x2, x3, x4; 896 const PetscScalar *x, *xb; 897 const MatScalar *v; 898 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 899 const PetscInt *aj = a->j, *ai = a->i, *ib; 900 PetscInt nonzerorow = 0; 901 902 PetscFunctionBegin; 903 PetscCall(VecCopy(yy, zz)); 904 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 905 PetscCall(VecGetArrayRead(xx, &x)); 906 PetscCall(VecGetArray(zz, &z)); 907 908 v = a->a; 909 xb = x; 910 911 for (i = 0; i < mbs; i++) { 912 n = ai[1] - ai[0]; /* length of i_th block row of A */ 913 x1 = xb[0]; 914 x2 = xb[1]; 915 x3 = xb[2]; 916 x4 = xb[3]; 917 ib = aj + *ai; 918 jmin = 0; 919 nonzerorow += (n > 0); 920 if (n && *ib == i) { /* (diag of A)*x */ 921 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 922 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 923 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4; 924 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 925 v += 16; 926 jmin++; 927 } 928 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 929 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 930 for (j = jmin; j < n; j++) { 931 /* (strict lower triangular part of A)*x */ 932 cval = ib[j] * 4; 933 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 934 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 935 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 936 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 937 /* (strict upper triangular part of A)*x */ 938 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3]; 939 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3]; 940 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3]; 941 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3]; 942 v += 16; 943 } 944 xb += 4; 945 ai++; 946 } 947 948 PetscCall(VecRestoreArrayRead(xx, &x)); 949 PetscCall(VecRestoreArray(zz, &z)); 950 951 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow))); 952 PetscFunctionReturn(PETSC_SUCCESS); 953 } 954 955 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) 956 { 957 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 958 PetscScalar *z, x1, x2, x3, x4, x5; 959 const PetscScalar *x, *xb; 960 const MatScalar *v; 961 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 962 const PetscInt *aj = a->j, *ai = a->i, *ib; 963 PetscInt nonzerorow = 0; 964 965 PetscFunctionBegin; 966 PetscCall(VecCopy(yy, zz)); 967 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 968 PetscCall(VecGetArrayRead(xx, &x)); 969 PetscCall(VecGetArray(zz, &z)); 970 971 v = a->a; 972 xb = x; 973 974 for (i = 0; i < mbs; i++) { 975 n = ai[1] - ai[0]; /* length of i_th block row of A */ 976 x1 = xb[0]; 977 x2 = xb[1]; 978 x3 = xb[2]; 979 x4 = xb[3]; 980 x5 = xb[4]; 981 ib = aj + *ai; 982 jmin = 0; 983 nonzerorow += (n > 0); 984 if (n && *ib == i) { /* (diag of A)*x */ 985 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 986 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 987 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 988 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5; 989 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 990 v += 25; 991 jmin++; 992 } 993 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 994 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 995 for (j = jmin; j < n; j++) { 996 /* (strict lower triangular part of A)*x */ 997 cval = ib[j] * 5; 998 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; 999 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; 1000 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; 1001 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; 1002 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; 1003 /* (strict upper triangular part of A)*x */ 1004 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4]; 1005 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4]; 1006 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4]; 1007 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4]; 1008 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4]; 1009 v += 25; 1010 } 1011 xb += 5; 1012 ai++; 1013 } 1014 1015 PetscCall(VecRestoreArrayRead(xx, &x)); 1016 PetscCall(VecRestoreArray(zz, &z)); 1017 1018 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow))); 1019 PetscFunctionReturn(PETSC_SUCCESS); 1020 } 1021 1022 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) 1023 { 1024 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1025 PetscScalar *z, x1, x2, x3, x4, x5, x6; 1026 const PetscScalar *x, *xb; 1027 const MatScalar *v; 1028 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1029 const PetscInt *aj = a->j, *ai = a->i, *ib; 1030 PetscInt nonzerorow = 0; 1031 1032 PetscFunctionBegin; 1033 PetscCall(VecCopy(yy, zz)); 1034 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1035 PetscCall(VecGetArrayRead(xx, &x)); 1036 PetscCall(VecGetArray(zz, &z)); 1037 1038 v = a->a; 1039 xb = x; 1040 1041 for (i = 0; i < mbs; i++) { 1042 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1043 x1 = xb[0]; 1044 x2 = xb[1]; 1045 x3 = xb[2]; 1046 x4 = xb[3]; 1047 x5 = xb[4]; 1048 x6 = xb[5]; 1049 ib = aj + *ai; 1050 jmin = 0; 1051 nonzerorow += (n > 0); 1052 if (n && *ib == i) { /* (diag of A)*x */ 1053 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 1054 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 1055 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 1056 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 1057 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6; 1058 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1059 v += 36; 1060 jmin++; 1061 } 1062 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1063 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1064 for (j = jmin; j < n; j++) { 1065 /* (strict lower triangular part of A)*x */ 1066 cval = ib[j] * 6; 1067 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6; 1068 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6; 1069 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6; 1070 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6; 1071 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6; 1072 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6; 1073 /* (strict upper triangular part of A)*x */ 1074 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5]; 1075 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5]; 1076 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5]; 1077 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5]; 1078 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5]; 1079 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5]; 1080 v += 36; 1081 } 1082 xb += 6; 1083 ai++; 1084 } 1085 1086 PetscCall(VecRestoreArrayRead(xx, &x)); 1087 PetscCall(VecRestoreArray(zz, &z)); 1088 1089 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow))); 1090 PetscFunctionReturn(PETSC_SUCCESS); 1091 } 1092 1093 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) 1094 { 1095 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1096 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7; 1097 const PetscScalar *x, *xb; 1098 const MatScalar *v; 1099 PetscInt mbs = a->mbs, i, n, cval, j, jmin; 1100 const PetscInt *aj = a->j, *ai = a->i, *ib; 1101 PetscInt nonzerorow = 0; 1102 1103 PetscFunctionBegin; 1104 PetscCall(VecCopy(yy, zz)); 1105 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1106 PetscCall(VecGetArrayRead(xx, &x)); 1107 PetscCall(VecGetArray(zz, &z)); 1108 1109 v = a->a; 1110 xb = x; 1111 1112 for (i = 0; i < mbs; i++) { 1113 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1114 x1 = xb[0]; 1115 x2 = xb[1]; 1116 x3 = xb[2]; 1117 x4 = xb[3]; 1118 x5 = xb[4]; 1119 x6 = xb[5]; 1120 x7 = xb[6]; 1121 ib = aj + *ai; 1122 jmin = 0; 1123 nonzerorow += (n > 0); 1124 if (n && *ib == i) { /* (diag of A)*x */ 1125 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 1126 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 1127 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 1128 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 1129 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 1130 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7; 1131 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1132 v += 49; 1133 jmin++; 1134 } 1135 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1136 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1137 for (j = jmin; j < n; j++) { 1138 /* (strict lower triangular part of A)*x */ 1139 cval = ib[j] * 7; 1140 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7; 1141 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7; 1142 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7; 1143 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7; 1144 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7; 1145 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7; 1146 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7; 1147 /* (strict upper triangular part of A)*x */ 1148 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6]; 1149 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6]; 1150 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6]; 1151 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6]; 1152 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6]; 1153 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6]; 1154 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6]; 1155 v += 49; 1156 } 1157 xb += 7; 1158 ai++; 1159 } 1160 1161 PetscCall(VecRestoreArrayRead(xx, &x)); 1162 PetscCall(VecRestoreArray(zz, &z)); 1163 1164 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow))); 1165 PetscFunctionReturn(PETSC_SUCCESS); 1166 } 1167 1168 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) 1169 { 1170 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1171 PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt; 1172 const PetscScalar *x, *x_ptr, *xb; 1173 const MatScalar *v; 1174 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k; 1175 const PetscInt *idx, *aj, *ii; 1176 PetscInt nonzerorow = 0; 1177 1178 PetscFunctionBegin; 1179 PetscCall(VecCopy(yy, zz)); 1180 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1181 PetscCall(VecGetArrayRead(xx, &x)); 1182 x_ptr = x; 1183 PetscCall(VecGetArray(zz, &z)); 1184 z_ptr = z; 1185 1186 aj = a->j; 1187 v = a->a; 1188 ii = a->i; 1189 1190 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work)); 1191 work = a->mult_work; 1192 1193 for (i = 0; i < mbs; i++) { 1194 n = ii[1] - ii[0]; 1195 ncols = n * bs; 1196 workt = work; 1197 idx = aj + ii[0]; 1198 nonzerorow += (n > 0); 1199 1200 /* upper triangular part */ 1201 for (j = 0; j < n; j++) { 1202 xb = x_ptr + bs * (*idx++); 1203 for (k = 0; k < bs; k++) workt[k] = xb[k]; 1204 workt += bs; 1205 } 1206 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1207 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); 1208 1209 /* strict lower triangular part */ 1210 idx = aj + ii[0]; 1211 if (n && *idx == i) { 1212 ncols -= bs; 1213 v += bs2; 1214 idx++; 1215 n--; 1216 } 1217 if (ncols > 0) { 1218 workt = work; 1219 PetscCall(PetscArrayzero(workt, ncols)); 1220 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt); 1221 for (j = 0; j < n; j++) { 1222 zb = z_ptr + bs * (*idx++); 1223 for (k = 0; k < bs; k++) zb[k] += workt[k]; 1224 workt += bs; 1225 } 1226 } 1227 1228 x += bs; 1229 v += n * bs2; 1230 z += bs; 1231 ii++; 1232 } 1233 1234 PetscCall(VecRestoreArrayRead(xx, &x)); 1235 PetscCall(VecRestoreArray(zz, &z)); 1236 1237 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow))); 1238 PetscFunctionReturn(PETSC_SUCCESS); 1239 } 1240 1241 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha) 1242 { 1243 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data; 1244 PetscScalar oalpha = alpha; 1245 PetscBLASInt one = 1, totalnz; 1246 1247 PetscFunctionBegin; 1248 PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz)); 1249 PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one)); 1250 PetscCall(PetscLogFlops(totalnz)); 1251 PetscFunctionReturn(PETSC_SUCCESS); 1252 } 1253 1254 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm) 1255 { 1256 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1257 const MatScalar *v = a->a; 1258 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1259 PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il; 1260 const PetscInt *aj = a->j, *col; 1261 1262 PetscFunctionBegin; 1263 if (!a->nz) { 1264 *norm = 0.0; 1265 PetscFunctionReturn(PETSC_SUCCESS); 1266 } 1267 if (type == NORM_FROBENIUS) { 1268 for (k = 0; k < mbs; k++) { 1269 jmin = a->i[k]; 1270 jmax = a->i[k + 1]; 1271 col = aj + jmin; 1272 if (jmax - jmin > 0 && *col == k) { /* diagonal block */ 1273 for (i = 0; i < bs2; i++) { 1274 sum_diag += PetscRealPart(PetscConj(*v) * (*v)); 1275 v++; 1276 } 1277 jmin++; 1278 } 1279 for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */ 1280 for (i = 0; i < bs2; i++) { 1281 sum_off += PetscRealPart(PetscConj(*v) * (*v)); 1282 v++; 1283 } 1284 } 1285 } 1286 *norm = PetscSqrtReal(sum_diag + 2 * sum_off); 1287 PetscCall(PetscLogFlops(2.0 * bs2 * a->nz)); 1288 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1289 PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl)); 1290 for (i = 0; i < mbs; i++) jl[i] = mbs; 1291 il[0] = 0; 1292 1293 *norm = 0.0; 1294 for (k = 0; k < mbs; k++) { /* k_th block row */ 1295 for (j = 0; j < bs; j++) sum[j] = 0.0; 1296 /*-- col sum --*/ 1297 i = jl[k]; /* first |A(i,k)| to be added */ 1298 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1299 at step k */ 1300 while (i < mbs) { 1301 nexti = jl[i]; /* next block row to be added */ 1302 ik = il[i]; /* block index of A(i,k) in the array a */ 1303 for (j = 0; j < bs; j++) { 1304 v = a->a + ik * bs2 + j * bs; 1305 for (k1 = 0; k1 < bs; k1++) { 1306 sum[j] += PetscAbsScalar(*v); 1307 v++; 1308 } 1309 } 1310 /* update il, jl */ 1311 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1312 jmax = a->i[i + 1]; 1313 if (jmin < jmax) { 1314 il[i] = jmin; 1315 j = a->j[jmin]; 1316 jl[i] = jl[j]; 1317 jl[j] = i; 1318 } 1319 i = nexti; 1320 } 1321 /*-- row sum --*/ 1322 jmin = a->i[k]; 1323 jmax = a->i[k + 1]; 1324 for (i = jmin; i < jmax; i++) { 1325 for (j = 0; j < bs; j++) { 1326 v = a->a + i * bs2 + j; 1327 for (k1 = 0; k1 < bs; k1++) { 1328 sum[j] += PetscAbsScalar(*v); 1329 v += bs; 1330 } 1331 } 1332 } 1333 /* add k_th block row to il, jl */ 1334 col = aj + jmin; 1335 if (jmax - jmin > 0 && *col == k) jmin++; 1336 if (jmin < jmax) { 1337 il[k] = jmin; 1338 j = a->j[jmin]; 1339 jl[k] = jl[j]; 1340 jl[j] = k; 1341 } 1342 for (j = 0; j < bs; j++) { 1343 if (sum[j] > *norm) *norm = sum[j]; 1344 } 1345 } 1346 PetscCall(PetscFree3(sum, il, jl)); 1347 PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0))); 1348 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); 1349 PetscFunctionReturn(PETSC_SUCCESS); 1350 } 1351 1352 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg) 1353 { 1354 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data; 1355 1356 PetscFunctionBegin; 1357 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1358 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) { 1359 *flg = PETSC_FALSE; 1360 PetscFunctionReturn(PETSC_SUCCESS); 1361 } 1362 1363 /* if the a->i are the same */ 1364 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg)); 1365 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1366 1367 /* if a->j are the same */ 1368 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg)); 1369 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 1370 1371 /* if a->a are the same */ 1372 PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg)); 1373 PetscFunctionReturn(PETSC_SUCCESS); 1374 } 1375 1376 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v) 1377 { 1378 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1379 PetscInt i, j, k, row, bs, ambs, bs2; 1380 const PetscInt *ai, *aj; 1381 PetscScalar *x, zero = 0.0; 1382 const MatScalar *aa, *aa_j; 1383 1384 PetscFunctionBegin; 1385 bs = A->rmap->bs; 1386 PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1"); 1387 1388 aa = a->a; 1389 ambs = a->mbs; 1390 1391 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1392 PetscInt *diag = a->diag; 1393 aa = a->a; 1394 ambs = a->mbs; 1395 PetscCall(VecGetArray(v, &x)); 1396 for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]]; 1397 PetscCall(VecRestoreArray(v, &x)); 1398 PetscFunctionReturn(PETSC_SUCCESS); 1399 } 1400 1401 ai = a->i; 1402 aj = a->j; 1403 bs2 = a->bs2; 1404 PetscCall(VecSet(v, zero)); 1405 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS); 1406 PetscCall(VecGetArray(v, &x)); 1407 for (i = 0; i < ambs; i++) { 1408 j = ai[i]; 1409 if (aj[j] == i) { /* if this is a diagonal element */ 1410 row = i * bs; 1411 aa_j = aa + j * bs2; 1412 for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k]; 1413 } 1414 } 1415 PetscCall(VecRestoreArray(v, &x)); 1416 PetscFunctionReturn(PETSC_SUCCESS); 1417 } 1418 1419 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr) 1420 { 1421 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1422 PetscScalar x; 1423 const PetscScalar *l, *li, *ri; 1424 MatScalar *aa, *v; 1425 PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2; 1426 const PetscInt *ai, *aj; 1427 PetscBool flg; 1428 1429 PetscFunctionBegin; 1430 if (ll != rr) { 1431 PetscCall(VecEqual(ll, rr, &flg)); 1432 PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same"); 1433 } 1434 if (!ll) PetscFunctionReturn(PETSC_SUCCESS); 1435 ai = a->i; 1436 aj = a->j; 1437 aa = a->a; 1438 m = A->rmap->N; 1439 bs = A->rmap->bs; 1440 mbs = a->mbs; 1441 bs2 = a->bs2; 1442 1443 PetscCall(VecGetArrayRead(ll, &l)); 1444 PetscCall(VecGetLocalSize(ll, &lm)); 1445 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1446 for (i = 0; i < mbs; i++) { /* for each block row */ 1447 M = ai[i + 1] - ai[i]; 1448 li = l + i * bs; 1449 v = aa + bs2 * ai[i]; 1450 for (j = 0; j < M; j++) { /* for each block */ 1451 ri = l + bs * aj[ai[i] + j]; 1452 for (k = 0; k < bs; k++) { 1453 x = ri[k]; 1454 for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x; 1455 } 1456 } 1457 } 1458 PetscCall(VecRestoreArrayRead(ll, &l)); 1459 PetscCall(PetscLogFlops(2.0 * a->nz)); 1460 PetscFunctionReturn(PETSC_SUCCESS); 1461 } 1462 1463 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info) 1464 { 1465 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1466 1467 PetscFunctionBegin; 1468 info->block_size = a->bs2; 1469 info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */ 1470 info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */ 1471 info->nz_unneeded = info->nz_allocated - info->nz_used; 1472 info->assemblies = A->num_ass; 1473 info->mallocs = A->info.mallocs; 1474 info->memory = 0; /* REVIEW ME */ 1475 if (A->factortype) { 1476 info->fill_ratio_given = A->info.fill_ratio_given; 1477 info->fill_ratio_needed = A->info.fill_ratio_needed; 1478 info->factor_mallocs = A->info.factor_mallocs; 1479 } else { 1480 info->fill_ratio_given = 0; 1481 info->fill_ratio_needed = 0; 1482 info->factor_mallocs = 0; 1483 } 1484 PetscFunctionReturn(PETSC_SUCCESS); 1485 } 1486 1487 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1488 { 1489 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1490 1491 PetscFunctionBegin; 1492 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); 1493 PetscFunctionReturn(PETSC_SUCCESS); 1494 } 1495 1496 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[]) 1497 { 1498 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1499 PetscInt i, j, n, row, col, bs, mbs; 1500 const PetscInt *ai, *aj; 1501 PetscReal atmp; 1502 const MatScalar *aa; 1503 PetscScalar *x; 1504 PetscInt ncols, brow, bcol, krow, kcol; 1505 1506 PetscFunctionBegin; 1507 PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov"); 1508 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1509 bs = A->rmap->bs; 1510 aa = a->a; 1511 ai = a->i; 1512 aj = a->j; 1513 mbs = a->mbs; 1514 1515 PetscCall(VecSet(v, 0.0)); 1516 PetscCall(VecGetArray(v, &x)); 1517 PetscCall(VecGetLocalSize(v, &n)); 1518 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1519 for (i = 0; i < mbs; i++) { 1520 ncols = ai[1] - ai[0]; 1521 ai++; 1522 brow = bs * i; 1523 for (j = 0; j < ncols; j++) { 1524 bcol = bs * (*aj); 1525 for (kcol = 0; kcol < bs; kcol++) { 1526 col = bcol + kcol; /* col index */ 1527 for (krow = 0; krow < bs; krow++) { 1528 atmp = PetscAbsScalar(*aa); 1529 aa++; 1530 row = brow + krow; /* row index */ 1531 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1532 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1533 } 1534 } 1535 aj++; 1536 } 1537 } 1538 PetscCall(VecRestoreArray(v, &x)); 1539 PetscFunctionReturn(PETSC_SUCCESS); 1540 } 1541 1542 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1543 { 1544 PetscFunctionBegin; 1545 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1546 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1547 PetscFunctionReturn(PETSC_SUCCESS); 1548 } 1549 1550 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1551 { 1552 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1553 PetscScalar *z = c; 1554 const PetscScalar *xb; 1555 PetscScalar x1; 1556 const MatScalar *v = a->a, *vv; 1557 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1558 #if defined(PETSC_USE_COMPLEX) 1559 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 1560 #else 1561 const int aconj = 0; 1562 #endif 1563 1564 PetscFunctionBegin; 1565 for (i = 0; i < mbs; i++) { 1566 n = ii[1] - ii[0]; 1567 ii++; 1568 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1569 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1570 jj = idx; 1571 vv = v; 1572 for (k = 0; k < cn; k++) { 1573 idx = jj; 1574 v = vv; 1575 for (j = 0; j < n; j++) { 1576 xb = b + (*idx); 1577 x1 = xb[0 + k * bm]; 1578 z[0 + k * cm] += v[0] * x1; 1579 if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm]; 1580 v += 1; 1581 ++idx; 1582 } 1583 } 1584 z += 1; 1585 } 1586 PetscFunctionReturn(PETSC_SUCCESS); 1587 } 1588 1589 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1590 { 1591 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1592 PetscScalar *z = c; 1593 const PetscScalar *xb; 1594 PetscScalar x1, x2; 1595 const MatScalar *v = a->a, *vv; 1596 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1597 1598 PetscFunctionBegin; 1599 for (i = 0; i < mbs; i++) { 1600 n = ii[1] - ii[0]; 1601 ii++; 1602 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1603 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1604 jj = idx; 1605 vv = v; 1606 for (k = 0; k < cn; k++) { 1607 idx = jj; 1608 v = vv; 1609 for (j = 0; j < n; j++) { 1610 xb = b + 2 * (*idx); 1611 x1 = xb[0 + k * bm]; 1612 x2 = xb[1 + k * bm]; 1613 z[0 + k * cm] += v[0] * x1 + v[2] * x2; 1614 z[1 + k * cm] += v[1] * x1 + v[3] * x2; 1615 if (*idx != i) { 1616 c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm]; 1617 c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm]; 1618 } 1619 v += 4; 1620 ++idx; 1621 } 1622 } 1623 z += 2; 1624 } 1625 PetscFunctionReturn(PETSC_SUCCESS); 1626 } 1627 1628 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1629 { 1630 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1631 PetscScalar *z = c; 1632 const PetscScalar *xb; 1633 PetscScalar x1, x2, x3; 1634 const MatScalar *v = a->a, *vv; 1635 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1636 1637 PetscFunctionBegin; 1638 for (i = 0; i < mbs; i++) { 1639 n = ii[1] - ii[0]; 1640 ii++; 1641 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1642 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1643 jj = idx; 1644 vv = v; 1645 for (k = 0; k < cn; k++) { 1646 idx = jj; 1647 v = vv; 1648 for (j = 0; j < n; j++) { 1649 xb = b + 3 * (*idx); 1650 x1 = xb[0 + k * bm]; 1651 x2 = xb[1 + k * bm]; 1652 x3 = xb[2 + k * bm]; 1653 z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3; 1654 z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3; 1655 z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3; 1656 if (*idx != i) { 1657 c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm]; 1658 c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm]; 1659 c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm]; 1660 } 1661 v += 9; 1662 ++idx; 1663 } 1664 } 1665 z += 3; 1666 } 1667 PetscFunctionReturn(PETSC_SUCCESS); 1668 } 1669 1670 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1671 { 1672 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1673 PetscScalar *z = c; 1674 const PetscScalar *xb; 1675 PetscScalar x1, x2, x3, x4; 1676 const MatScalar *v = a->a, *vv; 1677 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1678 1679 PetscFunctionBegin; 1680 for (i = 0; i < mbs; i++) { 1681 n = ii[1] - ii[0]; 1682 ii++; 1683 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1684 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1685 jj = idx; 1686 vv = v; 1687 for (k = 0; k < cn; k++) { 1688 idx = jj; 1689 v = vv; 1690 for (j = 0; j < n; j++) { 1691 xb = b + 4 * (*idx); 1692 x1 = xb[0 + k * bm]; 1693 x2 = xb[1 + k * bm]; 1694 x3 = xb[2 + k * bm]; 1695 x4 = xb[3 + k * bm]; 1696 z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 1697 z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 1698 z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 1699 z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 1700 if (*idx != i) { 1701 c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm]; 1702 c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm]; 1703 c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm]; 1704 c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm]; 1705 } 1706 v += 16; 1707 ++idx; 1708 } 1709 } 1710 z += 4; 1711 } 1712 PetscFunctionReturn(PETSC_SUCCESS); 1713 } 1714 1715 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) 1716 { 1717 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1718 PetscScalar *z = c; 1719 const PetscScalar *xb; 1720 PetscScalar x1, x2, x3, x4, x5; 1721 const MatScalar *v = a->a, *vv; 1722 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k; 1723 1724 PetscFunctionBegin; 1725 for (i = 0; i < mbs; i++) { 1726 n = ii[1] - ii[0]; 1727 ii++; 1728 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1729 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1730 jj = idx; 1731 vv = v; 1732 for (k = 0; k < cn; k++) { 1733 idx = jj; 1734 v = vv; 1735 for (j = 0; j < n; j++) { 1736 xb = b + 5 * (*idx); 1737 x1 = xb[0 + k * bm]; 1738 x2 = xb[1 + k * bm]; 1739 x3 = xb[2 + k * bm]; 1740 x4 = xb[3 + k * bm]; 1741 x5 = xb[4 + k * cm]; 1742 z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 1743 z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 1744 z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 1745 z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 1746 z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 1747 if (*idx != i) { 1748 c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm]; 1749 c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm]; 1750 c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm]; 1751 c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm]; 1752 c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm]; 1753 } 1754 v += 25; 1755 ++idx; 1756 } 1757 } 1758 z += 5; 1759 } 1760 PetscFunctionReturn(PETSC_SUCCESS); 1761 } 1762 1763 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C) 1764 { 1765 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1766 Mat_SeqDense *bd = (Mat_SeqDense *)B->data; 1767 Mat_SeqDense *cd = (Mat_SeqDense *)C->data; 1768 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; 1769 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; 1770 PetscBLASInt bbs, bcn, bbm, bcm; 1771 PetscScalar *z = NULL; 1772 PetscScalar *c, *b; 1773 const MatScalar *v; 1774 const PetscInt *idx, *ii; 1775 PetscScalar _DOne = 1.0; 1776 1777 PetscFunctionBegin; 1778 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1779 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1780 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1781 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1782 b = bd->v; 1783 PetscCall(MatZeroEntries(C)); 1784 PetscCall(MatDenseGetArray(C, &c)); 1785 switch (bs) { 1786 case 1: 1787 PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1788 break; 1789 case 2: 1790 PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1791 break; 1792 case 3: 1793 PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1794 break; 1795 case 4: 1796 PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1797 break; 1798 case 5: 1799 PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1800 break; 1801 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1802 PetscCall(PetscBLASIntCast(bs, &bbs)); 1803 PetscCall(PetscBLASIntCast(cn, &bcn)); 1804 PetscCall(PetscBLASIntCast(bm, &bbm)); 1805 PetscCall(PetscBLASIntCast(cm, &bcm)); 1806 idx = a->j; 1807 v = a->a; 1808 mbs = a->mbs; 1809 ii = a->i; 1810 z = c; 1811 for (i = 0; i < mbs; i++) { 1812 n = ii[1] - ii[0]; 1813 ii++; 1814 for (j = 0; j < n; j++) { 1815 if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm)); 1816 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); 1817 v += bs2; 1818 } 1819 z += bs; 1820 } 1821 } 1822 PetscCall(MatDenseRestoreArray(C, &c)); 1823 PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn)); 1824 PetscFunctionReturn(PETSC_SUCCESS); 1825 } 1826