1 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include <../src/mat/impls/sbaij/seq/sbaij.h> 7 #include <../src/mat/impls/baij/seq/baij.h> 8 #include <petsc/private/kernels/blockinvert.h> 9 10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 13 IS isrow = a->row; 14 PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 15 const PetscInt *r; 16 PetscInt nz, *vj, k, idx, k1; 17 PetscInt bs = A->rmap->bs, bs2 = a->bs2; 18 const MatScalar *aa = a->a, *v, *diag; 19 PetscScalar *x, *xk, *xj, *xk_tmp, *t; 20 const PetscScalar *b; 21 22 PetscFunctionBegin; 23 PetscCall(VecGetArrayRead(bb, &b)); 24 PetscCall(VecGetArray(xx, &x)); 25 t = a->solve_work; 26 PetscCall(ISGetIndices(isrow, &r)); 27 PetscCall(PetscMalloc1(bs, &xk_tmp)); 28 29 /* solve U^T * D * y = b by forward substitution */ 30 xk = t; 31 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 32 idx = bs * r[k]; 33 for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1]; 34 } 35 for (k = 0; k < mbs; k++) { 36 v = aa + bs2 * ai[k]; 37 xk = t + k * bs; /* Dk*xk = k-th block of x */ 38 PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */ 39 nz = ai[k + 1] - ai[k]; 40 vj = aj + ai[k]; 41 xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */ 42 while (nz--) { 43 /* x(:) += U(k,:)^T*(Dk*xk) */ 44 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */ 45 vj++; 46 xj = t + (*vj) * bs; 47 v += bs2; 48 } 49 /* xk = inv(Dk)*(Dk*xk) */ 50 diag = aa + k * bs2; /* ptr to inv(Dk) */ 51 PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */ 52 } 53 54 /* solve U*x = y by back substitution */ 55 for (k = mbs - 1; k >= 0; k--) { 56 v = aa + bs2 * ai[k]; 57 xk = t + k * bs; /* xk */ 58 nz = ai[k + 1] - ai[k]; 59 vj = aj + ai[k]; 60 xj = t + (*vj) * bs; 61 while (nz--) { 62 /* xk += U(k,:)*x(:) */ 63 PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */ 64 vj++; 65 v += bs2; 66 xj = t + (*vj) * bs; 67 } 68 idx = bs * r[k]; 69 for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++; 70 } 71 72 PetscCall(PetscFree(xk_tmp)); 73 PetscCall(ISRestoreIndices(isrow, &r)); 74 PetscCall(VecRestoreArrayRead(bb, &b)); 75 PetscCall(VecRestoreArray(xx, &x)); 76 PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs)); 77 PetscFunctionReturn(PETSC_SUCCESS); 78 } 79 80 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 81 { 82 PetscFunctionBegin; 83 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet"); 84 } 85 86 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 87 { 88 PetscFunctionBegin; 89 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented"); 90 } 91 92 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x) 93 { 94 PetscInt nz, k; 95 const PetscInt *vj, bs2 = bs * bs; 96 const MatScalar *v, *diag; 97 PetscScalar *xk, *xj, *xk_tmp; 98 99 PetscFunctionBegin; 100 PetscCall(PetscMalloc1(bs, &xk_tmp)); 101 for (k = 0; k < mbs; k++) { 102 v = aa + bs2 * ai[k]; 103 xk = x + k * bs; /* Dk*xk = k-th block of x */ 104 PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */ 105 nz = ai[k + 1] - ai[k]; 106 vj = aj + ai[k]; 107 xj = x + (size_t)(*vj) * bs; /* *vj-th block of x, *vj>k */ 108 while (nz--) { 109 /* x(:) += U(k,:)^T*(Dk*xk) */ 110 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */ 111 vj++; 112 xj = x + (size_t)(*vj) * bs; 113 v += bs2; 114 } 115 /* xk = inv(Dk)*(Dk*xk) */ 116 diag = aa + k * bs2; /* ptr to inv(Dk) */ 117 PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */ 118 } 119 PetscCall(PetscFree(xk_tmp)); 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x) 124 { 125 PetscInt nz, k; 126 const PetscInt *vj, bs2 = bs * bs; 127 const MatScalar *v; 128 PetscScalar *xk, *xj; 129 130 PetscFunctionBegin; 131 for (k = mbs - 1; k >= 0; k--) { 132 v = aa + bs2 * ai[k]; 133 xk = x + k * bs; /* xk */ 134 nz = ai[k + 1] - ai[k]; 135 vj = aj + ai[k]; 136 xj = x + (size_t)(*vj) * bs; 137 while (nz--) { 138 /* xk += U(k,:)*x(:) */ 139 PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */ 140 vj++; 141 v += bs2; 142 xj = x + (size_t)(*vj) * bs; 143 } 144 } 145 PetscFunctionReturn(PETSC_SUCCESS); 146 } 147 148 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 149 { 150 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 151 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 152 PetscInt bs = A->rmap->bs; 153 const MatScalar *aa = a->a; 154 PetscScalar *x; 155 const PetscScalar *b; 156 157 PetscFunctionBegin; 158 PetscCall(VecGetArrayRead(bb, &b)); 159 PetscCall(VecGetArray(xx, &x)); 160 161 /* solve U^T * D * y = b by forward substitution */ 162 PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */ 163 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 164 165 /* solve U*x = y by back substitution */ 166 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 167 168 PetscCall(VecRestoreArrayRead(bb, &b)); 169 PetscCall(VecRestoreArray(xx, &x)); 170 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs)); 171 PetscFunctionReturn(PETSC_SUCCESS); 172 } 173 174 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 175 { 176 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 177 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 178 PetscInt bs = A->rmap->bs; 179 const MatScalar *aa = a->a; 180 const PetscScalar *b; 181 PetscScalar *x; 182 183 PetscFunctionBegin; 184 PetscCall(VecGetArrayRead(bb, &b)); 185 PetscCall(VecGetArray(xx, &x)); 186 PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */ 187 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 188 PetscCall(VecRestoreArrayRead(bb, &b)); 189 PetscCall(VecRestoreArray(xx, &x)); 190 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs)); 191 PetscFunctionReturn(PETSC_SUCCESS); 192 } 193 194 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 195 { 196 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 197 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 198 PetscInt bs = A->rmap->bs; 199 const MatScalar *aa = a->a; 200 const PetscScalar *b; 201 PetscScalar *x; 202 203 PetscFunctionBegin; 204 PetscCall(VecGetArrayRead(bb, &b)); 205 PetscCall(VecGetArray(xx, &x)); 206 PetscCall(PetscArraycpy(x, b, bs * mbs)); 207 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 208 PetscCall(VecRestoreArrayRead(bb, &b)); 209 PetscCall(VecRestoreArray(xx, &x)); 210 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 211 PetscFunctionReturn(PETSC_SUCCESS); 212 } 213 214 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx) 215 { 216 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 217 IS isrow = a->row; 218 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj; 219 PetscInt nz, k, idx; 220 const MatScalar *aa = a->a, *v, *d; 221 PetscScalar *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp; 222 const PetscScalar *b; 223 224 PetscFunctionBegin; 225 PetscCall(VecGetArrayRead(bb, &b)); 226 PetscCall(VecGetArray(xx, &x)); 227 t = a->solve_work; 228 PetscCall(ISGetIndices(isrow, &r)); 229 230 /* solve U^T * D * y = b by forward substitution */ 231 tp = t; 232 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 233 idx = 7 * r[k]; 234 tp[0] = b[idx]; 235 tp[1] = b[idx + 1]; 236 tp[2] = b[idx + 2]; 237 tp[3] = b[idx + 3]; 238 tp[4] = b[idx + 4]; 239 tp[5] = b[idx + 5]; 240 tp[6] = b[idx + 6]; 241 tp += 7; 242 } 243 244 for (k = 0; k < mbs; k++) { 245 v = aa + 49 * ai[k]; 246 vj = aj + ai[k]; 247 tp = t + k * 7; 248 x0 = tp[0]; 249 x1 = tp[1]; 250 x2 = tp[2]; 251 x3 = tp[3]; 252 x4 = tp[4]; 253 x5 = tp[5]; 254 x6 = tp[6]; 255 nz = ai[k + 1] - ai[k]; 256 tp = t + (*vj) * 7; 257 while (nz--) { 258 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6; 259 tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6; 260 tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6; 261 tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6; 262 tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6; 263 tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6; 264 tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6; 265 vj++; 266 tp = t + (*vj) * 7; 267 v += 49; 268 } 269 270 /* xk = inv(Dk)*(Dk*xk) */ 271 d = aa + k * 49; /* ptr to inv(Dk) */ 272 tp = t + k * 7; 273 tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6; 274 tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6; 275 tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6; 276 tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6; 277 tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6; 278 tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6; 279 tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6; 280 } 281 282 /* solve U*x = y by back substitution */ 283 for (k = mbs - 1; k >= 0; k--) { 284 v = aa + 49 * ai[k]; 285 vj = aj + ai[k]; 286 tp = t + k * 7; 287 x0 = tp[0]; 288 x1 = tp[1]; 289 x2 = tp[2]; 290 x3 = tp[3]; 291 x4 = tp[4]; 292 x5 = tp[5]; 293 x6 = tp[6]; /* xk */ 294 nz = ai[k + 1] - ai[k]; 295 296 tp = t + (*vj) * 7; 297 while (nz--) { 298 /* xk += U(k,:)*x(:) */ 299 x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6]; 300 x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6]; 301 x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6]; 302 x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6]; 303 x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6]; 304 x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6]; 305 x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6]; 306 vj++; 307 tp = t + (*vj) * 7; 308 v += 49; 309 } 310 tp = t + k * 7; 311 tp[0] = x0; 312 tp[1] = x1; 313 tp[2] = x2; 314 tp[3] = x3; 315 tp[4] = x4; 316 tp[5] = x5; 317 tp[6] = x6; 318 idx = 7 * r[k]; 319 x[idx] = x0; 320 x[idx + 1] = x1; 321 x[idx + 2] = x2; 322 x[idx + 3] = x3; 323 x[idx + 4] = x4; 324 x[idx + 5] = x5; 325 x[idx + 6] = x6; 326 } 327 328 PetscCall(ISRestoreIndices(isrow, &r)); 329 PetscCall(VecRestoreArrayRead(bb, &b)); 330 PetscCall(VecRestoreArray(xx, &x)); 331 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 332 PetscFunctionReturn(PETSC_SUCCESS); 333 } 334 335 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 336 { 337 const MatScalar *v, *d; 338 PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6; 339 PetscInt nz, k; 340 const PetscInt *vj; 341 342 PetscFunctionBegin; 343 for (k = 0; k < mbs; k++) { 344 v = aa + 49 * ai[k]; 345 xp = x + k * 7; 346 x0 = xp[0]; 347 x1 = xp[1]; 348 x2 = xp[2]; 349 x3 = xp[3]; 350 x4 = xp[4]; 351 x5 = xp[5]; 352 x6 = xp[6]; /* Dk*xk = k-th block of x */ 353 nz = ai[k + 1] - ai[k]; 354 vj = aj + ai[k]; 355 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 356 PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 357 while (nz--) { 358 xp = x + (*vj) * 7; 359 /* x(:) += U(k,:)^T*(Dk*xk) */ 360 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6; 361 xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6; 362 xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6; 363 xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6; 364 xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6; 365 xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6; 366 xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6; 367 vj++; 368 v += 49; 369 } 370 /* xk = inv(Dk)*(Dk*xk) */ 371 d = aa + k * 49; /* ptr to inv(Dk) */ 372 xp = x + k * 7; 373 xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6; 374 xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6; 375 xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6; 376 xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6; 377 xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6; 378 xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6; 379 xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6; 380 } 381 PetscFunctionReturn(PETSC_SUCCESS); 382 } 383 384 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 385 { 386 const MatScalar *v; 387 PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6; 388 PetscInt nz, k; 389 const PetscInt *vj; 390 391 PetscFunctionBegin; 392 for (k = mbs - 1; k >= 0; k--) { 393 v = aa + 49 * ai[k]; 394 xp = x + k * 7; 395 x0 = xp[0]; 396 x1 = xp[1]; 397 x2 = xp[2]; 398 x3 = xp[3]; 399 x4 = xp[4]; 400 x5 = xp[5]; 401 x6 = xp[6]; /* xk */ 402 nz = ai[k + 1] - ai[k]; 403 vj = aj + ai[k]; 404 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 405 PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 406 while (nz--) { 407 xp = x + (*vj) * 7; 408 /* xk += U(k,:)*x(:) */ 409 x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6]; 410 x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6]; 411 x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6]; 412 x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6]; 413 x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6]; 414 x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6]; 415 x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6]; 416 vj++; 417 v += 49; 418 } 419 xp = x + k * 7; 420 xp[0] = x0; 421 xp[1] = x1; 422 xp[2] = x2; 423 xp[3] = x3; 424 xp[4] = x4; 425 xp[5] = x5; 426 xp[6] = x6; 427 } 428 PetscFunctionReturn(PETSC_SUCCESS); 429 } 430 431 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 432 { 433 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 434 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 435 const MatScalar *aa = a->a; 436 PetscScalar *x; 437 const PetscScalar *b; 438 439 PetscFunctionBegin; 440 PetscCall(VecGetArrayRead(bb, &b)); 441 PetscCall(VecGetArray(xx, &x)); 442 443 /* solve U^T * D * y = b by forward substitution */ 444 PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */ 445 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 446 447 /* solve U*x = y by back substitution */ 448 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 449 450 PetscCall(VecRestoreArrayRead(bb, &b)); 451 PetscCall(VecRestoreArray(xx, &x)); 452 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 453 PetscFunctionReturn(PETSC_SUCCESS); 454 } 455 456 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 457 { 458 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 459 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 460 const MatScalar *aa = a->a; 461 PetscScalar *x; 462 const PetscScalar *b; 463 464 PetscFunctionBegin; 465 PetscCall(VecGetArrayRead(bb, &b)); 466 PetscCall(VecGetArray(xx, &x)); 467 PetscCall(PetscArraycpy(x, b, 7 * mbs)); 468 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 469 PetscCall(VecRestoreArrayRead(bb, &b)); 470 PetscCall(VecRestoreArray(xx, &x)); 471 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 472 PetscFunctionReturn(PETSC_SUCCESS); 473 } 474 475 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 476 { 477 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 478 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 479 const MatScalar *aa = a->a; 480 PetscScalar *x; 481 const PetscScalar *b; 482 483 PetscFunctionBegin; 484 PetscCall(VecGetArrayRead(bb, &b)); 485 PetscCall(VecGetArray(xx, &x)); 486 PetscCall(PetscArraycpy(x, b, 7 * mbs)); 487 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 488 PetscCall(VecRestoreArrayRead(bb, &b)); 489 PetscCall(VecRestoreArray(xx, &x)); 490 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 491 PetscFunctionReturn(PETSC_SUCCESS); 492 } 493 494 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx) 495 { 496 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 497 IS isrow = a->row; 498 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj; 499 PetscInt nz, k, idx; 500 const MatScalar *aa = a->a, *v, *d; 501 PetscScalar *x, x0, x1, x2, x3, x4, x5, *t, *tp; 502 const PetscScalar *b; 503 504 PetscFunctionBegin; 505 PetscCall(VecGetArrayRead(bb, &b)); 506 PetscCall(VecGetArray(xx, &x)); 507 t = a->solve_work; 508 PetscCall(ISGetIndices(isrow, &r)); 509 510 /* solve U^T * D * y = b by forward substitution */ 511 tp = t; 512 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 513 idx = 6 * r[k]; 514 tp[0] = b[idx]; 515 tp[1] = b[idx + 1]; 516 tp[2] = b[idx + 2]; 517 tp[3] = b[idx + 3]; 518 tp[4] = b[idx + 4]; 519 tp[5] = b[idx + 5]; 520 tp += 6; 521 } 522 523 for (k = 0; k < mbs; k++) { 524 v = aa + 36 * ai[k]; 525 vj = aj + ai[k]; 526 tp = t + k * 6; 527 x0 = tp[0]; 528 x1 = tp[1]; 529 x2 = tp[2]; 530 x3 = tp[3]; 531 x4 = tp[4]; 532 x5 = tp[5]; 533 nz = ai[k + 1] - ai[k]; 534 tp = t + (*vj) * 6; 535 while (nz--) { 536 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5; 537 tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5; 538 tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5; 539 tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5; 540 tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5; 541 tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5; 542 vj++; 543 tp = t + (*vj) * 6; 544 v += 36; 545 } 546 547 /* xk = inv(Dk)*(Dk*xk) */ 548 d = aa + k * 36; /* ptr to inv(Dk) */ 549 tp = t + k * 6; 550 tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5; 551 tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5; 552 tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5; 553 tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5; 554 tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5; 555 tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5; 556 } 557 558 /* solve U*x = y by back substitution */ 559 for (k = mbs - 1; k >= 0; k--) { 560 v = aa + 36 * ai[k]; 561 vj = aj + ai[k]; 562 tp = t + k * 6; 563 x0 = tp[0]; 564 x1 = tp[1]; 565 x2 = tp[2]; 566 x3 = tp[3]; 567 x4 = tp[4]; 568 x5 = tp[5]; /* xk */ 569 nz = ai[k + 1] - ai[k]; 570 571 tp = t + (*vj) * 6; 572 while (nz--) { 573 /* xk += U(k,:)*x(:) */ 574 x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5]; 575 x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5]; 576 x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5]; 577 x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5]; 578 x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5]; 579 x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5]; 580 vj++; 581 tp = t + (*vj) * 6; 582 v += 36; 583 } 584 tp = t + k * 6; 585 tp[0] = x0; 586 tp[1] = x1; 587 tp[2] = x2; 588 tp[3] = x3; 589 tp[4] = x4; 590 tp[5] = x5; 591 idx = 6 * r[k]; 592 x[idx] = x0; 593 x[idx + 1] = x1; 594 x[idx + 2] = x2; 595 x[idx + 3] = x3; 596 x[idx + 4] = x4; 597 x[idx + 5] = x5; 598 } 599 600 PetscCall(ISRestoreIndices(isrow, &r)); 601 PetscCall(VecRestoreArrayRead(bb, &b)); 602 PetscCall(VecRestoreArray(xx, &x)); 603 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 604 PetscFunctionReturn(PETSC_SUCCESS); 605 } 606 607 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 608 { 609 const MatScalar *v, *d; 610 PetscScalar *xp, x0, x1, x2, x3, x4, x5; 611 PetscInt nz, k; 612 const PetscInt *vj; 613 614 PetscFunctionBegin; 615 for (k = 0; k < mbs; k++) { 616 v = aa + 36 * ai[k]; 617 xp = x + k * 6; 618 x0 = xp[0]; 619 x1 = xp[1]; 620 x2 = xp[2]; 621 x3 = xp[3]; 622 x4 = xp[4]; 623 x5 = xp[5]; /* Dk*xk = k-th block of x */ 624 nz = ai[k + 1] - ai[k]; 625 vj = aj + ai[k]; 626 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 627 PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 628 while (nz--) { 629 xp = x + (*vj) * 6; 630 /* x(:) += U(k,:)^T*(Dk*xk) */ 631 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5; 632 xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5; 633 xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5; 634 xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5; 635 xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5; 636 xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5; 637 vj++; 638 v += 36; 639 } 640 /* xk = inv(Dk)*(Dk*xk) */ 641 d = aa + k * 36; /* ptr to inv(Dk) */ 642 xp = x + k * 6; 643 xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5; 644 xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5; 645 xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5; 646 xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5; 647 xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5; 648 xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5; 649 } 650 PetscFunctionReturn(PETSC_SUCCESS); 651 } 652 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 653 { 654 const MatScalar *v; 655 PetscScalar *xp, x0, x1, x2, x3, x4, x5; 656 PetscInt nz, k; 657 const PetscInt *vj; 658 659 PetscFunctionBegin; 660 for (k = mbs - 1; k >= 0; k--) { 661 v = aa + 36 * ai[k]; 662 xp = x + k * 6; 663 x0 = xp[0]; 664 x1 = xp[1]; 665 x2 = xp[2]; 666 x3 = xp[3]; 667 x4 = xp[4]; 668 x5 = xp[5]; /* xk */ 669 nz = ai[k + 1] - ai[k]; 670 vj = aj + ai[k]; 671 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 672 PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 673 while (nz--) { 674 xp = x + (*vj) * 6; 675 /* xk += U(k,:)*x(:) */ 676 x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5]; 677 x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5]; 678 x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5]; 679 x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5]; 680 x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5]; 681 x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5]; 682 vj++; 683 v += 36; 684 } 685 xp = x + k * 6; 686 xp[0] = x0; 687 xp[1] = x1; 688 xp[2] = x2; 689 xp[3] = x3; 690 xp[4] = x4; 691 xp[5] = x5; 692 } 693 PetscFunctionReturn(PETSC_SUCCESS); 694 } 695 696 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 697 { 698 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 699 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 700 const MatScalar *aa = a->a; 701 PetscScalar *x; 702 const PetscScalar *b; 703 704 PetscFunctionBegin; 705 PetscCall(VecGetArrayRead(bb, &b)); 706 PetscCall(VecGetArray(xx, &x)); 707 708 /* solve U^T * D * y = b by forward substitution */ 709 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 710 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 711 712 /* solve U*x = y by back substitution */ 713 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 714 715 PetscCall(VecRestoreArrayRead(bb, &b)); 716 PetscCall(VecRestoreArray(xx, &x)); 717 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 718 PetscFunctionReturn(PETSC_SUCCESS); 719 } 720 721 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 722 { 723 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 724 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 725 const MatScalar *aa = a->a; 726 PetscScalar *x; 727 const PetscScalar *b; 728 729 PetscFunctionBegin; 730 PetscCall(VecGetArrayRead(bb, &b)); 731 PetscCall(VecGetArray(xx, &x)); 732 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 733 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 734 PetscCall(VecRestoreArrayRead(bb, &b)); 735 PetscCall(VecRestoreArray(xx, &x)); 736 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 737 PetscFunctionReturn(PETSC_SUCCESS); 738 } 739 740 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 741 { 742 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 743 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 744 const MatScalar *aa = a->a; 745 PetscScalar *x; 746 const PetscScalar *b; 747 748 PetscFunctionBegin; 749 PetscCall(VecGetArrayRead(bb, &b)); 750 PetscCall(VecGetArray(xx, &x)); 751 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 752 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 753 PetscCall(VecRestoreArrayRead(bb, &b)); 754 PetscCall(VecRestoreArray(xx, &x)); 755 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 756 PetscFunctionReturn(PETSC_SUCCESS); 757 } 758 759 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx) 760 { 761 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 762 IS isrow = a->row; 763 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 764 const PetscInt *r, *vj; 765 PetscInt nz, k, idx; 766 const MatScalar *aa = a->a, *v, *diag; 767 PetscScalar *x, x0, x1, x2, x3, x4, *t, *tp; 768 const PetscScalar *b; 769 770 PetscFunctionBegin; 771 PetscCall(VecGetArrayRead(bb, &b)); 772 PetscCall(VecGetArray(xx, &x)); 773 t = a->solve_work; 774 PetscCall(ISGetIndices(isrow, &r)); 775 776 /* solve U^T * D * y = b by forward substitution */ 777 tp = t; 778 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 779 idx = 5 * r[k]; 780 tp[0] = b[idx]; 781 tp[1] = b[idx + 1]; 782 tp[2] = b[idx + 2]; 783 tp[3] = b[idx + 3]; 784 tp[4] = b[idx + 4]; 785 tp += 5; 786 } 787 788 for (k = 0; k < mbs; k++) { 789 v = aa + 25 * ai[k]; 790 vj = aj + ai[k]; 791 tp = t + k * 5; 792 x0 = tp[0]; 793 x1 = tp[1]; 794 x2 = tp[2]; 795 x3 = tp[3]; 796 x4 = tp[4]; 797 nz = ai[k + 1] - ai[k]; 798 799 tp = t + (*vj) * 5; 800 while (nz--) { 801 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4; 802 tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4; 803 tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4; 804 tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4; 805 tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4; 806 vj++; 807 tp = t + (*vj) * 5; 808 v += 25; 809 } 810 811 /* xk = inv(Dk)*(Dk*xk) */ 812 diag = aa + k * 25; /* ptr to inv(Dk) */ 813 tp = t + k * 5; 814 tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4; 815 tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4; 816 tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4; 817 tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4; 818 tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4; 819 } 820 821 /* solve U*x = y by back substitution */ 822 for (k = mbs - 1; k >= 0; k--) { 823 v = aa + 25 * ai[k]; 824 vj = aj + ai[k]; 825 tp = t + k * 5; 826 x0 = tp[0]; 827 x1 = tp[1]; 828 x2 = tp[2]; 829 x3 = tp[3]; 830 x4 = tp[4]; /* xk */ 831 nz = ai[k + 1] - ai[k]; 832 833 tp = t + (*vj) * 5; 834 while (nz--) { 835 /* xk += U(k,:)*x(:) */ 836 x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4]; 837 x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4]; 838 x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4]; 839 x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4]; 840 x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4]; 841 vj++; 842 tp = t + (*vj) * 5; 843 v += 25; 844 } 845 tp = t + k * 5; 846 tp[0] = x0; 847 tp[1] = x1; 848 tp[2] = x2; 849 tp[3] = x3; 850 tp[4] = x4; 851 idx = 5 * r[k]; 852 x[idx] = x0; 853 x[idx + 1] = x1; 854 x[idx + 2] = x2; 855 x[idx + 3] = x3; 856 x[idx + 4] = x4; 857 } 858 859 PetscCall(ISRestoreIndices(isrow, &r)); 860 PetscCall(VecRestoreArrayRead(bb, &b)); 861 PetscCall(VecRestoreArray(xx, &x)); 862 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 863 PetscFunctionReturn(PETSC_SUCCESS); 864 } 865 866 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 867 { 868 const MatScalar *v, *diag; 869 PetscScalar *xp, x0, x1, x2, x3, x4; 870 PetscInt nz, k; 871 const PetscInt *vj; 872 873 PetscFunctionBegin; 874 for (k = 0; k < mbs; k++) { 875 v = aa + 25 * ai[k]; 876 xp = x + k * 5; 877 x0 = xp[0]; 878 x1 = xp[1]; 879 x2 = xp[2]; 880 x3 = xp[3]; 881 x4 = xp[4]; /* Dk*xk = k-th block of x */ 882 nz = ai[k + 1] - ai[k]; 883 vj = aj + ai[k]; 884 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 885 PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 886 while (nz--) { 887 xp = x + (*vj) * 5; 888 /* x(:) += U(k,:)^T*(Dk*xk) */ 889 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4; 890 xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4; 891 xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4; 892 xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4; 893 xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4; 894 vj++; 895 v += 25; 896 } 897 /* xk = inv(Dk)*(Dk*xk) */ 898 diag = aa + k * 25; /* ptr to inv(Dk) */ 899 xp = x + k * 5; 900 xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4; 901 xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4; 902 xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4; 903 xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4; 904 xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4; 905 } 906 PetscFunctionReturn(PETSC_SUCCESS); 907 } 908 909 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 910 { 911 const MatScalar *v; 912 PetscScalar *xp, x0, x1, x2, x3, x4; 913 PetscInt nz, k; 914 const PetscInt *vj; 915 916 PetscFunctionBegin; 917 for (k = mbs - 1; k >= 0; k--) { 918 v = aa + 25 * ai[k]; 919 xp = x + k * 5; 920 x0 = xp[0]; 921 x1 = xp[1]; 922 x2 = xp[2]; 923 x3 = xp[3]; 924 x4 = xp[4]; /* xk */ 925 nz = ai[k + 1] - ai[k]; 926 vj = aj + ai[k]; 927 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 928 PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 929 while (nz--) { 930 xp = x + (*vj) * 5; 931 /* xk += U(k,:)*x(:) */ 932 x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4]; 933 x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4]; 934 x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4]; 935 x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4]; 936 x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4]; 937 vj++; 938 v += 25; 939 } 940 xp = x + k * 5; 941 xp[0] = x0; 942 xp[1] = x1; 943 xp[2] = x2; 944 xp[3] = x3; 945 xp[4] = x4; 946 } 947 PetscFunctionReturn(PETSC_SUCCESS); 948 } 949 950 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 951 { 952 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 953 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 954 const MatScalar *aa = a->a; 955 PetscScalar *x; 956 const PetscScalar *b; 957 958 PetscFunctionBegin; 959 PetscCall(VecGetArrayRead(bb, &b)); 960 PetscCall(VecGetArray(xx, &x)); 961 962 /* solve U^T * D * y = b by forward substitution */ 963 PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */ 964 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 965 966 /* solve U*x = y by back substitution */ 967 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 968 969 PetscCall(VecRestoreArrayRead(bb, &b)); 970 PetscCall(VecRestoreArray(xx, &x)); 971 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 972 PetscFunctionReturn(PETSC_SUCCESS); 973 } 974 975 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 976 { 977 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 978 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 979 const MatScalar *aa = a->a; 980 PetscScalar *x; 981 const PetscScalar *b; 982 983 PetscFunctionBegin; 984 PetscCall(VecGetArrayRead(bb, &b)); 985 PetscCall(VecGetArray(xx, &x)); 986 PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */ 987 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 988 PetscCall(VecRestoreArrayRead(bb, &b)); 989 PetscCall(VecRestoreArray(xx, &x)); 990 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 991 PetscFunctionReturn(PETSC_SUCCESS); 992 } 993 994 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 995 { 996 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 997 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 998 const MatScalar *aa = a->a; 999 PetscScalar *x; 1000 const PetscScalar *b; 1001 1002 PetscFunctionBegin; 1003 PetscCall(VecGetArrayRead(bb, &b)); 1004 PetscCall(VecGetArray(xx, &x)); 1005 PetscCall(PetscArraycpy(x, b, 5 * mbs)); 1006 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 1007 PetscCall(VecRestoreArrayRead(bb, &b)); 1008 PetscCall(VecRestoreArray(xx, &x)); 1009 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1010 PetscFunctionReturn(PETSC_SUCCESS); 1011 } 1012 1013 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx) 1014 { 1015 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1016 IS isrow = a->row; 1017 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1018 const PetscInt *r, *vj; 1019 PetscInt nz, k, idx; 1020 const MatScalar *aa = a->a, *v, *diag; 1021 PetscScalar *x, x0, x1, x2, x3, *t, *tp; 1022 const PetscScalar *b; 1023 1024 PetscFunctionBegin; 1025 PetscCall(VecGetArrayRead(bb, &b)); 1026 PetscCall(VecGetArray(xx, &x)); 1027 t = a->solve_work; 1028 PetscCall(ISGetIndices(isrow, &r)); 1029 1030 /* solve U^T * D * y = b by forward substitution */ 1031 tp = t; 1032 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1033 idx = 4 * r[k]; 1034 tp[0] = b[idx]; 1035 tp[1] = b[idx + 1]; 1036 tp[2] = b[idx + 2]; 1037 tp[3] = b[idx + 3]; 1038 tp += 4; 1039 } 1040 1041 for (k = 0; k < mbs; k++) { 1042 v = aa + 16 * ai[k]; 1043 vj = aj + ai[k]; 1044 tp = t + k * 4; 1045 x0 = tp[0]; 1046 x1 = tp[1]; 1047 x2 = tp[2]; 1048 x3 = tp[3]; 1049 nz = ai[k + 1] - ai[k]; 1050 1051 tp = t + (*vj) * 4; 1052 while (nz--) { 1053 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3; 1054 tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3; 1055 tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3; 1056 tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3; 1057 vj++; 1058 tp = t + (*vj) * 4; 1059 v += 16; 1060 } 1061 1062 /* xk = inv(Dk)*(Dk*xk) */ 1063 diag = aa + k * 16; /* ptr to inv(Dk) */ 1064 tp = t + k * 4; 1065 tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3; 1066 tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3; 1067 tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3; 1068 tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3; 1069 } 1070 1071 /* solve U*x = y by back substitution */ 1072 for (k = mbs - 1; k >= 0; k--) { 1073 v = aa + 16 * ai[k]; 1074 vj = aj + ai[k]; 1075 tp = t + k * 4; 1076 x0 = tp[0]; 1077 x1 = tp[1]; 1078 x2 = tp[2]; 1079 x3 = tp[3]; /* xk */ 1080 nz = ai[k + 1] - ai[k]; 1081 1082 tp = t + (*vj) * 4; 1083 while (nz--) { 1084 /* xk += U(k,:)*x(:) */ 1085 x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3]; 1086 x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3]; 1087 x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3]; 1088 x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3]; 1089 vj++; 1090 tp = t + (*vj) * 4; 1091 v += 16; 1092 } 1093 tp = t + k * 4; 1094 tp[0] = x0; 1095 tp[1] = x1; 1096 tp[2] = x2; 1097 tp[3] = x3; 1098 idx = 4 * r[k]; 1099 x[idx] = x0; 1100 x[idx + 1] = x1; 1101 x[idx + 2] = x2; 1102 x[idx + 3] = x3; 1103 } 1104 1105 PetscCall(ISRestoreIndices(isrow, &r)); 1106 PetscCall(VecRestoreArrayRead(bb, &b)); 1107 PetscCall(VecRestoreArray(xx, &x)); 1108 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1109 PetscFunctionReturn(PETSC_SUCCESS); 1110 } 1111 1112 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1113 { 1114 const MatScalar *v, *diag; 1115 PetscScalar *xp, x0, x1, x2, x3; 1116 PetscInt nz, k; 1117 const PetscInt *vj; 1118 1119 PetscFunctionBegin; 1120 for (k = 0; k < mbs; k++) { 1121 v = aa + 16 * ai[k]; 1122 xp = x + k * 4; 1123 x0 = xp[0]; 1124 x1 = xp[1]; 1125 x2 = xp[2]; 1126 x3 = xp[3]; /* Dk*xk = k-th block of x */ 1127 nz = ai[k + 1] - ai[k]; 1128 vj = aj + ai[k]; 1129 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1130 PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1131 while (nz--) { 1132 xp = x + (*vj) * 4; 1133 /* x(:) += U(k,:)^T*(Dk*xk) */ 1134 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3; 1135 xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3; 1136 xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3; 1137 xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3; 1138 vj++; 1139 v += 16; 1140 } 1141 /* xk = inv(Dk)*(Dk*xk) */ 1142 diag = aa + k * 16; /* ptr to inv(Dk) */ 1143 xp = x + k * 4; 1144 xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3; 1145 xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3; 1146 xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3; 1147 xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3; 1148 } 1149 PetscFunctionReturn(PETSC_SUCCESS); 1150 } 1151 1152 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1153 { 1154 const MatScalar *v; 1155 PetscScalar *xp, x0, x1, x2, x3; 1156 PetscInt nz, k; 1157 const PetscInt *vj; 1158 1159 PetscFunctionBegin; 1160 for (k = mbs - 1; k >= 0; k--) { 1161 v = aa + 16 * ai[k]; 1162 xp = x + k * 4; 1163 x0 = xp[0]; 1164 x1 = xp[1]; 1165 x2 = xp[2]; 1166 x3 = xp[3]; /* xk */ 1167 nz = ai[k + 1] - ai[k]; 1168 vj = aj + ai[k]; 1169 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1170 PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1171 while (nz--) { 1172 xp = x + (*vj) * 4; 1173 /* xk += U(k,:)*x(:) */ 1174 x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3]; 1175 x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3]; 1176 x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3]; 1177 x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3]; 1178 vj++; 1179 v += 16; 1180 } 1181 xp = x + k * 4; 1182 xp[0] = x0; 1183 xp[1] = x1; 1184 xp[2] = x2; 1185 xp[3] = x3; 1186 } 1187 PetscFunctionReturn(PETSC_SUCCESS); 1188 } 1189 1190 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1191 { 1192 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1193 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1194 const MatScalar *aa = a->a; 1195 PetscScalar *x; 1196 const PetscScalar *b; 1197 1198 PetscFunctionBegin; 1199 PetscCall(VecGetArrayRead(bb, &b)); 1200 PetscCall(VecGetArray(xx, &x)); 1201 1202 /* solve U^T * D * y = b by forward substitution */ 1203 PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */ 1204 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1205 1206 /* solve U*x = y by back substitution */ 1207 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1208 PetscCall(VecRestoreArrayRead(bb, &b)); 1209 PetscCall(VecRestoreArray(xx, &x)); 1210 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1211 PetscFunctionReturn(PETSC_SUCCESS); 1212 } 1213 1214 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1215 { 1216 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1217 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1218 const MatScalar *aa = a->a; 1219 PetscScalar *x; 1220 const PetscScalar *b; 1221 1222 PetscFunctionBegin; 1223 PetscCall(VecGetArrayRead(bb, &b)); 1224 PetscCall(VecGetArray(xx, &x)); 1225 PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */ 1226 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1227 PetscCall(VecRestoreArrayRead(bb, &b)); 1228 PetscCall(VecRestoreArray(xx, &x)); 1229 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1230 PetscFunctionReturn(PETSC_SUCCESS); 1231 } 1232 1233 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1234 { 1235 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1236 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1237 const MatScalar *aa = a->a; 1238 PetscScalar *x; 1239 const PetscScalar *b; 1240 1241 PetscFunctionBegin; 1242 PetscCall(VecGetArrayRead(bb, &b)); 1243 PetscCall(VecGetArray(xx, &x)); 1244 PetscCall(PetscArraycpy(x, b, 4 * mbs)); 1245 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1246 PetscCall(VecRestoreArrayRead(bb, &b)); 1247 PetscCall(VecRestoreArray(xx, &x)); 1248 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1249 PetscFunctionReturn(PETSC_SUCCESS); 1250 } 1251 1252 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx) 1253 { 1254 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1255 IS isrow = a->row; 1256 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1257 const PetscInt *r; 1258 PetscInt nz, k, idx; 1259 const PetscInt *vj; 1260 const MatScalar *aa = a->a, *v, *diag; 1261 PetscScalar *x, x0, x1, x2, *t, *tp; 1262 const PetscScalar *b; 1263 1264 PetscFunctionBegin; 1265 PetscCall(VecGetArrayRead(bb, &b)); 1266 PetscCall(VecGetArray(xx, &x)); 1267 t = a->solve_work; 1268 PetscCall(ISGetIndices(isrow, &r)); 1269 1270 /* solve U^T * D * y = b by forward substitution */ 1271 tp = t; 1272 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1273 idx = 3 * r[k]; 1274 tp[0] = b[idx]; 1275 tp[1] = b[idx + 1]; 1276 tp[2] = b[idx + 2]; 1277 tp += 3; 1278 } 1279 1280 for (k = 0; k < mbs; k++) { 1281 v = aa + 9 * ai[k]; 1282 vj = aj + ai[k]; 1283 tp = t + k * 3; 1284 x0 = tp[0]; 1285 x1 = tp[1]; 1286 x2 = tp[2]; 1287 nz = ai[k + 1] - ai[k]; 1288 1289 tp = t + (*vj) * 3; 1290 while (nz--) { 1291 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2; 1292 tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2; 1293 tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2; 1294 vj++; 1295 tp = t + (*vj) * 3; 1296 v += 9; 1297 } 1298 1299 /* xk = inv(Dk)*(Dk*xk) */ 1300 diag = aa + k * 9; /* ptr to inv(Dk) */ 1301 tp = t + k * 3; 1302 tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2; 1303 tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2; 1304 tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2; 1305 } 1306 1307 /* solve U*x = y by back substitution */ 1308 for (k = mbs - 1; k >= 0; k--) { 1309 v = aa + 9 * ai[k]; 1310 vj = aj + ai[k]; 1311 tp = t + k * 3; 1312 x0 = tp[0]; 1313 x1 = tp[1]; 1314 x2 = tp[2]; /* xk */ 1315 nz = ai[k + 1] - ai[k]; 1316 1317 tp = t + (*vj) * 3; 1318 while (nz--) { 1319 /* xk += U(k,:)*x(:) */ 1320 x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2]; 1321 x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2]; 1322 x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2]; 1323 vj++; 1324 tp = t + (*vj) * 3; 1325 v += 9; 1326 } 1327 tp = t + k * 3; 1328 tp[0] = x0; 1329 tp[1] = x1; 1330 tp[2] = x2; 1331 idx = 3 * r[k]; 1332 x[idx] = x0; 1333 x[idx + 1] = x1; 1334 x[idx + 2] = x2; 1335 } 1336 1337 PetscCall(ISRestoreIndices(isrow, &r)); 1338 PetscCall(VecRestoreArrayRead(bb, &b)); 1339 PetscCall(VecRestoreArray(xx, &x)); 1340 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1341 PetscFunctionReturn(PETSC_SUCCESS); 1342 } 1343 1344 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1345 { 1346 const MatScalar *v, *diag; 1347 PetscScalar *xp, x0, x1, x2; 1348 PetscInt nz, k; 1349 const PetscInt *vj; 1350 1351 PetscFunctionBegin; 1352 for (k = 0; k < mbs; k++) { 1353 v = aa + 9 * ai[k]; 1354 xp = x + k * 3; 1355 x0 = xp[0]; 1356 x1 = xp[1]; 1357 x2 = xp[2]; /* Dk*xk = k-th block of x */ 1358 nz = ai[k + 1] - ai[k]; 1359 vj = aj + ai[k]; 1360 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1361 PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1362 while (nz--) { 1363 xp = x + (*vj) * 3; 1364 /* x(:) += U(k,:)^T*(Dk*xk) */ 1365 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2; 1366 xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2; 1367 xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2; 1368 vj++; 1369 v += 9; 1370 } 1371 /* xk = inv(Dk)*(Dk*xk) */ 1372 diag = aa + k * 9; /* ptr to inv(Dk) */ 1373 xp = x + k * 3; 1374 xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2; 1375 xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2; 1376 xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2; 1377 } 1378 PetscFunctionReturn(PETSC_SUCCESS); 1379 } 1380 1381 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1382 { 1383 const MatScalar *v; 1384 PetscScalar *xp, x0, x1, x2; 1385 PetscInt nz, k; 1386 const PetscInt *vj; 1387 1388 PetscFunctionBegin; 1389 for (k = mbs - 1; k >= 0; k--) { 1390 v = aa + 9 * ai[k]; 1391 xp = x + k * 3; 1392 x0 = xp[0]; 1393 x1 = xp[1]; 1394 x2 = xp[2]; /* xk */ 1395 nz = ai[k + 1] - ai[k]; 1396 vj = aj + ai[k]; 1397 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1398 PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1399 while (nz--) { 1400 xp = x + (*vj) * 3; 1401 /* xk += U(k,:)*x(:) */ 1402 x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2]; 1403 x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2]; 1404 x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2]; 1405 vj++; 1406 v += 9; 1407 } 1408 xp = x + k * 3; 1409 xp[0] = x0; 1410 xp[1] = x1; 1411 xp[2] = x2; 1412 } 1413 PetscFunctionReturn(PETSC_SUCCESS); 1414 } 1415 1416 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1417 { 1418 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1419 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1420 const MatScalar *aa = a->a; 1421 PetscScalar *x; 1422 const PetscScalar *b; 1423 1424 PetscFunctionBegin; 1425 PetscCall(VecGetArrayRead(bb, &b)); 1426 PetscCall(VecGetArray(xx, &x)); 1427 1428 /* solve U^T * D * y = b by forward substitution */ 1429 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1430 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1431 1432 /* solve U*x = y by back substitution */ 1433 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1434 1435 PetscCall(VecRestoreArrayRead(bb, &b)); 1436 PetscCall(VecRestoreArray(xx, &x)); 1437 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1438 PetscFunctionReturn(PETSC_SUCCESS); 1439 } 1440 1441 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1442 { 1443 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1444 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1445 const MatScalar *aa = a->a; 1446 PetscScalar *x; 1447 const PetscScalar *b; 1448 1449 PetscFunctionBegin; 1450 PetscCall(VecGetArrayRead(bb, &b)); 1451 PetscCall(VecGetArray(xx, &x)); 1452 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1453 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1454 PetscCall(VecRestoreArrayRead(bb, &b)); 1455 PetscCall(VecRestoreArray(xx, &x)); 1456 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1457 PetscFunctionReturn(PETSC_SUCCESS); 1458 } 1459 1460 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1461 { 1462 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1463 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1464 const MatScalar *aa = a->a; 1465 PetscScalar *x; 1466 const PetscScalar *b; 1467 1468 PetscFunctionBegin; 1469 PetscCall(VecGetArrayRead(bb, &b)); 1470 PetscCall(VecGetArray(xx, &x)); 1471 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1472 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1473 PetscCall(VecRestoreArrayRead(bb, &b)); 1474 PetscCall(VecRestoreArray(xx, &x)); 1475 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1476 PetscFunctionReturn(PETSC_SUCCESS); 1477 } 1478 1479 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx) 1480 { 1481 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1482 IS isrow = a->row; 1483 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1484 const PetscInt *r, *vj; 1485 PetscInt nz, k, k2, idx; 1486 const MatScalar *aa = a->a, *v, *diag; 1487 PetscScalar *x, x0, x1, *t; 1488 const PetscScalar *b; 1489 1490 PetscFunctionBegin; 1491 PetscCall(VecGetArrayRead(bb, &b)); 1492 PetscCall(VecGetArray(xx, &x)); 1493 t = a->solve_work; 1494 PetscCall(ISGetIndices(isrow, &r)); 1495 1496 /* solve U^T * D * y = perm(b) by forward substitution */ 1497 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1498 idx = 2 * r[k]; 1499 t[k * 2] = b[idx]; 1500 t[k * 2 + 1] = b[idx + 1]; 1501 } 1502 for (k = 0; k < mbs; k++) { 1503 v = aa + 4 * ai[k]; 1504 vj = aj + ai[k]; 1505 k2 = k * 2; 1506 x0 = t[k2]; 1507 x1 = t[k2 + 1]; 1508 nz = ai[k + 1] - ai[k]; 1509 while (nz--) { 1510 t[(*vj) * 2] += v[0] * x0 + v[1] * x1; 1511 t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1; 1512 vj++; 1513 v += 4; 1514 } 1515 diag = aa + k * 4; /* ptr to inv(Dk) */ 1516 t[k2] = diag[0] * x0 + diag[2] * x1; 1517 t[k2 + 1] = diag[1] * x0 + diag[3] * x1; 1518 } 1519 1520 /* solve U*x = y by back substitution */ 1521 for (k = mbs - 1; k >= 0; k--) { 1522 v = aa + 4 * ai[k]; 1523 vj = aj + ai[k]; 1524 k2 = k * 2; 1525 x0 = t[k2]; 1526 x1 = t[k2 + 1]; 1527 nz = ai[k + 1] - ai[k]; 1528 while (nz--) { 1529 x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1]; 1530 x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1]; 1531 vj++; 1532 v += 4; 1533 } 1534 t[k2] = x0; 1535 t[k2 + 1] = x1; 1536 idx = 2 * r[k]; 1537 x[idx] = x0; 1538 x[idx + 1] = x1; 1539 } 1540 1541 PetscCall(ISRestoreIndices(isrow, &r)); 1542 PetscCall(VecRestoreArrayRead(bb, &b)); 1543 PetscCall(VecRestoreArray(xx, &x)); 1544 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1545 PetscFunctionReturn(PETSC_SUCCESS); 1546 } 1547 1548 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1549 { 1550 const MatScalar *v, *diag; 1551 PetscScalar x0, x1; 1552 PetscInt nz, k, k2; 1553 const PetscInt *vj; 1554 1555 PetscFunctionBegin; 1556 for (k = 0; k < mbs; k++) { 1557 v = aa + 4 * ai[k]; 1558 vj = aj + ai[k]; 1559 k2 = k * 2; 1560 x0 = x[k2]; 1561 x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */ 1562 nz = ai[k + 1] - ai[k]; 1563 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1564 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1565 while (nz--) { 1566 /* x(:) += U(k,:)^T*(Dk*xk) */ 1567 x[(*vj) * 2] += v[0] * x0 + v[1] * x1; 1568 x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1; 1569 vj++; 1570 v += 4; 1571 } 1572 /* xk = inv(Dk)*(Dk*xk) */ 1573 diag = aa + k * 4; /* ptr to inv(Dk) */ 1574 x[k2] = diag[0] * x0 + diag[2] * x1; 1575 x[k2 + 1] = diag[1] * x0 + diag[3] * x1; 1576 } 1577 PetscFunctionReturn(PETSC_SUCCESS); 1578 } 1579 1580 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1581 { 1582 const MatScalar *v; 1583 PetscScalar x0, x1; 1584 PetscInt nz, k, k2; 1585 const PetscInt *vj; 1586 1587 PetscFunctionBegin; 1588 for (k = mbs - 1; k >= 0; k--) { 1589 v = aa + 4 * ai[k]; 1590 vj = aj + ai[k]; 1591 k2 = k * 2; 1592 x0 = x[k2]; 1593 x1 = x[k2 + 1]; /* xk */ 1594 nz = ai[k + 1] - ai[k]; 1595 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1596 PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1597 while (nz--) { 1598 /* xk += U(k,:)*x(:) */ 1599 x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1]; 1600 x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1]; 1601 vj++; 1602 v += 4; 1603 } 1604 x[k2] = x0; 1605 x[k2 + 1] = x1; 1606 } 1607 PetscFunctionReturn(PETSC_SUCCESS); 1608 } 1609 1610 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1611 { 1612 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1613 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1614 const MatScalar *aa = a->a; 1615 PetscScalar *x; 1616 const PetscScalar *b; 1617 1618 PetscFunctionBegin; 1619 PetscCall(VecGetArrayRead(bb, &b)); 1620 PetscCall(VecGetArray(xx, &x)); 1621 1622 /* solve U^T * D * y = b by forward substitution */ 1623 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1624 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1625 1626 /* solve U*x = y by back substitution */ 1627 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1628 1629 PetscCall(VecRestoreArrayRead(bb, &b)); 1630 PetscCall(VecRestoreArray(xx, &x)); 1631 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1632 PetscFunctionReturn(PETSC_SUCCESS); 1633 } 1634 1635 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1636 { 1637 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1638 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1639 const MatScalar *aa = a->a; 1640 PetscScalar *x; 1641 const PetscScalar *b; 1642 1643 PetscFunctionBegin; 1644 PetscCall(VecGetArrayRead(bb, &b)); 1645 PetscCall(VecGetArray(xx, &x)); 1646 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1647 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1648 PetscCall(VecRestoreArrayRead(bb, &b)); 1649 PetscCall(VecRestoreArray(xx, &x)); 1650 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1651 PetscFunctionReturn(PETSC_SUCCESS); 1652 } 1653 1654 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1655 { 1656 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1657 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1658 const MatScalar *aa = a->a; 1659 PetscScalar *x; 1660 const PetscScalar *b; 1661 1662 PetscFunctionBegin; 1663 PetscCall(VecGetArrayRead(bb, &b)); 1664 PetscCall(VecGetArray(xx, &x)); 1665 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1666 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1667 PetscCall(VecRestoreArrayRead(bb, &b)); 1668 PetscCall(VecRestoreArray(xx, &x)); 1669 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1670 PetscFunctionReturn(PETSC_SUCCESS); 1671 } 1672 1673 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) 1674 { 1675 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1676 IS isrow = a->row; 1677 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1678 const MatScalar *aa = a->a, *v; 1679 const PetscScalar *b; 1680 PetscScalar *x, xk, *t; 1681 PetscInt nz, k, j; 1682 1683 PetscFunctionBegin; 1684 PetscCall(VecGetArrayRead(bb, &b)); 1685 PetscCall(VecGetArray(xx, &x)); 1686 t = a->solve_work; 1687 PetscCall(ISGetIndices(isrow, &rp)); 1688 1689 /* solve U^T*D*y = perm(b) by forward substitution */ 1690 for (k = 0; k < mbs; k++) t[k] = b[rp[k]]; 1691 for (k = 0; k < mbs; k++) { 1692 v = aa + ai[k]; 1693 vj = aj + ai[k]; 1694 xk = t[k]; 1695 nz = ai[k + 1] - ai[k] - 1; 1696 for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk; 1697 t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */ 1698 } 1699 1700 /* solve U*perm(x) = y by back substitution */ 1701 for (k = mbs - 1; k >= 0; k--) { 1702 v = aa + adiag[k] - 1; 1703 vj = aj + adiag[k] - 1; 1704 nz = ai[k + 1] - ai[k] - 1; 1705 for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]]; 1706 x[rp[k]] = t[k]; 1707 } 1708 1709 PetscCall(ISRestoreIndices(isrow, &rp)); 1710 PetscCall(VecRestoreArrayRead(bb, &b)); 1711 PetscCall(VecRestoreArray(xx, &x)); 1712 PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs)); 1713 PetscFunctionReturn(PETSC_SUCCESS); 1714 } 1715 1716 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1717 { 1718 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1719 IS isrow = a->row; 1720 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1721 const MatScalar *aa = a->a, *v; 1722 PetscScalar *x, xk, *t; 1723 const PetscScalar *b; 1724 PetscInt nz, k; 1725 1726 PetscFunctionBegin; 1727 PetscCall(VecGetArrayRead(bb, &b)); 1728 PetscCall(VecGetArray(xx, &x)); 1729 t = a->solve_work; 1730 PetscCall(ISGetIndices(isrow, &rp)); 1731 1732 /* solve U^T*D*y = perm(b) by forward substitution */ 1733 for (k = 0; k < mbs; k++) t[k] = b[rp[k]]; 1734 for (k = 0; k < mbs; k++) { 1735 v = aa + ai[k] + 1; 1736 vj = aj + ai[k] + 1; 1737 xk = t[k]; 1738 nz = ai[k + 1] - ai[k] - 1; 1739 while (nz--) t[*vj++] += (*v++) * xk; 1740 t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */ 1741 } 1742 1743 /* solve U*perm(x) = y by back substitution */ 1744 for (k = mbs - 1; k >= 0; k--) { 1745 v = aa + ai[k] + 1; 1746 vj = aj + ai[k] + 1; 1747 nz = ai[k + 1] - ai[k] - 1; 1748 while (nz--) t[k] += (*v++) * t[*vj++]; 1749 x[rp[k]] = t[k]; 1750 } 1751 1752 PetscCall(ISRestoreIndices(isrow, &rp)); 1753 PetscCall(VecRestoreArrayRead(bb, &b)); 1754 PetscCall(VecRestoreArray(xx, &x)); 1755 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 1756 PetscFunctionReturn(PETSC_SUCCESS); 1757 } 1758 1759 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) 1760 { 1761 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1762 IS isrow = a->row; 1763 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1764 const MatScalar *aa = a->a, *v; 1765 PetscReal diagk; 1766 PetscScalar *x, xk; 1767 const PetscScalar *b; 1768 PetscInt nz, k; 1769 1770 PetscFunctionBegin; 1771 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1772 PetscCall(VecGetArrayRead(bb, &b)); 1773 PetscCall(VecGetArray(xx, &x)); 1774 PetscCall(ISGetIndices(isrow, &rp)); 1775 1776 for (k = 0; k < mbs; k++) x[k] = b[rp[k]]; 1777 for (k = 0; k < mbs; k++) { 1778 v = aa + ai[k]; 1779 vj = aj + ai[k]; 1780 xk = x[k]; 1781 nz = ai[k + 1] - ai[k] - 1; 1782 while (nz--) x[*vj++] += (*v++) * xk; 1783 1784 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1785 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1786 x[k] = xk * PetscSqrtReal(diagk); 1787 } 1788 PetscCall(ISRestoreIndices(isrow, &rp)); 1789 PetscCall(VecRestoreArrayRead(bb, &b)); 1790 PetscCall(VecRestoreArray(xx, &x)); 1791 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1792 PetscFunctionReturn(PETSC_SUCCESS); 1793 } 1794 1795 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1796 { 1797 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1798 IS isrow = a->row; 1799 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1800 const MatScalar *aa = a->a, *v; 1801 PetscReal diagk; 1802 PetscScalar *x, xk; 1803 const PetscScalar *b; 1804 PetscInt nz, k; 1805 1806 PetscFunctionBegin; 1807 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1808 PetscCall(VecGetArrayRead(bb, &b)); 1809 PetscCall(VecGetArray(xx, &x)); 1810 PetscCall(ISGetIndices(isrow, &rp)); 1811 1812 for (k = 0; k < mbs; k++) x[k] = b[rp[k]]; 1813 for (k = 0; k < mbs; k++) { 1814 v = aa + ai[k] + 1; 1815 vj = aj + ai[k] + 1; 1816 xk = x[k]; 1817 nz = ai[k + 1] - ai[k] - 1; 1818 while (nz--) x[*vj++] += (*v++) * xk; 1819 1820 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1821 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1822 x[k] = xk * PetscSqrtReal(diagk); 1823 } 1824 PetscCall(ISRestoreIndices(isrow, &rp)); 1825 PetscCall(VecRestoreArrayRead(bb, &b)); 1826 PetscCall(VecRestoreArray(xx, &x)); 1827 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1828 PetscFunctionReturn(PETSC_SUCCESS); 1829 } 1830 1831 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) 1832 { 1833 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1834 IS isrow = a->row; 1835 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1836 const MatScalar *aa = a->a, *v; 1837 PetscReal diagk; 1838 PetscScalar *x, *t; 1839 const PetscScalar *b; 1840 PetscInt nz, k; 1841 1842 PetscFunctionBegin; 1843 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1844 PetscCall(VecGetArrayRead(bb, &b)); 1845 PetscCall(VecGetArray(xx, &x)); 1846 t = a->solve_work; 1847 PetscCall(ISGetIndices(isrow, &rp)); 1848 1849 for (k = mbs - 1; k >= 0; k--) { 1850 v = aa + ai[k]; 1851 vj = aj + ai[k]; 1852 diagk = PetscRealPart(aa[adiag[k]]); 1853 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1854 t[k] = b[k] * PetscSqrtReal(diagk); 1855 nz = ai[k + 1] - ai[k] - 1; 1856 while (nz--) t[k] += (*v++) * t[*vj++]; 1857 x[rp[k]] = t[k]; 1858 } 1859 PetscCall(ISRestoreIndices(isrow, &rp)); 1860 PetscCall(VecRestoreArrayRead(bb, &b)); 1861 PetscCall(VecRestoreArray(xx, &x)); 1862 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1863 PetscFunctionReturn(PETSC_SUCCESS); 1864 } 1865 1866 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1867 { 1868 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1869 IS isrow = a->row; 1870 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1871 const MatScalar *aa = a->a, *v; 1872 PetscReal diagk; 1873 PetscScalar *x, *t; 1874 const PetscScalar *b; 1875 PetscInt nz, k; 1876 1877 PetscFunctionBegin; 1878 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1879 PetscCall(VecGetArrayRead(bb, &b)); 1880 PetscCall(VecGetArray(xx, &x)); 1881 t = a->solve_work; 1882 PetscCall(ISGetIndices(isrow, &rp)); 1883 1884 for (k = mbs - 1; k >= 0; k--) { 1885 v = aa + ai[k] + 1; 1886 vj = aj + ai[k] + 1; 1887 diagk = PetscRealPart(aa[ai[k]]); 1888 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1889 t[k] = b[k] * PetscSqrtReal(diagk); 1890 nz = ai[k + 1] - ai[k] - 1; 1891 while (nz--) t[k] += (*v++) * t[*vj++]; 1892 x[rp[k]] = t[k]; 1893 } 1894 PetscCall(ISRestoreIndices(isrow, &rp)); 1895 PetscCall(VecRestoreArrayRead(bb, &b)); 1896 PetscCall(VecRestoreArray(xx, &x)); 1897 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1898 PetscFunctionReturn(PETSC_SUCCESS); 1899 } 1900 1901 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx) 1902 { 1903 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1904 1905 PetscFunctionBegin; 1906 if (A->rmap->bs == 1) { 1907 PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v)); 1908 } else { 1909 IS isrow = a->row; 1910 const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp; 1911 const MatScalar *aa = a->a, *v; 1912 PetscScalar *x, *t; 1913 const PetscScalar *b; 1914 PetscInt nz, k, n, i, j; 1915 1916 if (bb->n > a->solves_work_n) { 1917 PetscCall(PetscFree(a->solves_work)); 1918 PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work)); 1919 a->solves_work_n = bb->n; 1920 } 1921 n = bb->n; 1922 PetscCall(VecGetArrayRead(bb->v, &b)); 1923 PetscCall(VecGetArray(xx->v, &x)); 1924 t = a->solves_work; 1925 1926 PetscCall(ISGetIndices(isrow, &rp)); 1927 1928 /* solve U^T*D*y = perm(b) by forward substitution */ 1929 for (k = 0; k < mbs; k++) { 1930 for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */ 1931 } 1932 for (k = 0; k < mbs; k++) { 1933 v = aa + ai[k]; 1934 vj = aj + ai[k]; 1935 nz = ai[k + 1] - ai[k] - 1; 1936 for (j = 0; j < nz; j++) { 1937 for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i]; 1938 v++; 1939 vj++; 1940 } 1941 for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1942 } 1943 1944 /* solve U*perm(x) = y by back substitution */ 1945 for (k = mbs - 1; k >= 0; k--) { 1946 v = aa + ai[k] - 1; 1947 vj = aj + ai[k] - 1; 1948 nz = ai[k + 1] - ai[k] - 1; 1949 for (j = 0; j < nz; j++) { 1950 for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i]; 1951 v++; 1952 vj++; 1953 } 1954 for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i]; 1955 } 1956 1957 PetscCall(ISRestoreIndices(isrow, &rp)); 1958 PetscCall(VecRestoreArrayRead(bb->v, &b)); 1959 PetscCall(VecRestoreArray(xx->v, &x)); 1960 PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs))); 1961 } 1962 PetscFunctionReturn(PETSC_SUCCESS); 1963 } 1964 1965 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx) 1966 { 1967 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1968 1969 PetscFunctionBegin; 1970 if (A->rmap->bs == 1) { 1971 PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v)); 1972 } else { 1973 IS isrow = a->row; 1974 const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp; 1975 const MatScalar *aa = a->a, *v; 1976 PetscScalar *x, *t; 1977 const PetscScalar *b; 1978 PetscInt nz, k, n, i; 1979 1980 if (bb->n > a->solves_work_n) { 1981 PetscCall(PetscFree(a->solves_work)); 1982 PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work)); 1983 a->solves_work_n = bb->n; 1984 } 1985 n = bb->n; 1986 PetscCall(VecGetArrayRead(bb->v, &b)); 1987 PetscCall(VecGetArray(xx->v, &x)); 1988 t = a->solves_work; 1989 1990 PetscCall(ISGetIndices(isrow, &rp)); 1991 1992 /* solve U^T*D*y = perm(b) by forward substitution */ 1993 for (k = 0; k < mbs; k++) { 1994 for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */ 1995 } 1996 for (k = 0; k < mbs; k++) { 1997 v = aa + ai[k]; 1998 vj = aj + ai[k]; 1999 nz = ai[k + 1] - ai[k]; 2000 while (nz--) { 2001 for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i]; 2002 v++; 2003 vj++; 2004 } 2005 for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 2006 } 2007 2008 /* solve U*perm(x) = y by back substitution */ 2009 for (k = mbs - 1; k >= 0; k--) { 2010 v = aa + ai[k]; 2011 vj = aj + ai[k]; 2012 nz = ai[k + 1] - ai[k]; 2013 while (nz--) { 2014 for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i]; 2015 v++; 2016 vj++; 2017 } 2018 for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i]; 2019 } 2020 2021 PetscCall(ISRestoreIndices(isrow, &rp)); 2022 PetscCall(VecRestoreArrayRead(bb->v, &b)); 2023 PetscCall(VecRestoreArray(xx->v, &x)); 2024 PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs))); 2025 } 2026 PetscFunctionReturn(PETSC_SUCCESS); 2027 } 2028 2029 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) 2030 { 2031 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2032 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag; 2033 const MatScalar *aa = a->a, *v; 2034 const PetscScalar *b; 2035 PetscScalar *x, xi; 2036 PetscInt nz, i, j; 2037 2038 PetscFunctionBegin; 2039 PetscCall(VecGetArrayRead(bb, &b)); 2040 PetscCall(VecGetArray(xx, &x)); 2041 /* solve U^T*D*y = b by forward substitution */ 2042 PetscCall(PetscArraycpy(x, b, mbs)); 2043 for (i = 0; i < mbs; i++) { 2044 v = aa + ai[i]; 2045 vj = aj + ai[i]; 2046 xi = x[i]; 2047 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 2048 for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi; 2049 x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2050 } 2051 /* solve U*x = y by backward substitution */ 2052 for (i = mbs - 2; i >= 0; i--) { 2053 xi = x[i]; 2054 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2055 vj = aj + adiag[i] - 1; 2056 nz = ai[i + 1] - ai[i] - 1; 2057 for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]]; 2058 x[i] = xi; 2059 } 2060 PetscCall(VecRestoreArrayRead(bb, &b)); 2061 PetscCall(VecRestoreArray(xx, &x)); 2062 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 2063 PetscFunctionReturn(PETSC_SUCCESS); 2064 } 2065 2066 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X) 2067 { 2068 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2069 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag; 2070 const MatScalar *aa = a->a, *v; 2071 const PetscScalar *b; 2072 PetscScalar *x, xi; 2073 PetscInt nz, i, j, neq, ldb, ldx; 2074 PetscBool isdense; 2075 2076 PetscFunctionBegin; 2077 if (!mbs) PetscFunctionReturn(PETSC_SUCCESS); 2078 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense)); 2079 PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix"); 2080 if (X != B) { 2081 PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense)); 2082 PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix"); 2083 } 2084 PetscCall(MatDenseGetArrayRead(B, &b)); 2085 PetscCall(MatDenseGetLDA(B, &ldb)); 2086 PetscCall(MatDenseGetArray(X, &x)); 2087 PetscCall(MatDenseGetLDA(X, &ldx)); 2088 for (neq = 0; neq < B->cmap->n; neq++) { 2089 /* solve U^T*D*y = b by forward substitution */ 2090 PetscCall(PetscArraycpy(x, b, mbs)); 2091 for (i = 0; i < mbs; i++) { 2092 v = aa + ai[i]; 2093 vj = aj + ai[i]; 2094 xi = x[i]; 2095 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 2096 for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi; 2097 x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2098 } 2099 /* solve U*x = y by backward substitution */ 2100 for (i = mbs - 2; i >= 0; i--) { 2101 xi = x[i]; 2102 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2103 vj = aj + adiag[i] - 1; 2104 nz = ai[i + 1] - ai[i] - 1; 2105 for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]]; 2106 x[i] = xi; 2107 } 2108 b += ldb; 2109 x += ldx; 2110 } 2111 PetscCall(MatDenseRestoreArrayRead(B, &b)); 2112 PetscCall(MatDenseRestoreArray(X, &x)); 2113 PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs))); 2114 PetscFunctionReturn(PETSC_SUCCESS); 2115 } 2116 2117 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 2118 { 2119 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2120 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2121 const MatScalar *aa = a->a, *v; 2122 PetscScalar *x, xk; 2123 const PetscScalar *b; 2124 PetscInt nz, k; 2125 2126 PetscFunctionBegin; 2127 PetscCall(VecGetArrayRead(bb, &b)); 2128 PetscCall(VecGetArray(xx, &x)); 2129 2130 /* solve U^T*D*y = b by forward substitution */ 2131 PetscCall(PetscArraycpy(x, b, mbs)); 2132 for (k = 0; k < mbs; k++) { 2133 v = aa + ai[k] + 1; 2134 vj = aj + ai[k] + 1; 2135 xk = x[k]; 2136 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2137 while (nz--) x[*vj++] += (*v++) * xk; 2138 x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2139 } 2140 2141 /* solve U*x = y by back substitution */ 2142 for (k = mbs - 2; k >= 0; k--) { 2143 v = aa + ai[k] + 1; 2144 vj = aj + ai[k] + 1; 2145 xk = x[k]; 2146 nz = ai[k + 1] - ai[k] - 1; 2147 while (nz--) xk += (*v++) * x[*vj++]; 2148 x[k] = xk; 2149 } 2150 2151 PetscCall(VecRestoreArrayRead(bb, &b)); 2152 PetscCall(VecRestoreArray(xx, &x)); 2153 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 2154 PetscFunctionReturn(PETSC_SUCCESS); 2155 } 2156 2157 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) 2158 { 2159 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2160 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj; 2161 const MatScalar *aa = a->a, *v; 2162 PetscReal diagk; 2163 PetscScalar *x; 2164 const PetscScalar *b; 2165 PetscInt nz, k; 2166 2167 PetscFunctionBegin; 2168 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2169 PetscCall(VecGetArrayRead(bb, &b)); 2170 PetscCall(VecGetArray(xx, &x)); 2171 PetscCall(PetscArraycpy(x, b, mbs)); 2172 for (k = 0; k < mbs; k++) { 2173 v = aa + ai[k]; 2174 vj = aj + ai[k]; 2175 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2176 while (nz--) x[*vj++] += (*v++) * x[k]; 2177 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2178 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]])); 2179 x[k] *= PetscSqrtReal(diagk); 2180 } 2181 PetscCall(VecRestoreArrayRead(bb, &b)); 2182 PetscCall(VecRestoreArray(xx, &x)); 2183 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2184 PetscFunctionReturn(PETSC_SUCCESS); 2185 } 2186 2187 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 2188 { 2189 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2190 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2191 const MatScalar *aa = a->a, *v; 2192 PetscReal diagk; 2193 PetscScalar *x; 2194 const PetscScalar *b; 2195 PetscInt nz, k; 2196 2197 PetscFunctionBegin; 2198 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2199 PetscCall(VecGetArrayRead(bb, &b)); 2200 PetscCall(VecGetArray(xx, &x)); 2201 PetscCall(PetscArraycpy(x, b, mbs)); 2202 for (k = 0; k < mbs; k++) { 2203 v = aa + ai[k] + 1; 2204 vj = aj + ai[k] + 1; 2205 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2206 while (nz--) x[*vj++] += (*v++) * x[k]; 2207 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2208 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]])); 2209 x[k] *= PetscSqrtReal(diagk); 2210 } 2211 PetscCall(VecRestoreArrayRead(bb, &b)); 2212 PetscCall(VecRestoreArray(xx, &x)); 2213 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2214 PetscFunctionReturn(PETSC_SUCCESS); 2215 } 2216 2217 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) 2218 { 2219 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2220 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj; 2221 const MatScalar *aa = a->a, *v; 2222 PetscReal diagk; 2223 PetscScalar *x; 2224 const PetscScalar *b; 2225 PetscInt nz, k; 2226 2227 PetscFunctionBegin; 2228 /* solve D^(1/2)*U*x = b by back substitution */ 2229 PetscCall(VecGetArrayRead(bb, &b)); 2230 PetscCall(VecGetArray(xx, &x)); 2231 2232 for (k = mbs - 1; k >= 0; k--) { 2233 v = aa + ai[k]; 2234 vj = aj + ai[k]; 2235 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2236 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 2237 x[k] = PetscSqrtReal(diagk) * b[k]; 2238 nz = ai[k + 1] - ai[k] - 1; 2239 while (nz--) x[k] += (*v++) * x[*vj++]; 2240 } 2241 PetscCall(VecRestoreArrayRead(bb, &b)); 2242 PetscCall(VecRestoreArray(xx, &x)); 2243 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2244 PetscFunctionReturn(PETSC_SUCCESS); 2245 } 2246 2247 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 2248 { 2249 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2250 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2251 const MatScalar *aa = a->a, *v; 2252 PetscReal diagk; 2253 PetscScalar *x; 2254 const PetscScalar *b; 2255 PetscInt nz, k; 2256 2257 PetscFunctionBegin; 2258 /* solve D^(1/2)*U*x = b by back substitution */ 2259 PetscCall(VecGetArrayRead(bb, &b)); 2260 PetscCall(VecGetArray(xx, &x)); 2261 2262 for (k = mbs - 1; k >= 0; k--) { 2263 v = aa + ai[k] + 1; 2264 vj = aj + ai[k] + 1; 2265 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2266 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 2267 x[k] = PetscSqrtReal(diagk) * b[k]; 2268 nz = ai[k + 1] - ai[k] - 1; 2269 while (nz--) x[k] += (*v++) * x[*vj++]; 2270 } 2271 PetscCall(VecRestoreArrayRead(bb, &b)); 2272 PetscCall(VecRestoreArray(xx, &x)); 2273 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2274 PetscFunctionReturn(PETSC_SUCCESS); 2275 } 2276 2277 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2278 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2279 { 2280 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b; 2281 const PetscInt *rip, mbs = a->mbs, *ai, *aj; 2282 PetscInt *jutmp, bs = A->rmap->bs, i; 2283 PetscInt m, reallocs = 0, *levtmp; 2284 PetscInt *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd; 2285 PetscInt incrlev, *lev, shift, prow, nz; 2286 PetscReal f = info->fill, levels = info->levels; 2287 PetscBool perm_identity; 2288 2289 PetscFunctionBegin; 2290 /* check whether perm is the identity mapping */ 2291 PetscCall(ISIdentity(perm, &perm_identity)); 2292 2293 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2294 a->permute = PETSC_FALSE; 2295 ai = a->i; 2296 aj = a->j; 2297 2298 /* initialization */ 2299 PetscCall(ISGetIndices(perm, &rip)); 2300 umax = (PetscInt)(f * ai[mbs] + 1); 2301 PetscCall(PetscMalloc1(umax, &lev)); 2302 umax += mbs + 1; 2303 shift = mbs + 1; 2304 PetscCall(PetscMalloc1(mbs + 1, &iu)); 2305 PetscCall(PetscMalloc1(umax, &ju)); 2306 iu[0] = mbs + 1; 2307 juidx = mbs + 1; 2308 /* prowl: linked list for pivot row */ 2309 PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp)); 2310 /* q: linked list for col index */ 2311 2312 for (i = 0; i < mbs; i++) { 2313 prowl[i] = mbs; 2314 q[i] = 0; 2315 } 2316 2317 /* for each row k */ 2318 for (k = 0; k < mbs; k++) { 2319 nzk = 0; 2320 q[k] = mbs; 2321 /* copy current row into linked list */ 2322 nz = ai[rip[k] + 1] - ai[rip[k]]; 2323 j = ai[rip[k]]; 2324 while (nz--) { 2325 vj = rip[aj[j++]]; 2326 if (vj > k) { 2327 qm = k; 2328 do { 2329 m = qm; 2330 qm = q[m]; 2331 } while (qm < vj); 2332 PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A"); 2333 nzk++; 2334 q[m] = vj; 2335 q[vj] = qm; 2336 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2337 } 2338 } 2339 2340 /* modify nonzero structure of k-th row by computing fill-in 2341 for each row prow to be merged in */ 2342 prow = k; 2343 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2344 2345 while (prow < k) { 2346 /* merge row prow into k-th row */ 2347 jmin = iu[prow] + 1; 2348 jmax = iu[prow + 1]; 2349 qm = k; 2350 for (j = jmin; j < jmax; j++) { 2351 incrlev = lev[j - shift] + 1; 2352 if (incrlev > levels) continue; 2353 vj = ju[j]; 2354 do { 2355 m = qm; 2356 qm = q[m]; 2357 } while (qm < vj); 2358 if (qm != vj) { /* a fill */ 2359 nzk++; 2360 q[m] = vj; 2361 q[vj] = qm; 2362 qm = vj; 2363 levtmp[vj] = incrlev; 2364 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2365 } 2366 prow = prowl[prow]; /* next pivot row */ 2367 } 2368 2369 /* add k to row list for first nonzero element in k-th row */ 2370 if (nzk > 1) { 2371 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2372 prowl[k] = prowl[i]; 2373 prowl[i] = k; 2374 } 2375 iu[k + 1] = iu[k] + nzk; 2376 2377 /* allocate more space to ju and lev if needed */ 2378 if (iu[k + 1] > umax) { 2379 /* estimate how much additional space we will need */ 2380 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2381 /* just double the memory each time */ 2382 maxadd = umax; 2383 if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2; 2384 umax += maxadd; 2385 2386 /* allocate a longer ju */ 2387 PetscCall(PetscMalloc1(umax, &jutmp)); 2388 PetscCall(PetscArraycpy(jutmp, ju, iu[k])); 2389 PetscCall(PetscFree(ju)); 2390 ju = jutmp; 2391 2392 PetscCall(PetscMalloc1(umax, &jutmp)); 2393 PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift)); 2394 PetscCall(PetscFree(lev)); 2395 lev = jutmp; 2396 reallocs += 2; /* count how many times we realloc */ 2397 } 2398 2399 /* save nonzero structure of k-th row in ju */ 2400 i = k; 2401 while (nzk--) { 2402 i = q[i]; 2403 ju[juidx] = i; 2404 lev[juidx - shift] = levtmp[i]; 2405 juidx++; 2406 } 2407 } 2408 2409 #if defined(PETSC_USE_INFO) 2410 if (ai[mbs] != 0) { 2411 PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]); 2412 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af)); 2413 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2414 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af)); 2415 PetscCall(PetscInfo(A, "for best performance.\n")); 2416 } else { 2417 PetscCall(PetscInfo(A, "Empty matrix\n")); 2418 } 2419 #endif 2420 2421 PetscCall(ISRestoreIndices(perm, &rip)); 2422 PetscCall(PetscFree3(prowl, q, levtmp)); 2423 PetscCall(PetscFree(lev)); 2424 2425 /* put together the new matrix */ 2426 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL)); 2427 2428 b = (Mat_SeqSBAIJ *)(B)->data; 2429 PetscCall(PetscFree2(b->imax, b->ilen)); 2430 2431 b->singlemalloc = PETSC_FALSE; 2432 b->free_a = PETSC_TRUE; 2433 b->free_ij = PETSC_TRUE; 2434 /* the next line frees the default space generated by the Create() */ 2435 PetscCall(PetscFree3(b->a, b->j, b->i)); 2436 PetscCall(PetscMalloc1((iu[mbs] + 1) * a->bs2, &b->a)); 2437 b->j = ju; 2438 b->i = iu; 2439 b->diag = NULL; 2440 b->ilen = NULL; 2441 b->imax = NULL; 2442 2443 PetscCall(ISDestroy(&b->row)); 2444 PetscCall(ISDestroy(&b->icol)); 2445 b->row = perm; 2446 b->icol = perm; 2447 PetscCall(PetscObjectReference((PetscObject)perm)); 2448 PetscCall(PetscObjectReference((PetscObject)perm)); 2449 PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work)); 2450 /* In b structure: Free imax, ilen, old a, old j. 2451 Allocate idnew, solve_work, new a, new j */ 2452 b->maxnz = b->nz = iu[mbs]; 2453 2454 (B)->info.factor_mallocs = reallocs; 2455 (B)->info.fill_ratio_given = f; 2456 if (ai[mbs] != 0) { 2457 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]); 2458 } else { 2459 (B)->info.fill_ratio_needed = 0.0; 2460 } 2461 PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity)); 2462 PetscFunctionReturn(PETSC_SUCCESS); 2463 } 2464 2465 /* 2466 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2467 */ 2468 #include <petscbt.h> 2469 #include <../src/mat/utils/freespace.h> 2470 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 2471 { 2472 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b; 2473 PetscBool perm_identity, free_ij = PETSC_TRUE, missing; 2474 PetscInt bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j; 2475 const PetscInt *rip; 2476 PetscInt reallocs = 0, i, *ui, *udiag, *cols; 2477 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 2478 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr; 2479 PetscReal fill = info->fill, levels = info->levels; 2480 PetscFreeSpaceList free_space = NULL, current_space = NULL; 2481 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 2482 PetscBT lnkbt; 2483 2484 PetscFunctionBegin; 2485 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2486 PetscCall(MatMissingDiagonal(A, &missing, &d)); 2487 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d); 2488 if (bs > 1) { 2489 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info)); 2490 PetscFunctionReturn(PETSC_SUCCESS); 2491 } 2492 2493 /* check whether perm is the identity mapping */ 2494 PetscCall(ISIdentity(perm, &perm_identity)); 2495 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2496 a->permute = PETSC_FALSE; 2497 2498 PetscCall(PetscMalloc1(am + 1, &ui)); 2499 PetscCall(PetscMalloc1(am + 1, &udiag)); 2500 ui[0] = 0; 2501 2502 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2503 if (!levels) { 2504 /* reuse the column pointers and row offsets for memory savings */ 2505 for (i = 0; i < am; i++) { 2506 ncols = ai[i + 1] - ai[i]; 2507 ui[i + 1] = ui[i] + ncols; 2508 udiag[i] = ui[i + 1] - 1; /* points to the last entry of U(i,:) */ 2509 } 2510 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 2511 cols = uj; 2512 for (i = 0; i < am; i++) { 2513 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2514 ncols = ai[i + 1] - ai[i] - 1; 2515 for (j = 0; j < ncols; j++) *cols++ = aj[j]; 2516 *cols++ = i; /* diagonal is located as the last entry of U(i,:) */ 2517 } 2518 } else { /* case: levels>0 */ 2519 PetscCall(ISGetIndices(perm, &rip)); 2520 2521 /* initialization */ 2522 /* jl: linked list for storing indices of the pivot rows 2523 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2524 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 2525 for (i = 0; i < am; i++) { 2526 jl[i] = am; 2527 il[i] = 0; 2528 } 2529 2530 /* create and initialize a linked list for storing column indices of the active row k */ 2531 nlnk = am + 1; 2532 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 2533 2534 /* initial FreeSpace size is fill*(ai[am]+1) */ 2535 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space)); 2536 2537 current_space = free_space; 2538 2539 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl)); 2540 2541 current_space_lvl = free_space_lvl; 2542 2543 for (k = 0; k < am; k++) { /* for each active row k */ 2544 /* initialize lnk by the column indices of row k */ 2545 nzk = 0; 2546 ncols = ai[k + 1] - ai[k]; 2547 PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k); 2548 cols = aj + ai[k]; 2549 PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt)); 2550 nzk += nlnk; 2551 2552 /* update lnk by computing fill-in for each pivot row to be merged in */ 2553 prow = jl[k]; /* 1st pivot row */ 2554 2555 while (prow < k) { 2556 nextprow = jl[prow]; 2557 2558 /* merge prow into k-th row */ 2559 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2560 jmax = ui[prow + 1]; 2561 ncols = jmax - jmin; 2562 i = jmin - ui[prow]; 2563 2564 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2565 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2566 j = *(uj - 1); 2567 PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j)); 2568 nzk += nlnk; 2569 2570 /* update il and jl for prow */ 2571 if (jmin < jmax) { 2572 il[prow] = jmin; 2573 2574 j = *cols; 2575 jl[prow] = jl[j]; 2576 jl[j] = prow; 2577 } 2578 prow = nextprow; 2579 } 2580 2581 /* if free space is not available, make more free space */ 2582 if (current_space->local_remaining < nzk) { 2583 i = am - k + 1; /* num of unfactored rows */ 2584 i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2585 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 2586 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 2587 reallocs++; 2588 } 2589 2590 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2591 PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k); 2592 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 2593 2594 /* add the k-th row into il and jl */ 2595 if (nzk > 1) { 2596 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2597 jl[k] = jl[i]; 2598 jl[i] = k; 2599 il[k] = ui[k] + 1; 2600 } 2601 uj_ptr[k] = current_space->array; 2602 uj_lvl_ptr[k] = current_space_lvl->array; 2603 2604 current_space->array += nzk; 2605 current_space->local_used += nzk; 2606 current_space->local_remaining -= nzk; 2607 current_space_lvl->array += nzk; 2608 current_space_lvl->local_used += nzk; 2609 current_space_lvl->local_remaining -= nzk; 2610 2611 ui[k + 1] = ui[k] + nzk; 2612 } 2613 2614 PetscCall(ISRestoreIndices(perm, &rip)); 2615 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 2616 2617 /* destroy list of free space and other temporary array(s) */ 2618 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 2619 PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor */ 2620 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 2621 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 2622 2623 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2624 2625 /* put together the new matrix in MATSEQSBAIJ format */ 2626 PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 2627 2628 b = (Mat_SeqSBAIJ *)(fact)->data; 2629 PetscCall(PetscFree2(b->imax, b->ilen)); 2630 2631 b->singlemalloc = PETSC_FALSE; 2632 b->free_a = PETSC_TRUE; 2633 b->free_ij = free_ij; 2634 2635 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 2636 2637 b->j = uj; 2638 b->i = ui; 2639 b->diag = udiag; 2640 b->free_diag = PETSC_TRUE; 2641 b->ilen = NULL; 2642 b->imax = NULL; 2643 b->row = perm; 2644 b->col = perm; 2645 2646 PetscCall(PetscObjectReference((PetscObject)perm)); 2647 PetscCall(PetscObjectReference((PetscObject)perm)); 2648 2649 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2650 2651 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 2652 2653 b->maxnz = b->nz = ui[am]; 2654 2655 fact->info.factor_mallocs = reallocs; 2656 fact->info.fill_ratio_given = fill; 2657 if (ai[am] != 0) { 2658 fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am]; 2659 } else { 2660 fact->info.fill_ratio_needed = 0.0; 2661 } 2662 #if defined(PETSC_USE_INFO) 2663 if (ai[am] != 0) { 2664 PetscReal af = fact->info.fill_ratio_needed; 2665 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 2666 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2667 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 2668 } else { 2669 PetscCall(PetscInfo(A, "Empty matrix\n")); 2670 } 2671 #endif 2672 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2673 PetscFunctionReturn(PETSC_SUCCESS); 2674 } 2675 2676 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 2677 { 2678 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2679 Mat_SeqSBAIJ *b; 2680 PetscBool perm_identity, free_ij = PETSC_TRUE; 2681 PetscInt bs = A->rmap->bs, am = a->mbs; 2682 const PetscInt *cols, *rip, *ai = a->i, *aj = a->j; 2683 PetscInt reallocs = 0, i, *ui; 2684 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 2685 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 2686 PetscReal fill = info->fill, levels = info->levels, ratio_needed; 2687 PetscFreeSpaceList free_space = NULL, current_space = NULL; 2688 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 2689 PetscBT lnkbt; 2690 2691 PetscFunctionBegin; 2692 /* 2693 This code originally uses Modified Sparse Row (MSR) storage 2694 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2695 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2696 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2697 thus the original code in MSR format is still used for these cases. 2698 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2699 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2700 */ 2701 if (bs > 1) { 2702 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info)); 2703 PetscFunctionReturn(PETSC_SUCCESS); 2704 } 2705 2706 /* check whether perm is the identity mapping */ 2707 PetscCall(ISIdentity(perm, &perm_identity)); 2708 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2709 a->permute = PETSC_FALSE; 2710 2711 /* special case that simply copies fill pattern */ 2712 if (!levels) { 2713 /* reuse the column pointers and row offsets for memory savings */ 2714 ui = a->i; 2715 uj = a->j; 2716 free_ij = PETSC_FALSE; 2717 ratio_needed = 1.0; 2718 } else { /* case: levels>0 */ 2719 PetscCall(ISGetIndices(perm, &rip)); 2720 2721 /* initialization */ 2722 PetscCall(PetscMalloc1(am + 1, &ui)); 2723 ui[0] = 0; 2724 2725 /* jl: linked list for storing indices of the pivot rows 2726 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2727 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 2728 for (i = 0; i < am; i++) { 2729 jl[i] = am; 2730 il[i] = 0; 2731 } 2732 2733 /* create and initialize a linked list for storing column indices of the active row k */ 2734 nlnk = am + 1; 2735 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 2736 2737 /* initial FreeSpace size is fill*(ai[am]+1) */ 2738 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space)); 2739 2740 current_space = free_space; 2741 2742 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl)); 2743 2744 current_space_lvl = free_space_lvl; 2745 2746 for (k = 0; k < am; k++) { /* for each active row k */ 2747 /* initialize lnk by the column indices of row rip[k] */ 2748 nzk = 0; 2749 ncols = ai[rip[k] + 1] - ai[rip[k]]; 2750 cols = aj + ai[rip[k]]; 2751 PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt)); 2752 nzk += nlnk; 2753 2754 /* update lnk by computing fill-in for each pivot row to be merged in */ 2755 prow = jl[k]; /* 1st pivot row */ 2756 2757 while (prow < k) { 2758 nextprow = jl[prow]; 2759 2760 /* merge prow into k-th row */ 2761 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2762 jmax = ui[prow + 1]; 2763 ncols = jmax - jmin; 2764 i = jmin - ui[prow]; 2765 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2766 j = *(uj_lvl_ptr[prow] + i - 1); 2767 cols_lvl = uj_lvl_ptr[prow] + i; 2768 PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j)); 2769 nzk += nlnk; 2770 2771 /* update il and jl for prow */ 2772 if (jmin < jmax) { 2773 il[prow] = jmin; 2774 j = *cols; 2775 jl[prow] = jl[j]; 2776 jl[j] = prow; 2777 } 2778 prow = nextprow; 2779 } 2780 2781 /* if free space is not available, make more free space */ 2782 if (current_space->local_remaining < nzk) { 2783 i = am - k + 1; /* num of unfactored rows */ 2784 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2785 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 2786 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 2787 reallocs++; 2788 } 2789 2790 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2791 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 2792 2793 /* add the k-th row into il and jl */ 2794 if (nzk - 1 > 0) { 2795 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2796 jl[k] = jl[i]; 2797 jl[i] = k; 2798 il[k] = ui[k] + 1; 2799 } 2800 uj_ptr[k] = current_space->array; 2801 uj_lvl_ptr[k] = current_space_lvl->array; 2802 2803 current_space->array += nzk; 2804 current_space->local_used += nzk; 2805 current_space->local_remaining -= nzk; 2806 current_space_lvl->array += nzk; 2807 current_space_lvl->local_used += nzk; 2808 current_space_lvl->local_remaining -= nzk; 2809 2810 ui[k + 1] = ui[k] + nzk; 2811 } 2812 2813 PetscCall(ISRestoreIndices(perm, &rip)); 2814 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 2815 2816 /* destroy list of free space and other temporary array(s) */ 2817 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 2818 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 2819 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 2820 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 2821 if (ai[am] != 0) { 2822 ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]); 2823 } else { 2824 ratio_needed = 0.0; 2825 } 2826 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2827 2828 /* put together the new matrix in MATSEQSBAIJ format */ 2829 PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 2830 2831 b = (Mat_SeqSBAIJ *)(fact)->data; 2832 2833 PetscCall(PetscFree2(b->imax, b->ilen)); 2834 2835 b->singlemalloc = PETSC_FALSE; 2836 b->free_a = PETSC_TRUE; 2837 b->free_ij = free_ij; 2838 2839 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 2840 2841 b->j = uj; 2842 b->i = ui; 2843 b->diag = NULL; 2844 b->ilen = NULL; 2845 b->imax = NULL; 2846 b->row = perm; 2847 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2848 2849 PetscCall(PetscObjectReference((PetscObject)perm)); 2850 b->icol = perm; 2851 PetscCall(PetscObjectReference((PetscObject)perm)); 2852 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 2853 2854 b->maxnz = b->nz = ui[am]; 2855 2856 fact->info.factor_mallocs = reallocs; 2857 fact->info.fill_ratio_given = fill; 2858 fact->info.fill_ratio_needed = ratio_needed; 2859 #if defined(PETSC_USE_INFO) 2860 if (ai[am] != 0) { 2861 PetscReal af = fact->info.fill_ratio_needed; 2862 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 2863 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2864 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 2865 } else { 2866 PetscCall(PetscInfo(A, "Empty matrix\n")); 2867 } 2868 #endif 2869 if (perm_identity) { 2870 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2871 } else { 2872 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2873 } 2874 PetscFunctionReturn(PETSC_SUCCESS); 2875 } 2876