1 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include <../src/mat/impls/sbaij/seq/sbaij.h> 7 #include <../src/mat/impls/baij/seq/baij.h> 8 #include <petsc/private/kernels/blockinvert.h> 9 10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) { 11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 12 IS isrow = a->row; 13 PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 14 const PetscInt *r; 15 PetscInt nz, *vj, k, idx, k1; 16 PetscInt bs = A->rmap->bs, bs2 = a->bs2; 17 const MatScalar *aa = a->a, *v, *diag; 18 PetscScalar *x, *xk, *xj, *xk_tmp, *t; 19 const PetscScalar *b; 20 21 PetscFunctionBegin; 22 PetscCall(VecGetArrayRead(bb, &b)); 23 PetscCall(VecGetArray(xx, &x)); 24 t = a->solve_work; 25 PetscCall(ISGetIndices(isrow, &r)); 26 PetscCall(PetscMalloc1(bs, &xk_tmp)); 27 28 /* solve U^T * D * y = b by forward substitution */ 29 xk = t; 30 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 31 idx = bs * r[k]; 32 for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1]; 33 } 34 for (k = 0; k < mbs; k++) { 35 v = aa + bs2 * ai[k]; 36 xk = t + k * bs; /* Dk*xk = k-th block of x */ 37 PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */ 38 nz = ai[k + 1] - ai[k]; 39 vj = aj + ai[k]; 40 xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */ 41 while (nz--) { 42 /* x(:) += U(k,:)^T*(Dk*xk) */ 43 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */ 44 vj++; 45 xj = t + (*vj) * bs; 46 v += bs2; 47 } 48 /* xk = inv(Dk)*(Dk*xk) */ 49 diag = aa + k * bs2; /* ptr to inv(Dk) */ 50 PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */ 51 } 52 53 /* solve U*x = y by back substitution */ 54 for (k = mbs - 1; k >= 0; k--) { 55 v = aa + bs2 * ai[k]; 56 xk = t + k * bs; /* xk */ 57 nz = ai[k + 1] - ai[k]; 58 vj = aj + ai[k]; 59 xj = t + (*vj) * bs; 60 while (nz--) { 61 /* xk += U(k,:)*x(:) */ 62 PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */ 63 vj++; 64 v += bs2; 65 xj = t + (*vj) * bs; 66 } 67 idx = bs * r[k]; 68 for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++; 69 } 70 71 PetscCall(PetscFree(xk_tmp)); 72 PetscCall(ISRestoreIndices(isrow, &r)); 73 PetscCall(VecRestoreArrayRead(bb, &b)); 74 PetscCall(VecRestoreArray(xx, &x)); 75 PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs)); 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) { 80 PetscFunctionBegin; 81 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet"); 82 } 83 84 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) { 85 PetscFunctionBegin; 86 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented"); 87 } 88 89 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x) { 90 PetscInt nz, k; 91 const PetscInt *vj, bs2 = bs * bs; 92 const MatScalar *v, *diag; 93 PetscScalar *xk, *xj, *xk_tmp; 94 95 PetscFunctionBegin; 96 PetscCall(PetscMalloc1(bs, &xk_tmp)); 97 for (k = 0; k < mbs; k++) { 98 v = aa + bs2 * ai[k]; 99 xk = x + k * bs; /* Dk*xk = k-th block of x */ 100 PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */ 101 nz = ai[k + 1] - ai[k]; 102 vj = aj + ai[k]; 103 xj = x + (*vj) * bs; /* *vj-th block of x, *vj>k */ 104 while (nz--) { 105 /* x(:) += U(k,:)^T*(Dk*xk) */ 106 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */ 107 vj++; 108 xj = x + (*vj) * bs; 109 v += bs2; 110 } 111 /* xk = inv(Dk)*(Dk*xk) */ 112 diag = aa + k * bs2; /* ptr to inv(Dk) */ 113 PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */ 114 } 115 PetscCall(PetscFree(xk_tmp)); 116 PetscFunctionReturn(0); 117 } 118 119 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x) { 120 PetscInt nz, k; 121 const PetscInt *vj, bs2 = bs * bs; 122 const MatScalar *v; 123 PetscScalar *xk, *xj; 124 125 PetscFunctionBegin; 126 for (k = mbs - 1; k >= 0; k--) { 127 v = aa + bs2 * ai[k]; 128 xk = x + k * bs; /* xk */ 129 nz = ai[k + 1] - ai[k]; 130 vj = aj + ai[k]; 131 xj = x + (*vj) * bs; 132 while (nz--) { 133 /* xk += U(k,:)*x(:) */ 134 PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */ 135 vj++; 136 v += bs2; 137 xj = x + (*vj) * bs; 138 } 139 } 140 PetscFunctionReturn(0); 141 } 142 143 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 144 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 145 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 146 PetscInt bs = A->rmap->bs; 147 const MatScalar *aa = a->a; 148 PetscScalar *x; 149 const PetscScalar *b; 150 151 PetscFunctionBegin; 152 PetscCall(VecGetArrayRead(bb, &b)); 153 PetscCall(VecGetArray(xx, &x)); 154 155 /* solve U^T * D * y = b by forward substitution */ 156 PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */ 157 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 158 159 /* solve U*x = y by back substitution */ 160 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 161 162 PetscCall(VecRestoreArrayRead(bb, &b)); 163 PetscCall(VecRestoreArray(xx, &x)); 164 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs)); 165 PetscFunctionReturn(0); 166 } 167 168 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 169 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 170 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 171 PetscInt bs = A->rmap->bs; 172 const MatScalar *aa = a->a; 173 const PetscScalar *b; 174 PetscScalar *x; 175 176 PetscFunctionBegin; 177 PetscCall(VecGetArrayRead(bb, &b)); 178 PetscCall(VecGetArray(xx, &x)); 179 PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */ 180 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 181 PetscCall(VecRestoreArrayRead(bb, &b)); 182 PetscCall(VecRestoreArray(xx, &x)); 183 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs)); 184 PetscFunctionReturn(0); 185 } 186 187 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 188 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 189 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 190 PetscInt bs = A->rmap->bs; 191 const MatScalar *aa = a->a; 192 const PetscScalar *b; 193 PetscScalar *x; 194 195 PetscFunctionBegin; 196 PetscCall(VecGetArrayRead(bb, &b)); 197 PetscCall(VecGetArray(xx, &x)); 198 PetscCall(PetscArraycpy(x, b, bs * mbs)); 199 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 200 PetscCall(VecRestoreArrayRead(bb, &b)); 201 PetscCall(VecRestoreArray(xx, &x)); 202 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 203 PetscFunctionReturn(0); 204 } 205 206 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx) { 207 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 208 IS isrow = a->row; 209 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj; 210 PetscInt nz, k, idx; 211 const MatScalar *aa = a->a, *v, *d; 212 PetscScalar *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp; 213 const PetscScalar *b; 214 215 PetscFunctionBegin; 216 PetscCall(VecGetArrayRead(bb, &b)); 217 PetscCall(VecGetArray(xx, &x)); 218 t = a->solve_work; 219 PetscCall(ISGetIndices(isrow, &r)); 220 221 /* solve U^T * D * y = b by forward substitution */ 222 tp = t; 223 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 224 idx = 7 * r[k]; 225 tp[0] = b[idx]; 226 tp[1] = b[idx + 1]; 227 tp[2] = b[idx + 2]; 228 tp[3] = b[idx + 3]; 229 tp[4] = b[idx + 4]; 230 tp[5] = b[idx + 5]; 231 tp[6] = b[idx + 6]; 232 tp += 7; 233 } 234 235 for (k = 0; k < mbs; k++) { 236 v = aa + 49 * ai[k]; 237 vj = aj + ai[k]; 238 tp = t + k * 7; 239 x0 = tp[0]; 240 x1 = tp[1]; 241 x2 = tp[2]; 242 x3 = tp[3]; 243 x4 = tp[4]; 244 x5 = tp[5]; 245 x6 = tp[6]; 246 nz = ai[k + 1] - ai[k]; 247 tp = t + (*vj) * 7; 248 while (nz--) { 249 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6; 250 tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6; 251 tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6; 252 tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6; 253 tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6; 254 tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6; 255 tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6; 256 vj++; 257 tp = t + (*vj) * 7; 258 v += 49; 259 } 260 261 /* xk = inv(Dk)*(Dk*xk) */ 262 d = aa + k * 49; /* ptr to inv(Dk) */ 263 tp = t + k * 7; 264 tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6; 265 tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6; 266 tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6; 267 tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6; 268 tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6; 269 tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6; 270 tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6; 271 } 272 273 /* solve U*x = y by back substitution */ 274 for (k = mbs - 1; k >= 0; k--) { 275 v = aa + 49 * ai[k]; 276 vj = aj + ai[k]; 277 tp = t + k * 7; 278 x0 = tp[0]; 279 x1 = tp[1]; 280 x2 = tp[2]; 281 x3 = tp[3]; 282 x4 = tp[4]; 283 x5 = tp[5]; 284 x6 = tp[6]; /* xk */ 285 nz = ai[k + 1] - ai[k]; 286 287 tp = t + (*vj) * 7; 288 while (nz--) { 289 /* xk += U(k,:)*x(:) */ 290 x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6]; 291 x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6]; 292 x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6]; 293 x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6]; 294 x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6]; 295 x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6]; 296 x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6]; 297 vj++; 298 tp = t + (*vj) * 7; 299 v += 49; 300 } 301 tp = t + k * 7; 302 tp[0] = x0; 303 tp[1] = x1; 304 tp[2] = x2; 305 tp[3] = x3; 306 tp[4] = x4; 307 tp[5] = x5; 308 tp[6] = x6; 309 idx = 7 * r[k]; 310 x[idx] = x0; 311 x[idx + 1] = x1; 312 x[idx + 2] = x2; 313 x[idx + 3] = x3; 314 x[idx + 4] = x4; 315 x[idx + 5] = x5; 316 x[idx + 6] = x6; 317 } 318 319 PetscCall(ISRestoreIndices(isrow, &r)); 320 PetscCall(VecRestoreArrayRead(bb, &b)); 321 PetscCall(VecRestoreArray(xx, &x)); 322 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 323 PetscFunctionReturn(0); 324 } 325 326 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 327 const MatScalar *v, *d; 328 PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6; 329 PetscInt nz, k; 330 const PetscInt *vj; 331 332 PetscFunctionBegin; 333 for (k = 0; k < mbs; k++) { 334 v = aa + 49 * ai[k]; 335 xp = x + k * 7; 336 x0 = xp[0]; 337 x1 = xp[1]; 338 x2 = xp[2]; 339 x3 = xp[3]; 340 x4 = xp[4]; 341 x5 = xp[5]; 342 x6 = xp[6]; /* Dk*xk = k-th block of x */ 343 nz = ai[k + 1] - ai[k]; 344 vj = aj + ai[k]; 345 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 346 PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 347 while (nz--) { 348 xp = x + (*vj) * 7; 349 /* x(:) += U(k,:)^T*(Dk*xk) */ 350 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6; 351 xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6; 352 xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6; 353 xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6; 354 xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6; 355 xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6; 356 xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6; 357 vj++; 358 v += 49; 359 } 360 /* xk = inv(Dk)*(Dk*xk) */ 361 d = aa + k * 49; /* ptr to inv(Dk) */ 362 xp = x + k * 7; 363 xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6; 364 xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6; 365 xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6; 366 xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6; 367 xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6; 368 xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6; 369 xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6; 370 } 371 PetscFunctionReturn(0); 372 } 373 374 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 375 const MatScalar *v; 376 PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6; 377 PetscInt nz, k; 378 const PetscInt *vj; 379 380 PetscFunctionBegin; 381 for (k = mbs - 1; k >= 0; k--) { 382 v = aa + 49 * ai[k]; 383 xp = x + k * 7; 384 x0 = xp[0]; 385 x1 = xp[1]; 386 x2 = xp[2]; 387 x3 = xp[3]; 388 x4 = xp[4]; 389 x5 = xp[5]; 390 x6 = xp[6]; /* xk */ 391 nz = ai[k + 1] - ai[k]; 392 vj = aj + ai[k]; 393 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 394 PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 395 while (nz--) { 396 xp = x + (*vj) * 7; 397 /* xk += U(k,:)*x(:) */ 398 x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6]; 399 x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6]; 400 x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6]; 401 x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6]; 402 x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6]; 403 x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6]; 404 x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6]; 405 vj++; 406 v += 49; 407 } 408 xp = x + k * 7; 409 xp[0] = x0; 410 xp[1] = x1; 411 xp[2] = x2; 412 xp[3] = x3; 413 xp[4] = x4; 414 xp[5] = x5; 415 xp[6] = x6; 416 } 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 421 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 422 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 423 const MatScalar *aa = a->a; 424 PetscScalar *x; 425 const PetscScalar *b; 426 427 PetscFunctionBegin; 428 PetscCall(VecGetArrayRead(bb, &b)); 429 PetscCall(VecGetArray(xx, &x)); 430 431 /* solve U^T * D * y = b by forward substitution */ 432 PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */ 433 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 434 435 /* solve U*x = y by back substitution */ 436 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 437 438 PetscCall(VecRestoreArrayRead(bb, &b)); 439 PetscCall(VecRestoreArray(xx, &x)); 440 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 441 PetscFunctionReturn(0); 442 } 443 444 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 445 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 446 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 447 const MatScalar *aa = a->a; 448 PetscScalar *x; 449 const PetscScalar *b; 450 451 PetscFunctionBegin; 452 PetscCall(VecGetArrayRead(bb, &b)); 453 PetscCall(VecGetArray(xx, &x)); 454 PetscCall(PetscArraycpy(x, b, 7 * mbs)); 455 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 456 PetscCall(VecRestoreArrayRead(bb, &b)); 457 PetscCall(VecRestoreArray(xx, &x)); 458 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 459 PetscFunctionReturn(0); 460 } 461 462 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 463 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 464 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 465 const MatScalar *aa = a->a; 466 PetscScalar *x; 467 const PetscScalar *b; 468 469 PetscFunctionBegin; 470 PetscCall(VecGetArrayRead(bb, &b)); 471 PetscCall(VecGetArray(xx, &x)); 472 PetscCall(PetscArraycpy(x, b, 7 * mbs)); 473 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 474 PetscCall(VecRestoreArrayRead(bb, &b)); 475 PetscCall(VecRestoreArray(xx, &x)); 476 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 477 PetscFunctionReturn(0); 478 } 479 480 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx) { 481 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 482 IS isrow = a->row; 483 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj; 484 PetscInt nz, k, idx; 485 const MatScalar *aa = a->a, *v, *d; 486 PetscScalar *x, x0, x1, x2, x3, x4, x5, *t, *tp; 487 const PetscScalar *b; 488 489 PetscFunctionBegin; 490 PetscCall(VecGetArrayRead(bb, &b)); 491 PetscCall(VecGetArray(xx, &x)); 492 t = a->solve_work; 493 PetscCall(ISGetIndices(isrow, &r)); 494 495 /* solve U^T * D * y = b by forward substitution */ 496 tp = t; 497 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 498 idx = 6 * r[k]; 499 tp[0] = b[idx]; 500 tp[1] = b[idx + 1]; 501 tp[2] = b[idx + 2]; 502 tp[3] = b[idx + 3]; 503 tp[4] = b[idx + 4]; 504 tp[5] = b[idx + 5]; 505 tp += 6; 506 } 507 508 for (k = 0; k < mbs; k++) { 509 v = aa + 36 * ai[k]; 510 vj = aj + ai[k]; 511 tp = t + k * 6; 512 x0 = tp[0]; 513 x1 = tp[1]; 514 x2 = tp[2]; 515 x3 = tp[3]; 516 x4 = tp[4]; 517 x5 = tp[5]; 518 nz = ai[k + 1] - ai[k]; 519 tp = t + (*vj) * 6; 520 while (nz--) { 521 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5; 522 tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5; 523 tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5; 524 tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5; 525 tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5; 526 tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5; 527 vj++; 528 tp = t + (*vj) * 6; 529 v += 36; 530 } 531 532 /* xk = inv(Dk)*(Dk*xk) */ 533 d = aa + k * 36; /* ptr to inv(Dk) */ 534 tp = t + k * 6; 535 tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5; 536 tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5; 537 tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5; 538 tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5; 539 tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5; 540 tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5; 541 } 542 543 /* solve U*x = y by back substitution */ 544 for (k = mbs - 1; k >= 0; k--) { 545 v = aa + 36 * ai[k]; 546 vj = aj + ai[k]; 547 tp = t + k * 6; 548 x0 = tp[0]; 549 x1 = tp[1]; 550 x2 = tp[2]; 551 x3 = tp[3]; 552 x4 = tp[4]; 553 x5 = tp[5]; /* xk */ 554 nz = ai[k + 1] - ai[k]; 555 556 tp = t + (*vj) * 6; 557 while (nz--) { 558 /* xk += U(k,:)*x(:) */ 559 x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5]; 560 x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5]; 561 x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5]; 562 x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5]; 563 x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5]; 564 x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5]; 565 vj++; 566 tp = t + (*vj) * 6; 567 v += 36; 568 } 569 tp = t + k * 6; 570 tp[0] = x0; 571 tp[1] = x1; 572 tp[2] = x2; 573 tp[3] = x3; 574 tp[4] = x4; 575 tp[5] = x5; 576 idx = 6 * r[k]; 577 x[idx] = x0; 578 x[idx + 1] = x1; 579 x[idx + 2] = x2; 580 x[idx + 3] = x3; 581 x[idx + 4] = x4; 582 x[idx + 5] = x5; 583 } 584 585 PetscCall(ISRestoreIndices(isrow, &r)); 586 PetscCall(VecRestoreArrayRead(bb, &b)); 587 PetscCall(VecRestoreArray(xx, &x)); 588 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 589 PetscFunctionReturn(0); 590 } 591 592 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 593 const MatScalar *v, *d; 594 PetscScalar *xp, x0, x1, x2, x3, x4, x5; 595 PetscInt nz, k; 596 const PetscInt *vj; 597 598 PetscFunctionBegin; 599 for (k = 0; k < mbs; k++) { 600 v = aa + 36 * ai[k]; 601 xp = x + k * 6; 602 x0 = xp[0]; 603 x1 = xp[1]; 604 x2 = xp[2]; 605 x3 = xp[3]; 606 x4 = xp[4]; 607 x5 = xp[5]; /* Dk*xk = k-th block of x */ 608 nz = ai[k + 1] - ai[k]; 609 vj = aj + ai[k]; 610 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 611 PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 612 while (nz--) { 613 xp = x + (*vj) * 6; 614 /* x(:) += U(k,:)^T*(Dk*xk) */ 615 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5; 616 xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5; 617 xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5; 618 xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5; 619 xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5; 620 xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5; 621 vj++; 622 v += 36; 623 } 624 /* xk = inv(Dk)*(Dk*xk) */ 625 d = aa + k * 36; /* ptr to inv(Dk) */ 626 xp = x + k * 6; 627 xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5; 628 xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5; 629 xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5; 630 xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5; 631 xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5; 632 xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5; 633 } 634 PetscFunctionReturn(0); 635 } 636 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 637 const MatScalar *v; 638 PetscScalar *xp, x0, x1, x2, x3, x4, x5; 639 PetscInt nz, k; 640 const PetscInt *vj; 641 642 PetscFunctionBegin; 643 for (k = mbs - 1; k >= 0; k--) { 644 v = aa + 36 * ai[k]; 645 xp = x + k * 6; 646 x0 = xp[0]; 647 x1 = xp[1]; 648 x2 = xp[2]; 649 x3 = xp[3]; 650 x4 = xp[4]; 651 x5 = xp[5]; /* xk */ 652 nz = ai[k + 1] - ai[k]; 653 vj = aj + ai[k]; 654 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 655 PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 656 while (nz--) { 657 xp = x + (*vj) * 6; 658 /* xk += U(k,:)*x(:) */ 659 x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5]; 660 x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5]; 661 x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5]; 662 x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5]; 663 x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5]; 664 x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5]; 665 vj++; 666 v += 36; 667 } 668 xp = x + k * 6; 669 xp[0] = x0; 670 xp[1] = x1; 671 xp[2] = x2; 672 xp[3] = x3; 673 xp[4] = x4; 674 xp[5] = x5; 675 } 676 PetscFunctionReturn(0); 677 } 678 679 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 680 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 681 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 682 const MatScalar *aa = a->a; 683 PetscScalar *x; 684 const PetscScalar *b; 685 686 PetscFunctionBegin; 687 PetscCall(VecGetArrayRead(bb, &b)); 688 PetscCall(VecGetArray(xx, &x)); 689 690 /* solve U^T * D * y = b by forward substitution */ 691 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 692 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 693 694 /* solve U*x = y by back substitution */ 695 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 696 697 PetscCall(VecRestoreArrayRead(bb, &b)); 698 PetscCall(VecRestoreArray(xx, &x)); 699 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 700 PetscFunctionReturn(0); 701 } 702 703 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 704 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 705 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 706 const MatScalar *aa = a->a; 707 PetscScalar *x; 708 const PetscScalar *b; 709 710 PetscFunctionBegin; 711 PetscCall(VecGetArrayRead(bb, &b)); 712 PetscCall(VecGetArray(xx, &x)); 713 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 714 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 715 PetscCall(VecRestoreArrayRead(bb, &b)); 716 PetscCall(VecRestoreArray(xx, &x)); 717 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 718 PetscFunctionReturn(0); 719 } 720 721 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 722 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 723 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 724 const MatScalar *aa = a->a; 725 PetscScalar *x; 726 const PetscScalar *b; 727 728 PetscFunctionBegin; 729 PetscCall(VecGetArrayRead(bb, &b)); 730 PetscCall(VecGetArray(xx, &x)); 731 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 732 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 733 PetscCall(VecRestoreArrayRead(bb, &b)); 734 PetscCall(VecRestoreArray(xx, &x)); 735 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 736 PetscFunctionReturn(0); 737 } 738 739 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx) { 740 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 741 IS isrow = a->row; 742 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 743 const PetscInt *r, *vj; 744 PetscInt nz, k, idx; 745 const MatScalar *aa = a->a, *v, *diag; 746 PetscScalar *x, x0, x1, x2, x3, x4, *t, *tp; 747 const PetscScalar *b; 748 749 PetscFunctionBegin; 750 PetscCall(VecGetArrayRead(bb, &b)); 751 PetscCall(VecGetArray(xx, &x)); 752 t = a->solve_work; 753 PetscCall(ISGetIndices(isrow, &r)); 754 755 /* solve U^T * D * y = b by forward substitution */ 756 tp = t; 757 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 758 idx = 5 * r[k]; 759 tp[0] = b[idx]; 760 tp[1] = b[idx + 1]; 761 tp[2] = b[idx + 2]; 762 tp[3] = b[idx + 3]; 763 tp[4] = b[idx + 4]; 764 tp += 5; 765 } 766 767 for (k = 0; k < mbs; k++) { 768 v = aa + 25 * ai[k]; 769 vj = aj + ai[k]; 770 tp = t + k * 5; 771 x0 = tp[0]; 772 x1 = tp[1]; 773 x2 = tp[2]; 774 x3 = tp[3]; 775 x4 = tp[4]; 776 nz = ai[k + 1] - ai[k]; 777 778 tp = t + (*vj) * 5; 779 while (nz--) { 780 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4; 781 tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4; 782 tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4; 783 tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4; 784 tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4; 785 vj++; 786 tp = t + (*vj) * 5; 787 v += 25; 788 } 789 790 /* xk = inv(Dk)*(Dk*xk) */ 791 diag = aa + k * 25; /* ptr to inv(Dk) */ 792 tp = t + k * 5; 793 tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4; 794 tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4; 795 tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4; 796 tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4; 797 tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4; 798 } 799 800 /* solve U*x = y by back substitution */ 801 for (k = mbs - 1; k >= 0; k--) { 802 v = aa + 25 * ai[k]; 803 vj = aj + ai[k]; 804 tp = t + k * 5; 805 x0 = tp[0]; 806 x1 = tp[1]; 807 x2 = tp[2]; 808 x3 = tp[3]; 809 x4 = tp[4]; /* xk */ 810 nz = ai[k + 1] - ai[k]; 811 812 tp = t + (*vj) * 5; 813 while (nz--) { 814 /* xk += U(k,:)*x(:) */ 815 x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4]; 816 x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4]; 817 x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4]; 818 x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4]; 819 x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4]; 820 vj++; 821 tp = t + (*vj) * 5; 822 v += 25; 823 } 824 tp = t + k * 5; 825 tp[0] = x0; 826 tp[1] = x1; 827 tp[2] = x2; 828 tp[3] = x3; 829 tp[4] = x4; 830 idx = 5 * r[k]; 831 x[idx] = x0; 832 x[idx + 1] = x1; 833 x[idx + 2] = x2; 834 x[idx + 3] = x3; 835 x[idx + 4] = x4; 836 } 837 838 PetscCall(ISRestoreIndices(isrow, &r)); 839 PetscCall(VecRestoreArrayRead(bb, &b)); 840 PetscCall(VecRestoreArray(xx, &x)); 841 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 842 PetscFunctionReturn(0); 843 } 844 845 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 846 const MatScalar *v, *diag; 847 PetscScalar *xp, x0, x1, x2, x3, x4; 848 PetscInt nz, k; 849 const PetscInt *vj; 850 851 PetscFunctionBegin; 852 for (k = 0; k < mbs; k++) { 853 v = aa + 25 * ai[k]; 854 xp = x + k * 5; 855 x0 = xp[0]; 856 x1 = xp[1]; 857 x2 = xp[2]; 858 x3 = xp[3]; 859 x4 = xp[4]; /* Dk*xk = k-th block of x */ 860 nz = ai[k + 1] - ai[k]; 861 vj = aj + ai[k]; 862 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 863 PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 864 while (nz--) { 865 xp = x + (*vj) * 5; 866 /* x(:) += U(k,:)^T*(Dk*xk) */ 867 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4; 868 xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4; 869 xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4; 870 xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4; 871 xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4; 872 vj++; 873 v += 25; 874 } 875 /* xk = inv(Dk)*(Dk*xk) */ 876 diag = aa + k * 25; /* ptr to inv(Dk) */ 877 xp = x + k * 5; 878 xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4; 879 xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4; 880 xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4; 881 xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4; 882 xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4; 883 } 884 PetscFunctionReturn(0); 885 } 886 887 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 888 const MatScalar *v; 889 PetscScalar *xp, x0, x1, x2, x3, x4; 890 PetscInt nz, k; 891 const PetscInt *vj; 892 893 PetscFunctionBegin; 894 for (k = mbs - 1; k >= 0; k--) { 895 v = aa + 25 * ai[k]; 896 xp = x + k * 5; 897 x0 = xp[0]; 898 x1 = xp[1]; 899 x2 = xp[2]; 900 x3 = xp[3]; 901 x4 = xp[4]; /* xk */ 902 nz = ai[k + 1] - ai[k]; 903 vj = aj + ai[k]; 904 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 905 PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 906 while (nz--) { 907 xp = x + (*vj) * 5; 908 /* xk += U(k,:)*x(:) */ 909 x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4]; 910 x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4]; 911 x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4]; 912 x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4]; 913 x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4]; 914 vj++; 915 v += 25; 916 } 917 xp = x + k * 5; 918 xp[0] = x0; 919 xp[1] = x1; 920 xp[2] = x2; 921 xp[3] = x3; 922 xp[4] = x4; 923 } 924 PetscFunctionReturn(0); 925 } 926 927 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 928 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 929 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 930 const MatScalar *aa = a->a; 931 PetscScalar *x; 932 const PetscScalar *b; 933 934 PetscFunctionBegin; 935 PetscCall(VecGetArrayRead(bb, &b)); 936 PetscCall(VecGetArray(xx, &x)); 937 938 /* solve U^T * D * y = b by forward substitution */ 939 PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */ 940 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 941 942 /* solve U*x = y by back substitution */ 943 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 944 945 PetscCall(VecRestoreArrayRead(bb, &b)); 946 PetscCall(VecRestoreArray(xx, &x)); 947 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 948 PetscFunctionReturn(0); 949 } 950 951 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 952 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 953 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 954 const MatScalar *aa = a->a; 955 PetscScalar *x; 956 const PetscScalar *b; 957 958 PetscFunctionBegin; 959 PetscCall(VecGetArrayRead(bb, &b)); 960 PetscCall(VecGetArray(xx, &x)); 961 PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */ 962 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 963 PetscCall(VecRestoreArrayRead(bb, &b)); 964 PetscCall(VecRestoreArray(xx, &x)); 965 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 966 PetscFunctionReturn(0); 967 } 968 969 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 970 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 971 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 972 const MatScalar *aa = a->a; 973 PetscScalar *x; 974 const PetscScalar *b; 975 976 PetscFunctionBegin; 977 PetscCall(VecGetArrayRead(bb, &b)); 978 PetscCall(VecGetArray(xx, &x)); 979 PetscCall(PetscArraycpy(x, b, 5 * mbs)); 980 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 981 PetscCall(VecRestoreArrayRead(bb, &b)); 982 PetscCall(VecRestoreArray(xx, &x)); 983 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 984 PetscFunctionReturn(0); 985 } 986 987 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx) { 988 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 989 IS isrow = a->row; 990 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 991 const PetscInt *r, *vj; 992 PetscInt nz, k, idx; 993 const MatScalar *aa = a->a, *v, *diag; 994 PetscScalar *x, x0, x1, x2, x3, *t, *tp; 995 const PetscScalar *b; 996 997 PetscFunctionBegin; 998 PetscCall(VecGetArrayRead(bb, &b)); 999 PetscCall(VecGetArray(xx, &x)); 1000 t = a->solve_work; 1001 PetscCall(ISGetIndices(isrow, &r)); 1002 1003 /* solve U^T * D * y = b by forward substitution */ 1004 tp = t; 1005 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1006 idx = 4 * r[k]; 1007 tp[0] = b[idx]; 1008 tp[1] = b[idx + 1]; 1009 tp[2] = b[idx + 2]; 1010 tp[3] = b[idx + 3]; 1011 tp += 4; 1012 } 1013 1014 for (k = 0; k < mbs; k++) { 1015 v = aa + 16 * ai[k]; 1016 vj = aj + ai[k]; 1017 tp = t + k * 4; 1018 x0 = tp[0]; 1019 x1 = tp[1]; 1020 x2 = tp[2]; 1021 x3 = tp[3]; 1022 nz = ai[k + 1] - ai[k]; 1023 1024 tp = t + (*vj) * 4; 1025 while (nz--) { 1026 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3; 1027 tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3; 1028 tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3; 1029 tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3; 1030 vj++; 1031 tp = t + (*vj) * 4; 1032 v += 16; 1033 } 1034 1035 /* xk = inv(Dk)*(Dk*xk) */ 1036 diag = aa + k * 16; /* ptr to inv(Dk) */ 1037 tp = t + k * 4; 1038 tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3; 1039 tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3; 1040 tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3; 1041 tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3; 1042 } 1043 1044 /* solve U*x = y by back substitution */ 1045 for (k = mbs - 1; k >= 0; k--) { 1046 v = aa + 16 * ai[k]; 1047 vj = aj + ai[k]; 1048 tp = t + k * 4; 1049 x0 = tp[0]; 1050 x1 = tp[1]; 1051 x2 = tp[2]; 1052 x3 = tp[3]; /* xk */ 1053 nz = ai[k + 1] - ai[k]; 1054 1055 tp = t + (*vj) * 4; 1056 while (nz--) { 1057 /* xk += U(k,:)*x(:) */ 1058 x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3]; 1059 x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3]; 1060 x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3]; 1061 x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3]; 1062 vj++; 1063 tp = t + (*vj) * 4; 1064 v += 16; 1065 } 1066 tp = t + k * 4; 1067 tp[0] = x0; 1068 tp[1] = x1; 1069 tp[2] = x2; 1070 tp[3] = x3; 1071 idx = 4 * r[k]; 1072 x[idx] = x0; 1073 x[idx + 1] = x1; 1074 x[idx + 2] = x2; 1075 x[idx + 3] = x3; 1076 } 1077 1078 PetscCall(ISRestoreIndices(isrow, &r)); 1079 PetscCall(VecRestoreArrayRead(bb, &b)); 1080 PetscCall(VecRestoreArray(xx, &x)); 1081 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1082 PetscFunctionReturn(0); 1083 } 1084 1085 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 1086 const MatScalar *v, *diag; 1087 PetscScalar *xp, x0, x1, x2, x3; 1088 PetscInt nz, k; 1089 const PetscInt *vj; 1090 1091 PetscFunctionBegin; 1092 for (k = 0; k < mbs; k++) { 1093 v = aa + 16 * ai[k]; 1094 xp = x + k * 4; 1095 x0 = xp[0]; 1096 x1 = xp[1]; 1097 x2 = xp[2]; 1098 x3 = xp[3]; /* Dk*xk = k-th block of x */ 1099 nz = ai[k + 1] - ai[k]; 1100 vj = aj + ai[k]; 1101 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1102 PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1103 while (nz--) { 1104 xp = x + (*vj) * 4; 1105 /* x(:) += U(k,:)^T*(Dk*xk) */ 1106 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3; 1107 xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3; 1108 xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3; 1109 xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3; 1110 vj++; 1111 v += 16; 1112 } 1113 /* xk = inv(Dk)*(Dk*xk) */ 1114 diag = aa + k * 16; /* ptr to inv(Dk) */ 1115 xp = x + k * 4; 1116 xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3; 1117 xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3; 1118 xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3; 1119 xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3; 1120 } 1121 PetscFunctionReturn(0); 1122 } 1123 1124 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 1125 const MatScalar *v; 1126 PetscScalar *xp, x0, x1, x2, x3; 1127 PetscInt nz, k; 1128 const PetscInt *vj; 1129 1130 PetscFunctionBegin; 1131 for (k = mbs - 1; k >= 0; k--) { 1132 v = aa + 16 * ai[k]; 1133 xp = x + k * 4; 1134 x0 = xp[0]; 1135 x1 = xp[1]; 1136 x2 = xp[2]; 1137 x3 = xp[3]; /* xk */ 1138 nz = ai[k + 1] - ai[k]; 1139 vj = aj + ai[k]; 1140 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1141 PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1142 while (nz--) { 1143 xp = x + (*vj) * 4; 1144 /* xk += U(k,:)*x(:) */ 1145 x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3]; 1146 x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3]; 1147 x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3]; 1148 x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3]; 1149 vj++; 1150 v += 16; 1151 } 1152 xp = x + k * 4; 1153 xp[0] = x0; 1154 xp[1] = x1; 1155 xp[2] = x2; 1156 xp[3] = x3; 1157 } 1158 PetscFunctionReturn(0); 1159 } 1160 1161 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 1162 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1163 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1164 const MatScalar *aa = a->a; 1165 PetscScalar *x; 1166 const PetscScalar *b; 1167 1168 PetscFunctionBegin; 1169 PetscCall(VecGetArrayRead(bb, &b)); 1170 PetscCall(VecGetArray(xx, &x)); 1171 1172 /* solve U^T * D * y = b by forward substitution */ 1173 PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */ 1174 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1175 1176 /* solve U*x = y by back substitution */ 1177 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1178 PetscCall(VecRestoreArrayRead(bb, &b)); 1179 PetscCall(VecRestoreArray(xx, &x)); 1180 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1181 PetscFunctionReturn(0); 1182 } 1183 1184 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 1185 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1186 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1187 const MatScalar *aa = a->a; 1188 PetscScalar *x; 1189 const PetscScalar *b; 1190 1191 PetscFunctionBegin; 1192 PetscCall(VecGetArrayRead(bb, &b)); 1193 PetscCall(VecGetArray(xx, &x)); 1194 PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */ 1195 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1196 PetscCall(VecRestoreArrayRead(bb, &b)); 1197 PetscCall(VecRestoreArray(xx, &x)); 1198 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1199 PetscFunctionReturn(0); 1200 } 1201 1202 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 1203 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1204 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1205 const MatScalar *aa = a->a; 1206 PetscScalar *x; 1207 const PetscScalar *b; 1208 1209 PetscFunctionBegin; 1210 PetscCall(VecGetArrayRead(bb, &b)); 1211 PetscCall(VecGetArray(xx, &x)); 1212 PetscCall(PetscArraycpy(x, b, 4 * mbs)); 1213 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1214 PetscCall(VecRestoreArrayRead(bb, &b)); 1215 PetscCall(VecRestoreArray(xx, &x)); 1216 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1217 PetscFunctionReturn(0); 1218 } 1219 1220 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx) { 1221 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1222 IS isrow = a->row; 1223 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1224 const PetscInt *r; 1225 PetscInt nz, k, idx; 1226 const PetscInt *vj; 1227 const MatScalar *aa = a->a, *v, *diag; 1228 PetscScalar *x, x0, x1, x2, *t, *tp; 1229 const PetscScalar *b; 1230 1231 PetscFunctionBegin; 1232 PetscCall(VecGetArrayRead(bb, &b)); 1233 PetscCall(VecGetArray(xx, &x)); 1234 t = a->solve_work; 1235 PetscCall(ISGetIndices(isrow, &r)); 1236 1237 /* solve U^T * D * y = b by forward substitution */ 1238 tp = t; 1239 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1240 idx = 3 * r[k]; 1241 tp[0] = b[idx]; 1242 tp[1] = b[idx + 1]; 1243 tp[2] = b[idx + 2]; 1244 tp += 3; 1245 } 1246 1247 for (k = 0; k < mbs; k++) { 1248 v = aa + 9 * ai[k]; 1249 vj = aj + ai[k]; 1250 tp = t + k * 3; 1251 x0 = tp[0]; 1252 x1 = tp[1]; 1253 x2 = tp[2]; 1254 nz = ai[k + 1] - ai[k]; 1255 1256 tp = t + (*vj) * 3; 1257 while (nz--) { 1258 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2; 1259 tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2; 1260 tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2; 1261 vj++; 1262 tp = t + (*vj) * 3; 1263 v += 9; 1264 } 1265 1266 /* xk = inv(Dk)*(Dk*xk) */ 1267 diag = aa + k * 9; /* ptr to inv(Dk) */ 1268 tp = t + k * 3; 1269 tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2; 1270 tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2; 1271 tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2; 1272 } 1273 1274 /* solve U*x = y by back substitution */ 1275 for (k = mbs - 1; k >= 0; k--) { 1276 v = aa + 9 * ai[k]; 1277 vj = aj + ai[k]; 1278 tp = t + k * 3; 1279 x0 = tp[0]; 1280 x1 = tp[1]; 1281 x2 = tp[2]; /* xk */ 1282 nz = ai[k + 1] - ai[k]; 1283 1284 tp = t + (*vj) * 3; 1285 while (nz--) { 1286 /* xk += U(k,:)*x(:) */ 1287 x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2]; 1288 x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2]; 1289 x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2]; 1290 vj++; 1291 tp = t + (*vj) * 3; 1292 v += 9; 1293 } 1294 tp = t + k * 3; 1295 tp[0] = x0; 1296 tp[1] = x1; 1297 tp[2] = x2; 1298 idx = 3 * r[k]; 1299 x[idx] = x0; 1300 x[idx + 1] = x1; 1301 x[idx + 2] = x2; 1302 } 1303 1304 PetscCall(ISRestoreIndices(isrow, &r)); 1305 PetscCall(VecRestoreArrayRead(bb, &b)); 1306 PetscCall(VecRestoreArray(xx, &x)); 1307 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1308 PetscFunctionReturn(0); 1309 } 1310 1311 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 1312 const MatScalar *v, *diag; 1313 PetscScalar *xp, x0, x1, x2; 1314 PetscInt nz, k; 1315 const PetscInt *vj; 1316 1317 PetscFunctionBegin; 1318 for (k = 0; k < mbs; k++) { 1319 v = aa + 9 * ai[k]; 1320 xp = x + k * 3; 1321 x0 = xp[0]; 1322 x1 = xp[1]; 1323 x2 = xp[2]; /* Dk*xk = k-th block of x */ 1324 nz = ai[k + 1] - ai[k]; 1325 vj = aj + ai[k]; 1326 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1327 PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1328 while (nz--) { 1329 xp = x + (*vj) * 3; 1330 /* x(:) += U(k,:)^T*(Dk*xk) */ 1331 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2; 1332 xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2; 1333 xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2; 1334 vj++; 1335 v += 9; 1336 } 1337 /* xk = inv(Dk)*(Dk*xk) */ 1338 diag = aa + k * 9; /* ptr to inv(Dk) */ 1339 xp = x + k * 3; 1340 xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2; 1341 xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2; 1342 xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2; 1343 } 1344 PetscFunctionReturn(0); 1345 } 1346 1347 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 1348 const MatScalar *v; 1349 PetscScalar *xp, x0, x1, x2; 1350 PetscInt nz, k; 1351 const PetscInt *vj; 1352 1353 PetscFunctionBegin; 1354 for (k = mbs - 1; k >= 0; k--) { 1355 v = aa + 9 * ai[k]; 1356 xp = x + k * 3; 1357 x0 = xp[0]; 1358 x1 = xp[1]; 1359 x2 = xp[2]; /* xk */ 1360 nz = ai[k + 1] - ai[k]; 1361 vj = aj + ai[k]; 1362 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1363 PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1364 while (nz--) { 1365 xp = x + (*vj) * 3; 1366 /* xk += U(k,:)*x(:) */ 1367 x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2]; 1368 x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2]; 1369 x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2]; 1370 vj++; 1371 v += 9; 1372 } 1373 xp = x + k * 3; 1374 xp[0] = x0; 1375 xp[1] = x1; 1376 xp[2] = x2; 1377 } 1378 PetscFunctionReturn(0); 1379 } 1380 1381 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 1382 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1383 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1384 const MatScalar *aa = a->a; 1385 PetscScalar *x; 1386 const PetscScalar *b; 1387 1388 PetscFunctionBegin; 1389 PetscCall(VecGetArrayRead(bb, &b)); 1390 PetscCall(VecGetArray(xx, &x)); 1391 1392 /* solve U^T * D * y = b by forward substitution */ 1393 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1394 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1395 1396 /* solve U*x = y by back substitution */ 1397 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1398 1399 PetscCall(VecRestoreArrayRead(bb, &b)); 1400 PetscCall(VecRestoreArray(xx, &x)); 1401 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1402 PetscFunctionReturn(0); 1403 } 1404 1405 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 1406 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1407 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1408 const MatScalar *aa = a->a; 1409 PetscScalar *x; 1410 const PetscScalar *b; 1411 1412 PetscFunctionBegin; 1413 PetscCall(VecGetArrayRead(bb, &b)); 1414 PetscCall(VecGetArray(xx, &x)); 1415 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1416 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1417 PetscCall(VecRestoreArrayRead(bb, &b)); 1418 PetscCall(VecRestoreArray(xx, &x)); 1419 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1420 PetscFunctionReturn(0); 1421 } 1422 1423 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 1424 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1425 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1426 const MatScalar *aa = a->a; 1427 PetscScalar *x; 1428 const PetscScalar *b; 1429 1430 PetscFunctionBegin; 1431 PetscCall(VecGetArrayRead(bb, &b)); 1432 PetscCall(VecGetArray(xx, &x)); 1433 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1434 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1435 PetscCall(VecRestoreArrayRead(bb, &b)); 1436 PetscCall(VecRestoreArray(xx, &x)); 1437 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1438 PetscFunctionReturn(0); 1439 } 1440 1441 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx) { 1442 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1443 IS isrow = a->row; 1444 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1445 const PetscInt *r, *vj; 1446 PetscInt nz, k, k2, idx; 1447 const MatScalar *aa = a->a, *v, *diag; 1448 PetscScalar *x, x0, x1, *t; 1449 const PetscScalar *b; 1450 1451 PetscFunctionBegin; 1452 PetscCall(VecGetArrayRead(bb, &b)); 1453 PetscCall(VecGetArray(xx, &x)); 1454 t = a->solve_work; 1455 PetscCall(ISGetIndices(isrow, &r)); 1456 1457 /* solve U^T * D * y = perm(b) by forward substitution */ 1458 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1459 idx = 2 * r[k]; 1460 t[k * 2] = b[idx]; 1461 t[k * 2 + 1] = b[idx + 1]; 1462 } 1463 for (k = 0; k < mbs; k++) { 1464 v = aa + 4 * ai[k]; 1465 vj = aj + ai[k]; 1466 k2 = k * 2; 1467 x0 = t[k2]; 1468 x1 = t[k2 + 1]; 1469 nz = ai[k + 1] - ai[k]; 1470 while (nz--) { 1471 t[(*vj) * 2] += v[0] * x0 + v[1] * x1; 1472 t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1; 1473 vj++; 1474 v += 4; 1475 } 1476 diag = aa + k * 4; /* ptr to inv(Dk) */ 1477 t[k2] = diag[0] * x0 + diag[2] * x1; 1478 t[k2 + 1] = diag[1] * x0 + diag[3] * x1; 1479 } 1480 1481 /* solve U*x = y by back substitution */ 1482 for (k = mbs - 1; k >= 0; k--) { 1483 v = aa + 4 * ai[k]; 1484 vj = aj + ai[k]; 1485 k2 = k * 2; 1486 x0 = t[k2]; 1487 x1 = t[k2 + 1]; 1488 nz = ai[k + 1] - ai[k]; 1489 while (nz--) { 1490 x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1]; 1491 x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1]; 1492 vj++; 1493 v += 4; 1494 } 1495 t[k2] = x0; 1496 t[k2 + 1] = x1; 1497 idx = 2 * r[k]; 1498 x[idx] = x0; 1499 x[idx + 1] = x1; 1500 } 1501 1502 PetscCall(ISRestoreIndices(isrow, &r)); 1503 PetscCall(VecRestoreArrayRead(bb, &b)); 1504 PetscCall(VecRestoreArray(xx, &x)); 1505 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1506 PetscFunctionReturn(0); 1507 } 1508 1509 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 1510 const MatScalar *v, *diag; 1511 PetscScalar x0, x1; 1512 PetscInt nz, k, k2; 1513 const PetscInt *vj; 1514 1515 PetscFunctionBegin; 1516 for (k = 0; k < mbs; k++) { 1517 v = aa + 4 * ai[k]; 1518 vj = aj + ai[k]; 1519 k2 = k * 2; 1520 x0 = x[k2]; 1521 x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */ 1522 nz = ai[k + 1] - ai[k]; 1523 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1524 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1525 while (nz--) { 1526 /* x(:) += U(k,:)^T*(Dk*xk) */ 1527 x[(*vj) * 2] += v[0] * x0 + v[1] * x1; 1528 x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1; 1529 vj++; 1530 v += 4; 1531 } 1532 /* xk = inv(Dk)*(Dk*xk) */ 1533 diag = aa + k * 4; /* ptr to inv(Dk) */ 1534 x[k2] = diag[0] * x0 + diag[2] * x1; 1535 x[k2 + 1] = diag[1] * x0 + diag[3] * x1; 1536 } 1537 PetscFunctionReturn(0); 1538 } 1539 1540 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) { 1541 const MatScalar *v; 1542 PetscScalar x0, x1; 1543 PetscInt nz, k, k2; 1544 const PetscInt *vj; 1545 1546 PetscFunctionBegin; 1547 for (k = mbs - 1; k >= 0; k--) { 1548 v = aa + 4 * ai[k]; 1549 vj = aj + ai[k]; 1550 k2 = k * 2; 1551 x0 = x[k2]; 1552 x1 = x[k2 + 1]; /* xk */ 1553 nz = ai[k + 1] - ai[k]; 1554 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1555 PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1556 while (nz--) { 1557 /* xk += U(k,:)*x(:) */ 1558 x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1]; 1559 x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1]; 1560 vj++; 1561 v += 4; 1562 } 1563 x[k2] = x0; 1564 x[k2 + 1] = x1; 1565 } 1566 PetscFunctionReturn(0); 1567 } 1568 1569 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 1570 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1571 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1572 const MatScalar *aa = a->a; 1573 PetscScalar *x; 1574 const PetscScalar *b; 1575 1576 PetscFunctionBegin; 1577 PetscCall(VecGetArrayRead(bb, &b)); 1578 PetscCall(VecGetArray(xx, &x)); 1579 1580 /* solve U^T * D * y = b by forward substitution */ 1581 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1582 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1583 1584 /* solve U*x = y by back substitution */ 1585 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1586 1587 PetscCall(VecRestoreArrayRead(bb, &b)); 1588 PetscCall(VecRestoreArray(xx, &x)); 1589 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1590 PetscFunctionReturn(0); 1591 } 1592 1593 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 1594 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1595 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1596 const MatScalar *aa = a->a; 1597 PetscScalar *x; 1598 const PetscScalar *b; 1599 1600 PetscFunctionBegin; 1601 PetscCall(VecGetArrayRead(bb, &b)); 1602 PetscCall(VecGetArray(xx, &x)); 1603 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1604 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1605 PetscCall(VecRestoreArrayRead(bb, &b)); 1606 PetscCall(VecRestoreArray(xx, &x)); 1607 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1608 PetscFunctionReturn(0); 1609 } 1610 1611 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 1612 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1613 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1614 const MatScalar *aa = a->a; 1615 PetscScalar *x; 1616 const PetscScalar *b; 1617 1618 PetscFunctionBegin; 1619 PetscCall(VecGetArrayRead(bb, &b)); 1620 PetscCall(VecGetArray(xx, &x)); 1621 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1622 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1623 PetscCall(VecRestoreArrayRead(bb, &b)); 1624 PetscCall(VecRestoreArray(xx, &x)); 1625 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1626 PetscFunctionReturn(0); 1627 } 1628 1629 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) { 1630 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1631 IS isrow = a->row; 1632 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1633 const MatScalar *aa = a->a, *v; 1634 const PetscScalar *b; 1635 PetscScalar *x, xk, *t; 1636 PetscInt nz, k, j; 1637 1638 PetscFunctionBegin; 1639 PetscCall(VecGetArrayRead(bb, &b)); 1640 PetscCall(VecGetArray(xx, &x)); 1641 t = a->solve_work; 1642 PetscCall(ISGetIndices(isrow, &rp)); 1643 1644 /* solve U^T*D*y = perm(b) by forward substitution */ 1645 for (k = 0; k < mbs; k++) t[k] = b[rp[k]]; 1646 for (k = 0; k < mbs; k++) { 1647 v = aa + ai[k]; 1648 vj = aj + ai[k]; 1649 xk = t[k]; 1650 nz = ai[k + 1] - ai[k] - 1; 1651 for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk; 1652 t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */ 1653 } 1654 1655 /* solve U*perm(x) = y by back substitution */ 1656 for (k = mbs - 1; k >= 0; k--) { 1657 v = aa + adiag[k] - 1; 1658 vj = aj + adiag[k] - 1; 1659 nz = ai[k + 1] - ai[k] - 1; 1660 for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]]; 1661 x[rp[k]] = t[k]; 1662 } 1663 1664 PetscCall(ISRestoreIndices(isrow, &rp)); 1665 PetscCall(VecRestoreArrayRead(bb, &b)); 1666 PetscCall(VecRestoreArray(xx, &x)); 1667 PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs)); 1668 PetscFunctionReturn(0); 1669 } 1670 1671 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) { 1672 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1673 IS isrow = a->row; 1674 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1675 const MatScalar *aa = a->a, *v; 1676 PetscScalar *x, xk, *t; 1677 const PetscScalar *b; 1678 PetscInt nz, k; 1679 1680 PetscFunctionBegin; 1681 PetscCall(VecGetArrayRead(bb, &b)); 1682 PetscCall(VecGetArray(xx, &x)); 1683 t = a->solve_work; 1684 PetscCall(ISGetIndices(isrow, &rp)); 1685 1686 /* solve U^T*D*y = perm(b) by forward substitution */ 1687 for (k = 0; k < mbs; k++) t[k] = b[rp[k]]; 1688 for (k = 0; k < mbs; k++) { 1689 v = aa + ai[k] + 1; 1690 vj = aj + ai[k] + 1; 1691 xk = t[k]; 1692 nz = ai[k + 1] - ai[k] - 1; 1693 while (nz--) t[*vj++] += (*v++) * xk; 1694 t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */ 1695 } 1696 1697 /* solve U*perm(x) = y by back substitution */ 1698 for (k = mbs - 1; k >= 0; k--) { 1699 v = aa + ai[k] + 1; 1700 vj = aj + ai[k] + 1; 1701 nz = ai[k + 1] - ai[k] - 1; 1702 while (nz--) t[k] += (*v++) * t[*vj++]; 1703 x[rp[k]] = t[k]; 1704 } 1705 1706 PetscCall(ISRestoreIndices(isrow, &rp)); 1707 PetscCall(VecRestoreArrayRead(bb, &b)); 1708 PetscCall(VecRestoreArray(xx, &x)); 1709 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 1710 PetscFunctionReturn(0); 1711 } 1712 1713 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) { 1714 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1715 IS isrow = a->row; 1716 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1717 const MatScalar *aa = a->a, *v; 1718 PetscReal diagk; 1719 PetscScalar *x, xk; 1720 const PetscScalar *b; 1721 PetscInt nz, k; 1722 1723 PetscFunctionBegin; 1724 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1725 PetscCall(VecGetArrayRead(bb, &b)); 1726 PetscCall(VecGetArray(xx, &x)); 1727 PetscCall(ISGetIndices(isrow, &rp)); 1728 1729 for (k = 0; k < mbs; k++) x[k] = b[rp[k]]; 1730 for (k = 0; k < mbs; k++) { 1731 v = aa + ai[k]; 1732 vj = aj + ai[k]; 1733 xk = x[k]; 1734 nz = ai[k + 1] - ai[k] - 1; 1735 while (nz--) x[*vj++] += (*v++) * xk; 1736 1737 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1738 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1739 x[k] = xk * PetscSqrtReal(diagk); 1740 } 1741 PetscCall(ISRestoreIndices(isrow, &rp)); 1742 PetscCall(VecRestoreArrayRead(bb, &b)); 1743 PetscCall(VecRestoreArray(xx, &x)); 1744 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1745 PetscFunctionReturn(0); 1746 } 1747 1748 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) { 1749 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1750 IS isrow = a->row; 1751 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1752 const MatScalar *aa = a->a, *v; 1753 PetscReal diagk; 1754 PetscScalar *x, xk; 1755 const PetscScalar *b; 1756 PetscInt nz, k; 1757 1758 PetscFunctionBegin; 1759 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1760 PetscCall(VecGetArrayRead(bb, &b)); 1761 PetscCall(VecGetArray(xx, &x)); 1762 PetscCall(ISGetIndices(isrow, &rp)); 1763 1764 for (k = 0; k < mbs; k++) x[k] = b[rp[k]]; 1765 for (k = 0; k < mbs; k++) { 1766 v = aa + ai[k] + 1; 1767 vj = aj + ai[k] + 1; 1768 xk = x[k]; 1769 nz = ai[k + 1] - ai[k] - 1; 1770 while (nz--) x[*vj++] += (*v++) * xk; 1771 1772 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1773 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1774 x[k] = xk * PetscSqrtReal(diagk); 1775 } 1776 PetscCall(ISRestoreIndices(isrow, &rp)); 1777 PetscCall(VecRestoreArrayRead(bb, &b)); 1778 PetscCall(VecRestoreArray(xx, &x)); 1779 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1780 PetscFunctionReturn(0); 1781 } 1782 1783 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) { 1784 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1785 IS isrow = a->row; 1786 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1787 const MatScalar *aa = a->a, *v; 1788 PetscReal diagk; 1789 PetscScalar *x, *t; 1790 const PetscScalar *b; 1791 PetscInt nz, k; 1792 1793 PetscFunctionBegin; 1794 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1795 PetscCall(VecGetArrayRead(bb, &b)); 1796 PetscCall(VecGetArray(xx, &x)); 1797 t = a->solve_work; 1798 PetscCall(ISGetIndices(isrow, &rp)); 1799 1800 for (k = mbs - 1; k >= 0; k--) { 1801 v = aa + ai[k]; 1802 vj = aj + ai[k]; 1803 diagk = PetscRealPart(aa[adiag[k]]); 1804 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1805 t[k] = b[k] * PetscSqrtReal(diagk); 1806 nz = ai[k + 1] - ai[k] - 1; 1807 while (nz--) t[k] += (*v++) * t[*vj++]; 1808 x[rp[k]] = t[k]; 1809 } 1810 PetscCall(ISRestoreIndices(isrow, &rp)); 1811 PetscCall(VecRestoreArrayRead(bb, &b)); 1812 PetscCall(VecRestoreArray(xx, &x)); 1813 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1814 PetscFunctionReturn(0); 1815 } 1816 1817 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) { 1818 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1819 IS isrow = a->row; 1820 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1821 const MatScalar *aa = a->a, *v; 1822 PetscReal diagk; 1823 PetscScalar *x, *t; 1824 const PetscScalar *b; 1825 PetscInt nz, k; 1826 1827 PetscFunctionBegin; 1828 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1829 PetscCall(VecGetArrayRead(bb, &b)); 1830 PetscCall(VecGetArray(xx, &x)); 1831 t = a->solve_work; 1832 PetscCall(ISGetIndices(isrow, &rp)); 1833 1834 for (k = mbs - 1; k >= 0; k--) { 1835 v = aa + ai[k] + 1; 1836 vj = aj + ai[k] + 1; 1837 diagk = PetscRealPart(aa[ai[k]]); 1838 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1839 t[k] = b[k] * PetscSqrtReal(diagk); 1840 nz = ai[k + 1] - ai[k] - 1; 1841 while (nz--) t[k] += (*v++) * t[*vj++]; 1842 x[rp[k]] = t[k]; 1843 } 1844 PetscCall(ISRestoreIndices(isrow, &rp)); 1845 PetscCall(VecRestoreArrayRead(bb, &b)); 1846 PetscCall(VecRestoreArray(xx, &x)); 1847 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1848 PetscFunctionReturn(0); 1849 } 1850 1851 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx) { 1852 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1853 1854 PetscFunctionBegin; 1855 if (A->rmap->bs == 1) { 1856 PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v)); 1857 } else { 1858 IS isrow = a->row; 1859 const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp; 1860 const MatScalar *aa = a->a, *v; 1861 PetscScalar *x, *t; 1862 const PetscScalar *b; 1863 PetscInt nz, k, n, i, j; 1864 1865 if (bb->n > a->solves_work_n) { 1866 PetscCall(PetscFree(a->solves_work)); 1867 PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work)); 1868 a->solves_work_n = bb->n; 1869 } 1870 n = bb->n; 1871 PetscCall(VecGetArrayRead(bb->v, &b)); 1872 PetscCall(VecGetArray(xx->v, &x)); 1873 t = a->solves_work; 1874 1875 PetscCall(ISGetIndices(isrow, &rp)); 1876 1877 /* solve U^T*D*y = perm(b) by forward substitution */ 1878 for (k = 0; k < mbs; k++) { 1879 for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */ 1880 } 1881 for (k = 0; k < mbs; k++) { 1882 v = aa + ai[k]; 1883 vj = aj + ai[k]; 1884 nz = ai[k + 1] - ai[k] - 1; 1885 for (j = 0; j < nz; j++) { 1886 for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i]; 1887 v++; 1888 vj++; 1889 } 1890 for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1891 } 1892 1893 /* solve U*perm(x) = y by back substitution */ 1894 for (k = mbs - 1; k >= 0; k--) { 1895 v = aa + ai[k] - 1; 1896 vj = aj + ai[k] - 1; 1897 nz = ai[k + 1] - ai[k] - 1; 1898 for (j = 0; j < nz; j++) { 1899 for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i]; 1900 v++; 1901 vj++; 1902 } 1903 for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i]; 1904 } 1905 1906 PetscCall(ISRestoreIndices(isrow, &rp)); 1907 PetscCall(VecRestoreArrayRead(bb->v, &b)); 1908 PetscCall(VecRestoreArray(xx->v, &x)); 1909 PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs))); 1910 } 1911 PetscFunctionReturn(0); 1912 } 1913 1914 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx) { 1915 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1916 1917 PetscFunctionBegin; 1918 if (A->rmap->bs == 1) { 1919 PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v)); 1920 } else { 1921 IS isrow = a->row; 1922 const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp; 1923 const MatScalar *aa = a->a, *v; 1924 PetscScalar *x, *t; 1925 const PetscScalar *b; 1926 PetscInt nz, k, n, i; 1927 1928 if (bb->n > a->solves_work_n) { 1929 PetscCall(PetscFree(a->solves_work)); 1930 PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work)); 1931 a->solves_work_n = bb->n; 1932 } 1933 n = bb->n; 1934 PetscCall(VecGetArrayRead(bb->v, &b)); 1935 PetscCall(VecGetArray(xx->v, &x)); 1936 t = a->solves_work; 1937 1938 PetscCall(ISGetIndices(isrow, &rp)); 1939 1940 /* solve U^T*D*y = perm(b) by forward substitution */ 1941 for (k = 0; k < mbs; k++) { 1942 for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */ 1943 } 1944 for (k = 0; k < mbs; k++) { 1945 v = aa + ai[k]; 1946 vj = aj + ai[k]; 1947 nz = ai[k + 1] - ai[k]; 1948 while (nz--) { 1949 for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i]; 1950 v++; 1951 vj++; 1952 } 1953 for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 1954 } 1955 1956 /* solve U*perm(x) = y by back substitution */ 1957 for (k = mbs - 1; k >= 0; k--) { 1958 v = aa + ai[k]; 1959 vj = aj + ai[k]; 1960 nz = ai[k + 1] - ai[k]; 1961 while (nz--) { 1962 for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i]; 1963 v++; 1964 vj++; 1965 } 1966 for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i]; 1967 } 1968 1969 PetscCall(ISRestoreIndices(isrow, &rp)); 1970 PetscCall(VecRestoreArrayRead(bb->v, &b)); 1971 PetscCall(VecRestoreArray(xx->v, &x)); 1972 PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs))); 1973 } 1974 PetscFunctionReturn(0); 1975 } 1976 1977 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1978 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1979 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag; 1980 const MatScalar *aa = a->a, *v; 1981 const PetscScalar *b; 1982 PetscScalar *x, xi; 1983 PetscInt nz, i, j; 1984 1985 PetscFunctionBegin; 1986 PetscCall(VecGetArrayRead(bb, &b)); 1987 PetscCall(VecGetArray(xx, &x)); 1988 /* solve U^T*D*y = b by forward substitution */ 1989 PetscCall(PetscArraycpy(x, b, mbs)); 1990 for (i = 0; i < mbs; i++) { 1991 v = aa + ai[i]; 1992 vj = aj + ai[i]; 1993 xi = x[i]; 1994 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 1995 for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi; 1996 x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 1997 } 1998 /* solve U*x = y by backward substitution */ 1999 for (i = mbs - 2; i >= 0; i--) { 2000 xi = x[i]; 2001 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2002 vj = aj + adiag[i] - 1; 2003 nz = ai[i + 1] - ai[i] - 1; 2004 for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]]; 2005 x[i] = xi; 2006 } 2007 PetscCall(VecRestoreArrayRead(bb, &b)); 2008 PetscCall(VecRestoreArray(xx, &x)); 2009 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 2010 PetscFunctionReturn(0); 2011 } 2012 2013 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X) { 2014 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2015 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag; 2016 const MatScalar *aa = a->a, *v; 2017 const PetscScalar *b; 2018 PetscScalar *x, xi; 2019 PetscInt nz, i, j, neq, ldb, ldx; 2020 PetscBool isdense; 2021 2022 PetscFunctionBegin; 2023 if (!mbs) PetscFunctionReturn(0); 2024 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense)); 2025 PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix"); 2026 if (X != B) { 2027 PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense)); 2028 PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix"); 2029 } 2030 PetscCall(MatDenseGetArrayRead(B, &b)); 2031 PetscCall(MatDenseGetLDA(B, &ldb)); 2032 PetscCall(MatDenseGetArray(X, &x)); 2033 PetscCall(MatDenseGetLDA(X, &ldx)); 2034 for (neq = 0; neq < B->cmap->n; neq++) { 2035 /* solve U^T*D*y = b by forward substitution */ 2036 PetscCall(PetscArraycpy(x, b, mbs)); 2037 for (i = 0; i < mbs; i++) { 2038 v = aa + ai[i]; 2039 vj = aj + ai[i]; 2040 xi = x[i]; 2041 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 2042 for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi; 2043 x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2044 } 2045 /* solve U*x = y by backward substitution */ 2046 for (i = mbs - 2; i >= 0; i--) { 2047 xi = x[i]; 2048 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2049 vj = aj + adiag[i] - 1; 2050 nz = ai[i + 1] - ai[i] - 1; 2051 for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]]; 2052 x[i] = xi; 2053 } 2054 b += ldb; 2055 x += ldx; 2056 } 2057 PetscCall(MatDenseRestoreArrayRead(B, &b)); 2058 PetscCall(MatDenseRestoreArray(X, &x)); 2059 PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs))); 2060 PetscFunctionReturn(0); 2061 } 2062 2063 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 2064 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2065 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2066 const MatScalar *aa = a->a, *v; 2067 PetscScalar *x, xk; 2068 const PetscScalar *b; 2069 PetscInt nz, k; 2070 2071 PetscFunctionBegin; 2072 PetscCall(VecGetArrayRead(bb, &b)); 2073 PetscCall(VecGetArray(xx, &x)); 2074 2075 /* solve U^T*D*y = b by forward substitution */ 2076 PetscCall(PetscArraycpy(x, b, mbs)); 2077 for (k = 0; k < mbs; k++) { 2078 v = aa + ai[k] + 1; 2079 vj = aj + ai[k] + 1; 2080 xk = x[k]; 2081 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2082 while (nz--) x[*vj++] += (*v++) * xk; 2083 x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2084 } 2085 2086 /* solve U*x = y by back substitution */ 2087 for (k = mbs - 2; k >= 0; k--) { 2088 v = aa + ai[k] + 1; 2089 vj = aj + ai[k] + 1; 2090 xk = x[k]; 2091 nz = ai[k + 1] - ai[k] - 1; 2092 while (nz--) { xk += (*v++) * x[*vj++]; } 2093 x[k] = xk; 2094 } 2095 2096 PetscCall(VecRestoreArrayRead(bb, &b)); 2097 PetscCall(VecRestoreArray(xx, &x)); 2098 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 2099 PetscFunctionReturn(0); 2100 } 2101 2102 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) { 2103 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2104 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj; 2105 const MatScalar *aa = a->a, *v; 2106 PetscReal diagk; 2107 PetscScalar *x; 2108 const PetscScalar *b; 2109 PetscInt nz, k; 2110 2111 PetscFunctionBegin; 2112 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2113 PetscCall(VecGetArrayRead(bb, &b)); 2114 PetscCall(VecGetArray(xx, &x)); 2115 PetscCall(PetscArraycpy(x, b, mbs)); 2116 for (k = 0; k < mbs; k++) { 2117 v = aa + ai[k]; 2118 vj = aj + ai[k]; 2119 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2120 while (nz--) x[*vj++] += (*v++) * x[k]; 2121 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2122 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]])); 2123 x[k] *= PetscSqrtReal(diagk); 2124 } 2125 PetscCall(VecRestoreArrayRead(bb, &b)); 2126 PetscCall(VecRestoreArray(xx, &x)); 2127 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2128 PetscFunctionReturn(0); 2129 } 2130 2131 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 2132 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2133 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2134 const MatScalar *aa = a->a, *v; 2135 PetscReal diagk; 2136 PetscScalar *x; 2137 const PetscScalar *b; 2138 PetscInt nz, k; 2139 2140 PetscFunctionBegin; 2141 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2142 PetscCall(VecGetArrayRead(bb, &b)); 2143 PetscCall(VecGetArray(xx, &x)); 2144 PetscCall(PetscArraycpy(x, b, mbs)); 2145 for (k = 0; k < mbs; k++) { 2146 v = aa + ai[k] + 1; 2147 vj = aj + ai[k] + 1; 2148 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2149 while (nz--) x[*vj++] += (*v++) * x[k]; 2150 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2151 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]])); 2152 x[k] *= PetscSqrtReal(diagk); 2153 } 2154 PetscCall(VecRestoreArrayRead(bb, &b)); 2155 PetscCall(VecRestoreArray(xx, &x)); 2156 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2157 PetscFunctionReturn(0); 2158 } 2159 2160 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) { 2161 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2162 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj; 2163 const MatScalar *aa = a->a, *v; 2164 PetscReal diagk; 2165 PetscScalar *x; 2166 const PetscScalar *b; 2167 PetscInt nz, k; 2168 2169 PetscFunctionBegin; 2170 /* solve D^(1/2)*U*x = b by back substitution */ 2171 PetscCall(VecGetArrayRead(bb, &b)); 2172 PetscCall(VecGetArray(xx, &x)); 2173 2174 for (k = mbs - 1; k >= 0; k--) { 2175 v = aa + ai[k]; 2176 vj = aj + ai[k]; 2177 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2178 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 2179 x[k] = PetscSqrtReal(diagk) * b[k]; 2180 nz = ai[k + 1] - ai[k] - 1; 2181 while (nz--) x[k] += (*v++) * x[*vj++]; 2182 } 2183 PetscCall(VecRestoreArrayRead(bb, &b)); 2184 PetscCall(VecRestoreArray(xx, &x)); 2185 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2186 PetscFunctionReturn(0); 2187 } 2188 2189 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 2190 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2191 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2192 const MatScalar *aa = a->a, *v; 2193 PetscReal diagk; 2194 PetscScalar *x; 2195 const PetscScalar *b; 2196 PetscInt nz, k; 2197 2198 PetscFunctionBegin; 2199 /* solve D^(1/2)*U*x = b by back substitution */ 2200 PetscCall(VecGetArrayRead(bb, &b)); 2201 PetscCall(VecGetArray(xx, &x)); 2202 2203 for (k = mbs - 1; k >= 0; k--) { 2204 v = aa + ai[k] + 1; 2205 vj = aj + ai[k] + 1; 2206 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2207 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 2208 x[k] = PetscSqrtReal(diagk) * b[k]; 2209 nz = ai[k + 1] - ai[k] - 1; 2210 while (nz--) x[k] += (*v++) * x[*vj++]; 2211 } 2212 PetscCall(VecRestoreArrayRead(bb, &b)); 2213 PetscCall(VecRestoreArray(xx, &x)); 2214 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2215 PetscFunctionReturn(0); 2216 } 2217 2218 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2219 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info) { 2220 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b; 2221 const PetscInt *rip, mbs = a->mbs, *ai, *aj; 2222 PetscInt *jutmp, bs = A->rmap->bs, i; 2223 PetscInt m, reallocs = 0, *levtmp; 2224 PetscInt *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd; 2225 PetscInt incrlev, *lev, shift, prow, nz; 2226 PetscReal f = info->fill, levels = info->levels; 2227 PetscBool perm_identity; 2228 2229 PetscFunctionBegin; 2230 /* check whether perm is the identity mapping */ 2231 PetscCall(ISIdentity(perm, &perm_identity)); 2232 2233 if (perm_identity) { 2234 a->permute = PETSC_FALSE; 2235 ai = a->i; 2236 aj = a->j; 2237 } else { /* non-trivial permutation */ 2238 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2239 } 2240 2241 /* initialization */ 2242 PetscCall(ISGetIndices(perm, &rip)); 2243 umax = (PetscInt)(f * ai[mbs] + 1); 2244 PetscCall(PetscMalloc1(umax, &lev)); 2245 umax += mbs + 1; 2246 shift = mbs + 1; 2247 PetscCall(PetscMalloc1(mbs + 1, &iu)); 2248 PetscCall(PetscMalloc1(umax, &ju)); 2249 iu[0] = mbs + 1; 2250 juidx = mbs + 1; 2251 /* prowl: linked list for pivot row */ 2252 PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp)); 2253 /* q: linked list for col index */ 2254 2255 for (i = 0; i < mbs; i++) { 2256 prowl[i] = mbs; 2257 q[i] = 0; 2258 } 2259 2260 /* for each row k */ 2261 for (k = 0; k < mbs; k++) { 2262 nzk = 0; 2263 q[k] = mbs; 2264 /* copy current row into linked list */ 2265 nz = ai[rip[k] + 1] - ai[rip[k]]; 2266 j = ai[rip[k]]; 2267 while (nz--) { 2268 vj = rip[aj[j++]]; 2269 if (vj > k) { 2270 qm = k; 2271 do { 2272 m = qm; 2273 qm = q[m]; 2274 } while (qm < vj); 2275 PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A"); 2276 nzk++; 2277 q[m] = vj; 2278 q[vj] = qm; 2279 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2280 } 2281 } 2282 2283 /* modify nonzero structure of k-th row by computing fill-in 2284 for each row prow to be merged in */ 2285 prow = k; 2286 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2287 2288 while (prow < k) { 2289 /* merge row prow into k-th row */ 2290 jmin = iu[prow] + 1; 2291 jmax = iu[prow + 1]; 2292 qm = k; 2293 for (j = jmin; j < jmax; j++) { 2294 incrlev = lev[j - shift] + 1; 2295 if (incrlev > levels) continue; 2296 vj = ju[j]; 2297 do { 2298 m = qm; 2299 qm = q[m]; 2300 } while (qm < vj); 2301 if (qm != vj) { /* a fill */ 2302 nzk++; 2303 q[m] = vj; 2304 q[vj] = qm; 2305 qm = vj; 2306 levtmp[vj] = incrlev; 2307 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2308 } 2309 prow = prowl[prow]; /* next pivot row */ 2310 } 2311 2312 /* add k to row list for first nonzero element in k-th row */ 2313 if (nzk > 1) { 2314 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2315 prowl[k] = prowl[i]; 2316 prowl[i] = k; 2317 } 2318 iu[k + 1] = iu[k] + nzk; 2319 2320 /* allocate more space to ju and lev if needed */ 2321 if (iu[k + 1] > umax) { 2322 /* estimate how much additional space we will need */ 2323 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2324 /* just double the memory each time */ 2325 maxadd = umax; 2326 if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2; 2327 umax += maxadd; 2328 2329 /* allocate a longer ju */ 2330 PetscCall(PetscMalloc1(umax, &jutmp)); 2331 PetscCall(PetscArraycpy(jutmp, ju, iu[k])); 2332 PetscCall(PetscFree(ju)); 2333 ju = jutmp; 2334 2335 PetscCall(PetscMalloc1(umax, &jutmp)); 2336 PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift)); 2337 PetscCall(PetscFree(lev)); 2338 lev = jutmp; 2339 reallocs += 2; /* count how many times we realloc */ 2340 } 2341 2342 /* save nonzero structure of k-th row in ju */ 2343 i = k; 2344 while (nzk--) { 2345 i = q[i]; 2346 ju[juidx] = i; 2347 lev[juidx - shift] = levtmp[i]; 2348 juidx++; 2349 } 2350 } 2351 2352 #if defined(PETSC_USE_INFO) 2353 if (ai[mbs] != 0) { 2354 PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]); 2355 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af)); 2356 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2357 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af)); 2358 PetscCall(PetscInfo(A, "for best performance.\n")); 2359 } else { 2360 PetscCall(PetscInfo(A, "Empty matrix\n")); 2361 } 2362 #endif 2363 2364 PetscCall(ISRestoreIndices(perm, &rip)); 2365 PetscCall(PetscFree3(prowl, q, levtmp)); 2366 PetscCall(PetscFree(lev)); 2367 2368 /* put together the new matrix */ 2369 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL)); 2370 2371 /* PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)iperm)); */ 2372 b = (Mat_SeqSBAIJ *)(B)->data; 2373 PetscCall(PetscFree2(b->imax, b->ilen)); 2374 2375 b->singlemalloc = PETSC_FALSE; 2376 b->free_a = PETSC_TRUE; 2377 b->free_ij = PETSC_TRUE; 2378 /* the next line frees the default space generated by the Create() */ 2379 PetscCall(PetscFree3(b->a, b->j, b->i)); 2380 PetscCall(PetscMalloc1((iu[mbs] + 1) * a->bs2, &b->a)); 2381 b->j = ju; 2382 b->i = iu; 2383 b->diag = NULL; 2384 b->ilen = NULL; 2385 b->imax = NULL; 2386 2387 PetscCall(ISDestroy(&b->row)); 2388 PetscCall(ISDestroy(&b->icol)); 2389 b->row = perm; 2390 b->icol = perm; 2391 PetscCall(PetscObjectReference((PetscObject)perm)); 2392 PetscCall(PetscObjectReference((PetscObject)perm)); 2393 PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work)); 2394 /* In b structure: Free imax, ilen, old a, old j. 2395 Allocate idnew, solve_work, new a, new j */ 2396 PetscCall(PetscLogObjectMemory((PetscObject)B, (iu[mbs] - mbs) * (sizeof(PetscInt) + sizeof(MatScalar)))); 2397 b->maxnz = b->nz = iu[mbs]; 2398 2399 (B)->info.factor_mallocs = reallocs; 2400 (B)->info.fill_ratio_given = f; 2401 if (ai[mbs] != 0) { 2402 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]); 2403 } else { 2404 (B)->info.fill_ratio_needed = 0.0; 2405 } 2406 PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity)); 2407 PetscFunctionReturn(0); 2408 } 2409 2410 /* 2411 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2412 */ 2413 #include <petscbt.h> 2414 #include <../src/mat/utils/freespace.h> 2415 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) { 2416 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b; 2417 PetscBool perm_identity, free_ij = PETSC_TRUE, missing; 2418 PetscInt bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j; 2419 const PetscInt *rip; 2420 PetscInt reallocs = 0, i, *ui, *udiag, *cols; 2421 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 2422 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr; 2423 PetscReal fill = info->fill, levels = info->levels; 2424 PetscFreeSpaceList free_space = NULL, current_space = NULL; 2425 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 2426 PetscBT lnkbt; 2427 2428 PetscFunctionBegin; 2429 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2430 PetscCall(MatMissingDiagonal(A, &missing, &d)); 2431 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d); 2432 if (bs > 1) { 2433 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info)); 2434 PetscFunctionReturn(0); 2435 } 2436 2437 /* check whether perm is the identity mapping */ 2438 PetscCall(ISIdentity(perm, &perm_identity)); 2439 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2440 a->permute = PETSC_FALSE; 2441 2442 PetscCall(PetscMalloc1(am + 1, &ui)); 2443 PetscCall(PetscMalloc1(am + 1, &udiag)); 2444 ui[0] = 0; 2445 2446 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2447 if (!levels) { 2448 /* reuse the column pointers and row offsets for memory savings */ 2449 for (i = 0; i < am; i++) { 2450 ncols = ai[i + 1] - ai[i]; 2451 ui[i + 1] = ui[i] + ncols; 2452 udiag[i] = ui[i + 1] - 1; /* points to the last entry of U(i,:) */ 2453 } 2454 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 2455 cols = uj; 2456 for (i = 0; i < am; i++) { 2457 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2458 ncols = ai[i + 1] - ai[i] - 1; 2459 for (j = 0; j < ncols; j++) *cols++ = aj[j]; 2460 *cols++ = i; /* diagonal is located as the last entry of U(i,:) */ 2461 } 2462 } else { /* case: levels>0 */ 2463 PetscCall(ISGetIndices(perm, &rip)); 2464 2465 /* initialization */ 2466 /* jl: linked list for storing indices of the pivot rows 2467 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2468 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 2469 for (i = 0; i < am; i++) { 2470 jl[i] = am; 2471 il[i] = 0; 2472 } 2473 2474 /* create and initialize a linked list for storing column indices of the active row k */ 2475 nlnk = am + 1; 2476 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 2477 2478 /* initial FreeSpace size is fill*(ai[am]+1) */ 2479 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space)); 2480 2481 current_space = free_space; 2482 2483 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl)); 2484 2485 current_space_lvl = free_space_lvl; 2486 2487 for (k = 0; k < am; k++) { /* for each active row k */ 2488 /* initialize lnk by the column indices of row k */ 2489 nzk = 0; 2490 ncols = ai[k + 1] - ai[k]; 2491 PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k); 2492 cols = aj + ai[k]; 2493 PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt)); 2494 nzk += nlnk; 2495 2496 /* update lnk by computing fill-in for each pivot row to be merged in */ 2497 prow = jl[k]; /* 1st pivot row */ 2498 2499 while (prow < k) { 2500 nextprow = jl[prow]; 2501 2502 /* merge prow into k-th row */ 2503 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2504 jmax = ui[prow + 1]; 2505 ncols = jmax - jmin; 2506 i = jmin - ui[prow]; 2507 2508 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2509 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2510 j = *(uj - 1); 2511 PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j)); 2512 nzk += nlnk; 2513 2514 /* update il and jl for prow */ 2515 if (jmin < jmax) { 2516 il[prow] = jmin; 2517 2518 j = *cols; 2519 jl[prow] = jl[j]; 2520 jl[j] = prow; 2521 } 2522 prow = nextprow; 2523 } 2524 2525 /* if free space is not available, make more free space */ 2526 if (current_space->local_remaining < nzk) { 2527 i = am - k + 1; /* num of unfactored rows */ 2528 i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2529 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 2530 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 2531 reallocs++; 2532 } 2533 2534 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2535 PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k); 2536 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 2537 2538 /* add the k-th row into il and jl */ 2539 if (nzk > 1) { 2540 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2541 jl[k] = jl[i]; 2542 jl[i] = k; 2543 il[k] = ui[k] + 1; 2544 } 2545 uj_ptr[k] = current_space->array; 2546 uj_lvl_ptr[k] = current_space_lvl->array; 2547 2548 current_space->array += nzk; 2549 current_space->local_used += nzk; 2550 current_space->local_remaining -= nzk; 2551 current_space_lvl->array += nzk; 2552 current_space_lvl->local_used += nzk; 2553 current_space_lvl->local_remaining -= nzk; 2554 2555 ui[k + 1] = ui[k] + nzk; 2556 } 2557 2558 PetscCall(ISRestoreIndices(perm, &rip)); 2559 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 2560 2561 /* destroy list of free space and other temporary array(s) */ 2562 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 2563 PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor */ 2564 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 2565 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 2566 2567 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2568 2569 /* put together the new matrix in MATSEQSBAIJ format */ 2570 PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 2571 2572 b = (Mat_SeqSBAIJ *)(fact)->data; 2573 PetscCall(PetscFree2(b->imax, b->ilen)); 2574 2575 b->singlemalloc = PETSC_FALSE; 2576 b->free_a = PETSC_TRUE; 2577 b->free_ij = free_ij; 2578 2579 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 2580 2581 b->j = uj; 2582 b->i = ui; 2583 b->diag = udiag; 2584 b->free_diag = PETSC_TRUE; 2585 b->ilen = NULL; 2586 b->imax = NULL; 2587 b->row = perm; 2588 b->col = perm; 2589 2590 PetscCall(PetscObjectReference((PetscObject)perm)); 2591 PetscCall(PetscObjectReference((PetscObject)perm)); 2592 2593 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2594 2595 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 2596 PetscCall(PetscLogObjectMemory((PetscObject)fact, ui[am] * (sizeof(PetscInt) + sizeof(MatScalar)))); 2597 2598 b->maxnz = b->nz = ui[am]; 2599 2600 fact->info.factor_mallocs = reallocs; 2601 fact->info.fill_ratio_given = fill; 2602 if (ai[am] != 0) { 2603 fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am]; 2604 } else { 2605 fact->info.fill_ratio_needed = 0.0; 2606 } 2607 #if defined(PETSC_USE_INFO) 2608 if (ai[am] != 0) { 2609 PetscReal af = fact->info.fill_ratio_needed; 2610 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 2611 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2612 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 2613 } else { 2614 PetscCall(PetscInfo(A, "Empty matrix\n")); 2615 } 2616 #endif 2617 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2618 PetscFunctionReturn(0); 2619 } 2620 2621 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info) { 2622 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2623 Mat_SeqSBAIJ *b; 2624 PetscBool perm_identity, free_ij = PETSC_TRUE; 2625 PetscInt bs = A->rmap->bs, am = a->mbs; 2626 const PetscInt *cols, *rip, *ai = a->i, *aj = a->j; 2627 PetscInt reallocs = 0, i, *ui; 2628 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 2629 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 2630 PetscReal fill = info->fill, levels = info->levels, ratio_needed; 2631 PetscFreeSpaceList free_space = NULL, current_space = NULL; 2632 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 2633 PetscBT lnkbt; 2634 2635 PetscFunctionBegin; 2636 /* 2637 This code originally uses Modified Sparse Row (MSR) storage 2638 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2639 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2640 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2641 thus the original code in MSR format is still used for these cases. 2642 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2643 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2644 */ 2645 if (bs > 1) { 2646 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info)); 2647 PetscFunctionReturn(0); 2648 } 2649 2650 /* check whether perm is the identity mapping */ 2651 PetscCall(ISIdentity(perm, &perm_identity)); 2652 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2653 a->permute = PETSC_FALSE; 2654 2655 /* special case that simply copies fill pattern */ 2656 if (!levels) { 2657 /* reuse the column pointers and row offsets for memory savings */ 2658 ui = a->i; 2659 uj = a->j; 2660 free_ij = PETSC_FALSE; 2661 ratio_needed = 1.0; 2662 } else { /* case: levels>0 */ 2663 PetscCall(ISGetIndices(perm, &rip)); 2664 2665 /* initialization */ 2666 PetscCall(PetscMalloc1(am + 1, &ui)); 2667 ui[0] = 0; 2668 2669 /* jl: linked list for storing indices of the pivot rows 2670 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2671 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 2672 for (i = 0; i < am; i++) { 2673 jl[i] = am; 2674 il[i] = 0; 2675 } 2676 2677 /* create and initialize a linked list for storing column indices of the active row k */ 2678 nlnk = am + 1; 2679 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 2680 2681 /* initial FreeSpace size is fill*(ai[am]+1) */ 2682 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space)); 2683 2684 current_space = free_space; 2685 2686 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl)); 2687 2688 current_space_lvl = free_space_lvl; 2689 2690 for (k = 0; k < am; k++) { /* for each active row k */ 2691 /* initialize lnk by the column indices of row rip[k] */ 2692 nzk = 0; 2693 ncols = ai[rip[k] + 1] - ai[rip[k]]; 2694 cols = aj + ai[rip[k]]; 2695 PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt)); 2696 nzk += nlnk; 2697 2698 /* update lnk by computing fill-in for each pivot row to be merged in */ 2699 prow = jl[k]; /* 1st pivot row */ 2700 2701 while (prow < k) { 2702 nextprow = jl[prow]; 2703 2704 /* merge prow into k-th row */ 2705 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2706 jmax = ui[prow + 1]; 2707 ncols = jmax - jmin; 2708 i = jmin - ui[prow]; 2709 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2710 j = *(uj_lvl_ptr[prow] + i - 1); 2711 cols_lvl = uj_lvl_ptr[prow] + i; 2712 PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j)); 2713 nzk += nlnk; 2714 2715 /* update il and jl for prow */ 2716 if (jmin < jmax) { 2717 il[prow] = jmin; 2718 j = *cols; 2719 jl[prow] = jl[j]; 2720 jl[j] = prow; 2721 } 2722 prow = nextprow; 2723 } 2724 2725 /* if free space is not available, make more free space */ 2726 if (current_space->local_remaining < nzk) { 2727 i = am - k + 1; /* num of unfactored rows */ 2728 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2729 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 2730 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 2731 reallocs++; 2732 } 2733 2734 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2735 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 2736 2737 /* add the k-th row into il and jl */ 2738 if (nzk - 1 > 0) { 2739 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2740 jl[k] = jl[i]; 2741 jl[i] = k; 2742 il[k] = ui[k] + 1; 2743 } 2744 uj_ptr[k] = current_space->array; 2745 uj_lvl_ptr[k] = current_space_lvl->array; 2746 2747 current_space->array += nzk; 2748 current_space->local_used += nzk; 2749 current_space->local_remaining -= nzk; 2750 current_space_lvl->array += nzk; 2751 current_space_lvl->local_used += nzk; 2752 current_space_lvl->local_remaining -= nzk; 2753 2754 ui[k + 1] = ui[k] + nzk; 2755 } 2756 2757 PetscCall(ISRestoreIndices(perm, &rip)); 2758 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 2759 2760 /* destroy list of free space and other temporary array(s) */ 2761 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 2762 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 2763 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 2764 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 2765 if (ai[am] != 0) { 2766 ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]); 2767 } else { 2768 ratio_needed = 0.0; 2769 } 2770 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2771 2772 /* put together the new matrix in MATSEQSBAIJ format */ 2773 PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 2774 2775 b = (Mat_SeqSBAIJ *)(fact)->data; 2776 2777 PetscCall(PetscFree2(b->imax, b->ilen)); 2778 2779 b->singlemalloc = PETSC_FALSE; 2780 b->free_a = PETSC_TRUE; 2781 b->free_ij = free_ij; 2782 2783 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 2784 2785 b->j = uj; 2786 b->i = ui; 2787 b->diag = NULL; 2788 b->ilen = NULL; 2789 b->imax = NULL; 2790 b->row = perm; 2791 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2792 2793 PetscCall(PetscObjectReference((PetscObject)perm)); 2794 b->icol = perm; 2795 PetscCall(PetscObjectReference((PetscObject)perm)); 2796 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 2797 2798 b->maxnz = b->nz = ui[am]; 2799 2800 fact->info.factor_mallocs = reallocs; 2801 fact->info.fill_ratio_given = fill; 2802 fact->info.fill_ratio_needed = ratio_needed; 2803 #if defined(PETSC_USE_INFO) 2804 if (ai[am] != 0) { 2805 PetscReal af = fact->info.fill_ratio_needed; 2806 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 2807 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2808 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 2809 } else { 2810 PetscCall(PetscInfo(A, "Empty matrix\n")); 2811 } 2812 #endif 2813 if (perm_identity) { 2814 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2815 } else { 2816 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2817 } 2818 PetscFunctionReturn(0); 2819 } 2820