1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) 9 { 10 Mat C = B; 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 12 IS isrow = b->row, isicol = b->icol; 13 const PetscInt *r, *ic; 14 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 15 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 16 MatScalar *rtmp, *pc, *mwork, *pv; 17 MatScalar *aa = a->a, *v; 18 PetscReal shift = info->shiftamount; 19 const PetscBool allowzeropivot = PetscNot(A->erroriffailure); 20 21 PetscFunctionBegin; 22 PetscCall(ISGetIndices(isrow, &r)); 23 PetscCall(ISGetIndices(isicol, &ic)); 24 25 /* generate work space needed by the factorization */ 26 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 27 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 28 29 for (PetscInt i = 0; i < n; i++) { 30 /* zero rtmp */ 31 /* L part */ 32 PetscInt *pj; 33 PetscInt nzL, nz = bi[i + 1] - bi[i]; 34 bjtmp = bj + bi[i]; 35 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 36 37 /* U part */ 38 nz = bdiag[i] - bdiag[i + 1]; 39 bjtmp = bj + bdiag[i + 1] + 1; 40 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 41 42 /* load in initial (unfactored row) */ 43 nz = ai[r[i] + 1] - ai[r[i]]; 44 ajtmp = aj + ai[r[i]]; 45 v = aa + bs2 * ai[r[i]]; 46 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 47 48 /* elimination */ 49 bjtmp = bj + bi[i]; 50 nzL = bi[i + 1] - bi[i]; 51 for (PetscInt k = 0; k < nzL; k++) { 52 PetscBool flg = PETSC_FALSE; 53 PetscInt row = bjtmp[k]; 54 55 pc = rtmp + bs2 * row; 56 for (PetscInt j = 0; j < bs2; j++) { 57 if (pc[j] != (PetscScalar)0.0) { 58 flg = PETSC_TRUE; 59 break; 60 } 61 } 62 if (flg) { 63 pv = b->a + bs2 * bdiag[row]; 64 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 65 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 66 67 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 68 pv = b->a + bs2 * (bdiag[row + 1] + 1); 69 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 70 for (PetscInt j = 0; j < nz; j++) { 71 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 72 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 73 v = rtmp + 4 * pj[j]; 74 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 75 pv += 4; 76 } 77 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 78 } 79 } 80 81 /* finished row so stick it into b->a */ 82 /* L part */ 83 pv = b->a + bs2 * bi[i]; 84 pj = b->j + bi[i]; 85 nz = bi[i + 1] - bi[i]; 86 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 87 88 /* Mark diagonal and invert diagonal for simpler triangular solves */ 89 pv = b->a + bs2 * bdiag[i]; 90 pj = b->j + bdiag[i]; 91 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 92 { 93 PetscBool zeropivotdetected; 94 95 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 96 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 97 } 98 99 /* U part */ 100 pv = b->a + bs2 * (bdiag[i + 1] + 1); 101 pj = b->j + bdiag[i + 1] + 1; 102 nz = bdiag[i] - bdiag[i + 1] - 1; 103 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 104 } 105 106 PetscCall(PetscFree2(rtmp, mwork)); 107 PetscCall(ISRestoreIndices(isicol, &ic)); 108 PetscCall(ISRestoreIndices(isrow, &r)); 109 110 C->ops->solve = MatSolve_SeqBAIJ_2; 111 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 112 C->assembled = PETSC_TRUE; 113 114 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 115 PetscFunctionReturn(PETSC_SUCCESS); 116 } 117 118 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 119 { 120 Mat C = B; 121 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 122 PetscInt i, j, k, nz, nzL, row, *pj; 123 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 124 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 125 MatScalar *rtmp, *pc, *mwork, *pv; 126 MatScalar *aa = a->a, *v; 127 PetscInt flg; 128 PetscReal shift = info->shiftamount; 129 PetscBool allowzeropivot, zeropivotdetected; 130 131 PetscFunctionBegin; 132 allowzeropivot = PetscNot(A->erroriffailure); 133 134 /* generate work space needed by the factorization */ 135 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 136 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 137 138 for (i = 0; i < n; i++) { 139 /* zero rtmp */ 140 /* L part */ 141 nz = bi[i + 1] - bi[i]; 142 bjtmp = bj + bi[i]; 143 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 144 145 /* U part */ 146 nz = bdiag[i] - bdiag[i + 1]; 147 bjtmp = bj + bdiag[i + 1] + 1; 148 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 149 150 /* load in initial (unfactored row) */ 151 nz = ai[i + 1] - ai[i]; 152 ajtmp = aj + ai[i]; 153 v = aa + bs2 * ai[i]; 154 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 155 156 /* elimination */ 157 bjtmp = bj + bi[i]; 158 nzL = bi[i + 1] - bi[i]; 159 for (k = 0; k < nzL; k++) { 160 row = bjtmp[k]; 161 pc = rtmp + bs2 * row; 162 for (flg = 0, j = 0; j < bs2; j++) { 163 if (pc[j] != (PetscScalar)0.0) { 164 flg = 1; 165 break; 166 } 167 } 168 if (flg) { 169 pv = b->a + bs2 * bdiag[row]; 170 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 171 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 172 173 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 174 pv = b->a + bs2 * (bdiag[row + 1] + 1); 175 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 176 for (j = 0; j < nz; j++) { 177 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 178 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 179 v = rtmp + 4 * pj[j]; 180 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 181 pv += 4; 182 } 183 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 184 } 185 } 186 187 /* finished row so stick it into b->a */ 188 /* L part */ 189 pv = b->a + bs2 * bi[i]; 190 pj = b->j + bi[i]; 191 nz = bi[i + 1] - bi[i]; 192 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 193 194 /* Mark diagonal and invert diagonal for simpler triangular solves */ 195 pv = b->a + bs2 * bdiag[i]; 196 pj = b->j + bdiag[i]; 197 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 198 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 199 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 200 201 /* U part */ 202 /* 203 pv = b->a + bs2*bi[2*n-i]; 204 pj = b->j + bi[2*n-i]; 205 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 206 */ 207 pv = b->a + bs2 * (bdiag[i + 1] + 1); 208 pj = b->j + bdiag[i + 1] + 1; 209 nz = bdiag[i] - bdiag[i + 1] - 1; 210 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 211 } 212 PetscCall(PetscFree2(rtmp, mwork)); 213 214 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 215 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering; 216 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering; 217 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 218 C->assembled = PETSC_TRUE; 219 220 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 221 PetscFunctionReturn(PETSC_SUCCESS); 222 } 223 224 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) 225 { 226 Mat C = B; 227 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 228 IS isrow = b->row, isicol = b->icol; 229 const PetscInt *r, *ic; 230 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 231 PetscInt *ajtmpold, *ajtmp, nz, row; 232 PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 233 MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4; 234 MatScalar p1, p2, p3, p4; 235 MatScalar *ba = b->a, *aa = a->a; 236 PetscReal shift = info->shiftamount; 237 PetscBool allowzeropivot, zeropivotdetected; 238 239 PetscFunctionBegin; 240 allowzeropivot = PetscNot(A->erroriffailure); 241 PetscCall(ISGetIndices(isrow, &r)); 242 PetscCall(ISGetIndices(isicol, &ic)); 243 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 244 245 for (i = 0; i < n; i++) { 246 nz = bi[i + 1] - bi[i]; 247 ajtmp = bj + bi[i]; 248 for (j = 0; j < nz; j++) { 249 x = rtmp + 4 * ajtmp[j]; 250 x[0] = x[1] = x[2] = x[3] = 0.0; 251 } 252 /* load in initial (unfactored row) */ 253 idx = r[i]; 254 nz = ai[idx + 1] - ai[idx]; 255 ajtmpold = aj + ai[idx]; 256 v = aa + 4 * ai[idx]; 257 for (j = 0; j < nz; j++) { 258 x = rtmp + 4 * ic[ajtmpold[j]]; 259 x[0] = v[0]; 260 x[1] = v[1]; 261 x[2] = v[2]; 262 x[3] = v[3]; 263 v += 4; 264 } 265 row = *ajtmp++; 266 while (row < i) { 267 pc = rtmp + 4 * row; 268 p1 = pc[0]; 269 p2 = pc[1]; 270 p3 = pc[2]; 271 p4 = pc[3]; 272 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 273 pv = ba + 4 * diag_offset[row]; 274 pj = bj + diag_offset[row] + 1; 275 x1 = pv[0]; 276 x2 = pv[1]; 277 x3 = pv[2]; 278 x4 = pv[3]; 279 pc[0] = m1 = p1 * x1 + p3 * x2; 280 pc[1] = m2 = p2 * x1 + p4 * x2; 281 pc[2] = m3 = p1 * x3 + p3 * x4; 282 pc[3] = m4 = p2 * x3 + p4 * x4; 283 nz = bi[row + 1] - diag_offset[row] - 1; 284 pv += 4; 285 for (j = 0; j < nz; j++) { 286 x1 = pv[0]; 287 x2 = pv[1]; 288 x3 = pv[2]; 289 x4 = pv[3]; 290 x = rtmp + 4 * pj[j]; 291 x[0] -= m1 * x1 + m3 * x2; 292 x[1] -= m2 * x1 + m4 * x2; 293 x[2] -= m1 * x3 + m3 * x4; 294 x[3] -= m2 * x3 + m4 * x4; 295 pv += 4; 296 } 297 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 298 } 299 row = *ajtmp++; 300 } 301 /* finished row so stick it into b->a */ 302 pv = ba + 4 * bi[i]; 303 pj = bj + bi[i]; 304 nz = bi[i + 1] - bi[i]; 305 for (j = 0; j < nz; j++) { 306 x = rtmp + 4 * pj[j]; 307 pv[0] = x[0]; 308 pv[1] = x[1]; 309 pv[2] = x[2]; 310 pv[3] = x[3]; 311 pv += 4; 312 } 313 /* invert diagonal block */ 314 w = ba + 4 * diag_offset[i]; 315 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 316 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 317 } 318 319 PetscCall(PetscFree(rtmp)); 320 PetscCall(ISRestoreIndices(isicol, &ic)); 321 PetscCall(ISRestoreIndices(isrow, &r)); 322 323 C->ops->solve = MatSolve_SeqBAIJ_2_inplace; 324 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace; 325 C->assembled = PETSC_TRUE; 326 327 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 328 PetscFunctionReturn(PETSC_SUCCESS); 329 } 330 /* 331 Version for when blocks are 2 by 2 Using natural ordering 332 */ 333 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 334 { 335 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 336 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 337 PetscInt *ajtmpold, *ajtmp, nz, row; 338 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 339 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 340 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4; 341 MatScalar *ba = b->a, *aa = a->a; 342 PetscReal shift = info->shiftamount; 343 PetscBool allowzeropivot, zeropivotdetected; 344 345 PetscFunctionBegin; 346 allowzeropivot = PetscNot(A->erroriffailure); 347 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 348 for (i = 0; i < n; i++) { 349 nz = bi[i + 1] - bi[i]; 350 ajtmp = bj + bi[i]; 351 for (j = 0; j < nz; j++) { 352 x = rtmp + 4 * ajtmp[j]; 353 x[0] = x[1] = x[2] = x[3] = 0.0; 354 } 355 /* load in initial (unfactored row) */ 356 nz = ai[i + 1] - ai[i]; 357 ajtmpold = aj + ai[i]; 358 v = aa + 4 * ai[i]; 359 for (j = 0; j < nz; j++) { 360 x = rtmp + 4 * ajtmpold[j]; 361 x[0] = v[0]; 362 x[1] = v[1]; 363 x[2] = v[2]; 364 x[3] = v[3]; 365 v += 4; 366 } 367 row = *ajtmp++; 368 while (row < i) { 369 pc = rtmp + 4 * row; 370 p1 = pc[0]; 371 p2 = pc[1]; 372 p3 = pc[2]; 373 p4 = pc[3]; 374 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 375 pv = ba + 4 * diag_offset[row]; 376 pj = bj + diag_offset[row] + 1; 377 x1 = pv[0]; 378 x2 = pv[1]; 379 x3 = pv[2]; 380 x4 = pv[3]; 381 pc[0] = m1 = p1 * x1 + p3 * x2; 382 pc[1] = m2 = p2 * x1 + p4 * x2; 383 pc[2] = m3 = p1 * x3 + p3 * x4; 384 pc[3] = m4 = p2 * x3 + p4 * x4; 385 nz = bi[row + 1] - diag_offset[row] - 1; 386 pv += 4; 387 for (j = 0; j < nz; j++) { 388 x1 = pv[0]; 389 x2 = pv[1]; 390 x3 = pv[2]; 391 x4 = pv[3]; 392 x = rtmp + 4 * pj[j]; 393 x[0] -= m1 * x1 + m3 * x2; 394 x[1] -= m2 * x1 + m4 * x2; 395 x[2] -= m1 * x3 + m3 * x4; 396 x[3] -= m2 * x3 + m4 * x4; 397 pv += 4; 398 } 399 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 400 } 401 row = *ajtmp++; 402 } 403 /* finished row so stick it into b->a */ 404 pv = ba + 4 * bi[i]; 405 pj = bj + bi[i]; 406 nz = bi[i + 1] - bi[i]; 407 for (j = 0; j < nz; j++) { 408 x = rtmp + 4 * pj[j]; 409 pv[0] = x[0]; 410 pv[1] = x[1]; 411 pv[2] = x[2]; 412 pv[3] = x[3]; 413 /* 414 printf(" col %d:",pj[j]); 415 PetscInt j1; 416 for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1)); 417 printf("\n"); 418 */ 419 pv += 4; 420 } 421 /* invert diagonal block */ 422 w = ba + 4 * diag_offset[i]; 423 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 424 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 425 } 426 427 PetscCall(PetscFree(rtmp)); 428 429 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace; 430 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace; 431 C->assembled = PETSC_TRUE; 432 433 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 434 PetscFunctionReturn(PETSC_SUCCESS); 435 } 436 437 /* ----------------------------------------------------------- */ 438 /* 439 Version for when blocks are 1 by 1. 440 */ 441 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) 442 { 443 Mat C = B; 444 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 445 IS isrow = b->row, isicol = b->icol; 446 const PetscInt *r, *ic, *ics; 447 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 448 PetscInt i, j, k, nz, nzL, row, *pj; 449 const PetscInt *ajtmp, *bjtmp; 450 MatScalar *rtmp, *pc, multiplier, *pv; 451 const MatScalar *aa = a->a, *v; 452 PetscBool row_identity, col_identity; 453 FactorShiftCtx sctx; 454 const PetscInt *ddiag; 455 PetscReal rs; 456 MatScalar d; 457 458 PetscFunctionBegin; 459 /* MatPivotSetUp(): initialize shift context sctx */ 460 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 461 462 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 463 ddiag = a->diag; 464 sctx.shift_top = info->zeropivot; 465 for (i = 0; i < n; i++) { 466 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 467 d = (aa)[ddiag[i]]; 468 rs = -PetscAbsScalar(d) - PetscRealPart(d); 469 v = aa + ai[i]; 470 nz = ai[i + 1] - ai[i]; 471 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 472 if (rs > sctx.shift_top) sctx.shift_top = rs; 473 } 474 sctx.shift_top *= 1.1; 475 sctx.nshift_max = 5; 476 sctx.shift_lo = 0.; 477 sctx.shift_hi = 1.; 478 } 479 480 PetscCall(ISGetIndices(isrow, &r)); 481 PetscCall(ISGetIndices(isicol, &ic)); 482 PetscCall(PetscMalloc1(n + 1, &rtmp)); 483 ics = ic; 484 485 do { 486 sctx.newshift = PETSC_FALSE; 487 for (i = 0; i < n; i++) { 488 /* zero rtmp */ 489 /* L part */ 490 nz = bi[i + 1] - bi[i]; 491 bjtmp = bj + bi[i]; 492 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 493 494 /* U part */ 495 nz = bdiag[i] - bdiag[i + 1]; 496 bjtmp = bj + bdiag[i + 1] + 1; 497 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 498 499 /* load in initial (unfactored row) */ 500 nz = ai[r[i] + 1] - ai[r[i]]; 501 ajtmp = aj + ai[r[i]]; 502 v = aa + ai[r[i]]; 503 for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j]; 504 505 /* ZeropivotApply() */ 506 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 507 508 /* elimination */ 509 bjtmp = bj + bi[i]; 510 row = *bjtmp++; 511 nzL = bi[i + 1] - bi[i]; 512 for (k = 0; k < nzL; k++) { 513 pc = rtmp + row; 514 if (*pc != (PetscScalar)0.0) { 515 pv = b->a + bdiag[row]; 516 multiplier = *pc * (*pv); 517 *pc = multiplier; 518 519 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 520 pv = b->a + bdiag[row + 1] + 1; 521 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 522 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 523 PetscCall(PetscLogFlops(2.0 * nz)); 524 } 525 row = *bjtmp++; 526 } 527 528 /* finished row so stick it into b->a */ 529 rs = 0.0; 530 /* L part */ 531 pv = b->a + bi[i]; 532 pj = b->j + bi[i]; 533 nz = bi[i + 1] - bi[i]; 534 for (j = 0; j < nz; j++) { 535 pv[j] = rtmp[pj[j]]; 536 rs += PetscAbsScalar(pv[j]); 537 } 538 539 /* U part */ 540 pv = b->a + bdiag[i + 1] + 1; 541 pj = b->j + bdiag[i + 1] + 1; 542 nz = bdiag[i] - bdiag[i + 1] - 1; 543 for (j = 0; j < nz; j++) { 544 pv[j] = rtmp[pj[j]]; 545 rs += PetscAbsScalar(pv[j]); 546 } 547 548 sctx.rs = rs; 549 sctx.pv = rtmp[i]; 550 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 551 if (sctx.newshift) break; /* break for-loop */ 552 rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */ 553 554 /* Mark diagonal and invert diagonal for simpler triangular solves */ 555 pv = b->a + bdiag[i]; 556 *pv = (PetscScalar)1.0 / rtmp[i]; 557 558 } /* endof for (i=0; i<n; i++) { */ 559 560 /* MatPivotRefine() */ 561 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 562 /* 563 * if no shift in this attempt & shifting & started shifting & can refine, 564 * then try lower shift 565 */ 566 sctx.shift_hi = sctx.shift_fraction; 567 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 568 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 569 sctx.newshift = PETSC_TRUE; 570 sctx.nshift++; 571 } 572 } while (sctx.newshift); 573 574 PetscCall(PetscFree(rtmp)); 575 PetscCall(ISRestoreIndices(isicol, &ic)); 576 PetscCall(ISRestoreIndices(isrow, &r)); 577 578 PetscCall(ISIdentity(isrow, &row_identity)); 579 PetscCall(ISIdentity(isicol, &col_identity)); 580 if (row_identity && col_identity) { 581 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 582 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering; 583 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering; 584 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 585 } else { 586 C->ops->solve = MatSolve_SeqBAIJ_1; 587 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 588 } 589 C->assembled = PETSC_TRUE; 590 PetscCall(PetscLogFlops(C->cmap->n)); 591 592 /* MatShiftView(A,info,&sctx) */ 593 if (sctx.nshift) { 594 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 595 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 596 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 597 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 598 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 599 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 600 } 601 } 602 PetscFunctionReturn(PETSC_SUCCESS); 603 } 604 605 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) 606 { 607 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 608 IS isrow = b->row, isicol = b->icol; 609 const PetscInt *r, *ic; 610 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 611 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 612 PetscInt *diag_offset = b->diag, diag, *pj; 613 MatScalar *pv, *v, *rtmp, multiplier, *pc; 614 MatScalar *ba = b->a, *aa = a->a; 615 PetscBool row_identity, col_identity; 616 617 PetscFunctionBegin; 618 PetscCall(ISGetIndices(isrow, &r)); 619 PetscCall(ISGetIndices(isicol, &ic)); 620 PetscCall(PetscMalloc1(n + 1, &rtmp)); 621 622 for (i = 0; i < n; i++) { 623 nz = bi[i + 1] - bi[i]; 624 ajtmp = bj + bi[i]; 625 for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0; 626 627 /* load in initial (unfactored row) */ 628 nz = ai[r[i] + 1] - ai[r[i]]; 629 ajtmpold = aj + ai[r[i]]; 630 v = aa + ai[r[i]]; 631 for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 632 633 row = *ajtmp++; 634 while (row < i) { 635 pc = rtmp + row; 636 if (*pc != 0.0) { 637 pv = ba + diag_offset[row]; 638 pj = bj + diag_offset[row] + 1; 639 multiplier = *pc * *pv++; 640 *pc = multiplier; 641 nz = bi[row + 1] - diag_offset[row] - 1; 642 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 643 PetscCall(PetscLogFlops(1.0 + 2.0 * nz)); 644 } 645 row = *ajtmp++; 646 } 647 /* finished row so stick it into b->a */ 648 pv = ba + bi[i]; 649 pj = bj + bi[i]; 650 nz = bi[i + 1] - bi[i]; 651 for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]]; 652 diag = diag_offset[i] - bi[i]; 653 /* check pivot entry for current row */ 654 PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 655 pv[diag] = 1.0 / pv[diag]; 656 } 657 658 PetscCall(PetscFree(rtmp)); 659 PetscCall(ISRestoreIndices(isicol, &ic)); 660 PetscCall(ISRestoreIndices(isrow, &r)); 661 PetscCall(ISIdentity(isrow, &row_identity)); 662 PetscCall(ISIdentity(isicol, &col_identity)); 663 if (row_identity && col_identity) { 664 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace; 665 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace; 666 } else { 667 C->ops->solve = MatSolve_SeqBAIJ_1_inplace; 668 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace; 669 } 670 C->assembled = PETSC_TRUE; 671 PetscCall(PetscLogFlops(C->cmap->n)); 672 PetscFunctionReturn(PETSC_SUCCESS); 673 } 674 675 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) 676 { 677 PetscFunctionBegin; 678 *type = MATSOLVERPETSC; 679 PetscFunctionReturn(PETSC_SUCCESS); 680 } 681 682 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) 683 { 684 PetscInt n = A->rmap->n; 685 686 PetscFunctionBegin; 687 #if defined(PETSC_USE_COMPLEX) 688 PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported"); 689 #endif 690 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 691 PetscCall(MatSetSizes(*B, n, n, n, n)); 692 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 693 PetscCall(MatSetType(*B, MATSEQBAIJ)); 694 695 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ; 696 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ; 697 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 698 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 699 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 700 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 701 PetscCall(MatSetType(*B, MATSEQSBAIJ)); 702 PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL)); 703 704 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ; 705 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ; 706 /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */ 707 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 708 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 709 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported"); 710 (*B)->factortype = ftype; 711 (*B)->canuseordering = PETSC_TRUE; 712 713 PetscCall(PetscFree((*B)->solvertype)); 714 PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype)); 715 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc)); 716 PetscFunctionReturn(PETSC_SUCCESS); 717 } 718 719 /* ----------------------------------------------------------- */ 720 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) 721 { 722 Mat C; 723 724 PetscFunctionBegin; 725 PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C)); 726 PetscCall(MatLUFactorSymbolic(C, A, row, col, info)); 727 PetscCall(MatLUFactorNumeric(C, A, info)); 728 729 A->ops->solve = C->ops->solve; 730 A->ops->solvetranspose = C->ops->solvetranspose; 731 732 PetscCall(MatHeaderMerge(A, &C)); 733 PetscFunctionReturn(PETSC_SUCCESS); 734 } 735 736 #include <../src/mat/impls/sbaij/seq/sbaij.h> 737 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) 738 { 739 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 740 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 741 IS ip = b->row; 742 const PetscInt *rip; 743 PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol; 744 PetscInt *ai = a->i, *aj = a->j; 745 PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz; 746 MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi; 747 PetscReal rs; 748 FactorShiftCtx sctx; 749 750 PetscFunctionBegin; 751 if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */ 752 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 753 PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info)); 754 PetscCall(MatDestroy(&a->sbaijMat)); 755 PetscFunctionReturn(PETSC_SUCCESS); 756 } 757 758 /* MatPivotSetUp(): initialize shift context sctx */ 759 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 760 761 PetscCall(ISGetIndices(ip, &rip)); 762 PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl)); 763 764 sctx.shift_amount = 0.; 765 sctx.nshift = 0; 766 do { 767 sctx.newshift = PETSC_FALSE; 768 for (i = 0; i < mbs; i++) { 769 rtmp[i] = 0.0; 770 jl[i] = mbs; 771 il[0] = 0; 772 } 773 774 for (k = 0; k < mbs; k++) { 775 bval = ba + bi[k]; 776 /* initialize k-th row by the perm[k]-th row of A */ 777 jmin = ai[rip[k]]; 778 jmax = ai[rip[k] + 1]; 779 for (j = jmin; j < jmax; j++) { 780 col = rip[aj[j]]; 781 if (col >= k) { /* only take upper triangular entry */ 782 rtmp[col] = aa[j]; 783 *bval++ = 0.0; /* for in-place factorization */ 784 } 785 } 786 787 /* shift the diagonal of the matrix */ 788 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 789 790 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 791 dk = rtmp[k]; 792 i = jl[k]; /* first row to be added to k_th row */ 793 794 while (i < k) { 795 nexti = jl[i]; /* next row to be added to k_th row */ 796 797 /* compute multiplier, update diag(k) and U(i,k) */ 798 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 799 uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */ 800 dk += uikdi * ba[ili]; 801 ba[ili] = uikdi; /* -U(i,k) */ 802 803 /* add multiple of row i to k-th row */ 804 jmin = ili + 1; 805 jmax = bi[i + 1]; 806 if (jmin < jmax) { 807 for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j]; 808 /* update il and jl for row i */ 809 il[i] = jmin; 810 j = bj[jmin]; 811 jl[i] = jl[j]; 812 jl[j] = i; 813 } 814 i = nexti; 815 } 816 817 /* shift the diagonals when zero pivot is detected */ 818 /* compute rs=sum of abs(off-diagonal) */ 819 rs = 0.0; 820 jmin = bi[k] + 1; 821 nz = bi[k + 1] - jmin; 822 if (nz) { 823 bcol = bj + jmin; 824 while (nz--) { 825 rs += PetscAbsScalar(rtmp[*bcol]); 826 bcol++; 827 } 828 } 829 830 sctx.rs = rs; 831 sctx.pv = dk; 832 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 833 if (sctx.newshift) break; 834 dk = sctx.pv; 835 836 /* copy data into U(k,:) */ 837 ba[bi[k]] = 1.0 / dk; /* U(k,k) */ 838 jmin = bi[k] + 1; 839 jmax = bi[k + 1]; 840 if (jmin < jmax) { 841 for (j = jmin; j < jmax; j++) { 842 col = bj[j]; 843 ba[j] = rtmp[col]; 844 rtmp[col] = 0.0; 845 } 846 /* add the k-th row into il and jl */ 847 il[k] = jmin; 848 i = bj[jmin]; 849 jl[k] = jl[i]; 850 jl[i] = k; 851 } 852 } 853 } while (sctx.newshift); 854 PetscCall(PetscFree3(rtmp, il, jl)); 855 856 PetscCall(ISRestoreIndices(ip, &rip)); 857 858 C->assembled = PETSC_TRUE; 859 C->preallocated = PETSC_TRUE; 860 861 PetscCall(PetscLogFlops(C->rmap->N)); 862 if (sctx.nshift) { 863 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 864 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 865 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 866 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 867 } 868 } 869 PetscFunctionReturn(PETSC_SUCCESS); 870 } 871 872 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) 873 { 874 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 875 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 876 PetscInt i, j, am = a->mbs; 877 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 878 PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz; 879 MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval; 880 PetscReal rs; 881 FactorShiftCtx sctx; 882 883 PetscFunctionBegin; 884 /* MatPivotSetUp(): initialize shift context sctx */ 885 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 886 887 PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl)); 888 889 do { 890 sctx.newshift = PETSC_FALSE; 891 for (i = 0; i < am; i++) { 892 rtmp[i] = 0.0; 893 jl[i] = am; 894 il[0] = 0; 895 } 896 897 for (k = 0; k < am; k++) { 898 /* initialize k-th row with elements nonzero in row perm(k) of A */ 899 nz = ai[k + 1] - ai[k]; 900 acol = aj + ai[k]; 901 aval = aa + ai[k]; 902 bval = ba + bi[k]; 903 while (nz--) { 904 if (*acol < k) { /* skip lower triangular entries */ 905 acol++; 906 aval++; 907 } else { 908 rtmp[*acol++] = *aval++; 909 *bval++ = 0.0; /* for in-place factorization */ 910 } 911 } 912 913 /* shift the diagonal of the matrix */ 914 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 915 916 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 917 dk = rtmp[k]; 918 i = jl[k]; /* first row to be added to k_th row */ 919 920 while (i < k) { 921 nexti = jl[i]; /* next row to be added to k_th row */ 922 /* compute multiplier, update D(k) and U(i,k) */ 923 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 924 uikdi = -ba[ili] * ba[bi[i]]; 925 dk += uikdi * ba[ili]; 926 ba[ili] = uikdi; /* -U(i,k) */ 927 928 /* add multiple of row i to k-th row ... */ 929 jmin = ili + 1; 930 nz = bi[i + 1] - jmin; 931 if (nz > 0) { 932 bcol = bj + jmin; 933 bval = ba + jmin; 934 while (nz--) rtmp[*bcol++] += uikdi * (*bval++); 935 /* update il and jl for i-th row */ 936 il[i] = jmin; 937 j = bj[jmin]; 938 jl[i] = jl[j]; 939 jl[j] = i; 940 } 941 i = nexti; 942 } 943 944 /* shift the diagonals when zero pivot is detected */ 945 /* compute rs=sum of abs(off-diagonal) */ 946 rs = 0.0; 947 jmin = bi[k] + 1; 948 nz = bi[k + 1] - jmin; 949 if (nz) { 950 bcol = bj + jmin; 951 while (nz--) { 952 rs += PetscAbsScalar(rtmp[*bcol]); 953 bcol++; 954 } 955 } 956 957 sctx.rs = rs; 958 sctx.pv = dk; 959 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 960 if (sctx.newshift) break; /* sctx.shift_amount is updated */ 961 dk = sctx.pv; 962 963 /* copy data into U(k,:) */ 964 ba[bi[k]] = 1.0 / dk; 965 jmin = bi[k] + 1; 966 nz = bi[k + 1] - jmin; 967 if (nz) { 968 bcol = bj + jmin; 969 bval = ba + jmin; 970 while (nz--) { 971 *bval++ = rtmp[*bcol]; 972 rtmp[*bcol++] = 0.0; 973 } 974 /* add k-th row into il and jl */ 975 il[k] = jmin; 976 i = bj[jmin]; 977 jl[k] = jl[i]; 978 jl[i] = k; 979 } 980 } 981 } while (sctx.newshift); 982 PetscCall(PetscFree3(rtmp, il, jl)); 983 984 C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 985 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 986 C->assembled = PETSC_TRUE; 987 C->preallocated = PETSC_TRUE; 988 989 PetscCall(PetscLogFlops(C->rmap->N)); 990 if (sctx.nshift) { 991 if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 992 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 993 } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 994 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 995 } 996 } 997 PetscFunctionReturn(PETSC_SUCCESS); 998 } 999 1000 #include <petscbt.h> 1001 #include <../src/mat/utils/freespace.h> 1002 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1003 { 1004 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1005 Mat_SeqSBAIJ *b; 1006 Mat B; 1007 PetscBool perm_identity, missing; 1008 PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui; 1009 const PetscInt *rip; 1010 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 1011 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 1012 PetscReal fill = info->fill, levels = info->levels; 1013 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1014 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 1015 PetscBT lnkbt; 1016 1017 PetscFunctionBegin; 1018 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1019 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1020 1021 if (bs > 1) { 1022 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1023 (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1024 1025 PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info)); 1026 PetscFunctionReturn(PETSC_SUCCESS); 1027 } 1028 1029 PetscCall(ISIdentity(perm, &perm_identity)); 1030 PetscCall(ISGetIndices(perm, &rip)); 1031 1032 /* special case that simply copies fill pattern */ 1033 if (!levels && perm_identity) { 1034 PetscCall(PetscMalloc1(am + 1, &ui)); 1035 for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */ 1036 B = fact; 1037 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui)); 1038 1039 b = (Mat_SeqSBAIJ *)B->data; 1040 uj = b->j; 1041 for (i = 0; i < am; i++) { 1042 aj = a->j + a->diag[i]; 1043 for (j = 0; j < ui[i]; j++) *uj++ = *aj++; 1044 b->ilen[i] = ui[i]; 1045 } 1046 PetscCall(PetscFree(ui)); 1047 1048 B->factortype = MAT_FACTOR_NONE; 1049 1050 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1051 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1052 B->factortype = MAT_FACTOR_ICC; 1053 1054 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1055 PetscFunctionReturn(PETSC_SUCCESS); 1056 } 1057 1058 /* initialization */ 1059 PetscCall(PetscMalloc1(am + 1, &ui)); 1060 ui[0] = 0; 1061 PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl)); 1062 1063 /* jl: linked list for storing indices of the pivot rows 1064 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 1065 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 1066 for (i = 0; i < am; i++) { 1067 jl[i] = am; 1068 il[i] = 0; 1069 } 1070 1071 /* create and initialize a linked list for storing column indices of the active row k */ 1072 nlnk = am + 1; 1073 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 1074 1075 /* initial FreeSpace size is fill*(ai[am]+am)/2 */ 1076 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space)); 1077 1078 current_space = free_space; 1079 1080 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl)); 1081 current_space_lvl = free_space_lvl; 1082 1083 for (k = 0; k < am; k++) { /* for each active row k */ 1084 /* initialize lnk by the column indices of row rip[k] of A */ 1085 nzk = 0; 1086 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1087 ncols_upper = 0; 1088 cols = cols_lvl + am; 1089 for (j = 0; j < ncols; j++) { 1090 i = rip[*(aj + ai[rip[k]] + j)]; 1091 if (i >= k) { /* only take upper triangular entry */ 1092 cols[ncols_upper] = i; 1093 cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */ 1094 ncols_upper++; 1095 } 1096 } 1097 PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1098 nzk += nlnk; 1099 1100 /* update lnk by computing fill-in for each pivot row to be merged in */ 1101 prow = jl[k]; /* 1st pivot row */ 1102 1103 while (prow < k) { 1104 nextprow = jl[prow]; 1105 1106 /* merge prow into k-th row */ 1107 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 1108 jmax = ui[prow + 1]; 1109 ncols = jmax - jmin; 1110 i = jmin - ui[prow]; 1111 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 1112 for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j); 1113 PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1114 nzk += nlnk; 1115 1116 /* update il and jl for prow */ 1117 if (jmin < jmax) { 1118 il[prow] = jmin; 1119 1120 j = *cols; 1121 jl[prow] = jl[j]; 1122 jl[j] = prow; 1123 } 1124 prow = nextprow; 1125 } 1126 1127 /* if free space is not available, make more free space */ 1128 if (current_space->local_remaining < nzk) { 1129 i = am - k + 1; /* num of unfactored rows */ 1130 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1131 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1132 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 1133 reallocs++; 1134 } 1135 1136 /* copy data into free_space and free_space_lvl, then initialize lnk */ 1137 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 1138 1139 /* add the k-th row into il and jl */ 1140 if (nzk - 1 > 0) { 1141 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 1142 jl[k] = jl[i]; 1143 jl[i] = k; 1144 il[k] = ui[k] + 1; 1145 } 1146 uj_ptr[k] = current_space->array; 1147 uj_lvl_ptr[k] = current_space_lvl->array; 1148 1149 current_space->array += nzk; 1150 current_space->local_used += nzk; 1151 current_space->local_remaining -= nzk; 1152 1153 current_space_lvl->array += nzk; 1154 current_space_lvl->local_used += nzk; 1155 current_space_lvl->local_remaining -= nzk; 1156 1157 ui[k + 1] = ui[k] + nzk; 1158 } 1159 1160 PetscCall(ISRestoreIndices(perm, &rip)); 1161 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 1162 PetscCall(PetscFree(cols_lvl)); 1163 1164 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1165 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 1166 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1167 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 1168 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 1169 1170 /* put together the new matrix in MATSEQSBAIJ format */ 1171 B = fact; 1172 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 1173 1174 b = (Mat_SeqSBAIJ *)B->data; 1175 b->singlemalloc = PETSC_FALSE; 1176 b->free_a = PETSC_TRUE; 1177 b->free_ij = PETSC_TRUE; 1178 1179 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 1180 1181 b->j = uj; 1182 b->i = ui; 1183 b->diag = NULL; 1184 b->ilen = NULL; 1185 b->imax = NULL; 1186 b->row = perm; 1187 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1188 1189 PetscCall(PetscObjectReference((PetscObject)perm)); 1190 1191 b->icol = perm; 1192 1193 PetscCall(PetscObjectReference((PetscObject)perm)); 1194 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 1195 1196 b->maxnz = b->nz = ui[am]; 1197 1198 B->info.factor_mallocs = reallocs; 1199 B->info.fill_ratio_given = fill; 1200 if (ai[am] != 0.) { 1201 /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */ 1202 B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am); 1203 } else { 1204 B->info.fill_ratio_needed = 0.0; 1205 } 1206 #if defined(PETSC_USE_INFO) 1207 if (ai[am] != 0) { 1208 PetscReal af = B->info.fill_ratio_needed; 1209 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1210 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1211 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1212 } else { 1213 PetscCall(PetscInfo(A, "Empty matrix\n")); 1214 } 1215 #endif 1216 if (perm_identity) { 1217 B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1218 B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1219 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1220 } else { 1221 (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1222 } 1223 PetscFunctionReturn(PETSC_SUCCESS); 1224 } 1225 1226 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1227 { 1228 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1229 Mat_SeqSBAIJ *b; 1230 Mat B; 1231 PetscBool perm_identity, missing; 1232 PetscReal fill = info->fill; 1233 const PetscInt *rip; 1234 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow; 1235 PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow; 1236 PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr; 1237 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1238 PetscBT lnkbt; 1239 1240 PetscFunctionBegin; 1241 if (bs > 1) { /* convert to seqsbaij */ 1242 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1243 (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1244 1245 PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info)); 1246 PetscFunctionReturn(PETSC_SUCCESS); 1247 } 1248 1249 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1250 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1251 1252 /* check whether perm is the identity mapping */ 1253 PetscCall(ISIdentity(perm, &perm_identity)); 1254 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported"); 1255 PetscCall(ISGetIndices(perm, &rip)); 1256 1257 /* initialization */ 1258 PetscCall(PetscMalloc1(mbs + 1, &ui)); 1259 ui[0] = 0; 1260 1261 /* jl: linked list for storing indices of the pivot rows 1262 il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 1263 PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols)); 1264 for (i = 0; i < mbs; i++) { 1265 jl[i] = mbs; 1266 il[i] = 0; 1267 } 1268 1269 /* create and initialize a linked list for storing column indices of the active row k */ 1270 nlnk = mbs + 1; 1271 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1272 1273 /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */ 1274 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space)); 1275 1276 current_space = free_space; 1277 1278 for (k = 0; k < mbs; k++) { /* for each active row k */ 1279 /* initialize lnk by the column indices of row rip[k] of A */ 1280 nzk = 0; 1281 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1282 ncols_upper = 0; 1283 for (j = 0; j < ncols; j++) { 1284 i = rip[*(aj + ai[rip[k]] + j)]; 1285 if (i >= k) { /* only take upper triangular entry */ 1286 cols[ncols_upper] = i; 1287 ncols_upper++; 1288 } 1289 } 1290 PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt)); 1291 nzk += nlnk; 1292 1293 /* update lnk by computing fill-in for each pivot row to be merged in */ 1294 prow = jl[k]; /* 1st pivot row */ 1295 1296 while (prow < k) { 1297 nextprow = jl[prow]; 1298 /* merge prow into k-th row */ 1299 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 1300 jmax = ui[prow + 1]; 1301 ncols = jmax - jmin; 1302 uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 1303 PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt)); 1304 nzk += nlnk; 1305 1306 /* update il and jl for prow */ 1307 if (jmin < jmax) { 1308 il[prow] = jmin; 1309 j = *uj_ptr; 1310 jl[prow] = jl[j]; 1311 jl[j] = prow; 1312 } 1313 prow = nextprow; 1314 } 1315 1316 /* if free space is not available, make more free space */ 1317 if (current_space->local_remaining < nzk) { 1318 i = mbs - k + 1; /* num of unfactored rows */ 1319 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1320 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1321 reallocs++; 1322 } 1323 1324 /* copy data into free space, then initialize lnk */ 1325 PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt)); 1326 1327 /* add the k-th row into il and jl */ 1328 if (nzk - 1 > 0) { 1329 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 1330 jl[k] = jl[i]; 1331 jl[i] = k; 1332 il[k] = ui[k] + 1; 1333 } 1334 ui_ptr[k] = current_space->array; 1335 current_space->array += nzk; 1336 current_space->local_used += nzk; 1337 current_space->local_remaining -= nzk; 1338 1339 ui[k + 1] = ui[k] + nzk; 1340 } 1341 1342 PetscCall(ISRestoreIndices(perm, &rip)); 1343 PetscCall(PetscFree4(ui_ptr, il, jl, cols)); 1344 1345 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1346 PetscCall(PetscMalloc1(ui[mbs] + 1, &uj)); 1347 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1348 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1349 1350 /* put together the new matrix in MATSEQSBAIJ format */ 1351 B = fact; 1352 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1353 1354 b = (Mat_SeqSBAIJ *)B->data; 1355 b->singlemalloc = PETSC_FALSE; 1356 b->free_a = PETSC_TRUE; 1357 b->free_ij = PETSC_TRUE; 1358 1359 PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a)); 1360 1361 b->j = uj; 1362 b->i = ui; 1363 b->diag = NULL; 1364 b->ilen = NULL; 1365 b->imax = NULL; 1366 b->row = perm; 1367 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1368 1369 PetscCall(PetscObjectReference((PetscObject)perm)); 1370 b->icol = perm; 1371 PetscCall(PetscObjectReference((PetscObject)perm)); 1372 PetscCall(PetscMalloc1(mbs + 1, &b->solve_work)); 1373 b->maxnz = b->nz = ui[mbs]; 1374 1375 B->info.factor_mallocs = reallocs; 1376 B->info.fill_ratio_given = fill; 1377 if (ai[mbs] != 0.) { 1378 /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */ 1379 B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs); 1380 } else { 1381 B->info.fill_ratio_needed = 0.0; 1382 } 1383 #if defined(PETSC_USE_INFO) 1384 if (ai[mbs] != 0.) { 1385 PetscReal af = B->info.fill_ratio_needed; 1386 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1387 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1388 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1389 } else { 1390 PetscCall(PetscInfo(A, "Empty matrix\n")); 1391 } 1392 #endif 1393 if (perm_identity) { 1394 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1395 } else { 1396 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1397 } 1398 PetscFunctionReturn(PETSC_SUCCESS); 1399 } 1400 1401 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) 1402 { 1403 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1404 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1405 PetscInt i, k, n = a->mbs; 1406 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1407 const MatScalar *aa = a->a, *v; 1408 PetscScalar *x, *s, *t, *ls; 1409 const PetscScalar *b; 1410 1411 PetscFunctionBegin; 1412 PetscCall(VecGetArrayRead(bb, &b)); 1413 PetscCall(VecGetArray(xx, &x)); 1414 t = a->solve_work; 1415 1416 /* forward solve the lower triangular */ 1417 PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */ 1418 1419 for (i = 1; i < n; i++) { 1420 v = aa + bs2 * ai[i]; 1421 vi = aj + ai[i]; 1422 nz = ai[i + 1] - ai[i]; 1423 s = t + bs * i; 1424 PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */ 1425 for (k = 0; k < nz; k++) { 1426 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]); 1427 v += bs2; 1428 } 1429 } 1430 1431 /* backward solve the upper triangular */ 1432 ls = a->solve_work + A->cmap->n; 1433 for (i = n - 1; i >= 0; i--) { 1434 v = aa + bs2 * (adiag[i + 1] + 1); 1435 vi = aj + adiag[i + 1] + 1; 1436 nz = adiag[i] - adiag[i + 1] - 1; 1437 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1438 for (k = 0; k < nz; k++) { 1439 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]); 1440 v += bs2; 1441 } 1442 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */ 1443 PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs)); 1444 } 1445 1446 PetscCall(VecRestoreArrayRead(bb, &b)); 1447 PetscCall(VecRestoreArray(xx, &x)); 1448 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1449 PetscFunctionReturn(PETSC_SUCCESS); 1450 } 1451 1452 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) 1453 { 1454 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1455 IS iscol = a->col, isrow = a->row; 1456 const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1457 PetscInt i, m, n = a->mbs; 1458 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1459 const MatScalar *aa = a->a, *v; 1460 PetscScalar *x, *s, *t, *ls; 1461 const PetscScalar *b; 1462 1463 PetscFunctionBegin; 1464 PetscCall(VecGetArrayRead(bb, &b)); 1465 PetscCall(VecGetArray(xx, &x)); 1466 t = a->solve_work; 1467 1468 PetscCall(ISGetIndices(isrow, &rout)); 1469 r = rout; 1470 PetscCall(ISGetIndices(iscol, &cout)); 1471 c = cout; 1472 1473 /* forward solve the lower triangular */ 1474 PetscCall(PetscArraycpy(t, b + bs * r[0], bs)); 1475 for (i = 1; i < n; i++) { 1476 v = aa + bs2 * ai[i]; 1477 vi = aj + ai[i]; 1478 nz = ai[i + 1] - ai[i]; 1479 s = t + bs * i; 1480 PetscCall(PetscArraycpy(s, b + bs * r[i], bs)); 1481 for (m = 0; m < nz; m++) { 1482 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]); 1483 v += bs2; 1484 } 1485 } 1486 1487 /* backward solve the upper triangular */ 1488 ls = a->solve_work + A->cmap->n; 1489 for (i = n - 1; i >= 0; i--) { 1490 v = aa + bs2 * (adiag[i + 1] + 1); 1491 vi = aj + adiag[i + 1] + 1; 1492 nz = adiag[i] - adiag[i + 1] - 1; 1493 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1494 for (m = 0; m < nz; m++) { 1495 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]); 1496 v += bs2; 1497 } 1498 PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */ 1499 PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs)); 1500 } 1501 PetscCall(ISRestoreIndices(isrow, &rout)); 1502 PetscCall(ISRestoreIndices(iscol, &cout)); 1503 PetscCall(VecRestoreArrayRead(bb, &b)); 1504 PetscCall(VecRestoreArray(xx, &x)); 1505 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1506 PetscFunctionReturn(PETSC_SUCCESS); 1507 } 1508 1509 /* 1510 For each block in an block array saves the largest absolute value in the block into another array 1511 */ 1512 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray) 1513 { 1514 PetscInt i, j; 1515 1516 PetscFunctionBegin; 1517 PetscCall(PetscArrayzero(absarray, nbs + 1)); 1518 for (i = 0; i < nbs; i++) { 1519 for (j = 0; j < bs2; j++) { 1520 if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]); 1521 } 1522 } 1523 PetscFunctionReturn(PETSC_SUCCESS); 1524 } 1525 1526 /* 1527 This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used 1528 */ 1529 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact) 1530 { 1531 Mat B = *fact; 1532 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 1533 IS isicol; 1534 const PetscInt *r, *ic; 1535 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag; 1536 PetscInt *bi, *bj, *bdiag; 1537 1538 PetscInt row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au; 1539 PetscInt nlnk, *lnk; 1540 PetscBT lnkbt; 1541 PetscBool row_identity, icol_identity; 1542 MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp; 1543 PetscInt j, nz, *pj, *bjtmp, k, ncut, *jtmp; 1544 1545 PetscReal dt = info->dt; /* shift=info->shiftamount; */ 1546 PetscInt nnz_max; 1547 PetscBool missing; 1548 PetscReal *vtmp_abs; 1549 MatScalar *v_work; 1550 PetscInt *v_pivots; 1551 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1552 1553 PetscFunctionBegin; 1554 /* ------- symbolic factorization, can be reused ---------*/ 1555 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1556 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1557 adiag = a->diag; 1558 1559 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 1560 1561 /* bdiag is location of diagonal in factor */ 1562 PetscCall(PetscMalloc1(mbs + 1, &bdiag)); 1563 1564 /* allocate row pointers bi */ 1565 PetscCall(PetscMalloc1(2 * mbs + 2, &bi)); 1566 1567 /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */ 1568 dtcount = (PetscInt)info->dtcount; 1569 if (dtcount > mbs - 1) dtcount = mbs - 1; 1570 nnz_max = ai[mbs] + 2 * mbs * dtcount + 2; 1571 /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */ 1572 PetscCall(PetscMalloc1(nnz_max, &bj)); 1573 nnz_max = nnz_max * bs2; 1574 PetscCall(PetscMalloc1(nnz_max, &ba)); 1575 1576 /* put together the new matrix */ 1577 PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1578 1579 b = (Mat_SeqBAIJ *)(B)->data; 1580 b->free_a = PETSC_TRUE; 1581 b->free_ij = PETSC_TRUE; 1582 b->singlemalloc = PETSC_FALSE; 1583 1584 b->a = ba; 1585 b->j = bj; 1586 b->i = bi; 1587 b->diag = bdiag; 1588 b->ilen = NULL; 1589 b->imax = NULL; 1590 b->row = isrow; 1591 b->col = iscol; 1592 1593 PetscCall(PetscObjectReference((PetscObject)isrow)); 1594 PetscCall(PetscObjectReference((PetscObject)iscol)); 1595 1596 b->icol = isicol; 1597 PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work)); 1598 b->maxnz = nnz_max / bs2; 1599 1600 (B)->factortype = MAT_FACTOR_ILUDT; 1601 (B)->info.factor_mallocs = 0; 1602 (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2)); 1603 /* ------- end of symbolic factorization ---------*/ 1604 PetscCall(ISGetIndices(isrow, &r)); 1605 PetscCall(ISGetIndices(isicol, &ic)); 1606 1607 /* linked list for storing column indices of the active row */ 1608 nlnk = mbs + 1; 1609 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1610 1611 /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */ 1612 PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp)); 1613 /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */ 1614 PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp)); 1615 PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs)); 1616 PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots)); 1617 1618 allowzeropivot = PetscNot(A->erroriffailure); 1619 bi[0] = 0; 1620 bdiag[0] = (nnz_max / bs2) - 1; /* location of diagonal in factor B */ 1621 bi[2 * mbs + 1] = bdiag[0] + 1; /* endof bj and ba array */ 1622 for (i = 0; i < mbs; i++) { 1623 /* copy initial fill into linked list */ 1624 nzi = ai[r[i] + 1] - ai[r[i]]; 1625 PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 1626 nzi_al = adiag[r[i]] - ai[r[i]]; 1627 nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1; 1628 1629 /* load in initial unfactored row */ 1630 ajtmp = aj + ai[r[i]]; 1631 PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt)); 1632 PetscCall(PetscArrayzero(rtmp, mbs * bs2)); 1633 aatmp = a->a + bs2 * ai[r[i]]; 1634 for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2)); 1635 1636 /* add pivot rows into linked list */ 1637 row = lnk[mbs]; 1638 while (row < i) { 1639 nzi_bl = bi[row + 1] - bi[row] + 1; 1640 bjtmp = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */ 1641 PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im)); 1642 nzi += nlnk; 1643 row = lnk[row]; 1644 } 1645 1646 /* copy data from lnk into jtmp, then initialize lnk */ 1647 PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt)); 1648 1649 /* numerical factorization */ 1650 bjtmp = jtmp; 1651 row = *bjtmp++; /* 1st pivot row */ 1652 1653 while (row < i) { 1654 pc = rtmp + bs2 * row; 1655 pv = ba + bs2 * bdiag[row]; /* inv(diag) of the pivot row */ 1656 PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */ 1657 PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs)); 1658 if (vtmp_abs[0] > dt) { /* apply tolerance dropping rule */ 1659 pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */ 1660 pv = ba + bs2 * (bdiag[row + 1] + 1); 1661 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */ 1662 for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); 1663 /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */ 1664 } 1665 row = *bjtmp++; 1666 } 1667 1668 /* copy sparse rtmp into contiguous vtmp; separate L and U part */ 1669 nzi_bl = 0; 1670 j = 0; 1671 while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */ 1672 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1673 nzi_bl++; 1674 j++; 1675 } 1676 nzi_bu = nzi - nzi_bl - 1; 1677 1678 while (j < nzi) { /* U-part */ 1679 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1680 j++; 1681 } 1682 1683 PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs)); 1684 1685 bjtmp = bj + bi[i]; 1686 batmp = ba + bs2 * bi[i]; 1687 /* apply level dropping rule to L part */ 1688 ncut = nzi_al + dtcount; 1689 if (ncut < nzi_bl) { 1690 PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp)); 1691 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp)); 1692 } else { 1693 ncut = nzi_bl; 1694 } 1695 for (j = 0; j < ncut; j++) { 1696 bjtmp[j] = jtmp[j]; 1697 PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2)); 1698 } 1699 bi[i + 1] = bi[i] + ncut; 1700 nzi = ncut + 1; 1701 1702 /* apply level dropping rule to U part */ 1703 ncut = nzi_au + dtcount; 1704 if (ncut < nzi_bu) { 1705 PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1)); 1706 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1)); 1707 } else { 1708 ncut = nzi_bu; 1709 } 1710 nzi += ncut; 1711 1712 /* mark bdiagonal */ 1713 bdiag[i + 1] = bdiag[i] - (ncut + 1); 1714 bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1); 1715 1716 bjtmp = bj + bdiag[i]; 1717 batmp = ba + bs2 * bdiag[i]; 1718 PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2)); 1719 *bjtmp = i; 1720 1721 bjtmp = bj + bdiag[i + 1] + 1; 1722 batmp = ba + (bdiag[i + 1] + 1) * bs2; 1723 1724 for (k = 0; k < ncut; k++) { 1725 bjtmp[k] = jtmp[nzi_bl + 1 + k]; 1726 PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2)); 1727 } 1728 1729 im[i] = nzi; /* used by PetscLLAddSortedLU() */ 1730 1731 /* invert diagonal block for simpler triangular solves - add shift??? */ 1732 batmp = ba + bs2 * bdiag[i]; 1733 1734 PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 1735 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1736 } /* for (i=0; i<mbs; i++) */ 1737 PetscCall(PetscFree3(v_work, multiplier, v_pivots)); 1738 1739 /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */ 1740 PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]); 1741 1742 PetscCall(ISRestoreIndices(isrow, &r)); 1743 PetscCall(ISRestoreIndices(isicol, &ic)); 1744 1745 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1746 1747 PetscCall(PetscFree2(im, jtmp)); 1748 PetscCall(PetscFree2(rtmp, vtmp)); 1749 1750 PetscCall(PetscLogFlops(bs2 * B->cmap->n)); 1751 b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs]; 1752 1753 PetscCall(ISIdentity(isrow, &row_identity)); 1754 PetscCall(ISIdentity(isicol, &icol_identity)); 1755 if (row_identity && icol_identity) { 1756 B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 1757 } else { 1758 B->ops->solve = MatSolve_SeqBAIJ_N; 1759 } 1760 1761 B->ops->solveadd = NULL; 1762 B->ops->solvetranspose = NULL; 1763 B->ops->solvetransposeadd = NULL; 1764 B->ops->matsolve = NULL; 1765 B->assembled = PETSC_TRUE; 1766 B->preallocated = PETSC_TRUE; 1767 PetscFunctionReturn(PETSC_SUCCESS); 1768 } 1769