1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) { 9 Mat C = B; 10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 11 IS isrow = b->row, isicol = b->icol; 12 const PetscInt *r, *ic; 13 PetscInt i, j, k, nz, nzL, row, *pj; 14 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 15 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 16 MatScalar *rtmp, *pc, *mwork, *pv; 17 MatScalar *aa = a->a, *v; 18 PetscInt flg; 19 PetscReal shift = info->shiftamount; 20 PetscBool allowzeropivot, zeropivotdetected; 21 22 PetscFunctionBegin; 23 PetscCall(ISGetIndices(isrow, &r)); 24 PetscCall(ISGetIndices(isicol, &ic)); 25 allowzeropivot = PetscNot(A->erroriffailure); 26 27 /* generate work space needed by the factorization */ 28 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 29 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 30 31 for (i = 0; i < n; i++) { 32 /* zero rtmp */ 33 /* L part */ 34 nz = bi[i + 1] - bi[i]; 35 bjtmp = bj + bi[i]; 36 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 37 38 /* U part */ 39 nz = bdiag[i] - bdiag[i + 1]; 40 bjtmp = bj + bdiag[i + 1] + 1; 41 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 42 43 /* load in initial (unfactored row) */ 44 nz = ai[r[i] + 1] - ai[r[i]]; 45 ajtmp = aj + ai[r[i]]; 46 v = aa + bs2 * ai[r[i]]; 47 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 48 49 /* elimination */ 50 bjtmp = bj + bi[i]; 51 nzL = bi[i + 1] - bi[i]; 52 for (k = 0; k < nzL; k++) { 53 row = bjtmp[k]; 54 pc = rtmp + bs2 * row; 55 for (flg = 0, j = 0; j < bs2; j++) { 56 if (pc[j] != (PetscScalar)0.0) { 57 flg = 1; 58 break; 59 } 60 } 61 if (flg) { 62 pv = b->a + bs2 * bdiag[row]; 63 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 64 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 65 66 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 67 pv = b->a + bs2 * (bdiag[row + 1] + 1); 68 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 69 for (j = 0; j < nz; j++) { 70 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 71 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 72 v = rtmp + 4 * pj[j]; 73 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 74 pv += 4; 75 } 76 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 77 } 78 } 79 80 /* finished row so stick it into b->a */ 81 /* L part */ 82 pv = b->a + bs2 * bi[i]; 83 pj = b->j + bi[i]; 84 nz = bi[i + 1] - bi[i]; 85 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 86 87 /* Mark diagonal and invert diagonal for simpler triangular solves */ 88 pv = b->a + bs2 * bdiag[i]; 89 pj = b->j + bdiag[i]; 90 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 91 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 92 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 93 94 /* U part */ 95 pv = b->a + bs2 * (bdiag[i + 1] + 1); 96 pj = b->j + bdiag[i + 1] + 1; 97 nz = bdiag[i] - bdiag[i + 1] - 1; 98 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 99 } 100 101 PetscCall(PetscFree2(rtmp, mwork)); 102 PetscCall(ISRestoreIndices(isicol, &ic)); 103 PetscCall(ISRestoreIndices(isrow, &r)); 104 105 C->ops->solve = MatSolve_SeqBAIJ_2; 106 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 107 C->assembled = PETSC_TRUE; 108 109 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 110 PetscFunctionReturn(0); 111 } 112 113 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) { 114 Mat C = B; 115 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 116 PetscInt i, j, k, nz, nzL, row, *pj; 117 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 118 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 119 MatScalar *rtmp, *pc, *mwork, *pv; 120 MatScalar *aa = a->a, *v; 121 PetscInt flg; 122 PetscReal shift = info->shiftamount; 123 PetscBool allowzeropivot, zeropivotdetected; 124 125 PetscFunctionBegin; 126 allowzeropivot = PetscNot(A->erroriffailure); 127 128 /* generate work space needed by the factorization */ 129 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 130 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 131 132 for (i = 0; i < n; i++) { 133 /* zero rtmp */ 134 /* L part */ 135 nz = bi[i + 1] - bi[i]; 136 bjtmp = bj + bi[i]; 137 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 138 139 /* U part */ 140 nz = bdiag[i] - bdiag[i + 1]; 141 bjtmp = bj + bdiag[i + 1] + 1; 142 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 143 144 /* load in initial (unfactored row) */ 145 nz = ai[i + 1] - ai[i]; 146 ajtmp = aj + ai[i]; 147 v = aa + bs2 * ai[i]; 148 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 149 150 /* elimination */ 151 bjtmp = bj + bi[i]; 152 nzL = bi[i + 1] - bi[i]; 153 for (k = 0; k < nzL; k++) { 154 row = bjtmp[k]; 155 pc = rtmp + bs2 * row; 156 for (flg = 0, j = 0; j < bs2; j++) { 157 if (pc[j] != (PetscScalar)0.0) { 158 flg = 1; 159 break; 160 } 161 } 162 if (flg) { 163 pv = b->a + bs2 * bdiag[row]; 164 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 165 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 166 167 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 168 pv = b->a + bs2 * (bdiag[row + 1] + 1); 169 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 170 for (j = 0; j < nz; j++) { 171 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 172 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 173 v = rtmp + 4 * pj[j]; 174 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 175 pv += 4; 176 } 177 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 178 } 179 } 180 181 /* finished row so stick it into b->a */ 182 /* L part */ 183 pv = b->a + bs2 * bi[i]; 184 pj = b->j + bi[i]; 185 nz = bi[i + 1] - bi[i]; 186 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 187 188 /* Mark diagonal and invert diagonal for simpler triangular solves */ 189 pv = b->a + bs2 * bdiag[i]; 190 pj = b->j + bdiag[i]; 191 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 192 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 193 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 194 195 /* U part */ 196 /* 197 pv = b->a + bs2*bi[2*n-i]; 198 pj = b->j + bi[2*n-i]; 199 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 200 */ 201 pv = b->a + bs2 * (bdiag[i + 1] + 1); 202 pj = b->j + bdiag[i + 1] + 1; 203 nz = bdiag[i] - bdiag[i + 1] - 1; 204 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 205 } 206 PetscCall(PetscFree2(rtmp, mwork)); 207 208 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 209 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering; 210 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering; 211 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 212 C->assembled = PETSC_TRUE; 213 214 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 215 PetscFunctionReturn(0); 216 } 217 218 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) { 219 Mat C = B; 220 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 221 IS isrow = b->row, isicol = b->icol; 222 const PetscInt *r, *ic; 223 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 224 PetscInt *ajtmpold, *ajtmp, nz, row; 225 PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 226 MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4; 227 MatScalar p1, p2, p3, p4; 228 MatScalar *ba = b->a, *aa = a->a; 229 PetscReal shift = info->shiftamount; 230 PetscBool allowzeropivot, zeropivotdetected; 231 232 PetscFunctionBegin; 233 allowzeropivot = PetscNot(A->erroriffailure); 234 PetscCall(ISGetIndices(isrow, &r)); 235 PetscCall(ISGetIndices(isicol, &ic)); 236 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 237 238 for (i = 0; i < n; i++) { 239 nz = bi[i + 1] - bi[i]; 240 ajtmp = bj + bi[i]; 241 for (j = 0; j < nz; j++) { 242 x = rtmp + 4 * ajtmp[j]; 243 x[0] = x[1] = x[2] = x[3] = 0.0; 244 } 245 /* load in initial (unfactored row) */ 246 idx = r[i]; 247 nz = ai[idx + 1] - ai[idx]; 248 ajtmpold = aj + ai[idx]; 249 v = aa + 4 * ai[idx]; 250 for (j = 0; j < nz; j++) { 251 x = rtmp + 4 * ic[ajtmpold[j]]; 252 x[0] = v[0]; 253 x[1] = v[1]; 254 x[2] = v[2]; 255 x[3] = v[3]; 256 v += 4; 257 } 258 row = *ajtmp++; 259 while (row < i) { 260 pc = rtmp + 4 * row; 261 p1 = pc[0]; 262 p2 = pc[1]; 263 p3 = pc[2]; 264 p4 = pc[3]; 265 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 266 pv = ba + 4 * diag_offset[row]; 267 pj = bj + diag_offset[row] + 1; 268 x1 = pv[0]; 269 x2 = pv[1]; 270 x3 = pv[2]; 271 x4 = pv[3]; 272 pc[0] = m1 = p1 * x1 + p3 * x2; 273 pc[1] = m2 = p2 * x1 + p4 * x2; 274 pc[2] = m3 = p1 * x3 + p3 * x4; 275 pc[3] = m4 = p2 * x3 + p4 * x4; 276 nz = bi[row + 1] - diag_offset[row] - 1; 277 pv += 4; 278 for (j = 0; j < nz; j++) { 279 x1 = pv[0]; 280 x2 = pv[1]; 281 x3 = pv[2]; 282 x4 = pv[3]; 283 x = rtmp + 4 * pj[j]; 284 x[0] -= m1 * x1 + m3 * x2; 285 x[1] -= m2 * x1 + m4 * x2; 286 x[2] -= m1 * x3 + m3 * x4; 287 x[3] -= m2 * x3 + m4 * x4; 288 pv += 4; 289 } 290 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 291 } 292 row = *ajtmp++; 293 } 294 /* finished row so stick it into b->a */ 295 pv = ba + 4 * bi[i]; 296 pj = bj + bi[i]; 297 nz = bi[i + 1] - bi[i]; 298 for (j = 0; j < nz; j++) { 299 x = rtmp + 4 * pj[j]; 300 pv[0] = x[0]; 301 pv[1] = x[1]; 302 pv[2] = x[2]; 303 pv[3] = x[3]; 304 pv += 4; 305 } 306 /* invert diagonal block */ 307 w = ba + 4 * diag_offset[i]; 308 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 309 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 310 } 311 312 PetscCall(PetscFree(rtmp)); 313 PetscCall(ISRestoreIndices(isicol, &ic)); 314 PetscCall(ISRestoreIndices(isrow, &r)); 315 316 C->ops->solve = MatSolve_SeqBAIJ_2_inplace; 317 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace; 318 C->assembled = PETSC_TRUE; 319 320 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 321 PetscFunctionReturn(0); 322 } 323 /* 324 Version for when blocks are 2 by 2 Using natural ordering 325 */ 326 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) { 327 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 328 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 329 PetscInt *ajtmpold, *ajtmp, nz, row; 330 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 331 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 332 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4; 333 MatScalar *ba = b->a, *aa = a->a; 334 PetscReal shift = info->shiftamount; 335 PetscBool allowzeropivot, zeropivotdetected; 336 337 PetscFunctionBegin; 338 allowzeropivot = PetscNot(A->erroriffailure); 339 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 340 for (i = 0; i < n; i++) { 341 nz = bi[i + 1] - bi[i]; 342 ajtmp = bj + bi[i]; 343 for (j = 0; j < nz; j++) { 344 x = rtmp + 4 * ajtmp[j]; 345 x[0] = x[1] = x[2] = x[3] = 0.0; 346 } 347 /* load in initial (unfactored row) */ 348 nz = ai[i + 1] - ai[i]; 349 ajtmpold = aj + ai[i]; 350 v = aa + 4 * ai[i]; 351 for (j = 0; j < nz; j++) { 352 x = rtmp + 4 * ajtmpold[j]; 353 x[0] = v[0]; 354 x[1] = v[1]; 355 x[2] = v[2]; 356 x[3] = v[3]; 357 v += 4; 358 } 359 row = *ajtmp++; 360 while (row < i) { 361 pc = rtmp + 4 * row; 362 p1 = pc[0]; 363 p2 = pc[1]; 364 p3 = pc[2]; 365 p4 = pc[3]; 366 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 367 pv = ba + 4 * diag_offset[row]; 368 pj = bj + diag_offset[row] + 1; 369 x1 = pv[0]; 370 x2 = pv[1]; 371 x3 = pv[2]; 372 x4 = pv[3]; 373 pc[0] = m1 = p1 * x1 + p3 * x2; 374 pc[1] = m2 = p2 * x1 + p4 * x2; 375 pc[2] = m3 = p1 * x3 + p3 * x4; 376 pc[3] = m4 = p2 * x3 + p4 * x4; 377 nz = bi[row + 1] - diag_offset[row] - 1; 378 pv += 4; 379 for (j = 0; j < nz; j++) { 380 x1 = pv[0]; 381 x2 = pv[1]; 382 x3 = pv[2]; 383 x4 = pv[3]; 384 x = rtmp + 4 * pj[j]; 385 x[0] -= m1 * x1 + m3 * x2; 386 x[1] -= m2 * x1 + m4 * x2; 387 x[2] -= m1 * x3 + m3 * x4; 388 x[3] -= m2 * x3 + m4 * x4; 389 pv += 4; 390 } 391 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 392 } 393 row = *ajtmp++; 394 } 395 /* finished row so stick it into b->a */ 396 pv = ba + 4 * bi[i]; 397 pj = bj + bi[i]; 398 nz = bi[i + 1] - bi[i]; 399 for (j = 0; j < nz; j++) { 400 x = rtmp + 4 * pj[j]; 401 pv[0] = x[0]; 402 pv[1] = x[1]; 403 pv[2] = x[2]; 404 pv[3] = x[3]; 405 /* 406 printf(" col %d:",pj[j]); 407 PetscInt j1; 408 for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1)); 409 printf("\n"); 410 */ 411 pv += 4; 412 } 413 /* invert diagonal block */ 414 w = ba + 4 * diag_offset[i]; 415 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 416 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 417 } 418 419 PetscCall(PetscFree(rtmp)); 420 421 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace; 422 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace; 423 C->assembled = PETSC_TRUE; 424 425 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 426 PetscFunctionReturn(0); 427 } 428 429 /* ----------------------------------------------------------- */ 430 /* 431 Version for when blocks are 1 by 1. 432 */ 433 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) { 434 Mat C = B; 435 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 436 IS isrow = b->row, isicol = b->icol; 437 const PetscInt *r, *ic, *ics; 438 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 439 PetscInt i, j, k, nz, nzL, row, *pj; 440 const PetscInt *ajtmp, *bjtmp; 441 MatScalar *rtmp, *pc, multiplier, *pv; 442 const MatScalar *aa = a->a, *v; 443 PetscBool row_identity, col_identity; 444 FactorShiftCtx sctx; 445 const PetscInt *ddiag; 446 PetscReal rs; 447 MatScalar d; 448 449 PetscFunctionBegin; 450 /* MatPivotSetUp(): initialize shift context sctx */ 451 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 452 453 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 454 ddiag = a->diag; 455 sctx.shift_top = info->zeropivot; 456 for (i = 0; i < n; i++) { 457 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 458 d = (aa)[ddiag[i]]; 459 rs = -PetscAbsScalar(d) - PetscRealPart(d); 460 v = aa + ai[i]; 461 nz = ai[i + 1] - ai[i]; 462 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 463 if (rs > sctx.shift_top) sctx.shift_top = rs; 464 } 465 sctx.shift_top *= 1.1; 466 sctx.nshift_max = 5; 467 sctx.shift_lo = 0.; 468 sctx.shift_hi = 1.; 469 } 470 471 PetscCall(ISGetIndices(isrow, &r)); 472 PetscCall(ISGetIndices(isicol, &ic)); 473 PetscCall(PetscMalloc1(n + 1, &rtmp)); 474 ics = ic; 475 476 do { 477 sctx.newshift = PETSC_FALSE; 478 for (i = 0; i < n; i++) { 479 /* zero rtmp */ 480 /* L part */ 481 nz = bi[i + 1] - bi[i]; 482 bjtmp = bj + bi[i]; 483 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 484 485 /* U part */ 486 nz = bdiag[i] - bdiag[i + 1]; 487 bjtmp = bj + bdiag[i + 1] + 1; 488 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 489 490 /* load in initial (unfactored row) */ 491 nz = ai[r[i] + 1] - ai[r[i]]; 492 ajtmp = aj + ai[r[i]]; 493 v = aa + ai[r[i]]; 494 for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j]; 495 496 /* ZeropivotApply() */ 497 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 498 499 /* elimination */ 500 bjtmp = bj + bi[i]; 501 row = *bjtmp++; 502 nzL = bi[i + 1] - bi[i]; 503 for (k = 0; k < nzL; k++) { 504 pc = rtmp + row; 505 if (*pc != (PetscScalar)0.0) { 506 pv = b->a + bdiag[row]; 507 multiplier = *pc * (*pv); 508 *pc = multiplier; 509 510 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 511 pv = b->a + bdiag[row + 1] + 1; 512 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 513 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 514 PetscCall(PetscLogFlops(2.0 * nz)); 515 } 516 row = *bjtmp++; 517 } 518 519 /* finished row so stick it into b->a */ 520 rs = 0.0; 521 /* L part */ 522 pv = b->a + bi[i]; 523 pj = b->j + bi[i]; 524 nz = bi[i + 1] - bi[i]; 525 for (j = 0; j < nz; j++) { 526 pv[j] = rtmp[pj[j]]; 527 rs += PetscAbsScalar(pv[j]); 528 } 529 530 /* U part */ 531 pv = b->a + bdiag[i + 1] + 1; 532 pj = b->j + bdiag[i + 1] + 1; 533 nz = bdiag[i] - bdiag[i + 1] - 1; 534 for (j = 0; j < nz; j++) { 535 pv[j] = rtmp[pj[j]]; 536 rs += PetscAbsScalar(pv[j]); 537 } 538 539 sctx.rs = rs; 540 sctx.pv = rtmp[i]; 541 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 542 if (sctx.newshift) break; /* break for-loop */ 543 rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */ 544 545 /* Mark diagonal and invert diagonal for simpler triangular solves */ 546 pv = b->a + bdiag[i]; 547 *pv = (PetscScalar)1.0 / rtmp[i]; 548 549 } /* endof for (i=0; i<n; i++) { */ 550 551 /* MatPivotRefine() */ 552 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 553 /* 554 * if no shift in this attempt & shifting & started shifting & can refine, 555 * then try lower shift 556 */ 557 sctx.shift_hi = sctx.shift_fraction; 558 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 559 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 560 sctx.newshift = PETSC_TRUE; 561 sctx.nshift++; 562 } 563 } while (sctx.newshift); 564 565 PetscCall(PetscFree(rtmp)); 566 PetscCall(ISRestoreIndices(isicol, &ic)); 567 PetscCall(ISRestoreIndices(isrow, &r)); 568 569 PetscCall(ISIdentity(isrow, &row_identity)); 570 PetscCall(ISIdentity(isicol, &col_identity)); 571 if (row_identity && col_identity) { 572 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 573 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering; 574 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering; 575 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 576 } else { 577 C->ops->solve = MatSolve_SeqBAIJ_1; 578 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 579 } 580 C->assembled = PETSC_TRUE; 581 PetscCall(PetscLogFlops(C->cmap->n)); 582 583 /* MatShiftView(A,info,&sctx) */ 584 if (sctx.nshift) { 585 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 586 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 587 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 588 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 589 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 590 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 591 } 592 } 593 PetscFunctionReturn(0); 594 } 595 596 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) { 597 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 598 IS isrow = b->row, isicol = b->icol; 599 const PetscInt *r, *ic; 600 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 601 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 602 PetscInt *diag_offset = b->diag, diag, *pj; 603 MatScalar *pv, *v, *rtmp, multiplier, *pc; 604 MatScalar *ba = b->a, *aa = a->a; 605 PetscBool row_identity, col_identity; 606 607 PetscFunctionBegin; 608 PetscCall(ISGetIndices(isrow, &r)); 609 PetscCall(ISGetIndices(isicol, &ic)); 610 PetscCall(PetscMalloc1(n + 1, &rtmp)); 611 612 for (i = 0; i < n; i++) { 613 nz = bi[i + 1] - bi[i]; 614 ajtmp = bj + bi[i]; 615 for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0; 616 617 /* load in initial (unfactored row) */ 618 nz = ai[r[i] + 1] - ai[r[i]]; 619 ajtmpold = aj + ai[r[i]]; 620 v = aa + ai[r[i]]; 621 for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 622 623 row = *ajtmp++; 624 while (row < i) { 625 pc = rtmp + row; 626 if (*pc != 0.0) { 627 pv = ba + diag_offset[row]; 628 pj = bj + diag_offset[row] + 1; 629 multiplier = *pc * *pv++; 630 *pc = multiplier; 631 nz = bi[row + 1] - diag_offset[row] - 1; 632 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 633 PetscCall(PetscLogFlops(1.0 + 2.0 * nz)); 634 } 635 row = *ajtmp++; 636 } 637 /* finished row so stick it into b->a */ 638 pv = ba + bi[i]; 639 pj = bj + bi[i]; 640 nz = bi[i + 1] - bi[i]; 641 for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]]; 642 diag = diag_offset[i] - bi[i]; 643 /* check pivot entry for current row */ 644 PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 645 pv[diag] = 1.0 / pv[diag]; 646 } 647 648 PetscCall(PetscFree(rtmp)); 649 PetscCall(ISRestoreIndices(isicol, &ic)); 650 PetscCall(ISRestoreIndices(isrow, &r)); 651 PetscCall(ISIdentity(isrow, &row_identity)); 652 PetscCall(ISIdentity(isicol, &col_identity)); 653 if (row_identity && col_identity) { 654 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace; 655 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace; 656 } else { 657 C->ops->solve = MatSolve_SeqBAIJ_1_inplace; 658 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace; 659 } 660 C->assembled = PETSC_TRUE; 661 PetscCall(PetscLogFlops(C->cmap->n)); 662 PetscFunctionReturn(0); 663 } 664 665 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) { 666 PetscFunctionBegin; 667 *type = MATSOLVERPETSC; 668 PetscFunctionReturn(0); 669 } 670 671 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) { 672 PetscInt n = A->rmap->n; 673 674 PetscFunctionBegin; 675 #if defined(PETSC_USE_COMPLEX) 676 PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported"); 677 #endif 678 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 679 PetscCall(MatSetSizes(*B, n, n, n, n)); 680 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 681 PetscCall(MatSetType(*B, MATSEQBAIJ)); 682 683 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ; 684 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ; 685 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 686 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 687 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 688 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 689 PetscCall(MatSetType(*B, MATSEQSBAIJ)); 690 PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL)); 691 692 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ; 693 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ; 694 /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */ 695 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 696 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 697 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported"); 698 (*B)->factortype = ftype; 699 (*B)->canuseordering = PETSC_TRUE; 700 701 PetscCall(PetscFree((*B)->solvertype)); 702 PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype)); 703 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc)); 704 PetscFunctionReturn(0); 705 } 706 707 /* ----------------------------------------------------------- */ 708 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) { 709 Mat C; 710 711 PetscFunctionBegin; 712 PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C)); 713 PetscCall(MatLUFactorSymbolic(C, A, row, col, info)); 714 PetscCall(MatLUFactorNumeric(C, A, info)); 715 716 A->ops->solve = C->ops->solve; 717 A->ops->solvetranspose = C->ops->solvetranspose; 718 719 PetscCall(MatHeaderMerge(A, &C)); 720 PetscFunctionReturn(0); 721 } 722 723 #include <../src/mat/impls/sbaij/seq/sbaij.h> 724 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) { 725 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 726 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 727 IS ip = b->row; 728 const PetscInt *rip; 729 PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol; 730 PetscInt *ai = a->i, *aj = a->j; 731 PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz; 732 MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi; 733 PetscReal rs; 734 FactorShiftCtx sctx; 735 736 PetscFunctionBegin; 737 if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */ 738 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 739 PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info)); 740 PetscCall(MatDestroy(&a->sbaijMat)); 741 PetscFunctionReturn(0); 742 } 743 744 /* MatPivotSetUp(): initialize shift context sctx */ 745 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 746 747 PetscCall(ISGetIndices(ip, &rip)); 748 PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl)); 749 750 sctx.shift_amount = 0.; 751 sctx.nshift = 0; 752 do { 753 sctx.newshift = PETSC_FALSE; 754 for (i = 0; i < mbs; i++) { 755 rtmp[i] = 0.0; 756 jl[i] = mbs; 757 il[0] = 0; 758 } 759 760 for (k = 0; k < mbs; k++) { 761 bval = ba + bi[k]; 762 /* initialize k-th row by the perm[k]-th row of A */ 763 jmin = ai[rip[k]]; 764 jmax = ai[rip[k] + 1]; 765 for (j = jmin; j < jmax; j++) { 766 col = rip[aj[j]]; 767 if (col >= k) { /* only take upper triangular entry */ 768 rtmp[col] = aa[j]; 769 *bval++ = 0.0; /* for in-place factorization */ 770 } 771 } 772 773 /* shift the diagonal of the matrix */ 774 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 775 776 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 777 dk = rtmp[k]; 778 i = jl[k]; /* first row to be added to k_th row */ 779 780 while (i < k) { 781 nexti = jl[i]; /* next row to be added to k_th row */ 782 783 /* compute multiplier, update diag(k) and U(i,k) */ 784 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 785 uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */ 786 dk += uikdi * ba[ili]; 787 ba[ili] = uikdi; /* -U(i,k) */ 788 789 /* add multiple of row i to k-th row */ 790 jmin = ili + 1; 791 jmax = bi[i + 1]; 792 if (jmin < jmax) { 793 for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j]; 794 /* update il and jl for row i */ 795 il[i] = jmin; 796 j = bj[jmin]; 797 jl[i] = jl[j]; 798 jl[j] = i; 799 } 800 i = nexti; 801 } 802 803 /* shift the diagonals when zero pivot is detected */ 804 /* compute rs=sum of abs(off-diagonal) */ 805 rs = 0.0; 806 jmin = bi[k] + 1; 807 nz = bi[k + 1] - jmin; 808 if (nz) { 809 bcol = bj + jmin; 810 while (nz--) { 811 rs += PetscAbsScalar(rtmp[*bcol]); 812 bcol++; 813 } 814 } 815 816 sctx.rs = rs; 817 sctx.pv = dk; 818 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 819 if (sctx.newshift) break; 820 dk = sctx.pv; 821 822 /* copy data into U(k,:) */ 823 ba[bi[k]] = 1.0 / dk; /* U(k,k) */ 824 jmin = bi[k] + 1; 825 jmax = bi[k + 1]; 826 if (jmin < jmax) { 827 for (j = jmin; j < jmax; j++) { 828 col = bj[j]; 829 ba[j] = rtmp[col]; 830 rtmp[col] = 0.0; 831 } 832 /* add the k-th row into il and jl */ 833 il[k] = jmin; 834 i = bj[jmin]; 835 jl[k] = jl[i]; 836 jl[i] = k; 837 } 838 } 839 } while (sctx.newshift); 840 PetscCall(PetscFree3(rtmp, il, jl)); 841 842 PetscCall(ISRestoreIndices(ip, &rip)); 843 844 C->assembled = PETSC_TRUE; 845 C->preallocated = PETSC_TRUE; 846 847 PetscCall(PetscLogFlops(C->rmap->N)); 848 if (sctx.nshift) { 849 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 850 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 851 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 852 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 853 } 854 } 855 PetscFunctionReturn(0); 856 } 857 858 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) { 859 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 860 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 861 PetscInt i, j, am = a->mbs; 862 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 863 PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz; 864 MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval; 865 PetscReal rs; 866 FactorShiftCtx sctx; 867 868 PetscFunctionBegin; 869 /* MatPivotSetUp(): initialize shift context sctx */ 870 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 871 872 PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl)); 873 874 do { 875 sctx.newshift = PETSC_FALSE; 876 for (i = 0; i < am; i++) { 877 rtmp[i] = 0.0; 878 jl[i] = am; 879 il[0] = 0; 880 } 881 882 for (k = 0; k < am; k++) { 883 /* initialize k-th row with elements nonzero in row perm(k) of A */ 884 nz = ai[k + 1] - ai[k]; 885 acol = aj + ai[k]; 886 aval = aa + ai[k]; 887 bval = ba + bi[k]; 888 while (nz--) { 889 if (*acol < k) { /* skip lower triangular entries */ 890 acol++; 891 aval++; 892 } else { 893 rtmp[*acol++] = *aval++; 894 *bval++ = 0.0; /* for in-place factorization */ 895 } 896 } 897 898 /* shift the diagonal of the matrix */ 899 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 900 901 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 902 dk = rtmp[k]; 903 i = jl[k]; /* first row to be added to k_th row */ 904 905 while (i < k) { 906 nexti = jl[i]; /* next row to be added to k_th row */ 907 /* compute multiplier, update D(k) and U(i,k) */ 908 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 909 uikdi = -ba[ili] * ba[bi[i]]; 910 dk += uikdi * ba[ili]; 911 ba[ili] = uikdi; /* -U(i,k) */ 912 913 /* add multiple of row i to k-th row ... */ 914 jmin = ili + 1; 915 nz = bi[i + 1] - jmin; 916 if (nz > 0) { 917 bcol = bj + jmin; 918 bval = ba + jmin; 919 while (nz--) rtmp[*bcol++] += uikdi * (*bval++); 920 /* update il and jl for i-th row */ 921 il[i] = jmin; 922 j = bj[jmin]; 923 jl[i] = jl[j]; 924 jl[j] = i; 925 } 926 i = nexti; 927 } 928 929 /* shift the diagonals when zero pivot is detected */ 930 /* compute rs=sum of abs(off-diagonal) */ 931 rs = 0.0; 932 jmin = bi[k] + 1; 933 nz = bi[k + 1] - jmin; 934 if (nz) { 935 bcol = bj + jmin; 936 while (nz--) { 937 rs += PetscAbsScalar(rtmp[*bcol]); 938 bcol++; 939 } 940 } 941 942 sctx.rs = rs; 943 sctx.pv = dk; 944 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 945 if (sctx.newshift) break; /* sctx.shift_amount is updated */ 946 dk = sctx.pv; 947 948 /* copy data into U(k,:) */ 949 ba[bi[k]] = 1.0 / dk; 950 jmin = bi[k] + 1; 951 nz = bi[k + 1] - jmin; 952 if (nz) { 953 bcol = bj + jmin; 954 bval = ba + jmin; 955 while (nz--) { 956 *bval++ = rtmp[*bcol]; 957 rtmp[*bcol++] = 0.0; 958 } 959 /* add k-th row into il and jl */ 960 il[k] = jmin; 961 i = bj[jmin]; 962 jl[k] = jl[i]; 963 jl[i] = k; 964 } 965 } 966 } while (sctx.newshift); 967 PetscCall(PetscFree3(rtmp, il, jl)); 968 969 C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 970 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 971 C->assembled = PETSC_TRUE; 972 C->preallocated = PETSC_TRUE; 973 974 PetscCall(PetscLogFlops(C->rmap->N)); 975 if (sctx.nshift) { 976 if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 977 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 978 } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 979 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 980 } 981 } 982 PetscFunctionReturn(0); 983 } 984 985 #include <petscbt.h> 986 #include <../src/mat/utils/freespace.h> 987 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) { 988 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 989 Mat_SeqSBAIJ *b; 990 Mat B; 991 PetscBool perm_identity, missing; 992 PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui; 993 const PetscInt *rip; 994 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 995 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 996 PetscReal fill = info->fill, levels = info->levels; 997 PetscFreeSpaceList free_space = NULL, current_space = NULL; 998 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 999 PetscBT lnkbt; 1000 1001 PetscFunctionBegin; 1002 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1003 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1004 1005 if (bs > 1) { 1006 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1007 (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1008 1009 PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info)); 1010 PetscFunctionReturn(0); 1011 } 1012 1013 PetscCall(ISIdentity(perm, &perm_identity)); 1014 PetscCall(ISGetIndices(perm, &rip)); 1015 1016 /* special case that simply copies fill pattern */ 1017 if (!levels && perm_identity) { 1018 PetscCall(PetscMalloc1(am + 1, &ui)); 1019 for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */ 1020 B = fact; 1021 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui)); 1022 1023 b = (Mat_SeqSBAIJ *)B->data; 1024 uj = b->j; 1025 for (i = 0; i < am; i++) { 1026 aj = a->j + a->diag[i]; 1027 for (j = 0; j < ui[i]; j++) *uj++ = *aj++; 1028 b->ilen[i] = ui[i]; 1029 } 1030 PetscCall(PetscFree(ui)); 1031 1032 B->factortype = MAT_FACTOR_NONE; 1033 1034 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1035 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1036 B->factortype = MAT_FACTOR_ICC; 1037 1038 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1039 PetscFunctionReturn(0); 1040 } 1041 1042 /* initialization */ 1043 PetscCall(PetscMalloc1(am + 1, &ui)); 1044 ui[0] = 0; 1045 PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl)); 1046 1047 /* jl: linked list for storing indices of the pivot rows 1048 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 1049 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 1050 for (i = 0; i < am; i++) { 1051 jl[i] = am; 1052 il[i] = 0; 1053 } 1054 1055 /* create and initialize a linked list for storing column indices of the active row k */ 1056 nlnk = am + 1; 1057 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 1058 1059 /* initial FreeSpace size is fill*(ai[am]+am)/2 */ 1060 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space)); 1061 1062 current_space = free_space; 1063 1064 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl)); 1065 current_space_lvl = free_space_lvl; 1066 1067 for (k = 0; k < am; k++) { /* for each active row k */ 1068 /* initialize lnk by the column indices of row rip[k] of A */ 1069 nzk = 0; 1070 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1071 ncols_upper = 0; 1072 cols = cols_lvl + am; 1073 for (j = 0; j < ncols; j++) { 1074 i = rip[*(aj + ai[rip[k]] + j)]; 1075 if (i >= k) { /* only take upper triangular entry */ 1076 cols[ncols_upper] = i; 1077 cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */ 1078 ncols_upper++; 1079 } 1080 } 1081 PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1082 nzk += nlnk; 1083 1084 /* update lnk by computing fill-in for each pivot row to be merged in */ 1085 prow = jl[k]; /* 1st pivot row */ 1086 1087 while (prow < k) { 1088 nextprow = jl[prow]; 1089 1090 /* merge prow into k-th row */ 1091 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 1092 jmax = ui[prow + 1]; 1093 ncols = jmax - jmin; 1094 i = jmin - ui[prow]; 1095 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 1096 for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j); 1097 PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1098 nzk += nlnk; 1099 1100 /* update il and jl for prow */ 1101 if (jmin < jmax) { 1102 il[prow] = jmin; 1103 1104 j = *cols; 1105 jl[prow] = jl[j]; 1106 jl[j] = prow; 1107 } 1108 prow = nextprow; 1109 } 1110 1111 /* if free space is not available, make more free space */ 1112 if (current_space->local_remaining < nzk) { 1113 i = am - k + 1; /* num of unfactored rows */ 1114 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1115 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1116 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 1117 reallocs++; 1118 } 1119 1120 /* copy data into free_space and free_space_lvl, then initialize lnk */ 1121 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 1122 1123 /* add the k-th row into il and jl */ 1124 if (nzk - 1 > 0) { 1125 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 1126 jl[k] = jl[i]; 1127 jl[i] = k; 1128 il[k] = ui[k] + 1; 1129 } 1130 uj_ptr[k] = current_space->array; 1131 uj_lvl_ptr[k] = current_space_lvl->array; 1132 1133 current_space->array += nzk; 1134 current_space->local_used += nzk; 1135 current_space->local_remaining -= nzk; 1136 1137 current_space_lvl->array += nzk; 1138 current_space_lvl->local_used += nzk; 1139 current_space_lvl->local_remaining -= nzk; 1140 1141 ui[k + 1] = ui[k] + nzk; 1142 } 1143 1144 PetscCall(ISRestoreIndices(perm, &rip)); 1145 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 1146 PetscCall(PetscFree(cols_lvl)); 1147 1148 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1149 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 1150 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1151 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 1152 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 1153 1154 /* put together the new matrix in MATSEQSBAIJ format */ 1155 B = fact; 1156 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 1157 1158 b = (Mat_SeqSBAIJ *)B->data; 1159 b->singlemalloc = PETSC_FALSE; 1160 b->free_a = PETSC_TRUE; 1161 b->free_ij = PETSC_TRUE; 1162 1163 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 1164 1165 b->j = uj; 1166 b->i = ui; 1167 b->diag = NULL; 1168 b->ilen = NULL; 1169 b->imax = NULL; 1170 b->row = perm; 1171 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1172 1173 PetscCall(PetscObjectReference((PetscObject)perm)); 1174 1175 b->icol = perm; 1176 1177 PetscCall(PetscObjectReference((PetscObject)perm)); 1178 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 1179 1180 b->maxnz = b->nz = ui[am]; 1181 1182 B->info.factor_mallocs = reallocs; 1183 B->info.fill_ratio_given = fill; 1184 if (ai[am] != 0.) { 1185 /* nonzeros in lower triangular part of A (includign diagonals)= (ai[am]+am)/2 */ 1186 B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am); 1187 } else { 1188 B->info.fill_ratio_needed = 0.0; 1189 } 1190 #if defined(PETSC_USE_INFO) 1191 if (ai[am] != 0) { 1192 PetscReal af = B->info.fill_ratio_needed; 1193 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1194 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1195 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1196 } else { 1197 PetscCall(PetscInfo(A, "Empty matrix\n")); 1198 } 1199 #endif 1200 if (perm_identity) { 1201 B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1202 B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1203 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1204 } else { 1205 (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1206 } 1207 PetscFunctionReturn(0); 1208 } 1209 1210 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) { 1211 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1212 Mat_SeqSBAIJ *b; 1213 Mat B; 1214 PetscBool perm_identity, missing; 1215 PetscReal fill = info->fill; 1216 const PetscInt *rip; 1217 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow; 1218 PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow; 1219 PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr; 1220 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1221 PetscBT lnkbt; 1222 1223 PetscFunctionBegin; 1224 if (bs > 1) { /* convert to seqsbaij */ 1225 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1226 (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1227 1228 PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info)); 1229 PetscFunctionReturn(0); 1230 } 1231 1232 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1233 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1234 1235 /* check whether perm is the identity mapping */ 1236 PetscCall(ISIdentity(perm, &perm_identity)); 1237 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported"); 1238 PetscCall(ISGetIndices(perm, &rip)); 1239 1240 /* initialization */ 1241 PetscCall(PetscMalloc1(mbs + 1, &ui)); 1242 ui[0] = 0; 1243 1244 /* jl: linked list for storing indices of the pivot rows 1245 il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 1246 PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols)); 1247 for (i = 0; i < mbs; i++) { 1248 jl[i] = mbs; 1249 il[i] = 0; 1250 } 1251 1252 /* create and initialize a linked list for storing column indices of the active row k */ 1253 nlnk = mbs + 1; 1254 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1255 1256 /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */ 1257 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space)); 1258 1259 current_space = free_space; 1260 1261 for (k = 0; k < mbs; k++) { /* for each active row k */ 1262 /* initialize lnk by the column indices of row rip[k] of A */ 1263 nzk = 0; 1264 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1265 ncols_upper = 0; 1266 for (j = 0; j < ncols; j++) { 1267 i = rip[*(aj + ai[rip[k]] + j)]; 1268 if (i >= k) { /* only take upper triangular entry */ 1269 cols[ncols_upper] = i; 1270 ncols_upper++; 1271 } 1272 } 1273 PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt)); 1274 nzk += nlnk; 1275 1276 /* update lnk by computing fill-in for each pivot row to be merged in */ 1277 prow = jl[k]; /* 1st pivot row */ 1278 1279 while (prow < k) { 1280 nextprow = jl[prow]; 1281 /* merge prow into k-th row */ 1282 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 1283 jmax = ui[prow + 1]; 1284 ncols = jmax - jmin; 1285 uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 1286 PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt)); 1287 nzk += nlnk; 1288 1289 /* update il and jl for prow */ 1290 if (jmin < jmax) { 1291 il[prow] = jmin; 1292 j = *uj_ptr; 1293 jl[prow] = jl[j]; 1294 jl[j] = prow; 1295 } 1296 prow = nextprow; 1297 } 1298 1299 /* if free space is not available, make more free space */ 1300 if (current_space->local_remaining < nzk) { 1301 i = mbs - k + 1; /* num of unfactored rows */ 1302 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1303 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1304 reallocs++; 1305 } 1306 1307 /* copy data into free space, then initialize lnk */ 1308 PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt)); 1309 1310 /* add the k-th row into il and jl */ 1311 if (nzk - 1 > 0) { 1312 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 1313 jl[k] = jl[i]; 1314 jl[i] = k; 1315 il[k] = ui[k] + 1; 1316 } 1317 ui_ptr[k] = current_space->array; 1318 current_space->array += nzk; 1319 current_space->local_used += nzk; 1320 current_space->local_remaining -= nzk; 1321 1322 ui[k + 1] = ui[k] + nzk; 1323 } 1324 1325 PetscCall(ISRestoreIndices(perm, &rip)); 1326 PetscCall(PetscFree4(ui_ptr, il, jl, cols)); 1327 1328 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1329 PetscCall(PetscMalloc1(ui[mbs] + 1, &uj)); 1330 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1331 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1332 1333 /* put together the new matrix in MATSEQSBAIJ format */ 1334 B = fact; 1335 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1336 1337 b = (Mat_SeqSBAIJ *)B->data; 1338 b->singlemalloc = PETSC_FALSE; 1339 b->free_a = PETSC_TRUE; 1340 b->free_ij = PETSC_TRUE; 1341 1342 PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a)); 1343 1344 b->j = uj; 1345 b->i = ui; 1346 b->diag = NULL; 1347 b->ilen = NULL; 1348 b->imax = NULL; 1349 b->row = perm; 1350 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1351 1352 PetscCall(PetscObjectReference((PetscObject)perm)); 1353 b->icol = perm; 1354 PetscCall(PetscObjectReference((PetscObject)perm)); 1355 PetscCall(PetscMalloc1(mbs + 1, &b->solve_work)); 1356 b->maxnz = b->nz = ui[mbs]; 1357 1358 B->info.factor_mallocs = reallocs; 1359 B->info.fill_ratio_given = fill; 1360 if (ai[mbs] != 0.) { 1361 /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */ 1362 B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs); 1363 } else { 1364 B->info.fill_ratio_needed = 0.0; 1365 } 1366 #if defined(PETSC_USE_INFO) 1367 if (ai[mbs] != 0.) { 1368 PetscReal af = B->info.fill_ratio_needed; 1369 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1370 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1371 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1372 } else { 1373 PetscCall(PetscInfo(A, "Empty matrix\n")); 1374 } 1375 #endif 1376 if (perm_identity) { 1377 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1378 } else { 1379 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1380 } 1381 PetscFunctionReturn(0); 1382 } 1383 1384 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1385 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1386 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1387 PetscInt i, k, n = a->mbs; 1388 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1389 const MatScalar *aa = a->a, *v; 1390 PetscScalar *x, *s, *t, *ls; 1391 const PetscScalar *b; 1392 1393 PetscFunctionBegin; 1394 PetscCall(VecGetArrayRead(bb, &b)); 1395 PetscCall(VecGetArray(xx, &x)); 1396 t = a->solve_work; 1397 1398 /* forward solve the lower triangular */ 1399 PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */ 1400 1401 for (i = 1; i < n; i++) { 1402 v = aa + bs2 * ai[i]; 1403 vi = aj + ai[i]; 1404 nz = ai[i + 1] - ai[i]; 1405 s = t + bs * i; 1406 PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */ 1407 for (k = 0; k < nz; k++) { 1408 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]); 1409 v += bs2; 1410 } 1411 } 1412 1413 /* backward solve the upper triangular */ 1414 ls = a->solve_work + A->cmap->n; 1415 for (i = n - 1; i >= 0; i--) { 1416 v = aa + bs2 * (adiag[i + 1] + 1); 1417 vi = aj + adiag[i + 1] + 1; 1418 nz = adiag[i] - adiag[i + 1] - 1; 1419 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1420 for (k = 0; k < nz; k++) { 1421 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]); 1422 v += bs2; 1423 } 1424 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */ 1425 PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs)); 1426 } 1427 1428 PetscCall(VecRestoreArrayRead(bb, &b)); 1429 PetscCall(VecRestoreArray(xx, &x)); 1430 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1431 PetscFunctionReturn(0); 1432 } 1433 1434 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) { 1435 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1436 IS iscol = a->col, isrow = a->row; 1437 const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1438 PetscInt i, m, n = a->mbs; 1439 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1440 const MatScalar *aa = a->a, *v; 1441 PetscScalar *x, *s, *t, *ls; 1442 const PetscScalar *b; 1443 1444 PetscFunctionBegin; 1445 PetscCall(VecGetArrayRead(bb, &b)); 1446 PetscCall(VecGetArray(xx, &x)); 1447 t = a->solve_work; 1448 1449 PetscCall(ISGetIndices(isrow, &rout)); 1450 r = rout; 1451 PetscCall(ISGetIndices(iscol, &cout)); 1452 c = cout; 1453 1454 /* forward solve the lower triangular */ 1455 PetscCall(PetscArraycpy(t, b + bs * r[0], bs)); 1456 for (i = 1; i < n; i++) { 1457 v = aa + bs2 * ai[i]; 1458 vi = aj + ai[i]; 1459 nz = ai[i + 1] - ai[i]; 1460 s = t + bs * i; 1461 PetscCall(PetscArraycpy(s, b + bs * r[i], bs)); 1462 for (m = 0; m < nz; m++) { 1463 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]); 1464 v += bs2; 1465 } 1466 } 1467 1468 /* backward solve the upper triangular */ 1469 ls = a->solve_work + A->cmap->n; 1470 for (i = n - 1; i >= 0; i--) { 1471 v = aa + bs2 * (adiag[i + 1] + 1); 1472 vi = aj + adiag[i + 1] + 1; 1473 nz = adiag[i] - adiag[i + 1] - 1; 1474 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1475 for (m = 0; m < nz; m++) { 1476 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]); 1477 v += bs2; 1478 } 1479 PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */ 1480 PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs)); 1481 } 1482 PetscCall(ISRestoreIndices(isrow, &rout)); 1483 PetscCall(ISRestoreIndices(iscol, &cout)); 1484 PetscCall(VecRestoreArrayRead(bb, &b)); 1485 PetscCall(VecRestoreArray(xx, &x)); 1486 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1487 PetscFunctionReturn(0); 1488 } 1489 1490 /* 1491 For each block in an block array saves the largest absolute value in the block into another array 1492 */ 1493 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray) { 1494 PetscInt i, j; 1495 1496 PetscFunctionBegin; 1497 PetscCall(PetscArrayzero(absarray, nbs + 1)); 1498 for (i = 0; i < nbs; i++) { 1499 for (j = 0; j < bs2; j++) { 1500 if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]); 1501 } 1502 } 1503 PetscFunctionReturn(0); 1504 } 1505 1506 /* 1507 This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used 1508 */ 1509 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact) { 1510 Mat B = *fact; 1511 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 1512 IS isicol; 1513 const PetscInt *r, *ic; 1514 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag; 1515 PetscInt *bi, *bj, *bdiag; 1516 1517 PetscInt row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au; 1518 PetscInt nlnk, *lnk; 1519 PetscBT lnkbt; 1520 PetscBool row_identity, icol_identity; 1521 MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp; 1522 PetscInt j, nz, *pj, *bjtmp, k, ncut, *jtmp; 1523 1524 PetscReal dt = info->dt; /* shift=info->shiftamount; */ 1525 PetscInt nnz_max; 1526 PetscBool missing; 1527 PetscReal *vtmp_abs; 1528 MatScalar *v_work; 1529 PetscInt *v_pivots; 1530 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1531 1532 PetscFunctionBegin; 1533 /* ------- symbolic factorization, can be reused ---------*/ 1534 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1535 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1536 adiag = a->diag; 1537 1538 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 1539 1540 /* bdiag is location of diagonal in factor */ 1541 PetscCall(PetscMalloc1(mbs + 1, &bdiag)); 1542 1543 /* allocate row pointers bi */ 1544 PetscCall(PetscMalloc1(2 * mbs + 2, &bi)); 1545 1546 /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */ 1547 dtcount = (PetscInt)info->dtcount; 1548 if (dtcount > mbs - 1) dtcount = mbs - 1; 1549 nnz_max = ai[mbs] + 2 * mbs * dtcount + 2; 1550 /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */ 1551 PetscCall(PetscMalloc1(nnz_max, &bj)); 1552 nnz_max = nnz_max * bs2; 1553 PetscCall(PetscMalloc1(nnz_max, &ba)); 1554 1555 /* put together the new matrix */ 1556 PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1557 1558 b = (Mat_SeqBAIJ *)(B)->data; 1559 b->free_a = PETSC_TRUE; 1560 b->free_ij = PETSC_TRUE; 1561 b->singlemalloc = PETSC_FALSE; 1562 1563 b->a = ba; 1564 b->j = bj; 1565 b->i = bi; 1566 b->diag = bdiag; 1567 b->ilen = NULL; 1568 b->imax = NULL; 1569 b->row = isrow; 1570 b->col = iscol; 1571 1572 PetscCall(PetscObjectReference((PetscObject)isrow)); 1573 PetscCall(PetscObjectReference((PetscObject)iscol)); 1574 1575 b->icol = isicol; 1576 PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work)); 1577 b->maxnz = nnz_max / bs2; 1578 1579 (B)->factortype = MAT_FACTOR_ILUDT; 1580 (B)->info.factor_mallocs = 0; 1581 (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2)); 1582 /* ------- end of symbolic factorization ---------*/ 1583 PetscCall(ISGetIndices(isrow, &r)); 1584 PetscCall(ISGetIndices(isicol, &ic)); 1585 1586 /* linked list for storing column indices of the active row */ 1587 nlnk = mbs + 1; 1588 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1589 1590 /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */ 1591 PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp)); 1592 /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */ 1593 PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp)); 1594 PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs)); 1595 PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots)); 1596 1597 allowzeropivot = PetscNot(A->erroriffailure); 1598 bi[0] = 0; 1599 bdiag[0] = (nnz_max / bs2) - 1; /* location of diagonal in factor B */ 1600 bi[2 * mbs + 1] = bdiag[0] + 1; /* endof bj and ba array */ 1601 for (i = 0; i < mbs; i++) { 1602 /* copy initial fill into linked list */ 1603 nzi = ai[r[i] + 1] - ai[r[i]]; 1604 PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 1605 nzi_al = adiag[r[i]] - ai[r[i]]; 1606 nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1; 1607 1608 /* load in initial unfactored row */ 1609 ajtmp = aj + ai[r[i]]; 1610 PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt)); 1611 PetscCall(PetscArrayzero(rtmp, mbs * bs2)); 1612 aatmp = a->a + bs2 * ai[r[i]]; 1613 for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2)); 1614 1615 /* add pivot rows into linked list */ 1616 row = lnk[mbs]; 1617 while (row < i) { 1618 nzi_bl = bi[row + 1] - bi[row] + 1; 1619 bjtmp = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */ 1620 PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im)); 1621 nzi += nlnk; 1622 row = lnk[row]; 1623 } 1624 1625 /* copy data from lnk into jtmp, then initialize lnk */ 1626 PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt)); 1627 1628 /* numerical factorization */ 1629 bjtmp = jtmp; 1630 row = *bjtmp++; /* 1st pivot row */ 1631 1632 while (row < i) { 1633 pc = rtmp + bs2 * row; 1634 pv = ba + bs2 * bdiag[row]; /* inv(diag) of the pivot row */ 1635 PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */ 1636 PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs)); 1637 if (vtmp_abs[0] > dt) { /* apply tolerance dropping rule */ 1638 pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */ 1639 pv = ba + bs2 * (bdiag[row + 1] + 1); 1640 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */ 1641 for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); 1642 /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */ 1643 } 1644 row = *bjtmp++; 1645 } 1646 1647 /* copy sparse rtmp into contiguous vtmp; separate L and U part */ 1648 nzi_bl = 0; 1649 j = 0; 1650 while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */ 1651 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1652 nzi_bl++; 1653 j++; 1654 } 1655 nzi_bu = nzi - nzi_bl - 1; 1656 1657 while (j < nzi) { /* U-part */ 1658 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1659 j++; 1660 } 1661 1662 PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs)); 1663 1664 bjtmp = bj + bi[i]; 1665 batmp = ba + bs2 * bi[i]; 1666 /* apply level dropping rule to L part */ 1667 ncut = nzi_al + dtcount; 1668 if (ncut < nzi_bl) { 1669 PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp)); 1670 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp)); 1671 } else { 1672 ncut = nzi_bl; 1673 } 1674 for (j = 0; j < ncut; j++) { 1675 bjtmp[j] = jtmp[j]; 1676 PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2)); 1677 } 1678 bi[i + 1] = bi[i] + ncut; 1679 nzi = ncut + 1; 1680 1681 /* apply level dropping rule to U part */ 1682 ncut = nzi_au + dtcount; 1683 if (ncut < nzi_bu) { 1684 PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1)); 1685 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1)); 1686 } else { 1687 ncut = nzi_bu; 1688 } 1689 nzi += ncut; 1690 1691 /* mark bdiagonal */ 1692 bdiag[i + 1] = bdiag[i] - (ncut + 1); 1693 bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1); 1694 1695 bjtmp = bj + bdiag[i]; 1696 batmp = ba + bs2 * bdiag[i]; 1697 PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2)); 1698 *bjtmp = i; 1699 1700 bjtmp = bj + bdiag[i + 1] + 1; 1701 batmp = ba + (bdiag[i + 1] + 1) * bs2; 1702 1703 for (k = 0; k < ncut; k++) { 1704 bjtmp[k] = jtmp[nzi_bl + 1 + k]; 1705 PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2)); 1706 } 1707 1708 im[i] = nzi; /* used by PetscLLAddSortedLU() */ 1709 1710 /* invert diagonal block for simpler triangular solves - add shift??? */ 1711 batmp = ba + bs2 * bdiag[i]; 1712 1713 PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 1714 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1715 } /* for (i=0; i<mbs; i++) */ 1716 PetscCall(PetscFree3(v_work, multiplier, v_pivots)); 1717 1718 /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */ 1719 PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]); 1720 1721 PetscCall(ISRestoreIndices(isrow, &r)); 1722 PetscCall(ISRestoreIndices(isicol, &ic)); 1723 1724 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1725 1726 PetscCall(PetscFree2(im, jtmp)); 1727 PetscCall(PetscFree2(rtmp, vtmp)); 1728 1729 PetscCall(PetscLogFlops(bs2 * B->cmap->n)); 1730 b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs]; 1731 1732 PetscCall(ISIdentity(isrow, &row_identity)); 1733 PetscCall(ISIdentity(isicol, &icol_identity)); 1734 if (row_identity && icol_identity) { 1735 B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 1736 } else { 1737 B->ops->solve = MatSolve_SeqBAIJ_N; 1738 } 1739 1740 B->ops->solveadd = NULL; 1741 B->ops->solvetranspose = NULL; 1742 B->ops->solvetransposeadd = NULL; 1743 B->ops->matsolve = NULL; 1744 B->assembled = PETSC_TRUE; 1745 B->preallocated = PETSC_TRUE; 1746 PetscFunctionReturn(0); 1747 } 1748