1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 /* ------------------------------------------------------------*/ 9 /* 10 Version for when blocks are 5 by 5 11 */ 12 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_inplace(Mat C, Mat A, const MatFactorInfo *info) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 15 IS isrow = b->row, isicol = b->icol; 16 const PetscInt *r, *ic, *bi = b->i, *bj = b->j, *ajtmpold, *ajtmp; 17 PetscInt i, j, n = a->mbs, nz, row, idx, ipvt[5]; 18 const PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 19 MatScalar *w, *pv, *rtmp, *x, *pc; 20 const MatScalar *v, *aa = a->a; 21 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 22 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 23 MatScalar x17, x18, x19, x20, x21, x22, x23, x24, x25, p10, p11, p12, p13, p14; 24 MatScalar p15, p16, p17, p18, p19, p20, p21, p22, p23, p24, p25, m10, m11, m12; 25 MatScalar m13, m14, m15, m16, m17, m18, m19, m20, m21, m22, m23, m24, m25; 26 MatScalar *ba = b->a, work[25]; 27 PetscReal shift = info->shiftamount; 28 PetscBool allowzeropivot, zeropivotdetected; 29 30 PetscFunctionBegin; 31 allowzeropivot = PetscNot(A->erroriffailure); 32 PetscCall(ISGetIndices(isrow, &r)); 33 PetscCall(ISGetIndices(isicol, &ic)); 34 PetscCall(PetscMalloc1(25 * (n + 1), &rtmp)); 35 36 #define PETSC_USE_MEMZERO 1 37 #define PETSC_USE_MEMCPY 1 38 39 for (i = 0; i < n; i++) { 40 nz = bi[i + 1] - bi[i]; 41 ajtmp = bj + bi[i]; 42 for (j = 0; j < nz; j++) { 43 #if defined(PETSC_USE_MEMZERO) 44 PetscCall(PetscArrayzero(rtmp + 25 * ajtmp[j], 25)); 45 #else 46 x = rtmp + 25 * ajtmp[j]; 47 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 48 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 49 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0; 50 #endif 51 } 52 /* load in initial (unfactored row) */ 53 idx = r[i]; 54 nz = ai[idx + 1] - ai[idx]; 55 ajtmpold = aj + ai[idx]; 56 v = aa + 25 * ai[idx]; 57 for (j = 0; j < nz; j++) { 58 #if defined(PETSC_USE_MEMCPY) 59 PetscCall(PetscArraycpy(rtmp + 25 * ic[ajtmpold[j]], v, 25)); 60 #else 61 x = rtmp + 25 * ic[ajtmpold[j]]; 62 x[0] = v[0]; 63 x[1] = v[1]; 64 x[2] = v[2]; 65 x[3] = v[3]; 66 x[4] = v[4]; 67 x[5] = v[5]; 68 x[6] = v[6]; 69 x[7] = v[7]; 70 x[8] = v[8]; 71 x[9] = v[9]; 72 x[10] = v[10]; 73 x[11] = v[11]; 74 x[12] = v[12]; 75 x[13] = v[13]; 76 x[14] = v[14]; 77 x[15] = v[15]; 78 x[16] = v[16]; 79 x[17] = v[17]; 80 x[18] = v[18]; 81 x[19] = v[19]; 82 x[20] = v[20]; 83 x[21] = v[21]; 84 x[22] = v[22]; 85 x[23] = v[23]; 86 x[24] = v[24]; 87 #endif 88 v += 25; 89 } 90 row = *ajtmp++; 91 while (row < i) { 92 pc = rtmp + 25 * row; 93 p1 = pc[0]; 94 p2 = pc[1]; 95 p3 = pc[2]; 96 p4 = pc[3]; 97 p5 = pc[4]; 98 p6 = pc[5]; 99 p7 = pc[6]; 100 p8 = pc[7]; 101 p9 = pc[8]; 102 p10 = pc[9]; 103 p11 = pc[10]; 104 p12 = pc[11]; 105 p13 = pc[12]; 106 p14 = pc[13]; 107 p15 = pc[14]; 108 p16 = pc[15]; 109 p17 = pc[16]; 110 p18 = pc[17]; 111 p19 = pc[18]; 112 p20 = pc[19]; 113 p21 = pc[20]; 114 p22 = pc[21]; 115 p23 = pc[22]; 116 p24 = pc[23]; 117 p25 = pc[24]; 118 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) { 119 pv = ba + 25 * diag_offset[row]; 120 pj = bj + diag_offset[row] + 1; 121 x1 = pv[0]; 122 x2 = pv[1]; 123 x3 = pv[2]; 124 x4 = pv[3]; 125 x5 = pv[4]; 126 x6 = pv[5]; 127 x7 = pv[6]; 128 x8 = pv[7]; 129 x9 = pv[8]; 130 x10 = pv[9]; 131 x11 = pv[10]; 132 x12 = pv[11]; 133 x13 = pv[12]; 134 x14 = pv[13]; 135 x15 = pv[14]; 136 x16 = pv[15]; 137 x17 = pv[16]; 138 x18 = pv[17]; 139 x19 = pv[18]; 140 x20 = pv[19]; 141 x21 = pv[20]; 142 x22 = pv[21]; 143 x23 = pv[22]; 144 x24 = pv[23]; 145 x25 = pv[24]; 146 pc[0] = m1 = p1 * x1 + p6 * x2 + p11 * x3 + p16 * x4 + p21 * x5; 147 pc[1] = m2 = p2 * x1 + p7 * x2 + p12 * x3 + p17 * x4 + p22 * x5; 148 pc[2] = m3 = p3 * x1 + p8 * x2 + p13 * x3 + p18 * x4 + p23 * x5; 149 pc[3] = m4 = p4 * x1 + p9 * x2 + p14 * x3 + p19 * x4 + p24 * x5; 150 pc[4] = m5 = p5 * x1 + p10 * x2 + p15 * x3 + p20 * x4 + p25 * x5; 151 152 pc[5] = m6 = p1 * x6 + p6 * x7 + p11 * x8 + p16 * x9 + p21 * x10; 153 pc[6] = m7 = p2 * x6 + p7 * x7 + p12 * x8 + p17 * x9 + p22 * x10; 154 pc[7] = m8 = p3 * x6 + p8 * x7 + p13 * x8 + p18 * x9 + p23 * x10; 155 pc[8] = m9 = p4 * x6 + p9 * x7 + p14 * x8 + p19 * x9 + p24 * x10; 156 pc[9] = m10 = p5 * x6 + p10 * x7 + p15 * x8 + p20 * x9 + p25 * x10; 157 158 pc[10] = m11 = p1 * x11 + p6 * x12 + p11 * x13 + p16 * x14 + p21 * x15; 159 pc[11] = m12 = p2 * x11 + p7 * x12 + p12 * x13 + p17 * x14 + p22 * x15; 160 pc[12] = m13 = p3 * x11 + p8 * x12 + p13 * x13 + p18 * x14 + p23 * x15; 161 pc[13] = m14 = p4 * x11 + p9 * x12 + p14 * x13 + p19 * x14 + p24 * x15; 162 pc[14] = m15 = p5 * x11 + p10 * x12 + p15 * x13 + p20 * x14 + p25 * x15; 163 164 pc[15] = m16 = p1 * x16 + p6 * x17 + p11 * x18 + p16 * x19 + p21 * x20; 165 pc[16] = m17 = p2 * x16 + p7 * x17 + p12 * x18 + p17 * x19 + p22 * x20; 166 pc[17] = m18 = p3 * x16 + p8 * x17 + p13 * x18 + p18 * x19 + p23 * x20; 167 pc[18] = m19 = p4 * x16 + p9 * x17 + p14 * x18 + p19 * x19 + p24 * x20; 168 pc[19] = m20 = p5 * x16 + p10 * x17 + p15 * x18 + p20 * x19 + p25 * x20; 169 170 pc[20] = m21 = p1 * x21 + p6 * x22 + p11 * x23 + p16 * x24 + p21 * x25; 171 pc[21] = m22 = p2 * x21 + p7 * x22 + p12 * x23 + p17 * x24 + p22 * x25; 172 pc[22] = m23 = p3 * x21 + p8 * x22 + p13 * x23 + p18 * x24 + p23 * x25; 173 pc[23] = m24 = p4 * x21 + p9 * x22 + p14 * x23 + p19 * x24 + p24 * x25; 174 pc[24] = m25 = p5 * x21 + p10 * x22 + p15 * x23 + p20 * x24 + p25 * x25; 175 176 nz = bi[row + 1] - diag_offset[row] - 1; 177 pv += 25; 178 for (j = 0; j < nz; j++) { 179 x1 = pv[0]; 180 x2 = pv[1]; 181 x3 = pv[2]; 182 x4 = pv[3]; 183 x5 = pv[4]; 184 x6 = pv[5]; 185 x7 = pv[6]; 186 x8 = pv[7]; 187 x9 = pv[8]; 188 x10 = pv[9]; 189 x11 = pv[10]; 190 x12 = pv[11]; 191 x13 = pv[12]; 192 x14 = pv[13]; 193 x15 = pv[14]; 194 x16 = pv[15]; 195 x17 = pv[16]; 196 x18 = pv[17]; 197 x19 = pv[18]; 198 x20 = pv[19]; 199 x21 = pv[20]; 200 x22 = pv[21]; 201 x23 = pv[22]; 202 x24 = pv[23]; 203 x25 = pv[24]; 204 x = rtmp + 25 * pj[j]; 205 x[0] -= m1 * x1 + m6 * x2 + m11 * x3 + m16 * x4 + m21 * x5; 206 x[1] -= m2 * x1 + m7 * x2 + m12 * x3 + m17 * x4 + m22 * x5; 207 x[2] -= m3 * x1 + m8 * x2 + m13 * x3 + m18 * x4 + m23 * x5; 208 x[3] -= m4 * x1 + m9 * x2 + m14 * x3 + m19 * x4 + m24 * x5; 209 x[4] -= m5 * x1 + m10 * x2 + m15 * x3 + m20 * x4 + m25 * x5; 210 211 x[5] -= m1 * x6 + m6 * x7 + m11 * x8 + m16 * x9 + m21 * x10; 212 x[6] -= m2 * x6 + m7 * x7 + m12 * x8 + m17 * x9 + m22 * x10; 213 x[7] -= m3 * x6 + m8 * x7 + m13 * x8 + m18 * x9 + m23 * x10; 214 x[8] -= m4 * x6 + m9 * x7 + m14 * x8 + m19 * x9 + m24 * x10; 215 x[9] -= m5 * x6 + m10 * x7 + m15 * x8 + m20 * x9 + m25 * x10; 216 217 x[10] -= m1 * x11 + m6 * x12 + m11 * x13 + m16 * x14 + m21 * x15; 218 x[11] -= m2 * x11 + m7 * x12 + m12 * x13 + m17 * x14 + m22 * x15; 219 x[12] -= m3 * x11 + m8 * x12 + m13 * x13 + m18 * x14 + m23 * x15; 220 x[13] -= m4 * x11 + m9 * x12 + m14 * x13 + m19 * x14 + m24 * x15; 221 x[14] -= m5 * x11 + m10 * x12 + m15 * x13 + m20 * x14 + m25 * x15; 222 223 x[15] -= m1 * x16 + m6 * x17 + m11 * x18 + m16 * x19 + m21 * x20; 224 x[16] -= m2 * x16 + m7 * x17 + m12 * x18 + m17 * x19 + m22 * x20; 225 x[17] -= m3 * x16 + m8 * x17 + m13 * x18 + m18 * x19 + m23 * x20; 226 x[18] -= m4 * x16 + m9 * x17 + m14 * x18 + m19 * x19 + m24 * x20; 227 x[19] -= m5 * x16 + m10 * x17 + m15 * x18 + m20 * x19 + m25 * x20; 228 229 x[20] -= m1 * x21 + m6 * x22 + m11 * x23 + m16 * x24 + m21 * x25; 230 x[21] -= m2 * x21 + m7 * x22 + m12 * x23 + m17 * x24 + m22 * x25; 231 x[22] -= m3 * x21 + m8 * x22 + m13 * x23 + m18 * x24 + m23 * x25; 232 x[23] -= m4 * x21 + m9 * x22 + m14 * x23 + m19 * x24 + m24 * x25; 233 x[24] -= m5 * x21 + m10 * x22 + m15 * x23 + m20 * x24 + m25 * x25; 234 235 pv += 25; 236 } 237 PetscCall(PetscLogFlops(250.0 * nz + 225.0)); 238 } 239 row = *ajtmp++; 240 } 241 /* finished row so stick it into b->a */ 242 pv = ba + 25 * bi[i]; 243 pj = bj + bi[i]; 244 nz = bi[i + 1] - bi[i]; 245 for (j = 0; j < nz; j++) { 246 #if defined(PETSC_USE_MEMCPY) 247 PetscCall(PetscArraycpy(pv, rtmp + 25 * pj[j], 25)); 248 #else 249 x = rtmp + 25 * pj[j]; 250 pv[0] = x[0]; 251 pv[1] = x[1]; 252 pv[2] = x[2]; 253 pv[3] = x[3]; 254 pv[4] = x[4]; 255 pv[5] = x[5]; 256 pv[6] = x[6]; 257 pv[7] = x[7]; 258 pv[8] = x[8]; 259 pv[9] = x[9]; 260 pv[10] = x[10]; 261 pv[11] = x[11]; 262 pv[12] = x[12]; 263 pv[13] = x[13]; 264 pv[14] = x[14]; 265 pv[15] = x[15]; 266 pv[16] = x[16]; 267 pv[17] = x[17]; 268 pv[18] = x[18]; 269 pv[19] = x[19]; 270 pv[20] = x[20]; 271 pv[21] = x[21]; 272 pv[22] = x[22]; 273 pv[23] = x[23]; 274 pv[24] = x[24]; 275 #endif 276 pv += 25; 277 } 278 /* invert diagonal block */ 279 w = ba + 25 * diag_offset[i]; 280 PetscCall(PetscKernel_A_gets_inverse_A_5(w, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 281 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 282 } 283 284 PetscCall(PetscFree(rtmp)); 285 PetscCall(ISRestoreIndices(isicol, &ic)); 286 PetscCall(ISRestoreIndices(isrow, &r)); 287 288 C->ops->solve = MatSolve_SeqBAIJ_5_inplace; 289 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_inplace; 290 C->assembled = PETSC_TRUE; 291 292 PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * b->mbs)); /* from inverting diagonal blocks */ 293 PetscFunctionReturn(0); 294 } 295 296 /* MatLUFactorNumeric_SeqBAIJ_5 - 297 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 298 PetscKernel_A_gets_A_times_B() 299 PetscKernel_A_gets_A_minus_B_times_C() 300 PetscKernel_A_gets_inverse_A() 301 */ 302 303 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5(Mat B, Mat A, const MatFactorInfo *info) 304 { 305 Mat C = B; 306 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 307 IS isrow = b->row, isicol = b->icol; 308 const PetscInt *r, *ic; 309 PetscInt i, j, k, nz, nzL, row; 310 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 311 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 312 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a, work[25]; 313 PetscInt flg, ipvt[5]; 314 PetscReal shift = info->shiftamount; 315 PetscBool allowzeropivot, zeropivotdetected; 316 317 PetscFunctionBegin; 318 allowzeropivot = PetscNot(A->erroriffailure); 319 PetscCall(ISGetIndices(isrow, &r)); 320 PetscCall(ISGetIndices(isicol, &ic)); 321 322 /* generate work space needed by the factorization */ 323 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 324 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 325 326 for (i = 0; i < n; i++) { 327 /* zero rtmp */ 328 /* L part */ 329 nz = bi[i + 1] - bi[i]; 330 bjtmp = bj + bi[i]; 331 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 332 333 /* U part */ 334 nz = bdiag[i] - bdiag[i + 1]; 335 bjtmp = bj + bdiag[i + 1] + 1; 336 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 337 338 /* load in initial (unfactored row) */ 339 nz = ai[r[i] + 1] - ai[r[i]]; 340 ajtmp = aj + ai[r[i]]; 341 v = aa + bs2 * ai[r[i]]; 342 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 343 344 /* elimination */ 345 bjtmp = bj + bi[i]; 346 nzL = bi[i + 1] - bi[i]; 347 for (k = 0; k < nzL; k++) { 348 row = bjtmp[k]; 349 pc = rtmp + bs2 * row; 350 for (flg = 0, j = 0; j < bs2; j++) { 351 if (pc[j] != 0.0) { 352 flg = 1; 353 break; 354 } 355 } 356 if (flg) { 357 pv = b->a + bs2 * bdiag[row]; 358 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 359 PetscCall(PetscKernel_A_gets_A_times_B_5(pc, pv, mwork)); 360 361 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 362 pv = b->a + bs2 * (bdiag[row + 1] + 1); 363 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 364 for (j = 0; j < nz; j++) { 365 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 366 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 367 v = rtmp + bs2 * pj[j]; 368 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_5(v, pc, pv)); 369 pv += bs2; 370 } 371 PetscCall(PetscLogFlops(250.0 * nz + 225)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 372 } 373 } 374 375 /* finished row so stick it into b->a */ 376 /* L part */ 377 pv = b->a + bs2 * bi[i]; 378 pj = b->j + bi[i]; 379 nz = bi[i + 1] - bi[i]; 380 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 381 382 /* Mark diagonal and invert diagonal for simpler triangular solves */ 383 pv = b->a + bs2 * bdiag[i]; 384 pj = b->j + bdiag[i]; 385 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 386 PetscCall(PetscKernel_A_gets_inverse_A_5(pv, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 387 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 388 389 /* U part */ 390 pv = b->a + bs2 * (bdiag[i + 1] + 1); 391 pj = b->j + bdiag[i + 1] + 1; 392 nz = bdiag[i] - bdiag[i + 1] - 1; 393 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 394 } 395 396 PetscCall(PetscFree2(rtmp, mwork)); 397 PetscCall(ISRestoreIndices(isicol, &ic)); 398 PetscCall(ISRestoreIndices(isrow, &r)); 399 400 C->ops->solve = MatSolve_SeqBAIJ_5; 401 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 402 C->assembled = PETSC_TRUE; 403 404 PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * n)); /* from inverting diagonal blocks */ 405 PetscFunctionReturn(0); 406 } 407 408 /* 409 Version for when blocks are 5 by 5 Using natural ordering 410 */ 411 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 412 { 413 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 414 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j, ipvt[5]; 415 PetscInt *ajtmpold, *ajtmp, nz, row; 416 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 417 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 418 MatScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; 419 MatScalar x16, x17, x18, x19, x20, x21, x22, x23, x24, x25; 420 MatScalar p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15; 421 MatScalar p16, p17, p18, p19, p20, p21, p22, p23, p24, p25; 422 MatScalar m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15; 423 MatScalar m16, m17, m18, m19, m20, m21, m22, m23, m24, m25; 424 MatScalar *ba = b->a, *aa = a->a, work[25]; 425 PetscReal shift = info->shiftamount; 426 PetscBool allowzeropivot, zeropivotdetected; 427 428 PetscFunctionBegin; 429 allowzeropivot = PetscNot(A->erroriffailure); 430 PetscCall(PetscMalloc1(25 * (n + 1), &rtmp)); 431 for (i = 0; i < n; i++) { 432 nz = bi[i + 1] - bi[i]; 433 ajtmp = bj + bi[i]; 434 for (j = 0; j < nz; j++) { 435 x = rtmp + 25 * ajtmp[j]; 436 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 437 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 438 x[16] = x[17] = x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0; 439 } 440 /* load in initial (unfactored row) */ 441 nz = ai[i + 1] - ai[i]; 442 ajtmpold = aj + ai[i]; 443 v = aa + 25 * ai[i]; 444 for (j = 0; j < nz; j++) { 445 x = rtmp + 25 * ajtmpold[j]; 446 x[0] = v[0]; 447 x[1] = v[1]; 448 x[2] = v[2]; 449 x[3] = v[3]; 450 x[4] = v[4]; 451 x[5] = v[5]; 452 x[6] = v[6]; 453 x[7] = v[7]; 454 x[8] = v[8]; 455 x[9] = v[9]; 456 x[10] = v[10]; 457 x[11] = v[11]; 458 x[12] = v[12]; 459 x[13] = v[13]; 460 x[14] = v[14]; 461 x[15] = v[15]; 462 x[16] = v[16]; 463 x[17] = v[17]; 464 x[18] = v[18]; 465 x[19] = v[19]; 466 x[20] = v[20]; 467 x[21] = v[21]; 468 x[22] = v[22]; 469 x[23] = v[23]; 470 x[24] = v[24]; 471 v += 25; 472 } 473 row = *ajtmp++; 474 while (row < i) { 475 pc = rtmp + 25 * row; 476 p1 = pc[0]; 477 p2 = pc[1]; 478 p3 = pc[2]; 479 p4 = pc[3]; 480 p5 = pc[4]; 481 p6 = pc[5]; 482 p7 = pc[6]; 483 p8 = pc[7]; 484 p9 = pc[8]; 485 p10 = pc[9]; 486 p11 = pc[10]; 487 p12 = pc[11]; 488 p13 = pc[12]; 489 p14 = pc[13]; 490 p15 = pc[14]; 491 p16 = pc[15]; 492 p17 = pc[16]; 493 p18 = pc[17]; 494 p19 = pc[18]; 495 p20 = pc[19]; 496 p21 = pc[20]; 497 p22 = pc[21]; 498 p23 = pc[22]; 499 p24 = pc[23]; 500 p25 = pc[24]; 501 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) { 502 pv = ba + 25 * diag_offset[row]; 503 pj = bj + diag_offset[row] + 1; 504 x1 = pv[0]; 505 x2 = pv[1]; 506 x3 = pv[2]; 507 x4 = pv[3]; 508 x5 = pv[4]; 509 x6 = pv[5]; 510 x7 = pv[6]; 511 x8 = pv[7]; 512 x9 = pv[8]; 513 x10 = pv[9]; 514 x11 = pv[10]; 515 x12 = pv[11]; 516 x13 = pv[12]; 517 x14 = pv[13]; 518 x15 = pv[14]; 519 x16 = pv[15]; 520 x17 = pv[16]; 521 x18 = pv[17]; 522 x19 = pv[18]; 523 x20 = pv[19]; 524 x21 = pv[20]; 525 x22 = pv[21]; 526 x23 = pv[22]; 527 x24 = pv[23]; 528 x25 = pv[24]; 529 pc[0] = m1 = p1 * x1 + p6 * x2 + p11 * x3 + p16 * x4 + p21 * x5; 530 pc[1] = m2 = p2 * x1 + p7 * x2 + p12 * x3 + p17 * x4 + p22 * x5; 531 pc[2] = m3 = p3 * x1 + p8 * x2 + p13 * x3 + p18 * x4 + p23 * x5; 532 pc[3] = m4 = p4 * x1 + p9 * x2 + p14 * x3 + p19 * x4 + p24 * x5; 533 pc[4] = m5 = p5 * x1 + p10 * x2 + p15 * x3 + p20 * x4 + p25 * x5; 534 535 pc[5] = m6 = p1 * x6 + p6 * x7 + p11 * x8 + p16 * x9 + p21 * x10; 536 pc[6] = m7 = p2 * x6 + p7 * x7 + p12 * x8 + p17 * x9 + p22 * x10; 537 pc[7] = m8 = p3 * x6 + p8 * x7 + p13 * x8 + p18 * x9 + p23 * x10; 538 pc[8] = m9 = p4 * x6 + p9 * x7 + p14 * x8 + p19 * x9 + p24 * x10; 539 pc[9] = m10 = p5 * x6 + p10 * x7 + p15 * x8 + p20 * x9 + p25 * x10; 540 541 pc[10] = m11 = p1 * x11 + p6 * x12 + p11 * x13 + p16 * x14 + p21 * x15; 542 pc[11] = m12 = p2 * x11 + p7 * x12 + p12 * x13 + p17 * x14 + p22 * x15; 543 pc[12] = m13 = p3 * x11 + p8 * x12 + p13 * x13 + p18 * x14 + p23 * x15; 544 pc[13] = m14 = p4 * x11 + p9 * x12 + p14 * x13 + p19 * x14 + p24 * x15; 545 pc[14] = m15 = p5 * x11 + p10 * x12 + p15 * x13 + p20 * x14 + p25 * x15; 546 547 pc[15] = m16 = p1 * x16 + p6 * x17 + p11 * x18 + p16 * x19 + p21 * x20; 548 pc[16] = m17 = p2 * x16 + p7 * x17 + p12 * x18 + p17 * x19 + p22 * x20; 549 pc[17] = m18 = p3 * x16 + p8 * x17 + p13 * x18 + p18 * x19 + p23 * x20; 550 pc[18] = m19 = p4 * x16 + p9 * x17 + p14 * x18 + p19 * x19 + p24 * x20; 551 pc[19] = m20 = p5 * x16 + p10 * x17 + p15 * x18 + p20 * x19 + p25 * x20; 552 553 pc[20] = m21 = p1 * x21 + p6 * x22 + p11 * x23 + p16 * x24 + p21 * x25; 554 pc[21] = m22 = p2 * x21 + p7 * x22 + p12 * x23 + p17 * x24 + p22 * x25; 555 pc[22] = m23 = p3 * x21 + p8 * x22 + p13 * x23 + p18 * x24 + p23 * x25; 556 pc[23] = m24 = p4 * x21 + p9 * x22 + p14 * x23 + p19 * x24 + p24 * x25; 557 pc[24] = m25 = p5 * x21 + p10 * x22 + p15 * x23 + p20 * x24 + p25 * x25; 558 559 nz = bi[row + 1] - diag_offset[row] - 1; 560 pv += 25; 561 for (j = 0; j < nz; j++) { 562 x1 = pv[0]; 563 x2 = pv[1]; 564 x3 = pv[2]; 565 x4 = pv[3]; 566 x5 = pv[4]; 567 x6 = pv[5]; 568 x7 = pv[6]; 569 x8 = pv[7]; 570 x9 = pv[8]; 571 x10 = pv[9]; 572 x11 = pv[10]; 573 x12 = pv[11]; 574 x13 = pv[12]; 575 x14 = pv[13]; 576 x15 = pv[14]; 577 x16 = pv[15]; 578 x17 = pv[16]; 579 x18 = pv[17]; 580 x19 = pv[18]; 581 x20 = pv[19]; 582 x21 = pv[20]; 583 x22 = pv[21]; 584 x23 = pv[22]; 585 x24 = pv[23]; 586 x25 = pv[24]; 587 x = rtmp + 25 * pj[j]; 588 x[0] -= m1 * x1 + m6 * x2 + m11 * x3 + m16 * x4 + m21 * x5; 589 x[1] -= m2 * x1 + m7 * x2 + m12 * x3 + m17 * x4 + m22 * x5; 590 x[2] -= m3 * x1 + m8 * x2 + m13 * x3 + m18 * x4 + m23 * x5; 591 x[3] -= m4 * x1 + m9 * x2 + m14 * x3 + m19 * x4 + m24 * x5; 592 x[4] -= m5 * x1 + m10 * x2 + m15 * x3 + m20 * x4 + m25 * x5; 593 594 x[5] -= m1 * x6 + m6 * x7 + m11 * x8 + m16 * x9 + m21 * x10; 595 x[6] -= m2 * x6 + m7 * x7 + m12 * x8 + m17 * x9 + m22 * x10; 596 x[7] -= m3 * x6 + m8 * x7 + m13 * x8 + m18 * x9 + m23 * x10; 597 x[8] -= m4 * x6 + m9 * x7 + m14 * x8 + m19 * x9 + m24 * x10; 598 x[9] -= m5 * x6 + m10 * x7 + m15 * x8 + m20 * x9 + m25 * x10; 599 600 x[10] -= m1 * x11 + m6 * x12 + m11 * x13 + m16 * x14 + m21 * x15; 601 x[11] -= m2 * x11 + m7 * x12 + m12 * x13 + m17 * x14 + m22 * x15; 602 x[12] -= m3 * x11 + m8 * x12 + m13 * x13 + m18 * x14 + m23 * x15; 603 x[13] -= m4 * x11 + m9 * x12 + m14 * x13 + m19 * x14 + m24 * x15; 604 x[14] -= m5 * x11 + m10 * x12 + m15 * x13 + m20 * x14 + m25 * x15; 605 606 x[15] -= m1 * x16 + m6 * x17 + m11 * x18 + m16 * x19 + m21 * x20; 607 x[16] -= m2 * x16 + m7 * x17 + m12 * x18 + m17 * x19 + m22 * x20; 608 x[17] -= m3 * x16 + m8 * x17 + m13 * x18 + m18 * x19 + m23 * x20; 609 x[18] -= m4 * x16 + m9 * x17 + m14 * x18 + m19 * x19 + m24 * x20; 610 x[19] -= m5 * x16 + m10 * x17 + m15 * x18 + m20 * x19 + m25 * x20; 611 612 x[20] -= m1 * x21 + m6 * x22 + m11 * x23 + m16 * x24 + m21 * x25; 613 x[21] -= m2 * x21 + m7 * x22 + m12 * x23 + m17 * x24 + m22 * x25; 614 x[22] -= m3 * x21 + m8 * x22 + m13 * x23 + m18 * x24 + m23 * x25; 615 x[23] -= m4 * x21 + m9 * x22 + m14 * x23 + m19 * x24 + m24 * x25; 616 x[24] -= m5 * x21 + m10 * x22 + m15 * x23 + m20 * x24 + m25 * x25; 617 pv += 25; 618 } 619 PetscCall(PetscLogFlops(250.0 * nz + 225.0)); 620 } 621 row = *ajtmp++; 622 } 623 /* finished row so stick it into b->a */ 624 pv = ba + 25 * bi[i]; 625 pj = bj + bi[i]; 626 nz = bi[i + 1] - bi[i]; 627 for (j = 0; j < nz; j++) { 628 x = rtmp + 25 * pj[j]; 629 pv[0] = x[0]; 630 pv[1] = x[1]; 631 pv[2] = x[2]; 632 pv[3] = x[3]; 633 pv[4] = x[4]; 634 pv[5] = x[5]; 635 pv[6] = x[6]; 636 pv[7] = x[7]; 637 pv[8] = x[8]; 638 pv[9] = x[9]; 639 pv[10] = x[10]; 640 pv[11] = x[11]; 641 pv[12] = x[12]; 642 pv[13] = x[13]; 643 pv[14] = x[14]; 644 pv[15] = x[15]; 645 pv[16] = x[16]; 646 pv[17] = x[17]; 647 pv[18] = x[18]; 648 pv[19] = x[19]; 649 pv[20] = x[20]; 650 pv[21] = x[21]; 651 pv[22] = x[22]; 652 pv[23] = x[23]; 653 pv[24] = x[24]; 654 pv += 25; 655 } 656 /* invert diagonal block */ 657 w = ba + 25 * diag_offset[i]; 658 PetscCall(PetscKernel_A_gets_inverse_A_5(w, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 659 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 660 } 661 662 PetscCall(PetscFree(rtmp)); 663 664 C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_inplace; 665 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace; 666 C->assembled = PETSC_TRUE; 667 668 PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * b->mbs)); /* from inverting diagonal blocks */ 669 PetscFunctionReturn(0); 670 } 671 672 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 673 { 674 Mat C = B; 675 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 676 PetscInt i, j, k, nz, nzL, row; 677 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 678 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 679 MatScalar *rtmp, *pc, *mwork, *v, *vv, *pv, *aa = a->a, work[25]; 680 PetscInt flg, ipvt[5]; 681 PetscReal shift = info->shiftamount; 682 PetscBool allowzeropivot, zeropivotdetected; 683 684 PetscFunctionBegin; 685 allowzeropivot = PetscNot(A->erroriffailure); 686 687 /* generate work space needed by the factorization */ 688 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 689 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 690 691 for (i = 0; i < n; i++) { 692 /* zero rtmp */ 693 /* L part */ 694 nz = bi[i + 1] - bi[i]; 695 bjtmp = bj + bi[i]; 696 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 697 698 /* U part */ 699 nz = bdiag[i] - bdiag[i + 1]; 700 bjtmp = bj + bdiag[i + 1] + 1; 701 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 702 703 /* load in initial (unfactored row) */ 704 nz = ai[i + 1] - ai[i]; 705 ajtmp = aj + ai[i]; 706 v = aa + bs2 * ai[i]; 707 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 708 709 /* elimination */ 710 bjtmp = bj + bi[i]; 711 nzL = bi[i + 1] - bi[i]; 712 for (k = 0; k < nzL; k++) { 713 row = bjtmp[k]; 714 pc = rtmp + bs2 * row; 715 for (flg = 0, j = 0; j < bs2; j++) { 716 if (pc[j] != 0.0) { 717 flg = 1; 718 break; 719 } 720 } 721 if (flg) { 722 pv = b->a + bs2 * bdiag[row]; 723 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 724 PetscCall(PetscKernel_A_gets_A_times_B_5(pc, pv, mwork)); 725 726 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 727 pv = b->a + bs2 * (bdiag[row + 1] + 1); 728 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 729 for (j = 0; j < nz; j++) { 730 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 731 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 732 vv = rtmp + bs2 * pj[j]; 733 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_5(vv, pc, pv)); 734 pv += bs2; 735 } 736 PetscCall(PetscLogFlops(250.0 * nz + 225)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 737 } 738 } 739 740 /* finished row so stick it into b->a */ 741 /* L part */ 742 pv = b->a + bs2 * bi[i]; 743 pj = b->j + bi[i]; 744 nz = bi[i + 1] - bi[i]; 745 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 746 747 /* Mark diagonal and invert diagonal for simpler triangular solves */ 748 pv = b->a + bs2 * bdiag[i]; 749 pj = b->j + bdiag[i]; 750 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 751 PetscCall(PetscKernel_A_gets_inverse_A_5(pv, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 752 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 753 754 /* U part */ 755 pv = b->a + bs2 * (bdiag[i + 1] + 1); 756 pj = b->j + bdiag[i + 1] + 1; 757 nz = bdiag[i] - bdiag[i + 1] - 1; 758 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 759 } 760 PetscCall(PetscFree2(rtmp, mwork)); 761 762 C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 763 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 764 C->assembled = PETSC_TRUE; 765 766 PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * n)); /* from inverting diagonal blocks */ 767 PetscFunctionReturn(0); 768 } 769