1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 /* ------------------------------------------------------------*/ 9 /* 10 Version for when blocks are 4 by 4 11 */ 12 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 15 IS isrow = b->row, isicol = b->icol; 16 const PetscInt *r, *ic; 17 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 18 PetscInt *ajtmpold, *ajtmp, nz, row; 19 PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 20 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 21 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 22 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 23 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 24 MatScalar m13, m14, m15, m16; 25 MatScalar *ba = b->a, *aa = a->a; 26 PetscBool pivotinblocks = b->pivotinblocks; 27 PetscReal shift = info->shiftamount; 28 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 29 30 PetscFunctionBegin; 31 PetscCall(ISGetIndices(isrow, &r)); 32 PetscCall(ISGetIndices(isicol, &ic)); 33 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 34 allowzeropivot = PetscNot(A->erroriffailure); 35 36 for (i = 0; i < n; i++) { 37 nz = bi[i + 1] - bi[i]; 38 ajtmp = bj + bi[i]; 39 for (j = 0; j < nz; j++) { 40 x = rtmp + 16 * ajtmp[j]; 41 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 42 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 43 } 44 /* load in initial (unfactored row) */ 45 idx = r[i]; 46 nz = ai[idx + 1] - ai[idx]; 47 ajtmpold = aj + ai[idx]; 48 v = aa + 16 * ai[idx]; 49 for (j = 0; j < nz; j++) { 50 x = rtmp + 16 * ic[ajtmpold[j]]; 51 x[0] = v[0]; 52 x[1] = v[1]; 53 x[2] = v[2]; 54 x[3] = v[3]; 55 x[4] = v[4]; 56 x[5] = v[5]; 57 x[6] = v[6]; 58 x[7] = v[7]; 59 x[8] = v[8]; 60 x[9] = v[9]; 61 x[10] = v[10]; 62 x[11] = v[11]; 63 x[12] = v[12]; 64 x[13] = v[13]; 65 x[14] = v[14]; 66 x[15] = v[15]; 67 v += 16; 68 } 69 row = *ajtmp++; 70 while (row < i) { 71 pc = rtmp + 16 * row; 72 p1 = pc[0]; 73 p2 = pc[1]; 74 p3 = pc[2]; 75 p4 = pc[3]; 76 p5 = pc[4]; 77 p6 = pc[5]; 78 p7 = pc[6]; 79 p8 = pc[7]; 80 p9 = pc[8]; 81 p10 = pc[9]; 82 p11 = pc[10]; 83 p12 = pc[11]; 84 p13 = pc[12]; 85 p14 = pc[13]; 86 p15 = pc[14]; 87 p16 = pc[15]; 88 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 89 pv = ba + 16 * diag_offset[row]; 90 pj = bj + diag_offset[row] + 1; 91 x1 = pv[0]; 92 x2 = pv[1]; 93 x3 = pv[2]; 94 x4 = pv[3]; 95 x5 = pv[4]; 96 x6 = pv[5]; 97 x7 = pv[6]; 98 x8 = pv[7]; 99 x9 = pv[8]; 100 x10 = pv[9]; 101 x11 = pv[10]; 102 x12 = pv[11]; 103 x13 = pv[12]; 104 x14 = pv[13]; 105 x15 = pv[14]; 106 x16 = pv[15]; 107 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 108 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 109 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 110 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 111 112 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 113 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 114 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 115 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 116 117 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 118 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 119 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 120 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 121 122 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 123 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 124 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 125 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 126 127 nz = bi[row + 1] - diag_offset[row] - 1; 128 pv += 16; 129 for (j = 0; j < nz; j++) { 130 x1 = pv[0]; 131 x2 = pv[1]; 132 x3 = pv[2]; 133 x4 = pv[3]; 134 x5 = pv[4]; 135 x6 = pv[5]; 136 x7 = pv[6]; 137 x8 = pv[7]; 138 x9 = pv[8]; 139 x10 = pv[9]; 140 x11 = pv[10]; 141 x12 = pv[11]; 142 x13 = pv[12]; 143 x14 = pv[13]; 144 x15 = pv[14]; 145 x16 = pv[15]; 146 x = rtmp + 16 * pj[j]; 147 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 148 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 149 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 150 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 151 152 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 153 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 154 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 155 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 156 157 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 158 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 159 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 160 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 161 162 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 163 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 164 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 165 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 166 167 pv += 16; 168 } 169 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 170 } 171 row = *ajtmp++; 172 } 173 /* finished row so stick it into b->a */ 174 pv = ba + 16 * bi[i]; 175 pj = bj + bi[i]; 176 nz = bi[i + 1] - bi[i]; 177 for (j = 0; j < nz; j++) { 178 x = rtmp + 16 * pj[j]; 179 pv[0] = x[0]; 180 pv[1] = x[1]; 181 pv[2] = x[2]; 182 pv[3] = x[3]; 183 pv[4] = x[4]; 184 pv[5] = x[5]; 185 pv[6] = x[6]; 186 pv[7] = x[7]; 187 pv[8] = x[8]; 188 pv[9] = x[9]; 189 pv[10] = x[10]; 190 pv[11] = x[11]; 191 pv[12] = x[12]; 192 pv[13] = x[13]; 193 pv[14] = x[14]; 194 pv[15] = x[15]; 195 pv += 16; 196 } 197 /* invert diagonal block */ 198 w = ba + 16 * diag_offset[i]; 199 if (pivotinblocks) { 200 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 201 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 202 } else { 203 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 204 } 205 } 206 207 PetscCall(PetscFree(rtmp)); 208 PetscCall(ISRestoreIndices(isicol, &ic)); 209 PetscCall(ISRestoreIndices(isrow, &r)); 210 211 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 212 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 213 C->assembled = PETSC_TRUE; 214 215 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 216 PetscFunctionReturn(PETSC_SUCCESS); 217 } 218 219 /* MatLUFactorNumeric_SeqBAIJ_4 - 220 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 221 PetscKernel_A_gets_A_times_B() 222 PetscKernel_A_gets_A_minus_B_times_C() 223 PetscKernel_A_gets_inverse_A() 224 */ 225 226 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info) 227 { 228 Mat C = B; 229 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 230 IS isrow = b->row, isicol = b->icol; 231 const PetscInt *r, *ic; 232 PetscInt i, j, k, nz, nzL, row; 233 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 234 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 235 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 236 PetscInt flg; 237 PetscReal shift; 238 PetscBool allowzeropivot, zeropivotdetected; 239 240 PetscFunctionBegin; 241 allowzeropivot = PetscNot(A->erroriffailure); 242 PetscCall(ISGetIndices(isrow, &r)); 243 PetscCall(ISGetIndices(isicol, &ic)); 244 245 if (info->shifttype == MAT_SHIFT_NONE) { 246 shift = 0; 247 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 248 shift = info->shiftamount; 249 } 250 251 /* generate work space needed by the factorization */ 252 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 253 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 254 255 for (i = 0; i < n; i++) { 256 /* zero rtmp */ 257 /* L part */ 258 nz = bi[i + 1] - bi[i]; 259 bjtmp = bj + bi[i]; 260 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 261 262 /* U part */ 263 nz = bdiag[i] - bdiag[i + 1]; 264 bjtmp = bj + bdiag[i + 1] + 1; 265 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 266 267 /* load in initial (unfactored row) */ 268 nz = ai[r[i] + 1] - ai[r[i]]; 269 ajtmp = aj + ai[r[i]]; 270 v = aa + bs2 * ai[r[i]]; 271 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 272 273 /* elimination */ 274 bjtmp = bj + bi[i]; 275 nzL = bi[i + 1] - bi[i]; 276 for (k = 0; k < nzL; k++) { 277 row = bjtmp[k]; 278 pc = rtmp + bs2 * row; 279 for (flg = 0, j = 0; j < bs2; j++) { 280 if (pc[j] != 0.0) { 281 flg = 1; 282 break; 283 } 284 } 285 if (flg) { 286 pv = b->a + bs2 * bdiag[row]; 287 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 288 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 289 290 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 291 pv = b->a + bs2 * (bdiag[row + 1] + 1); 292 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 293 for (j = 0; j < nz; j++) { 294 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 295 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 296 v = rtmp + bs2 * pj[j]; 297 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 298 pv += bs2; 299 } 300 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 301 } 302 } 303 304 /* finished row so stick it into b->a */ 305 /* L part */ 306 pv = b->a + bs2 * bi[i]; 307 pj = b->j + bi[i]; 308 nz = bi[i + 1] - bi[i]; 309 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 310 311 /* Mark diagonal and invert diagonal for simpler triangular solves */ 312 pv = b->a + bs2 * bdiag[i]; 313 pj = b->j + bdiag[i]; 314 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 315 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 316 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 317 318 /* U part */ 319 pv = b->a + bs2 * (bdiag[i + 1] + 1); 320 pj = b->j + bdiag[i + 1] + 1; 321 nz = bdiag[i] - bdiag[i + 1] - 1; 322 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 323 } 324 325 PetscCall(PetscFree2(rtmp, mwork)); 326 PetscCall(ISRestoreIndices(isicol, &ic)); 327 PetscCall(ISRestoreIndices(isrow, &r)); 328 329 C->ops->solve = MatSolve_SeqBAIJ_4; 330 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 331 C->assembled = PETSC_TRUE; 332 333 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 334 PetscFunctionReturn(PETSC_SUCCESS); 335 } 336 337 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 338 { 339 /* 340 Default Version for when blocks are 4 by 4 Using natural ordering 341 */ 342 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 343 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 344 PetscInt *ajtmpold, *ajtmp, nz, row; 345 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 346 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 347 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 348 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 349 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 350 MatScalar m13, m14, m15, m16; 351 MatScalar *ba = b->a, *aa = a->a; 352 PetscBool pivotinblocks = b->pivotinblocks; 353 PetscReal shift = info->shiftamount; 354 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 355 356 PetscFunctionBegin; 357 allowzeropivot = PetscNot(A->erroriffailure); 358 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 359 360 for (i = 0; i < n; i++) { 361 nz = bi[i + 1] - bi[i]; 362 ajtmp = bj + bi[i]; 363 for (j = 0; j < nz; j++) { 364 x = rtmp + 16 * ajtmp[j]; 365 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 366 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 367 } 368 /* load in initial (unfactored row) */ 369 nz = ai[i + 1] - ai[i]; 370 ajtmpold = aj + ai[i]; 371 v = aa + 16 * ai[i]; 372 for (j = 0; j < nz; j++) { 373 x = rtmp + 16 * ajtmpold[j]; 374 x[0] = v[0]; 375 x[1] = v[1]; 376 x[2] = v[2]; 377 x[3] = v[3]; 378 x[4] = v[4]; 379 x[5] = v[5]; 380 x[6] = v[6]; 381 x[7] = v[7]; 382 x[8] = v[8]; 383 x[9] = v[9]; 384 x[10] = v[10]; 385 x[11] = v[11]; 386 x[12] = v[12]; 387 x[13] = v[13]; 388 x[14] = v[14]; 389 x[15] = v[15]; 390 v += 16; 391 } 392 row = *ajtmp++; 393 while (row < i) { 394 pc = rtmp + 16 * row; 395 p1 = pc[0]; 396 p2 = pc[1]; 397 p3 = pc[2]; 398 p4 = pc[3]; 399 p5 = pc[4]; 400 p6 = pc[5]; 401 p7 = pc[6]; 402 p8 = pc[7]; 403 p9 = pc[8]; 404 p10 = pc[9]; 405 p11 = pc[10]; 406 p12 = pc[11]; 407 p13 = pc[12]; 408 p14 = pc[13]; 409 p15 = pc[14]; 410 p16 = pc[15]; 411 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 412 pv = ba + 16 * diag_offset[row]; 413 pj = bj + diag_offset[row] + 1; 414 x1 = pv[0]; 415 x2 = pv[1]; 416 x3 = pv[2]; 417 x4 = pv[3]; 418 x5 = pv[4]; 419 x6 = pv[5]; 420 x7 = pv[6]; 421 x8 = pv[7]; 422 x9 = pv[8]; 423 x10 = pv[9]; 424 x11 = pv[10]; 425 x12 = pv[11]; 426 x13 = pv[12]; 427 x14 = pv[13]; 428 x15 = pv[14]; 429 x16 = pv[15]; 430 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 431 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 432 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 433 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 434 435 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 436 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 437 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 438 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 439 440 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 441 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 442 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 443 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 444 445 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 446 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 447 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 448 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 449 nz = bi[row + 1] - diag_offset[row] - 1; 450 pv += 16; 451 for (j = 0; j < nz; j++) { 452 x1 = pv[0]; 453 x2 = pv[1]; 454 x3 = pv[2]; 455 x4 = pv[3]; 456 x5 = pv[4]; 457 x6 = pv[5]; 458 x7 = pv[6]; 459 x8 = pv[7]; 460 x9 = pv[8]; 461 x10 = pv[9]; 462 x11 = pv[10]; 463 x12 = pv[11]; 464 x13 = pv[12]; 465 x14 = pv[13]; 466 x15 = pv[14]; 467 x16 = pv[15]; 468 x = rtmp + 16 * pj[j]; 469 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 470 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 471 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 472 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 473 474 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 475 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 476 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 477 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 478 479 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 480 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 481 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 482 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 483 484 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 485 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 486 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 487 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 488 489 pv += 16; 490 } 491 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 492 } 493 row = *ajtmp++; 494 } 495 /* finished row so stick it into b->a */ 496 pv = ba + 16 * bi[i]; 497 pj = bj + bi[i]; 498 nz = bi[i + 1] - bi[i]; 499 for (j = 0; j < nz; j++) { 500 x = rtmp + 16 * pj[j]; 501 pv[0] = x[0]; 502 pv[1] = x[1]; 503 pv[2] = x[2]; 504 pv[3] = x[3]; 505 pv[4] = x[4]; 506 pv[5] = x[5]; 507 pv[6] = x[6]; 508 pv[7] = x[7]; 509 pv[8] = x[8]; 510 pv[9] = x[9]; 511 pv[10] = x[10]; 512 pv[11] = x[11]; 513 pv[12] = x[12]; 514 pv[13] = x[13]; 515 pv[14] = x[14]; 516 pv[15] = x[15]; 517 pv += 16; 518 } 519 /* invert diagonal block */ 520 w = ba + 16 * diag_offset[i]; 521 if (pivotinblocks) { 522 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 523 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 524 } else { 525 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 526 } 527 } 528 529 PetscCall(PetscFree(rtmp)); 530 531 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 532 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 533 C->assembled = PETSC_TRUE; 534 535 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 536 PetscFunctionReturn(PETSC_SUCCESS); 537 } 538 539 /* 540 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 541 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 542 */ 543 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 544 { 545 Mat C = B; 546 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 547 PetscInt i, j, k, nz, nzL, row; 548 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 549 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 550 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 551 PetscInt flg; 552 PetscReal shift; 553 PetscBool allowzeropivot, zeropivotdetected; 554 555 PetscFunctionBegin; 556 allowzeropivot = PetscNot(A->erroriffailure); 557 558 /* generate work space needed by the factorization */ 559 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 560 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 561 562 if (info->shifttype == MAT_SHIFT_NONE) { 563 shift = 0; 564 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 565 shift = info->shiftamount; 566 } 567 568 for (i = 0; i < n; i++) { 569 /* zero rtmp */ 570 /* L part */ 571 nz = bi[i + 1] - bi[i]; 572 bjtmp = bj + bi[i]; 573 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 574 575 /* U part */ 576 nz = bdiag[i] - bdiag[i + 1]; 577 bjtmp = bj + bdiag[i + 1] + 1; 578 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 579 580 /* load in initial (unfactored row) */ 581 nz = ai[i + 1] - ai[i]; 582 ajtmp = aj + ai[i]; 583 v = aa + bs2 * ai[i]; 584 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 585 586 /* elimination */ 587 bjtmp = bj + bi[i]; 588 nzL = bi[i + 1] - bi[i]; 589 for (k = 0; k < nzL; k++) { 590 row = bjtmp[k]; 591 pc = rtmp + bs2 * row; 592 for (flg = 0, j = 0; j < bs2; j++) { 593 if (pc[j] != 0.0) { 594 flg = 1; 595 break; 596 } 597 } 598 if (flg) { 599 pv = b->a + bs2 * bdiag[row]; 600 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 601 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 602 603 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 604 pv = b->a + bs2 * (bdiag[row + 1] + 1); 605 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 606 for (j = 0; j < nz; j++) { 607 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 608 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 609 v = rtmp + bs2 * pj[j]; 610 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 611 pv += bs2; 612 } 613 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 614 } 615 } 616 617 /* finished row so stick it into b->a */ 618 /* L part */ 619 pv = b->a + bs2 * bi[i]; 620 pj = b->j + bi[i]; 621 nz = bi[i + 1] - bi[i]; 622 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 623 624 /* Mark diagonal and invert diagonal for simpler triangular solves */ 625 pv = b->a + bs2 * bdiag[i]; 626 pj = b->j + bdiag[i]; 627 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 628 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 629 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 630 631 /* U part */ 632 pv = b->a + bs2 * (bdiag[i + 1] + 1); 633 pj = b->j + bdiag[i + 1] + 1; 634 nz = bdiag[i] - bdiag[i + 1] - 1; 635 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 636 } 637 PetscCall(PetscFree2(rtmp, mwork)); 638 639 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 640 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 641 C->assembled = PETSC_TRUE; 642 643 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 644 PetscFunctionReturn(PETSC_SUCCESS); 645 } 646 647 #if defined(PETSC_HAVE_SSE) 648 649 #include PETSC_HAVE_SSE 650 651 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 652 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info) 653 { 654 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 655 int i, j, n = a->mbs; 656 int *bj = b->j, *bjtmp, *pj; 657 int row; 658 int *ajtmpold, nz, *bi = b->i; 659 int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 660 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 661 MatScalar *ba = b->a, *aa = a->a; 662 int nonzero = 0; 663 PetscBool pivotinblocks = b->pivotinblocks; 664 PetscReal shift = info->shiftamount; 665 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 666 667 PetscFunctionBegin; 668 allowzeropivot = PetscNot(A->erroriffailure); 669 SSE_SCOPE_BEGIN; 670 671 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 672 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 673 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 674 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 675 /* if ((unsigned long)bj==(unsigned long)aj) { */ 676 /* colscale = 4; */ 677 /* } */ 678 for (i = 0; i < n; i++) { 679 nz = bi[i + 1] - bi[i]; 680 bjtmp = bj + bi[i]; 681 /* zero out the 4x4 block accumulators */ 682 /* zero out one register */ 683 XOR_PS(XMM7, XMM7); 684 for (j = 0; j < nz; j++) { 685 x = rtmp + 16 * bjtmp[j]; 686 /* x = rtmp+4*bjtmp[j]; */ 687 SSE_INLINE_BEGIN_1(x) 688 /* Copy zero register to memory locations */ 689 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 690 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 691 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 692 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 693 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 694 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 695 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 696 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 697 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 698 SSE_INLINE_END_1; 699 } 700 /* load in initial (unfactored row) */ 701 nz = ai[i + 1] - ai[i]; 702 ajtmpold = aj + ai[i]; 703 v = aa + 16 * ai[i]; 704 for (j = 0; j < nz; j++) { 705 x = rtmp + 16 * ajtmpold[j]; 706 /* x = rtmp+colscale*ajtmpold[j]; */ 707 /* Copy v block into x block */ 708 SSE_INLINE_BEGIN_2(v, x) 709 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 710 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 711 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 712 713 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 714 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 715 716 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 717 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 718 719 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 720 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 721 722 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 723 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 724 725 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 726 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 727 728 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 729 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 730 731 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 732 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 733 SSE_INLINE_END_2; 734 735 v += 16; 736 } 737 /* row = (*bjtmp++)/4; */ 738 row = *bjtmp++; 739 while (row < i) { 740 pc = rtmp + 16 * row; 741 SSE_INLINE_BEGIN_1(pc) 742 /* Load block from lower triangle */ 743 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 744 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 745 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 746 747 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 748 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 749 750 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 751 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 752 753 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 754 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 755 756 /* Compare block to zero block */ 757 758 SSE_COPY_PS(XMM4, XMM7) 759 SSE_CMPNEQ_PS(XMM4, XMM0) 760 761 SSE_COPY_PS(XMM5, XMM7) 762 SSE_CMPNEQ_PS(XMM5, XMM1) 763 764 SSE_COPY_PS(XMM6, XMM7) 765 SSE_CMPNEQ_PS(XMM6, XMM2) 766 767 SSE_CMPNEQ_PS(XMM7, XMM3) 768 769 /* Reduce the comparisons to one SSE register */ 770 SSE_OR_PS(XMM6, XMM7) 771 SSE_OR_PS(XMM5, XMM4) 772 SSE_OR_PS(XMM5, XMM6) 773 SSE_INLINE_END_1; 774 775 /* Reduce the one SSE register to an integer register for branching */ 776 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 777 MOVEMASK(nonzero, XMM5); 778 779 /* If block is nonzero ... */ 780 if (nonzero) { 781 pv = ba + 16 * diag_offset[row]; 782 PREFETCH_L1(&pv[16]); 783 pj = bj + diag_offset[row] + 1; 784 785 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 786 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 787 /* but the diagonal was inverted already */ 788 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 789 790 SSE_INLINE_BEGIN_2(pv, pc) 791 /* Column 0, product is accumulated in XMM4 */ 792 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 793 SSE_SHUFFLE(XMM4, XMM4, 0x00) 794 SSE_MULT_PS(XMM4, XMM0) 795 796 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 797 SSE_SHUFFLE(XMM5, XMM5, 0x00) 798 SSE_MULT_PS(XMM5, XMM1) 799 SSE_ADD_PS(XMM4, XMM5) 800 801 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 802 SSE_SHUFFLE(XMM6, XMM6, 0x00) 803 SSE_MULT_PS(XMM6, XMM2) 804 SSE_ADD_PS(XMM4, XMM6) 805 806 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 807 SSE_SHUFFLE(XMM7, XMM7, 0x00) 808 SSE_MULT_PS(XMM7, XMM3) 809 SSE_ADD_PS(XMM4, XMM7) 810 811 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 812 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 813 814 /* Column 1, product is accumulated in XMM5 */ 815 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 816 SSE_SHUFFLE(XMM5, XMM5, 0x00) 817 SSE_MULT_PS(XMM5, XMM0) 818 819 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 820 SSE_SHUFFLE(XMM6, XMM6, 0x00) 821 SSE_MULT_PS(XMM6, XMM1) 822 SSE_ADD_PS(XMM5, XMM6) 823 824 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 825 SSE_SHUFFLE(XMM7, XMM7, 0x00) 826 SSE_MULT_PS(XMM7, XMM2) 827 SSE_ADD_PS(XMM5, XMM7) 828 829 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 830 SSE_SHUFFLE(XMM6, XMM6, 0x00) 831 SSE_MULT_PS(XMM6, XMM3) 832 SSE_ADD_PS(XMM5, XMM6) 833 834 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 835 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 836 837 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 838 839 /* Column 2, product is accumulated in XMM6 */ 840 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 841 SSE_SHUFFLE(XMM6, XMM6, 0x00) 842 SSE_MULT_PS(XMM6, XMM0) 843 844 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 845 SSE_SHUFFLE(XMM7, XMM7, 0x00) 846 SSE_MULT_PS(XMM7, XMM1) 847 SSE_ADD_PS(XMM6, XMM7) 848 849 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 850 SSE_SHUFFLE(XMM7, XMM7, 0x00) 851 SSE_MULT_PS(XMM7, XMM2) 852 SSE_ADD_PS(XMM6, XMM7) 853 854 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 855 SSE_SHUFFLE(XMM7, XMM7, 0x00) 856 SSE_MULT_PS(XMM7, XMM3) 857 SSE_ADD_PS(XMM6, XMM7) 858 859 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 860 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 861 862 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 863 /* Column 3, product is accumulated in XMM0 */ 864 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 865 SSE_SHUFFLE(XMM7, XMM7, 0x00) 866 SSE_MULT_PS(XMM0, XMM7) 867 868 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 869 SSE_SHUFFLE(XMM7, XMM7, 0x00) 870 SSE_MULT_PS(XMM1, XMM7) 871 SSE_ADD_PS(XMM0, XMM1) 872 873 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 874 SSE_SHUFFLE(XMM1, XMM1, 0x00) 875 SSE_MULT_PS(XMM1, XMM2) 876 SSE_ADD_PS(XMM0, XMM1) 877 878 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 879 SSE_SHUFFLE(XMM7, XMM7, 0x00) 880 SSE_MULT_PS(XMM3, XMM7) 881 SSE_ADD_PS(XMM0, XMM3) 882 883 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 884 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 885 886 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 887 /* This is code to be maintained and read by humans after all. */ 888 /* Copy Multiplier Col 3 into XMM3 */ 889 SSE_COPY_PS(XMM3, XMM0) 890 /* Copy Multiplier Col 2 into XMM2 */ 891 SSE_COPY_PS(XMM2, XMM6) 892 /* Copy Multiplier Col 1 into XMM1 */ 893 SSE_COPY_PS(XMM1, XMM5) 894 /* Copy Multiplier Col 0 into XMM0 */ 895 SSE_COPY_PS(XMM0, XMM4) 896 SSE_INLINE_END_2; 897 898 /* Update the row: */ 899 nz = bi[row + 1] - diag_offset[row] - 1; 900 pv += 16; 901 for (j = 0; j < nz; j++) { 902 PREFETCH_L1(&pv[16]); 903 x = rtmp + 16 * pj[j]; 904 /* x = rtmp + 4*pj[j]; */ 905 906 /* X:=X-M*PV, One column at a time */ 907 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 908 SSE_INLINE_BEGIN_2(x, pv) 909 /* Load First Column of X*/ 910 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 911 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 912 913 /* Matrix-Vector Product: */ 914 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 915 SSE_SHUFFLE(XMM5, XMM5, 0x00) 916 SSE_MULT_PS(XMM5, XMM0) 917 SSE_SUB_PS(XMM4, XMM5) 918 919 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 920 SSE_SHUFFLE(XMM6, XMM6, 0x00) 921 SSE_MULT_PS(XMM6, XMM1) 922 SSE_SUB_PS(XMM4, XMM6) 923 924 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 925 SSE_SHUFFLE(XMM7, XMM7, 0x00) 926 SSE_MULT_PS(XMM7, XMM2) 927 SSE_SUB_PS(XMM4, XMM7) 928 929 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 930 SSE_SHUFFLE(XMM5, XMM5, 0x00) 931 SSE_MULT_PS(XMM5, XMM3) 932 SSE_SUB_PS(XMM4, XMM5) 933 934 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 935 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 936 937 /* Second Column */ 938 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 939 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 940 941 /* Matrix-Vector Product: */ 942 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 943 SSE_SHUFFLE(XMM6, XMM6, 0x00) 944 SSE_MULT_PS(XMM6, XMM0) 945 SSE_SUB_PS(XMM5, XMM6) 946 947 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 948 SSE_SHUFFLE(XMM7, XMM7, 0x00) 949 SSE_MULT_PS(XMM7, XMM1) 950 SSE_SUB_PS(XMM5, XMM7) 951 952 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 953 SSE_SHUFFLE(XMM4, XMM4, 0x00) 954 SSE_MULT_PS(XMM4, XMM2) 955 SSE_SUB_PS(XMM5, XMM4) 956 957 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 958 SSE_SHUFFLE(XMM6, XMM6, 0x00) 959 SSE_MULT_PS(XMM6, XMM3) 960 SSE_SUB_PS(XMM5, XMM6) 961 962 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 963 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 964 965 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 966 967 /* Third Column */ 968 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 969 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 970 971 /* Matrix-Vector Product: */ 972 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 973 SSE_SHUFFLE(XMM7, XMM7, 0x00) 974 SSE_MULT_PS(XMM7, XMM0) 975 SSE_SUB_PS(XMM6, XMM7) 976 977 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 978 SSE_SHUFFLE(XMM4, XMM4, 0x00) 979 SSE_MULT_PS(XMM4, XMM1) 980 SSE_SUB_PS(XMM6, XMM4) 981 982 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 983 SSE_SHUFFLE(XMM5, XMM5, 0x00) 984 SSE_MULT_PS(XMM5, XMM2) 985 SSE_SUB_PS(XMM6, XMM5) 986 987 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 988 SSE_SHUFFLE(XMM7, XMM7, 0x00) 989 SSE_MULT_PS(XMM7, XMM3) 990 SSE_SUB_PS(XMM6, XMM7) 991 992 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 993 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 994 995 /* Fourth Column */ 996 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 997 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 998 999 /* Matrix-Vector Product: */ 1000 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1001 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1002 SSE_MULT_PS(XMM5, XMM0) 1003 SSE_SUB_PS(XMM4, XMM5) 1004 1005 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1006 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1007 SSE_MULT_PS(XMM6, XMM1) 1008 SSE_SUB_PS(XMM4, XMM6) 1009 1010 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1011 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1012 SSE_MULT_PS(XMM7, XMM2) 1013 SSE_SUB_PS(XMM4, XMM7) 1014 1015 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1016 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1017 SSE_MULT_PS(XMM5, XMM3) 1018 SSE_SUB_PS(XMM4, XMM5) 1019 1020 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1021 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1022 SSE_INLINE_END_2; 1023 pv += 16; 1024 } 1025 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1026 } 1027 row = *bjtmp++; 1028 /* row = (*bjtmp++)/4; */ 1029 } 1030 /* finished row so stick it into b->a */ 1031 pv = ba + 16 * bi[i]; 1032 pj = bj + bi[i]; 1033 nz = bi[i + 1] - bi[i]; 1034 1035 /* Copy x block back into pv block */ 1036 for (j = 0; j < nz; j++) { 1037 x = rtmp + 16 * pj[j]; 1038 /* x = rtmp+4*pj[j]; */ 1039 1040 SSE_INLINE_BEGIN_2(x, pv) 1041 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1042 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1043 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1044 1045 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1046 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1047 1048 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1049 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1050 1051 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1052 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1053 1054 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1055 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1056 1057 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1058 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1059 1060 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1061 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1062 1063 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1064 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1065 SSE_INLINE_END_2; 1066 pv += 16; 1067 } 1068 /* invert diagonal block */ 1069 w = ba + 16 * diag_offset[i]; 1070 if (pivotinblocks) { 1071 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1072 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1073 } else { 1074 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1075 } 1076 /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1077 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1078 } 1079 1080 PetscCall(PetscFree(rtmp)); 1081 1082 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1083 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1084 C->assembled = PETSC_TRUE; 1085 1086 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1087 /* Flop Count from inverting diagonal blocks */ 1088 SSE_SCOPE_END; 1089 PetscFunctionReturn(PETSC_SUCCESS); 1090 } 1091 1092 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1093 { 1094 Mat A = C; 1095 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1096 int i, j, n = a->mbs; 1097 unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj; 1098 unsigned short *aj = (unsigned short *)(a->j), *ajtmp; 1099 unsigned int row; 1100 int nz, *bi = b->i; 1101 int *diag_offset = b->diag, *ai = a->i; 1102 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1103 MatScalar *ba = b->a, *aa = a->a; 1104 int nonzero = 0; 1105 PetscBool pivotinblocks = b->pivotinblocks; 1106 PetscReal shift = info->shiftamount; 1107 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1108 1109 PetscFunctionBegin; 1110 allowzeropivot = PetscNot(A->erroriffailure); 1111 SSE_SCOPE_BEGIN; 1112 1113 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1114 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 1115 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1116 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1117 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1118 /* colscale = 4; */ 1119 /* } */ 1120 1121 for (i = 0; i < n; i++) { 1122 nz = bi[i + 1] - bi[i]; 1123 bjtmp = bj + bi[i]; 1124 /* zero out the 4x4 block accumulators */ 1125 /* zero out one register */ 1126 XOR_PS(XMM7, XMM7); 1127 for (j = 0; j < nz; j++) { 1128 x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1129 /* x = rtmp+4*bjtmp[j]; */ 1130 SSE_INLINE_BEGIN_1(x) 1131 /* Copy zero register to memory locations */ 1132 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1133 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1134 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1135 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1136 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1137 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1138 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1139 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1140 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1141 SSE_INLINE_END_1; 1142 } 1143 /* load in initial (unfactored row) */ 1144 nz = ai[i + 1] - ai[i]; 1145 ajtmp = aj + ai[i]; 1146 v = aa + 16 * ai[i]; 1147 for (j = 0; j < nz; j++) { 1148 x = rtmp + 16 * ((unsigned int)ajtmp[j]); 1149 /* x = rtmp+colscale*ajtmp[j]; */ 1150 /* Copy v block into x block */ 1151 SSE_INLINE_BEGIN_2(v, x) 1152 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1153 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1154 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1155 1156 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1157 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1158 1159 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1160 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1161 1162 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1163 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1164 1165 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1166 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1167 1168 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1169 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1170 1171 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1172 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1173 1174 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1175 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1176 SSE_INLINE_END_2; 1177 1178 v += 16; 1179 } 1180 /* row = (*bjtmp++)/4; */ 1181 row = (unsigned int)(*bjtmp++); 1182 while (row < i) { 1183 pc = rtmp + 16 * row; 1184 SSE_INLINE_BEGIN_1(pc) 1185 /* Load block from lower triangle */ 1186 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1187 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1188 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1189 1190 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1191 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1192 1193 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1194 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1195 1196 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1197 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1198 1199 /* Compare block to zero block */ 1200 1201 SSE_COPY_PS(XMM4, XMM7) 1202 SSE_CMPNEQ_PS(XMM4, XMM0) 1203 1204 SSE_COPY_PS(XMM5, XMM7) 1205 SSE_CMPNEQ_PS(XMM5, XMM1) 1206 1207 SSE_COPY_PS(XMM6, XMM7) 1208 SSE_CMPNEQ_PS(XMM6, XMM2) 1209 1210 SSE_CMPNEQ_PS(XMM7, XMM3) 1211 1212 /* Reduce the comparisons to one SSE register */ 1213 SSE_OR_PS(XMM6, XMM7) 1214 SSE_OR_PS(XMM5, XMM4) 1215 SSE_OR_PS(XMM5, XMM6) 1216 SSE_INLINE_END_1; 1217 1218 /* Reduce the one SSE register to an integer register for branching */ 1219 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1220 MOVEMASK(nonzero, XMM5); 1221 1222 /* If block is nonzero ... */ 1223 if (nonzero) { 1224 pv = ba + 16 * diag_offset[row]; 1225 PREFETCH_L1(&pv[16]); 1226 pj = bj + diag_offset[row] + 1; 1227 1228 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1229 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1230 /* but the diagonal was inverted already */ 1231 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1232 1233 SSE_INLINE_BEGIN_2(pv, pc) 1234 /* Column 0, product is accumulated in XMM4 */ 1235 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1236 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1237 SSE_MULT_PS(XMM4, XMM0) 1238 1239 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1240 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1241 SSE_MULT_PS(XMM5, XMM1) 1242 SSE_ADD_PS(XMM4, XMM5) 1243 1244 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1245 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1246 SSE_MULT_PS(XMM6, XMM2) 1247 SSE_ADD_PS(XMM4, XMM6) 1248 1249 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1250 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1251 SSE_MULT_PS(XMM7, XMM3) 1252 SSE_ADD_PS(XMM4, XMM7) 1253 1254 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1255 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1256 1257 /* Column 1, product is accumulated in XMM5 */ 1258 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1259 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1260 SSE_MULT_PS(XMM5, XMM0) 1261 1262 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1263 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1264 SSE_MULT_PS(XMM6, XMM1) 1265 SSE_ADD_PS(XMM5, XMM6) 1266 1267 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1268 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1269 SSE_MULT_PS(XMM7, XMM2) 1270 SSE_ADD_PS(XMM5, XMM7) 1271 1272 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1273 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1274 SSE_MULT_PS(XMM6, XMM3) 1275 SSE_ADD_PS(XMM5, XMM6) 1276 1277 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1278 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1279 1280 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1281 1282 /* Column 2, product is accumulated in XMM6 */ 1283 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1284 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1285 SSE_MULT_PS(XMM6, XMM0) 1286 1287 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1288 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1289 SSE_MULT_PS(XMM7, XMM1) 1290 SSE_ADD_PS(XMM6, XMM7) 1291 1292 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1293 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1294 SSE_MULT_PS(XMM7, XMM2) 1295 SSE_ADD_PS(XMM6, XMM7) 1296 1297 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1298 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1299 SSE_MULT_PS(XMM7, XMM3) 1300 SSE_ADD_PS(XMM6, XMM7) 1301 1302 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1303 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1304 1305 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1306 /* Column 3, product is accumulated in XMM0 */ 1307 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1308 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1309 SSE_MULT_PS(XMM0, XMM7) 1310 1311 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1312 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1313 SSE_MULT_PS(XMM1, XMM7) 1314 SSE_ADD_PS(XMM0, XMM1) 1315 1316 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1317 SSE_SHUFFLE(XMM1, XMM1, 0x00) 1318 SSE_MULT_PS(XMM1, XMM2) 1319 SSE_ADD_PS(XMM0, XMM1) 1320 1321 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1322 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1323 SSE_MULT_PS(XMM3, XMM7) 1324 SSE_ADD_PS(XMM0, XMM3) 1325 1326 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1327 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1328 1329 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1330 /* This is code to be maintained and read by humans after all. */ 1331 /* Copy Multiplier Col 3 into XMM3 */ 1332 SSE_COPY_PS(XMM3, XMM0) 1333 /* Copy Multiplier Col 2 into XMM2 */ 1334 SSE_COPY_PS(XMM2, XMM6) 1335 /* Copy Multiplier Col 1 into XMM1 */ 1336 SSE_COPY_PS(XMM1, XMM5) 1337 /* Copy Multiplier Col 0 into XMM0 */ 1338 SSE_COPY_PS(XMM0, XMM4) 1339 SSE_INLINE_END_2; 1340 1341 /* Update the row: */ 1342 nz = bi[row + 1] - diag_offset[row] - 1; 1343 pv += 16; 1344 for (j = 0; j < nz; j++) { 1345 PREFETCH_L1(&pv[16]); 1346 x = rtmp + 16 * ((unsigned int)pj[j]); 1347 /* x = rtmp + 4*pj[j]; */ 1348 1349 /* X:=X-M*PV, One column at a time */ 1350 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1351 SSE_INLINE_BEGIN_2(x, pv) 1352 /* Load First Column of X*/ 1353 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1354 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1355 1356 /* Matrix-Vector Product: */ 1357 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1358 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1359 SSE_MULT_PS(XMM5, XMM0) 1360 SSE_SUB_PS(XMM4, XMM5) 1361 1362 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1363 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1364 SSE_MULT_PS(XMM6, XMM1) 1365 SSE_SUB_PS(XMM4, XMM6) 1366 1367 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1368 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1369 SSE_MULT_PS(XMM7, XMM2) 1370 SSE_SUB_PS(XMM4, XMM7) 1371 1372 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1373 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1374 SSE_MULT_PS(XMM5, XMM3) 1375 SSE_SUB_PS(XMM4, XMM5) 1376 1377 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1378 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1379 1380 /* Second Column */ 1381 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1382 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1383 1384 /* Matrix-Vector Product: */ 1385 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1386 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1387 SSE_MULT_PS(XMM6, XMM0) 1388 SSE_SUB_PS(XMM5, XMM6) 1389 1390 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1391 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1392 SSE_MULT_PS(XMM7, XMM1) 1393 SSE_SUB_PS(XMM5, XMM7) 1394 1395 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1396 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1397 SSE_MULT_PS(XMM4, XMM2) 1398 SSE_SUB_PS(XMM5, XMM4) 1399 1400 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1401 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1402 SSE_MULT_PS(XMM6, XMM3) 1403 SSE_SUB_PS(XMM5, XMM6) 1404 1405 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1406 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1407 1408 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1409 1410 /* Third Column */ 1411 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1412 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1413 1414 /* Matrix-Vector Product: */ 1415 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1416 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1417 SSE_MULT_PS(XMM7, XMM0) 1418 SSE_SUB_PS(XMM6, XMM7) 1419 1420 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1421 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1422 SSE_MULT_PS(XMM4, XMM1) 1423 SSE_SUB_PS(XMM6, XMM4) 1424 1425 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1426 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1427 SSE_MULT_PS(XMM5, XMM2) 1428 SSE_SUB_PS(XMM6, XMM5) 1429 1430 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1431 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1432 SSE_MULT_PS(XMM7, XMM3) 1433 SSE_SUB_PS(XMM6, XMM7) 1434 1435 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1436 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1437 1438 /* Fourth Column */ 1439 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1440 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1441 1442 /* Matrix-Vector Product: */ 1443 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1444 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1445 SSE_MULT_PS(XMM5, XMM0) 1446 SSE_SUB_PS(XMM4, XMM5) 1447 1448 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1449 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1450 SSE_MULT_PS(XMM6, XMM1) 1451 SSE_SUB_PS(XMM4, XMM6) 1452 1453 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1454 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1455 SSE_MULT_PS(XMM7, XMM2) 1456 SSE_SUB_PS(XMM4, XMM7) 1457 1458 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1459 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1460 SSE_MULT_PS(XMM5, XMM3) 1461 SSE_SUB_PS(XMM4, XMM5) 1462 1463 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1464 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1465 SSE_INLINE_END_2; 1466 pv += 16; 1467 } 1468 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1469 } 1470 row = (unsigned int)(*bjtmp++); 1471 /* row = (*bjtmp++)/4; */ 1472 /* bjtmp++; */ 1473 } 1474 /* finished row so stick it into b->a */ 1475 pv = ba + 16 * bi[i]; 1476 pj = bj + bi[i]; 1477 nz = bi[i + 1] - bi[i]; 1478 1479 /* Copy x block back into pv block */ 1480 for (j = 0; j < nz; j++) { 1481 x = rtmp + 16 * ((unsigned int)pj[j]); 1482 /* x = rtmp+4*pj[j]; */ 1483 1484 SSE_INLINE_BEGIN_2(x, pv) 1485 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1486 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1487 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1488 1489 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1490 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1491 1492 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1493 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1494 1495 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1496 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1497 1498 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1499 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1500 1501 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1502 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1503 1504 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1505 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1506 1507 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1508 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1509 SSE_INLINE_END_2; 1510 pv += 16; 1511 } 1512 /* invert diagonal block */ 1513 w = ba + 16 * diag_offset[i]; 1514 if (pivotinblocks) { 1515 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1516 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1517 } else { 1518 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1519 } 1520 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1521 } 1522 1523 PetscCall(PetscFree(rtmp)); 1524 1525 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1526 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1527 C->assembled = PETSC_TRUE; 1528 1529 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1530 /* Flop Count from inverting diagonal blocks */ 1531 SSE_SCOPE_END; 1532 PetscFunctionReturn(PETSC_SUCCESS); 1533 } 1534 1535 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info) 1536 { 1537 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1538 int i, j, n = a->mbs; 1539 unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj; 1540 unsigned int row; 1541 int *ajtmpold, nz, *bi = b->i; 1542 int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 1543 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1544 MatScalar *ba = b->a, *aa = a->a; 1545 int nonzero = 0; 1546 PetscBool pivotinblocks = b->pivotinblocks; 1547 PetscReal shift = info->shiftamount; 1548 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1549 1550 PetscFunctionBegin; 1551 allowzeropivot = PetscNot(A->erroriffailure); 1552 SSE_SCOPE_BEGIN; 1553 1554 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1555 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 1556 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1557 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1558 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1559 /* colscale = 4; */ 1560 /* } */ 1561 if ((unsigned long)bj == (unsigned long)aj) return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1562 1563 for (i = 0; i < n; i++) { 1564 nz = bi[i + 1] - bi[i]; 1565 bjtmp = bj + bi[i]; 1566 /* zero out the 4x4 block accumulators */ 1567 /* zero out one register */ 1568 XOR_PS(XMM7, XMM7); 1569 for (j = 0; j < nz; j++) { 1570 x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1571 /* x = rtmp+4*bjtmp[j]; */ 1572 SSE_INLINE_BEGIN_1(x) 1573 /* Copy zero register to memory locations */ 1574 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1575 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1576 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1577 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1578 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1579 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1580 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1581 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1582 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1583 SSE_INLINE_END_1; 1584 } 1585 /* load in initial (unfactored row) */ 1586 nz = ai[i + 1] - ai[i]; 1587 ajtmpold = aj + ai[i]; 1588 v = aa + 16 * ai[i]; 1589 for (j = 0; j < nz; j++) { 1590 x = rtmp + 16 * ajtmpold[j]; 1591 /* x = rtmp+colscale*ajtmpold[j]; */ 1592 /* Copy v block into x block */ 1593 SSE_INLINE_BEGIN_2(v, x) 1594 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1595 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1596 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1597 1598 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1599 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1600 1601 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1602 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1603 1604 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1605 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1606 1607 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1608 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1609 1610 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1611 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1612 1613 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1614 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1615 1616 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1617 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1618 SSE_INLINE_END_2; 1619 1620 v += 16; 1621 } 1622 /* row = (*bjtmp++)/4; */ 1623 row = (unsigned int)(*bjtmp++); 1624 while (row < i) { 1625 pc = rtmp + 16 * row; 1626 SSE_INLINE_BEGIN_1(pc) 1627 /* Load block from lower triangle */ 1628 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1629 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1630 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1631 1632 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1633 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1634 1635 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1636 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1637 1638 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1639 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1640 1641 /* Compare block to zero block */ 1642 1643 SSE_COPY_PS(XMM4, XMM7) 1644 SSE_CMPNEQ_PS(XMM4, XMM0) 1645 1646 SSE_COPY_PS(XMM5, XMM7) 1647 SSE_CMPNEQ_PS(XMM5, XMM1) 1648 1649 SSE_COPY_PS(XMM6, XMM7) 1650 SSE_CMPNEQ_PS(XMM6, XMM2) 1651 1652 SSE_CMPNEQ_PS(XMM7, XMM3) 1653 1654 /* Reduce the comparisons to one SSE register */ 1655 SSE_OR_PS(XMM6, XMM7) 1656 SSE_OR_PS(XMM5, XMM4) 1657 SSE_OR_PS(XMM5, XMM6) 1658 SSE_INLINE_END_1; 1659 1660 /* Reduce the one SSE register to an integer register for branching */ 1661 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1662 MOVEMASK(nonzero, XMM5); 1663 1664 /* If block is nonzero ... */ 1665 if (nonzero) { 1666 pv = ba + 16 * diag_offset[row]; 1667 PREFETCH_L1(&pv[16]); 1668 pj = bj + diag_offset[row] + 1; 1669 1670 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1671 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1672 /* but the diagonal was inverted already */ 1673 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1674 1675 SSE_INLINE_BEGIN_2(pv, pc) 1676 /* Column 0, product is accumulated in XMM4 */ 1677 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1678 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1679 SSE_MULT_PS(XMM4, XMM0) 1680 1681 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1682 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1683 SSE_MULT_PS(XMM5, XMM1) 1684 SSE_ADD_PS(XMM4, XMM5) 1685 1686 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1687 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1688 SSE_MULT_PS(XMM6, XMM2) 1689 SSE_ADD_PS(XMM4, XMM6) 1690 1691 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1692 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1693 SSE_MULT_PS(XMM7, XMM3) 1694 SSE_ADD_PS(XMM4, XMM7) 1695 1696 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1697 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1698 1699 /* Column 1, product is accumulated in XMM5 */ 1700 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1701 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1702 SSE_MULT_PS(XMM5, XMM0) 1703 1704 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1705 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1706 SSE_MULT_PS(XMM6, XMM1) 1707 SSE_ADD_PS(XMM5, XMM6) 1708 1709 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1710 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1711 SSE_MULT_PS(XMM7, XMM2) 1712 SSE_ADD_PS(XMM5, XMM7) 1713 1714 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1715 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1716 SSE_MULT_PS(XMM6, XMM3) 1717 SSE_ADD_PS(XMM5, XMM6) 1718 1719 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1720 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1721 1722 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1723 1724 /* Column 2, product is accumulated in XMM6 */ 1725 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1726 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1727 SSE_MULT_PS(XMM6, XMM0) 1728 1729 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1730 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1731 SSE_MULT_PS(XMM7, XMM1) 1732 SSE_ADD_PS(XMM6, XMM7) 1733 1734 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1735 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1736 SSE_MULT_PS(XMM7, XMM2) 1737 SSE_ADD_PS(XMM6, XMM7) 1738 1739 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1740 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1741 SSE_MULT_PS(XMM7, XMM3) 1742 SSE_ADD_PS(XMM6, XMM7) 1743 1744 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1745 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1746 1747 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1748 /* Column 3, product is accumulated in XMM0 */ 1749 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1750 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1751 SSE_MULT_PS(XMM0, XMM7) 1752 1753 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1754 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1755 SSE_MULT_PS(XMM1, XMM7) 1756 SSE_ADD_PS(XMM0, XMM1) 1757 1758 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1759 SSE_SHUFFLE(XMM1, XMM1, 0x00) 1760 SSE_MULT_PS(XMM1, XMM2) 1761 SSE_ADD_PS(XMM0, XMM1) 1762 1763 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1764 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1765 SSE_MULT_PS(XMM3, XMM7) 1766 SSE_ADD_PS(XMM0, XMM3) 1767 1768 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1769 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1770 1771 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1772 /* This is code to be maintained and read by humans after all. */ 1773 /* Copy Multiplier Col 3 into XMM3 */ 1774 SSE_COPY_PS(XMM3, XMM0) 1775 /* Copy Multiplier Col 2 into XMM2 */ 1776 SSE_COPY_PS(XMM2, XMM6) 1777 /* Copy Multiplier Col 1 into XMM1 */ 1778 SSE_COPY_PS(XMM1, XMM5) 1779 /* Copy Multiplier Col 0 into XMM0 */ 1780 SSE_COPY_PS(XMM0, XMM4) 1781 SSE_INLINE_END_2; 1782 1783 /* Update the row: */ 1784 nz = bi[row + 1] - diag_offset[row] - 1; 1785 pv += 16; 1786 for (j = 0; j < nz; j++) { 1787 PREFETCH_L1(&pv[16]); 1788 x = rtmp + 16 * ((unsigned int)pj[j]); 1789 /* x = rtmp + 4*pj[j]; */ 1790 1791 /* X:=X-M*PV, One column at a time */ 1792 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1793 SSE_INLINE_BEGIN_2(x, pv) 1794 /* Load First Column of X*/ 1795 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1796 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1797 1798 /* Matrix-Vector Product: */ 1799 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1800 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1801 SSE_MULT_PS(XMM5, XMM0) 1802 SSE_SUB_PS(XMM4, XMM5) 1803 1804 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1805 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1806 SSE_MULT_PS(XMM6, XMM1) 1807 SSE_SUB_PS(XMM4, XMM6) 1808 1809 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1810 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1811 SSE_MULT_PS(XMM7, XMM2) 1812 SSE_SUB_PS(XMM4, XMM7) 1813 1814 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1815 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1816 SSE_MULT_PS(XMM5, XMM3) 1817 SSE_SUB_PS(XMM4, XMM5) 1818 1819 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1820 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1821 1822 /* Second Column */ 1823 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1824 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1825 1826 /* Matrix-Vector Product: */ 1827 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1828 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1829 SSE_MULT_PS(XMM6, XMM0) 1830 SSE_SUB_PS(XMM5, XMM6) 1831 1832 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1833 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1834 SSE_MULT_PS(XMM7, XMM1) 1835 SSE_SUB_PS(XMM5, XMM7) 1836 1837 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1838 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1839 SSE_MULT_PS(XMM4, XMM2) 1840 SSE_SUB_PS(XMM5, XMM4) 1841 1842 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1843 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1844 SSE_MULT_PS(XMM6, XMM3) 1845 SSE_SUB_PS(XMM5, XMM6) 1846 1847 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1848 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1849 1850 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1851 1852 /* Third Column */ 1853 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1854 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1855 1856 /* Matrix-Vector Product: */ 1857 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1858 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1859 SSE_MULT_PS(XMM7, XMM0) 1860 SSE_SUB_PS(XMM6, XMM7) 1861 1862 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1863 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1864 SSE_MULT_PS(XMM4, XMM1) 1865 SSE_SUB_PS(XMM6, XMM4) 1866 1867 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1868 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1869 SSE_MULT_PS(XMM5, XMM2) 1870 SSE_SUB_PS(XMM6, XMM5) 1871 1872 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1873 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1874 SSE_MULT_PS(XMM7, XMM3) 1875 SSE_SUB_PS(XMM6, XMM7) 1876 1877 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1878 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1879 1880 /* Fourth Column */ 1881 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1882 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1883 1884 /* Matrix-Vector Product: */ 1885 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1886 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1887 SSE_MULT_PS(XMM5, XMM0) 1888 SSE_SUB_PS(XMM4, XMM5) 1889 1890 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1891 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1892 SSE_MULT_PS(XMM6, XMM1) 1893 SSE_SUB_PS(XMM4, XMM6) 1894 1895 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1896 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1897 SSE_MULT_PS(XMM7, XMM2) 1898 SSE_SUB_PS(XMM4, XMM7) 1899 1900 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1901 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1902 SSE_MULT_PS(XMM5, XMM3) 1903 SSE_SUB_PS(XMM4, XMM5) 1904 1905 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1906 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1907 SSE_INLINE_END_2; 1908 pv += 16; 1909 } 1910 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1911 } 1912 row = (unsigned int)(*bjtmp++); 1913 /* row = (*bjtmp++)/4; */ 1914 /* bjtmp++; */ 1915 } 1916 /* finished row so stick it into b->a */ 1917 pv = ba + 16 * bi[i]; 1918 pj = bj + bi[i]; 1919 nz = bi[i + 1] - bi[i]; 1920 1921 /* Copy x block back into pv block */ 1922 for (j = 0; j < nz; j++) { 1923 x = rtmp + 16 * ((unsigned int)pj[j]); 1924 /* x = rtmp+4*pj[j]; */ 1925 1926 SSE_INLINE_BEGIN_2(x, pv) 1927 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1928 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1929 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1930 1931 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1932 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1933 1934 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1935 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1936 1937 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1938 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1939 1940 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1941 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1942 1943 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1944 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1945 1946 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1947 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1948 1949 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1950 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1951 SSE_INLINE_END_2; 1952 pv += 16; 1953 } 1954 /* invert diagonal block */ 1955 w = ba + 16 * diag_offset[i]; 1956 if (pivotinblocks) { 1957 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected)); 1958 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1959 } else { 1960 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1961 } 1962 /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1963 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1964 } 1965 1966 PetscCall(PetscFree(rtmp)); 1967 1968 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1969 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1970 C->assembled = PETSC_TRUE; 1971 1972 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1973 /* Flop Count from inverting diagonal blocks */ 1974 SSE_SCOPE_END; 1975 PetscFunctionReturn(PETSC_SUCCESS); 1976 } 1977 1978 #endif 1979