1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 /* 5 Special case where the matrix was ILU(0) factored in the natural 6 ordering. This eliminates the need for the column and row permutation. 7 */ 8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 9 { 10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 11 PetscInt n = a->mbs; 12 const PetscInt *ai = a->i, *aj = a->j; 13 const PetscInt *diag = a->diag; 14 const MatScalar *aa = a->a; 15 PetscScalar *x; 16 const PetscScalar *b; 17 18 PetscFunctionBegin; 19 PetscCall(VecGetArrayRead(bb, &b)); 20 PetscCall(VecGetArray(xx, &x)); 21 22 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 23 { 24 static PetscScalar w[2000]; /* very BAD need to fix */ 25 fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w); 26 } 27 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 28 fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b); 29 #else 30 { 31 PetscScalar s1, s2, s3, s4, x1, x2, x3, x4; 32 const MatScalar *v; 33 PetscInt jdx, idt, idx, nz, i, ai16; 34 const PetscInt *vi; 35 36 /* forward solve the lower triangular */ 37 idx = 0; 38 x[0] = b[0]; 39 x[1] = b[1]; 40 x[2] = b[2]; 41 x[3] = b[3]; 42 for (i = 1; i < n; i++) { 43 v = aa + 16 * ai[i]; 44 vi = aj + ai[i]; 45 nz = diag[i] - ai[i]; 46 idx += 4; 47 s1 = b[idx]; 48 s2 = b[1 + idx]; 49 s3 = b[2 + idx]; 50 s4 = b[3 + idx]; 51 while (nz--) { 52 jdx = 4 * (*vi++); 53 x1 = x[jdx]; 54 x2 = x[1 + jdx]; 55 x3 = x[2 + jdx]; 56 x4 = x[3 + jdx]; 57 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 58 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 59 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 60 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 61 v += 16; 62 } 63 x[idx] = s1; 64 x[1 + idx] = s2; 65 x[2 + idx] = s3; 66 x[3 + idx] = s4; 67 } 68 /* backward solve the upper triangular */ 69 idt = 4 * (n - 1); 70 for (i = n - 1; i >= 0; i--) { 71 ai16 = 16 * diag[i]; 72 v = aa + ai16 + 16; 73 vi = aj + diag[i] + 1; 74 nz = ai[i + 1] - diag[i] - 1; 75 s1 = x[idt]; 76 s2 = x[1 + idt]; 77 s3 = x[2 + idt]; 78 s4 = x[3 + idt]; 79 while (nz--) { 80 idx = 4 * (*vi++); 81 x1 = x[idx]; 82 x2 = x[1 + idx]; 83 x3 = x[2 + idx]; 84 x4 = x[3 + idx]; 85 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 86 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 87 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 88 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 89 v += 16; 90 } 91 v = aa + ai16; 92 x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 93 x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 94 x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 95 x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 96 idt -= 4; 97 } 98 } 99 #endif 100 101 PetscCall(VecRestoreArrayRead(bb, &b)); 102 PetscCall(VecRestoreArray(xx, &x)); 103 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 104 PetscFunctionReturn(0); 105 } 106 107 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx) 108 { 109 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 110 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 111 PetscInt i, k, nz, idx, jdx, idt; 112 const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 113 const MatScalar *aa = a->a, *v; 114 PetscScalar *x; 115 const PetscScalar *b; 116 PetscScalar s1, s2, s3, s4, x1, x2, x3, x4; 117 118 PetscFunctionBegin; 119 PetscCall(VecGetArrayRead(bb, &b)); 120 PetscCall(VecGetArray(xx, &x)); 121 /* forward solve the lower triangular */ 122 idx = 0; 123 x[0] = b[idx]; 124 x[1] = b[1 + idx]; 125 x[2] = b[2 + idx]; 126 x[3] = b[3 + idx]; 127 for (i = 1; i < n; i++) { 128 v = aa + bs2 * ai[i]; 129 vi = aj + ai[i]; 130 nz = ai[i + 1] - ai[i]; 131 idx = bs * i; 132 s1 = b[idx]; 133 s2 = b[1 + idx]; 134 s3 = b[2 + idx]; 135 s4 = b[3 + idx]; 136 for (k = 0; k < nz; k++) { 137 jdx = bs * vi[k]; 138 x1 = x[jdx]; 139 x2 = x[1 + jdx]; 140 x3 = x[2 + jdx]; 141 x4 = x[3 + jdx]; 142 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 143 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 144 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 145 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 146 147 v += bs2; 148 } 149 150 x[idx] = s1; 151 x[1 + idx] = s2; 152 x[2 + idx] = s3; 153 x[3 + idx] = s4; 154 } 155 156 /* backward solve the upper triangular */ 157 for (i = n - 1; i >= 0; i--) { 158 v = aa + bs2 * (adiag[i + 1] + 1); 159 vi = aj + adiag[i + 1] + 1; 160 nz = adiag[i] - adiag[i + 1] - 1; 161 idt = bs * i; 162 s1 = x[idt]; 163 s2 = x[1 + idt]; 164 s3 = x[2 + idt]; 165 s4 = x[3 + idt]; 166 167 for (k = 0; k < nz; k++) { 168 idx = bs * vi[k]; 169 x1 = x[idx]; 170 x2 = x[1 + idx]; 171 x3 = x[2 + idx]; 172 x4 = x[3 + idx]; 173 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 174 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 175 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 176 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 177 178 v += bs2; 179 } 180 /* x = inv_diagonal*x */ 181 x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 182 x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 183 x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 184 x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 185 } 186 187 PetscCall(VecRestoreArrayRead(bb, &b)); 188 PetscCall(VecRestoreArray(xx, &x)); 189 PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n)); 190 PetscFunctionReturn(0); 191 } 192 193 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A, Vec bb, Vec xx) 194 { 195 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 196 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *diag = a->diag; 197 const MatScalar *aa = a->a; 198 const PetscScalar *b; 199 PetscScalar *x; 200 201 PetscFunctionBegin; 202 PetscCall(VecGetArrayRead(bb, &b)); 203 PetscCall(VecGetArray(xx, &x)); 204 205 { 206 MatScalar s1, s2, s3, s4, x1, x2, x3, x4; 207 const MatScalar *v; 208 MatScalar *t = (MatScalar *)x; 209 PetscInt jdx, idt, idx, nz, i, ai16; 210 const PetscInt *vi; 211 212 /* forward solve the lower triangular */ 213 idx = 0; 214 t[0] = (MatScalar)b[0]; 215 t[1] = (MatScalar)b[1]; 216 t[2] = (MatScalar)b[2]; 217 t[3] = (MatScalar)b[3]; 218 for (i = 1; i < n; i++) { 219 v = aa + 16 * ai[i]; 220 vi = aj + ai[i]; 221 nz = diag[i] - ai[i]; 222 idx += 4; 223 s1 = (MatScalar)b[idx]; 224 s2 = (MatScalar)b[1 + idx]; 225 s3 = (MatScalar)b[2 + idx]; 226 s4 = (MatScalar)b[3 + idx]; 227 while (nz--) { 228 jdx = 4 * (*vi++); 229 x1 = t[jdx]; 230 x2 = t[1 + jdx]; 231 x3 = t[2 + jdx]; 232 x4 = t[3 + jdx]; 233 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 234 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 235 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 236 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 237 v += 16; 238 } 239 t[idx] = s1; 240 t[1 + idx] = s2; 241 t[2 + idx] = s3; 242 t[3 + idx] = s4; 243 } 244 /* backward solve the upper triangular */ 245 idt = 4 * (n - 1); 246 for (i = n - 1; i >= 0; i--) { 247 ai16 = 16 * diag[i]; 248 v = aa + ai16 + 16; 249 vi = aj + diag[i] + 1; 250 nz = ai[i + 1] - diag[i] - 1; 251 s1 = t[idt]; 252 s2 = t[1 + idt]; 253 s3 = t[2 + idt]; 254 s4 = t[3 + idt]; 255 while (nz--) { 256 idx = 4 * (*vi++); 257 x1 = (MatScalar)x[idx]; 258 x2 = (MatScalar)x[1 + idx]; 259 x3 = (MatScalar)x[2 + idx]; 260 x4 = (MatScalar)x[3 + idx]; 261 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 262 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 263 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 264 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 265 v += 16; 266 } 267 v = aa + ai16; 268 x[idt] = (PetscScalar)(v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4); 269 x[1 + idt] = (PetscScalar)(v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4); 270 x[2 + idt] = (PetscScalar)(v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4); 271 x[3 + idt] = (PetscScalar)(v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4); 272 idt -= 4; 273 } 274 } 275 276 PetscCall(VecRestoreArrayRead(bb, &b)); 277 PetscCall(VecRestoreArray(xx, &x)); 278 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 279 PetscFunctionReturn(0); 280 } 281 282 #if defined(PETSC_HAVE_SSE) 283 284 #include PETSC_HAVE_SSE 285 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx) 286 { 287 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 288 unsigned short *aj = (unsigned short *)a->j; 289 int *ai = a->i, n = a->mbs, *diag = a->diag; 290 MatScalar *aa = a->a; 291 PetscScalar *x, *b; 292 293 PetscFunctionBegin; 294 SSE_SCOPE_BEGIN; 295 /* 296 Note: This code currently uses demotion of double 297 to float when performing the mixed-mode computation. 298 This may not be numerically reasonable for all applications. 299 */ 300 PREFETCH_NTA(aa + 16 * ai[1]); 301 302 PetscCall(VecGetArray(bb, &b)); 303 PetscCall(VecGetArray(xx, &x)); 304 { 305 /* x will first be computed in single precision then promoted inplace to double */ 306 MatScalar *v, *t = (MatScalar *)x; 307 int nz, i, idt, ai16; 308 unsigned int jdx, idx; 309 unsigned short *vi; 310 /* Forward solve the lower triangular factor. */ 311 312 /* First block is the identity. */ 313 idx = 0; 314 CONVERT_DOUBLE4_FLOAT4(t, b); 315 v = aa + 16 * ((unsigned int)ai[1]); 316 317 for (i = 1; i < n;) { 318 PREFETCH_NTA(&v[8]); 319 vi = aj + ai[i]; 320 nz = diag[i] - ai[i]; 321 idx += 4; 322 323 /* Demote RHS from double to float. */ 324 CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]); 325 LOAD_PS(&t[idx], XMM7); 326 327 while (nz--) { 328 PREFETCH_NTA(&v[16]); 329 jdx = 4 * ((unsigned int)(*vi++)); 330 331 /* 4x4 Matrix-Vector product with negative accumulation: */ 332 SSE_INLINE_BEGIN_2(&t[jdx], v) 333 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 334 335 /* First Column */ 336 SSE_COPY_PS(XMM0, XMM6) 337 SSE_SHUFFLE(XMM0, XMM0, 0x00) 338 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 339 SSE_SUB_PS(XMM7, XMM0) 340 341 /* Second Column */ 342 SSE_COPY_PS(XMM1, XMM6) 343 SSE_SHUFFLE(XMM1, XMM1, 0x55) 344 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 345 SSE_SUB_PS(XMM7, XMM1) 346 347 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 348 349 /* Third Column */ 350 SSE_COPY_PS(XMM2, XMM6) 351 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 352 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 353 SSE_SUB_PS(XMM7, XMM2) 354 355 /* Fourth Column */ 356 SSE_COPY_PS(XMM3, XMM6) 357 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 358 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 359 SSE_SUB_PS(XMM7, XMM3) 360 SSE_INLINE_END_2 361 362 v += 16; 363 } 364 v = aa + 16 * ai[++i]; 365 PREFETCH_NTA(v); 366 STORE_PS(&t[idx], XMM7); 367 } 368 369 /* Backward solve the upper triangular factor.*/ 370 371 idt = 4 * (n - 1); 372 ai16 = 16 * diag[n - 1]; 373 v = aa + ai16 + 16; 374 for (i = n - 1; i >= 0;) { 375 PREFETCH_NTA(&v[8]); 376 vi = aj + diag[i] + 1; 377 nz = ai[i + 1] - diag[i] - 1; 378 379 LOAD_PS(&t[idt], XMM7); 380 381 while (nz--) { 382 PREFETCH_NTA(&v[16]); 383 idx = 4 * ((unsigned int)(*vi++)); 384 385 /* 4x4 Matrix-Vector Product with negative accumulation: */ 386 SSE_INLINE_BEGIN_2(&t[idx], v) 387 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 388 389 /* First Column */ 390 SSE_COPY_PS(XMM0, XMM6) 391 SSE_SHUFFLE(XMM0, XMM0, 0x00) 392 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 393 SSE_SUB_PS(XMM7, XMM0) 394 395 /* Second Column */ 396 SSE_COPY_PS(XMM1, XMM6) 397 SSE_SHUFFLE(XMM1, XMM1, 0x55) 398 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 399 SSE_SUB_PS(XMM7, XMM1) 400 401 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 402 403 /* Third Column */ 404 SSE_COPY_PS(XMM2, XMM6) 405 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 406 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 407 SSE_SUB_PS(XMM7, XMM2) 408 409 /* Fourth Column */ 410 SSE_COPY_PS(XMM3, XMM6) 411 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 412 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 413 SSE_SUB_PS(XMM7, XMM3) 414 SSE_INLINE_END_2 415 v += 16; 416 } 417 v = aa + ai16; 418 ai16 = 16 * diag[--i]; 419 PREFETCH_NTA(aa + ai16 + 16); 420 /* 421 Scale the result by the diagonal 4x4 block, 422 which was inverted as part of the factorization 423 */ 424 SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16) 425 /* First Column */ 426 SSE_COPY_PS(XMM0, XMM7) 427 SSE_SHUFFLE(XMM0, XMM0, 0x00) 428 SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 429 430 /* Second Column */ 431 SSE_COPY_PS(XMM1, XMM7) 432 SSE_SHUFFLE(XMM1, XMM1, 0x55) 433 SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 434 SSE_ADD_PS(XMM0, XMM1) 435 436 SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 437 438 /* Third Column */ 439 SSE_COPY_PS(XMM2, XMM7) 440 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 441 SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 442 SSE_ADD_PS(XMM0, XMM2) 443 444 /* Fourth Column */ 445 SSE_COPY_PS(XMM3, XMM7) 446 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 447 SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 448 SSE_ADD_PS(XMM0, XMM3) 449 450 SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 451 SSE_INLINE_END_3 452 453 v = aa + ai16 + 16; 454 idt -= 4; 455 } 456 457 /* Convert t from single precision back to double precision (inplace)*/ 458 idt = 4 * (n - 1); 459 for (i = n - 1; i >= 0; i--) { 460 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 461 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 462 PetscScalar *xtemp = &x[idt]; 463 MatScalar *ttemp = &t[idt]; 464 xtemp[3] = (PetscScalar)ttemp[3]; 465 xtemp[2] = (PetscScalar)ttemp[2]; 466 xtemp[1] = (PetscScalar)ttemp[1]; 467 xtemp[0] = (PetscScalar)ttemp[0]; 468 idt -= 4; 469 } 470 471 } /* End of artificial scope. */ 472 PetscCall(VecRestoreArray(bb, &b)); 473 PetscCall(VecRestoreArray(xx, &x)); 474 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 475 SSE_SCOPE_END; 476 PetscFunctionReturn(0); 477 } 478 479 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx) 480 { 481 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 482 int *aj = a->j; 483 int *ai = a->i, n = a->mbs, *diag = a->diag; 484 MatScalar *aa = a->a; 485 PetscScalar *x, *b; 486 487 PetscFunctionBegin; 488 SSE_SCOPE_BEGIN; 489 /* 490 Note: This code currently uses demotion of double 491 to float when performing the mixed-mode computation. 492 This may not be numerically reasonable for all applications. 493 */ 494 PREFETCH_NTA(aa + 16 * ai[1]); 495 496 PetscCall(VecGetArray(bb, &b)); 497 PetscCall(VecGetArray(xx, &x)); 498 { 499 /* x will first be computed in single precision then promoted inplace to double */ 500 MatScalar *v, *t = (MatScalar *)x; 501 int nz, i, idt, ai16; 502 int jdx, idx; 503 int *vi; 504 /* Forward solve the lower triangular factor. */ 505 506 /* First block is the identity. */ 507 idx = 0; 508 CONVERT_DOUBLE4_FLOAT4(t, b); 509 v = aa + 16 * ai[1]; 510 511 for (i = 1; i < n;) { 512 PREFETCH_NTA(&v[8]); 513 vi = aj + ai[i]; 514 nz = diag[i] - ai[i]; 515 idx += 4; 516 517 /* Demote RHS from double to float. */ 518 CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]); 519 LOAD_PS(&t[idx], XMM7); 520 521 while (nz--) { 522 PREFETCH_NTA(&v[16]); 523 jdx = 4 * (*vi++); 524 /* jdx = *vi++; */ 525 526 /* 4x4 Matrix-Vector product with negative accumulation: */ 527 SSE_INLINE_BEGIN_2(&t[jdx], v) 528 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 529 530 /* First Column */ 531 SSE_COPY_PS(XMM0, XMM6) 532 SSE_SHUFFLE(XMM0, XMM0, 0x00) 533 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 534 SSE_SUB_PS(XMM7, XMM0) 535 536 /* Second Column */ 537 SSE_COPY_PS(XMM1, XMM6) 538 SSE_SHUFFLE(XMM1, XMM1, 0x55) 539 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 540 SSE_SUB_PS(XMM7, XMM1) 541 542 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 543 544 /* Third Column */ 545 SSE_COPY_PS(XMM2, XMM6) 546 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 547 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 548 SSE_SUB_PS(XMM7, XMM2) 549 550 /* Fourth Column */ 551 SSE_COPY_PS(XMM3, XMM6) 552 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 553 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 554 SSE_SUB_PS(XMM7, XMM3) 555 SSE_INLINE_END_2 556 557 v += 16; 558 } 559 v = aa + 16 * ai[++i]; 560 PREFETCH_NTA(v); 561 STORE_PS(&t[idx], XMM7); 562 } 563 564 /* Backward solve the upper triangular factor.*/ 565 566 idt = 4 * (n - 1); 567 ai16 = 16 * diag[n - 1]; 568 v = aa + ai16 + 16; 569 for (i = n - 1; i >= 0;) { 570 PREFETCH_NTA(&v[8]); 571 vi = aj + diag[i] + 1; 572 nz = ai[i + 1] - diag[i] - 1; 573 574 LOAD_PS(&t[idt], XMM7); 575 576 while (nz--) { 577 PREFETCH_NTA(&v[16]); 578 idx = 4 * (*vi++); 579 /* idx = *vi++; */ 580 581 /* 4x4 Matrix-Vector Product with negative accumulation: */ 582 SSE_INLINE_BEGIN_2(&t[idx], v) 583 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 584 585 /* First Column */ 586 SSE_COPY_PS(XMM0, XMM6) 587 SSE_SHUFFLE(XMM0, XMM0, 0x00) 588 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 589 SSE_SUB_PS(XMM7, XMM0) 590 591 /* Second Column */ 592 SSE_COPY_PS(XMM1, XMM6) 593 SSE_SHUFFLE(XMM1, XMM1, 0x55) 594 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 595 SSE_SUB_PS(XMM7, XMM1) 596 597 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 598 599 /* Third Column */ 600 SSE_COPY_PS(XMM2, XMM6) 601 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 602 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 603 SSE_SUB_PS(XMM7, XMM2) 604 605 /* Fourth Column */ 606 SSE_COPY_PS(XMM3, XMM6) 607 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 608 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 609 SSE_SUB_PS(XMM7, XMM3) 610 SSE_INLINE_END_2 611 v += 16; 612 } 613 v = aa + ai16; 614 ai16 = 16 * diag[--i]; 615 PREFETCH_NTA(aa + ai16 + 16); 616 /* 617 Scale the result by the diagonal 4x4 block, 618 which was inverted as part of the factorization 619 */ 620 SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16) 621 /* First Column */ 622 SSE_COPY_PS(XMM0, XMM7) 623 SSE_SHUFFLE(XMM0, XMM0, 0x00) 624 SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 625 626 /* Second Column */ 627 SSE_COPY_PS(XMM1, XMM7) 628 SSE_SHUFFLE(XMM1, XMM1, 0x55) 629 SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 630 SSE_ADD_PS(XMM0, XMM1) 631 632 SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 633 634 /* Third Column */ 635 SSE_COPY_PS(XMM2, XMM7) 636 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 637 SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 638 SSE_ADD_PS(XMM0, XMM2) 639 640 /* Fourth Column */ 641 SSE_COPY_PS(XMM3, XMM7) 642 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 643 SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 644 SSE_ADD_PS(XMM0, XMM3) 645 646 SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 647 SSE_INLINE_END_3 648 649 v = aa + ai16 + 16; 650 idt -= 4; 651 } 652 653 /* Convert t from single precision back to double precision (inplace)*/ 654 idt = 4 * (n - 1); 655 for (i = n - 1; i >= 0; i--) { 656 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 657 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 658 PetscScalar *xtemp = &x[idt]; 659 MatScalar *ttemp = &t[idt]; 660 xtemp[3] = (PetscScalar)ttemp[3]; 661 xtemp[2] = (PetscScalar)ttemp[2]; 662 xtemp[1] = (PetscScalar)ttemp[1]; 663 xtemp[0] = (PetscScalar)ttemp[0]; 664 idt -= 4; 665 } 666 667 } /* End of artificial scope. */ 668 PetscCall(VecRestoreArray(bb, &b)); 669 PetscCall(VecRestoreArray(xx, &x)); 670 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 671 SSE_SCOPE_END; 672 PetscFunctionReturn(0); 673 } 674 675 #endif 676