1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 /* 5 Special case where the matrix was ILU(0) factored in the natural 6 ordering. This eliminates the need for the column and row permutation. 7 */ 8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 9 { 10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 11 PetscInt n =a->mbs; 12 const PetscInt *ai=a->i,*aj=a->j; 13 const PetscInt *diag = a->diag; 14 const MatScalar *aa =a->a; 15 PetscScalar *x; 16 const PetscScalar *b; 17 18 PetscFunctionBegin; 19 PetscCall(VecGetArrayRead(bb,&b)); 20 PetscCall(VecGetArray(xx,&x)); 21 22 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 23 { 24 static PetscScalar w[2000]; /* very BAD need to fix */ 25 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 26 } 27 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 28 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 29 #else 30 { 31 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 32 const MatScalar *v; 33 PetscInt jdx,idt,idx,nz,i,ai16; 34 const PetscInt *vi; 35 36 /* forward solve the lower triangular */ 37 idx = 0; 38 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 39 for (i=1; i<n; i++) { 40 v = aa + 16*ai[i]; 41 vi = aj + ai[i]; 42 nz = diag[i] - ai[i]; 43 idx += 4; 44 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 45 while (nz--) { 46 jdx = 4*(*vi++); 47 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 48 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 49 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 50 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 51 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 52 v += 16; 53 } 54 x[idx] = s1; 55 x[1+idx] = s2; 56 x[2+idx] = s3; 57 x[3+idx] = s4; 58 } 59 /* backward solve the upper triangular */ 60 idt = 4*(n-1); 61 for (i=n-1; i>=0; i--) { 62 ai16 = 16*diag[i]; 63 v = aa + ai16 + 16; 64 vi = aj + diag[i] + 1; 65 nz = ai[i+1] - diag[i] - 1; 66 s1 = x[idt]; s2 = x[1+idt]; 67 s3 = x[2+idt];s4 = x[3+idt]; 68 while (nz--) { 69 idx = 4*(*vi++); 70 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 71 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 72 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 73 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 74 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 75 v += 16; 76 } 77 v = aa + ai16; 78 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 79 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 80 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 81 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 82 idt -= 4; 83 } 84 } 85 #endif 86 87 PetscCall(VecRestoreArrayRead(bb,&b)); 88 PetscCall(VecRestoreArray(xx,&x)); 89 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 90 PetscFunctionReturn(0); 91 } 92 93 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 94 { 95 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 96 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 97 PetscInt i,k,nz,idx,jdx,idt; 98 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 99 const MatScalar *aa=a->a,*v; 100 PetscScalar *x; 101 const PetscScalar *b; 102 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 103 104 PetscFunctionBegin; 105 PetscCall(VecGetArrayRead(bb,&b)); 106 PetscCall(VecGetArray(xx,&x)); 107 /* forward solve the lower triangular */ 108 idx = 0; 109 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 110 for (i=1; i<n; i++) { 111 v = aa + bs2*ai[i]; 112 vi = aj + ai[i]; 113 nz = ai[i+1] - ai[i]; 114 idx = bs*i; 115 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 116 for (k=0; k<nz; k++) { 117 jdx = bs*vi[k]; 118 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 119 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 120 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 121 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 122 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 123 124 v += bs2; 125 } 126 127 x[idx] = s1; 128 x[1+idx] = s2; 129 x[2+idx] = s3; 130 x[3+idx] = s4; 131 } 132 133 /* backward solve the upper triangular */ 134 for (i=n-1; i>=0; i--) { 135 v = aa + bs2*(adiag[i+1]+1); 136 vi = aj + adiag[i+1]+1; 137 nz = adiag[i] - adiag[i+1]-1; 138 idt = bs*i; 139 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 140 141 for (k=0; k<nz; k++) { 142 idx = bs*vi[k]; 143 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 144 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 145 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 146 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 147 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 148 149 v += bs2; 150 } 151 /* x = inv_diagonal*x */ 152 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 153 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 154 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 155 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 156 157 } 158 159 PetscCall(VecRestoreArrayRead(bb,&b)); 160 PetscCall(VecRestoreArray(xx,&x)); 161 PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n)); 162 PetscFunctionReturn(0); 163 } 164 165 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 166 { 167 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 168 const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 169 const MatScalar *aa=a->a; 170 const PetscScalar *b; 171 PetscScalar *x; 172 173 PetscFunctionBegin; 174 PetscCall(VecGetArrayRead(bb,&b)); 175 PetscCall(VecGetArray(xx,&x)); 176 177 { 178 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 179 const MatScalar *v; 180 MatScalar *t=(MatScalar*)x; 181 PetscInt jdx,idt,idx,nz,i,ai16; 182 const PetscInt *vi; 183 184 /* forward solve the lower triangular */ 185 idx = 0; 186 t[0] = (MatScalar)b[0]; 187 t[1] = (MatScalar)b[1]; 188 t[2] = (MatScalar)b[2]; 189 t[3] = (MatScalar)b[3]; 190 for (i=1; i<n; i++) { 191 v = aa + 16*ai[i]; 192 vi = aj + ai[i]; 193 nz = diag[i] - ai[i]; 194 idx += 4; 195 s1 = (MatScalar)b[idx]; 196 s2 = (MatScalar)b[1+idx]; 197 s3 = (MatScalar)b[2+idx]; 198 s4 = (MatScalar)b[3+idx]; 199 while (nz--) { 200 jdx = 4*(*vi++); 201 x1 = t[jdx]; 202 x2 = t[1+jdx]; 203 x3 = t[2+jdx]; 204 x4 = t[3+jdx]; 205 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 206 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 207 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 208 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 209 v += 16; 210 } 211 t[idx] = s1; 212 t[1+idx] = s2; 213 t[2+idx] = s3; 214 t[3+idx] = s4; 215 } 216 /* backward solve the upper triangular */ 217 idt = 4*(n-1); 218 for (i=n-1; i>=0; i--) { 219 ai16 = 16*diag[i]; 220 v = aa + ai16 + 16; 221 vi = aj + diag[i] + 1; 222 nz = ai[i+1] - diag[i] - 1; 223 s1 = t[idt]; 224 s2 = t[1+idt]; 225 s3 = t[2+idt]; 226 s4 = t[3+idt]; 227 while (nz--) { 228 idx = 4*(*vi++); 229 x1 = (MatScalar)x[idx]; 230 x2 = (MatScalar)x[1+idx]; 231 x3 = (MatScalar)x[2+idx]; 232 x4 = (MatScalar)x[3+idx]; 233 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 234 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 235 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 236 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 237 v += 16; 238 } 239 v = aa + ai16; 240 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 241 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 242 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 243 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 244 idt -= 4; 245 } 246 } 247 248 PetscCall(VecRestoreArrayRead(bb,&b)); 249 PetscCall(VecRestoreArray(xx,&x)); 250 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 251 PetscFunctionReturn(0); 252 } 253 254 #if defined(PETSC_HAVE_SSE) 255 256 #include PETSC_HAVE_SSE 257 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 258 { 259 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 260 unsigned short *aj=(unsigned short*)a->j; 261 PetscErrorCode ierr; 262 int *ai=a->i,n=a->mbs,*diag = a->diag; 263 MatScalar *aa=a->a; 264 PetscScalar *x,*b; 265 266 PetscFunctionBegin; 267 SSE_SCOPE_BEGIN; 268 /* 269 Note: This code currently uses demotion of double 270 to float when performing the mixed-mode computation. 271 This may not be numerically reasonable for all applications. 272 */ 273 PREFETCH_NTA(aa+16*ai[1]); 274 275 PetscCall(VecGetArray(bb,&b)); 276 PetscCall(VecGetArray(xx,&x)); 277 { 278 /* x will first be computed in single precision then promoted inplace to double */ 279 MatScalar *v,*t=(MatScalar*)x; 280 int nz,i,idt,ai16; 281 unsigned int jdx,idx; 282 unsigned short *vi; 283 /* Forward solve the lower triangular factor. */ 284 285 /* First block is the identity. */ 286 idx = 0; 287 CONVERT_DOUBLE4_FLOAT4(t,b); 288 v = aa + 16*((unsigned int)ai[1]); 289 290 for (i=1; i<n;) { 291 PREFETCH_NTA(&v[8]); 292 vi = aj + ai[i]; 293 nz = diag[i] - ai[i]; 294 idx += 4; 295 296 /* Demote RHS from double to float. */ 297 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 298 LOAD_PS(&t[idx],XMM7); 299 300 while (nz--) { 301 PREFETCH_NTA(&v[16]); 302 jdx = 4*((unsigned int)(*vi++)); 303 304 /* 4x4 Matrix-Vector product with negative accumulation: */ 305 SSE_INLINE_BEGIN_2(&t[jdx],v) 306 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 307 308 /* First Column */ 309 SSE_COPY_PS(XMM0,XMM6) 310 SSE_SHUFFLE(XMM0,XMM0,0x00) 311 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 312 SSE_SUB_PS(XMM7,XMM0) 313 314 /* Second Column */ 315 SSE_COPY_PS(XMM1,XMM6) 316 SSE_SHUFFLE(XMM1,XMM1,0x55) 317 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 318 SSE_SUB_PS(XMM7,XMM1) 319 320 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 321 322 /* Third Column */ 323 SSE_COPY_PS(XMM2,XMM6) 324 SSE_SHUFFLE(XMM2,XMM2,0xAA) 325 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 326 SSE_SUB_PS(XMM7,XMM2) 327 328 /* Fourth Column */ 329 SSE_COPY_PS(XMM3,XMM6) 330 SSE_SHUFFLE(XMM3,XMM3,0xFF) 331 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 332 SSE_SUB_PS(XMM7,XMM3) 333 SSE_INLINE_END_2 334 335 v += 16; 336 } 337 v = aa + 16*ai[++i]; 338 PREFETCH_NTA(v); 339 STORE_PS(&t[idx],XMM7); 340 } 341 342 /* Backward solve the upper triangular factor.*/ 343 344 idt = 4*(n-1); 345 ai16 = 16*diag[n-1]; 346 v = aa + ai16 + 16; 347 for (i=n-1; i>=0;) { 348 PREFETCH_NTA(&v[8]); 349 vi = aj + diag[i] + 1; 350 nz = ai[i+1] - diag[i] - 1; 351 352 LOAD_PS(&t[idt],XMM7); 353 354 while (nz--) { 355 PREFETCH_NTA(&v[16]); 356 idx = 4*((unsigned int)(*vi++)); 357 358 /* 4x4 Matrix-Vector Product with negative accumulation: */ 359 SSE_INLINE_BEGIN_2(&t[idx],v) 360 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 361 362 /* First Column */ 363 SSE_COPY_PS(XMM0,XMM6) 364 SSE_SHUFFLE(XMM0,XMM0,0x00) 365 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 366 SSE_SUB_PS(XMM7,XMM0) 367 368 /* Second Column */ 369 SSE_COPY_PS(XMM1,XMM6) 370 SSE_SHUFFLE(XMM1,XMM1,0x55) 371 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 372 SSE_SUB_PS(XMM7,XMM1) 373 374 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 375 376 /* Third Column */ 377 SSE_COPY_PS(XMM2,XMM6) 378 SSE_SHUFFLE(XMM2,XMM2,0xAA) 379 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 380 SSE_SUB_PS(XMM7,XMM2) 381 382 /* Fourth Column */ 383 SSE_COPY_PS(XMM3,XMM6) 384 SSE_SHUFFLE(XMM3,XMM3,0xFF) 385 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 386 SSE_SUB_PS(XMM7,XMM3) 387 SSE_INLINE_END_2 388 v += 16; 389 } 390 v = aa + ai16; 391 ai16 = 16*diag[--i]; 392 PREFETCH_NTA(aa+ai16+16); 393 /* 394 Scale the result by the diagonal 4x4 block, 395 which was inverted as part of the factorization 396 */ 397 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 398 /* First Column */ 399 SSE_COPY_PS(XMM0,XMM7) 400 SSE_SHUFFLE(XMM0,XMM0,0x00) 401 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 402 403 /* Second Column */ 404 SSE_COPY_PS(XMM1,XMM7) 405 SSE_SHUFFLE(XMM1,XMM1,0x55) 406 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 407 SSE_ADD_PS(XMM0,XMM1) 408 409 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 410 411 /* Third Column */ 412 SSE_COPY_PS(XMM2,XMM7) 413 SSE_SHUFFLE(XMM2,XMM2,0xAA) 414 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 415 SSE_ADD_PS(XMM0,XMM2) 416 417 /* Fourth Column */ 418 SSE_COPY_PS(XMM3,XMM7) 419 SSE_SHUFFLE(XMM3,XMM3,0xFF) 420 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 421 SSE_ADD_PS(XMM0,XMM3) 422 423 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 424 SSE_INLINE_END_3 425 426 v = aa + ai16 + 16; 427 idt -= 4; 428 } 429 430 /* Convert t from single precision back to double precision (inplace)*/ 431 idt = 4*(n-1); 432 for (i=n-1; i>=0; i--) { 433 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 434 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 435 PetscScalar *xtemp=&x[idt]; 436 MatScalar *ttemp=&t[idt]; 437 xtemp[3] = (PetscScalar)ttemp[3]; 438 xtemp[2] = (PetscScalar)ttemp[2]; 439 xtemp[1] = (PetscScalar)ttemp[1]; 440 xtemp[0] = (PetscScalar)ttemp[0]; 441 idt -= 4; 442 } 443 444 } /* End of artificial scope. */ 445 PetscCall(VecRestoreArray(bb,&b)); 446 PetscCall(VecRestoreArray(xx,&x)); 447 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 448 SSE_SCOPE_END; 449 PetscFunctionReturn(0); 450 } 451 452 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 453 { 454 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 455 int *aj=a->j; 456 PetscErrorCode ierr; 457 int *ai=a->i,n=a->mbs,*diag = a->diag; 458 MatScalar *aa=a->a; 459 PetscScalar *x,*b; 460 461 PetscFunctionBegin; 462 SSE_SCOPE_BEGIN; 463 /* 464 Note: This code currently uses demotion of double 465 to float when performing the mixed-mode computation. 466 This may not be numerically reasonable for all applications. 467 */ 468 PREFETCH_NTA(aa+16*ai[1]); 469 470 PetscCall(VecGetArray(bb,&b)); 471 PetscCall(VecGetArray(xx,&x)); 472 { 473 /* x will first be computed in single precision then promoted inplace to double */ 474 MatScalar *v,*t=(MatScalar*)x; 475 int nz,i,idt,ai16; 476 int jdx,idx; 477 int *vi; 478 /* Forward solve the lower triangular factor. */ 479 480 /* First block is the identity. */ 481 idx = 0; 482 CONVERT_DOUBLE4_FLOAT4(t,b); 483 v = aa + 16*ai[1]; 484 485 for (i=1; i<n;) { 486 PREFETCH_NTA(&v[8]); 487 vi = aj + ai[i]; 488 nz = diag[i] - ai[i]; 489 idx += 4; 490 491 /* Demote RHS from double to float. */ 492 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 493 LOAD_PS(&t[idx],XMM7); 494 495 while (nz--) { 496 PREFETCH_NTA(&v[16]); 497 jdx = 4*(*vi++); 498 /* jdx = *vi++; */ 499 500 /* 4x4 Matrix-Vector product with negative accumulation: */ 501 SSE_INLINE_BEGIN_2(&t[jdx],v) 502 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 503 504 /* First Column */ 505 SSE_COPY_PS(XMM0,XMM6) 506 SSE_SHUFFLE(XMM0,XMM0,0x00) 507 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 508 SSE_SUB_PS(XMM7,XMM0) 509 510 /* Second Column */ 511 SSE_COPY_PS(XMM1,XMM6) 512 SSE_SHUFFLE(XMM1,XMM1,0x55) 513 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 514 SSE_SUB_PS(XMM7,XMM1) 515 516 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 517 518 /* Third Column */ 519 SSE_COPY_PS(XMM2,XMM6) 520 SSE_SHUFFLE(XMM2,XMM2,0xAA) 521 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 522 SSE_SUB_PS(XMM7,XMM2) 523 524 /* Fourth Column */ 525 SSE_COPY_PS(XMM3,XMM6) 526 SSE_SHUFFLE(XMM3,XMM3,0xFF) 527 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 528 SSE_SUB_PS(XMM7,XMM3) 529 SSE_INLINE_END_2 530 531 v += 16; 532 } 533 v = aa + 16*ai[++i]; 534 PREFETCH_NTA(v); 535 STORE_PS(&t[idx],XMM7); 536 } 537 538 /* Backward solve the upper triangular factor.*/ 539 540 idt = 4*(n-1); 541 ai16 = 16*diag[n-1]; 542 v = aa + ai16 + 16; 543 for (i=n-1; i>=0;) { 544 PREFETCH_NTA(&v[8]); 545 vi = aj + diag[i] + 1; 546 nz = ai[i+1] - diag[i] - 1; 547 548 LOAD_PS(&t[idt],XMM7); 549 550 while (nz--) { 551 PREFETCH_NTA(&v[16]); 552 idx = 4*(*vi++); 553 /* idx = *vi++; */ 554 555 /* 4x4 Matrix-Vector Product with negative accumulation: */ 556 SSE_INLINE_BEGIN_2(&t[idx],v) 557 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 558 559 /* First Column */ 560 SSE_COPY_PS(XMM0,XMM6) 561 SSE_SHUFFLE(XMM0,XMM0,0x00) 562 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 563 SSE_SUB_PS(XMM7,XMM0) 564 565 /* Second Column */ 566 SSE_COPY_PS(XMM1,XMM6) 567 SSE_SHUFFLE(XMM1,XMM1,0x55) 568 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 569 SSE_SUB_PS(XMM7,XMM1) 570 571 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 572 573 /* Third Column */ 574 SSE_COPY_PS(XMM2,XMM6) 575 SSE_SHUFFLE(XMM2,XMM2,0xAA) 576 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 577 SSE_SUB_PS(XMM7,XMM2) 578 579 /* Fourth Column */ 580 SSE_COPY_PS(XMM3,XMM6) 581 SSE_SHUFFLE(XMM3,XMM3,0xFF) 582 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 583 SSE_SUB_PS(XMM7,XMM3) 584 SSE_INLINE_END_2 585 v += 16; 586 } 587 v = aa + ai16; 588 ai16 = 16*diag[--i]; 589 PREFETCH_NTA(aa+ai16+16); 590 /* 591 Scale the result by the diagonal 4x4 block, 592 which was inverted as part of the factorization 593 */ 594 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 595 /* First Column */ 596 SSE_COPY_PS(XMM0,XMM7) 597 SSE_SHUFFLE(XMM0,XMM0,0x00) 598 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 599 600 /* Second Column */ 601 SSE_COPY_PS(XMM1,XMM7) 602 SSE_SHUFFLE(XMM1,XMM1,0x55) 603 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 604 SSE_ADD_PS(XMM0,XMM1) 605 606 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 607 608 /* Third Column */ 609 SSE_COPY_PS(XMM2,XMM7) 610 SSE_SHUFFLE(XMM2,XMM2,0xAA) 611 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 612 SSE_ADD_PS(XMM0,XMM2) 613 614 /* Fourth Column */ 615 SSE_COPY_PS(XMM3,XMM7) 616 SSE_SHUFFLE(XMM3,XMM3,0xFF) 617 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 618 SSE_ADD_PS(XMM0,XMM3) 619 620 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 621 SSE_INLINE_END_3 622 623 v = aa + ai16 + 16; 624 idt -= 4; 625 } 626 627 /* Convert t from single precision back to double precision (inplace)*/ 628 idt = 4*(n-1); 629 for (i=n-1; i>=0; i--) { 630 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 631 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 632 PetscScalar *xtemp=&x[idt]; 633 MatScalar *ttemp=&t[idt]; 634 xtemp[3] = (PetscScalar)ttemp[3]; 635 xtemp[2] = (PetscScalar)ttemp[2]; 636 xtemp[1] = (PetscScalar)ttemp[1]; 637 xtemp[0] = (PetscScalar)ttemp[0]; 638 idt -= 4; 639 } 640 641 } /* End of artificial scope. */ 642 PetscCall(VecRestoreArray(bb,&b)); 643 PetscCall(VecRestoreArray(xx,&x)); 644 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 645 SSE_SCOPE_END; 646 PetscFunctionReturn(0); 647 } 648 649 #endif 650