1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 /* 9 Version for when blocks are 4 by 4 10 */ 11 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info) 12 { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 14 IS isrow = b->row, isicol = b->icol; 15 const PetscInt *r, *ic; 16 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 17 PetscInt *ajtmpold, *ajtmp, nz, row; 18 PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 19 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 20 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 21 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 22 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 23 MatScalar m13, m14, m15, m16; 24 MatScalar *ba = b->a, *aa = a->a; 25 PetscBool pivotinblocks = b->pivotinblocks; 26 PetscReal shift = info->shiftamount; 27 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 28 29 PetscFunctionBegin; 30 PetscCall(ISGetIndices(isrow, &r)); 31 PetscCall(ISGetIndices(isicol, &ic)); 32 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 33 allowzeropivot = PetscNot(A->erroriffailure); 34 35 for (i = 0; i < n; i++) { 36 nz = bi[i + 1] - bi[i]; 37 ajtmp = bj + bi[i]; 38 for (j = 0; j < nz; j++) { 39 x = rtmp + 16 * ajtmp[j]; 40 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 41 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 42 } 43 /* load in initial (unfactored row) */ 44 idx = r[i]; 45 nz = ai[idx + 1] - ai[idx]; 46 ajtmpold = aj + ai[idx]; 47 v = aa + 16 * ai[idx]; 48 for (j = 0; j < nz; j++) { 49 x = rtmp + 16 * ic[ajtmpold[j]]; 50 x[0] = v[0]; 51 x[1] = v[1]; 52 x[2] = v[2]; 53 x[3] = v[3]; 54 x[4] = v[4]; 55 x[5] = v[5]; 56 x[6] = v[6]; 57 x[7] = v[7]; 58 x[8] = v[8]; 59 x[9] = v[9]; 60 x[10] = v[10]; 61 x[11] = v[11]; 62 x[12] = v[12]; 63 x[13] = v[13]; 64 x[14] = v[14]; 65 x[15] = v[15]; 66 v += 16; 67 } 68 row = *ajtmp++; 69 while (row < i) { 70 pc = rtmp + 16 * row; 71 p1 = pc[0]; 72 p2 = pc[1]; 73 p3 = pc[2]; 74 p4 = pc[3]; 75 p5 = pc[4]; 76 p6 = pc[5]; 77 p7 = pc[6]; 78 p8 = pc[7]; 79 p9 = pc[8]; 80 p10 = pc[9]; 81 p11 = pc[10]; 82 p12 = pc[11]; 83 p13 = pc[12]; 84 p14 = pc[13]; 85 p15 = pc[14]; 86 p16 = pc[15]; 87 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 88 pv = ba + 16 * diag_offset[row]; 89 pj = bj + diag_offset[row] + 1; 90 x1 = pv[0]; 91 x2 = pv[1]; 92 x3 = pv[2]; 93 x4 = pv[3]; 94 x5 = pv[4]; 95 x6 = pv[5]; 96 x7 = pv[6]; 97 x8 = pv[7]; 98 x9 = pv[8]; 99 x10 = pv[9]; 100 x11 = pv[10]; 101 x12 = pv[11]; 102 x13 = pv[12]; 103 x14 = pv[13]; 104 x15 = pv[14]; 105 x16 = pv[15]; 106 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 107 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 108 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 109 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 110 111 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 112 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 113 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 114 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 115 116 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 117 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 118 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 119 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 120 121 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 122 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 123 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 124 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 125 126 nz = bi[row + 1] - diag_offset[row] - 1; 127 pv += 16; 128 for (j = 0; j < nz; j++) { 129 x1 = pv[0]; 130 x2 = pv[1]; 131 x3 = pv[2]; 132 x4 = pv[3]; 133 x5 = pv[4]; 134 x6 = pv[5]; 135 x7 = pv[6]; 136 x8 = pv[7]; 137 x9 = pv[8]; 138 x10 = pv[9]; 139 x11 = pv[10]; 140 x12 = pv[11]; 141 x13 = pv[12]; 142 x14 = pv[13]; 143 x15 = pv[14]; 144 x16 = pv[15]; 145 x = rtmp + 16 * pj[j]; 146 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 147 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 148 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 149 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 150 151 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 152 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 153 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 154 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 155 156 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 157 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 158 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 159 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 160 161 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 162 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 163 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 164 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 165 166 pv += 16; 167 } 168 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 169 } 170 row = *ajtmp++; 171 } 172 /* finished row so stick it into b->a */ 173 pv = ba + 16 * bi[i]; 174 pj = bj + bi[i]; 175 nz = bi[i + 1] - bi[i]; 176 for (j = 0; j < nz; j++) { 177 x = rtmp + 16 * pj[j]; 178 pv[0] = x[0]; 179 pv[1] = x[1]; 180 pv[2] = x[2]; 181 pv[3] = x[3]; 182 pv[4] = x[4]; 183 pv[5] = x[5]; 184 pv[6] = x[6]; 185 pv[7] = x[7]; 186 pv[8] = x[8]; 187 pv[9] = x[9]; 188 pv[10] = x[10]; 189 pv[11] = x[11]; 190 pv[12] = x[12]; 191 pv[13] = x[13]; 192 pv[14] = x[14]; 193 pv[15] = x[15]; 194 pv += 16; 195 } 196 /* invert diagonal block */ 197 w = ba + 16 * diag_offset[i]; 198 if (pivotinblocks) { 199 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 200 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 201 } else { 202 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 203 } 204 } 205 206 PetscCall(PetscFree(rtmp)); 207 PetscCall(ISRestoreIndices(isicol, &ic)); 208 PetscCall(ISRestoreIndices(isrow, &r)); 209 210 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 211 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 212 C->assembled = PETSC_TRUE; 213 214 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 215 PetscFunctionReturn(PETSC_SUCCESS); 216 } 217 218 /* MatLUFactorNumeric_SeqBAIJ_4 - 219 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 220 PetscKernel_A_gets_A_times_B() 221 PetscKernel_A_gets_A_minus_B_times_C() 222 PetscKernel_A_gets_inverse_A() 223 */ 224 225 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info) 226 { 227 Mat C = B; 228 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 229 IS isrow = b->row, isicol = b->icol; 230 const PetscInt *r, *ic; 231 PetscInt i, j, k, nz, nzL, row; 232 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 233 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 234 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 235 PetscInt flg; 236 PetscReal shift; 237 PetscBool allowzeropivot, zeropivotdetected; 238 239 PetscFunctionBegin; 240 allowzeropivot = PetscNot(A->erroriffailure); 241 PetscCall(ISGetIndices(isrow, &r)); 242 PetscCall(ISGetIndices(isicol, &ic)); 243 244 if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) { 245 shift = 0; 246 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 247 shift = info->shiftamount; 248 } 249 250 /* generate work space needed by the factorization */ 251 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 252 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 253 254 for (i = 0; i < n; i++) { 255 /* zero rtmp */ 256 /* L part */ 257 nz = bi[i + 1] - bi[i]; 258 bjtmp = bj + bi[i]; 259 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 260 261 /* U part */ 262 nz = bdiag[i] - bdiag[i + 1]; 263 bjtmp = bj + bdiag[i + 1] + 1; 264 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 265 266 /* load in initial (unfactored row) */ 267 nz = ai[r[i] + 1] - ai[r[i]]; 268 ajtmp = aj + ai[r[i]]; 269 v = aa + bs2 * ai[r[i]]; 270 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 271 272 /* elimination */ 273 bjtmp = bj + bi[i]; 274 nzL = bi[i + 1] - bi[i]; 275 for (k = 0; k < nzL; k++) { 276 row = bjtmp[k]; 277 pc = rtmp + bs2 * row; 278 for (flg = 0, j = 0; j < bs2; j++) { 279 if (pc[j] != 0.0) { 280 flg = 1; 281 break; 282 } 283 } 284 if (flg) { 285 pv = b->a + bs2 * bdiag[row]; 286 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 287 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 288 289 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 290 pv = b->a + bs2 * (bdiag[row + 1] + 1); 291 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 292 for (j = 0; j < nz; j++) { 293 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 294 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 295 v = rtmp + bs2 * pj[j]; 296 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 297 pv += bs2; 298 } 299 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 300 } 301 } 302 303 /* finished row so stick it into b->a */ 304 /* L part */ 305 pv = b->a + bs2 * bi[i]; 306 pj = b->j + bi[i]; 307 nz = bi[i + 1] - bi[i]; 308 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 309 310 /* Mark diagonal and invert diagonal for simpler triangular solves */ 311 pv = b->a + bs2 * bdiag[i]; 312 pj = b->j + bdiag[i]; 313 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 314 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 315 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 316 317 /* U part */ 318 pv = b->a + bs2 * (bdiag[i + 1] + 1); 319 pj = b->j + bdiag[i + 1] + 1; 320 nz = bdiag[i] - bdiag[i + 1] - 1; 321 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 322 } 323 324 PetscCall(PetscFree2(rtmp, mwork)); 325 PetscCall(ISRestoreIndices(isicol, &ic)); 326 PetscCall(ISRestoreIndices(isrow, &r)); 327 328 C->ops->solve = MatSolve_SeqBAIJ_4; 329 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 330 C->assembled = PETSC_TRUE; 331 332 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 333 PetscFunctionReturn(PETSC_SUCCESS); 334 } 335 336 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 337 { 338 /* 339 Default Version for when blocks are 4 by 4 Using natural ordering 340 */ 341 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 342 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 343 PetscInt *ajtmpold, *ajtmp, nz, row; 344 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 345 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 346 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 347 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 348 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 349 MatScalar m13, m14, m15, m16; 350 MatScalar *ba = b->a, *aa = a->a; 351 PetscBool pivotinblocks = b->pivotinblocks; 352 PetscReal shift = info->shiftamount; 353 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 354 355 PetscFunctionBegin; 356 allowzeropivot = PetscNot(A->erroriffailure); 357 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 358 359 for (i = 0; i < n; i++) { 360 nz = bi[i + 1] - bi[i]; 361 ajtmp = bj + bi[i]; 362 for (j = 0; j < nz; j++) { 363 x = rtmp + 16 * ajtmp[j]; 364 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 365 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 366 } 367 /* load in initial (unfactored row) */ 368 nz = ai[i + 1] - ai[i]; 369 ajtmpold = aj + ai[i]; 370 v = aa + 16 * ai[i]; 371 for (j = 0; j < nz; j++) { 372 x = rtmp + 16 * ajtmpold[j]; 373 x[0] = v[0]; 374 x[1] = v[1]; 375 x[2] = v[2]; 376 x[3] = v[3]; 377 x[4] = v[4]; 378 x[5] = v[5]; 379 x[6] = v[6]; 380 x[7] = v[7]; 381 x[8] = v[8]; 382 x[9] = v[9]; 383 x[10] = v[10]; 384 x[11] = v[11]; 385 x[12] = v[12]; 386 x[13] = v[13]; 387 x[14] = v[14]; 388 x[15] = v[15]; 389 v += 16; 390 } 391 row = *ajtmp++; 392 while (row < i) { 393 pc = rtmp + 16 * row; 394 p1 = pc[0]; 395 p2 = pc[1]; 396 p3 = pc[2]; 397 p4 = pc[3]; 398 p5 = pc[4]; 399 p6 = pc[5]; 400 p7 = pc[6]; 401 p8 = pc[7]; 402 p9 = pc[8]; 403 p10 = pc[9]; 404 p11 = pc[10]; 405 p12 = pc[11]; 406 p13 = pc[12]; 407 p14 = pc[13]; 408 p15 = pc[14]; 409 p16 = pc[15]; 410 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 411 pv = ba + 16 * diag_offset[row]; 412 pj = bj + diag_offset[row] + 1; 413 x1 = pv[0]; 414 x2 = pv[1]; 415 x3 = pv[2]; 416 x4 = pv[3]; 417 x5 = pv[4]; 418 x6 = pv[5]; 419 x7 = pv[6]; 420 x8 = pv[7]; 421 x9 = pv[8]; 422 x10 = pv[9]; 423 x11 = pv[10]; 424 x12 = pv[11]; 425 x13 = pv[12]; 426 x14 = pv[13]; 427 x15 = pv[14]; 428 x16 = pv[15]; 429 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 430 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 431 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 432 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 433 434 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 435 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 436 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 437 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 438 439 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 440 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 441 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 442 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 443 444 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 445 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 446 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 447 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 448 nz = bi[row + 1] - diag_offset[row] - 1; 449 pv += 16; 450 for (j = 0; j < nz; j++) { 451 x1 = pv[0]; 452 x2 = pv[1]; 453 x3 = pv[2]; 454 x4 = pv[3]; 455 x5 = pv[4]; 456 x6 = pv[5]; 457 x7 = pv[6]; 458 x8 = pv[7]; 459 x9 = pv[8]; 460 x10 = pv[9]; 461 x11 = pv[10]; 462 x12 = pv[11]; 463 x13 = pv[12]; 464 x14 = pv[13]; 465 x15 = pv[14]; 466 x16 = pv[15]; 467 x = rtmp + 16 * pj[j]; 468 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 469 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 470 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 471 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 472 473 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 474 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 475 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 476 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 477 478 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 479 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 480 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 481 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 482 483 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 484 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 485 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 486 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 487 488 pv += 16; 489 } 490 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 491 } 492 row = *ajtmp++; 493 } 494 /* finished row so stick it into b->a */ 495 pv = ba + 16 * bi[i]; 496 pj = bj + bi[i]; 497 nz = bi[i + 1] - bi[i]; 498 for (j = 0; j < nz; j++) { 499 x = rtmp + 16 * pj[j]; 500 pv[0] = x[0]; 501 pv[1] = x[1]; 502 pv[2] = x[2]; 503 pv[3] = x[3]; 504 pv[4] = x[4]; 505 pv[5] = x[5]; 506 pv[6] = x[6]; 507 pv[7] = x[7]; 508 pv[8] = x[8]; 509 pv[9] = x[9]; 510 pv[10] = x[10]; 511 pv[11] = x[11]; 512 pv[12] = x[12]; 513 pv[13] = x[13]; 514 pv[14] = x[14]; 515 pv[15] = x[15]; 516 pv += 16; 517 } 518 /* invert diagonal block */ 519 w = ba + 16 * diag_offset[i]; 520 if (pivotinblocks) { 521 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 522 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 523 } else { 524 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 525 } 526 } 527 528 PetscCall(PetscFree(rtmp)); 529 530 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 531 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 532 C->assembled = PETSC_TRUE; 533 534 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 535 PetscFunctionReturn(PETSC_SUCCESS); 536 } 537 538 /* 539 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 540 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 541 */ 542 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 543 { 544 Mat C = B; 545 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 546 PetscInt i, j, k, nz, nzL, row; 547 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 548 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 549 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 550 PetscInt flg; 551 PetscReal shift; 552 PetscBool allowzeropivot, zeropivotdetected; 553 554 PetscFunctionBegin; 555 allowzeropivot = PetscNot(A->erroriffailure); 556 557 /* generate work space needed by the factorization */ 558 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 559 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 560 561 if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) { 562 shift = 0; 563 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 564 shift = info->shiftamount; 565 } 566 567 for (i = 0; i < n; i++) { 568 /* zero rtmp */ 569 /* L part */ 570 nz = bi[i + 1] - bi[i]; 571 bjtmp = bj + bi[i]; 572 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 573 574 /* U part */ 575 nz = bdiag[i] - bdiag[i + 1]; 576 bjtmp = bj + bdiag[i + 1] + 1; 577 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 578 579 /* load in initial (unfactored row) */ 580 nz = ai[i + 1] - ai[i]; 581 ajtmp = aj + ai[i]; 582 v = aa + bs2 * ai[i]; 583 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 584 585 /* elimination */ 586 bjtmp = bj + bi[i]; 587 nzL = bi[i + 1] - bi[i]; 588 for (k = 0; k < nzL; k++) { 589 row = bjtmp[k]; 590 pc = rtmp + bs2 * row; 591 for (flg = 0, j = 0; j < bs2; j++) { 592 if (pc[j] != 0.0) { 593 flg = 1; 594 break; 595 } 596 } 597 if (flg) { 598 pv = b->a + bs2 * bdiag[row]; 599 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 600 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 601 602 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 603 pv = b->a + bs2 * (bdiag[row + 1] + 1); 604 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 605 for (j = 0; j < nz; j++) { 606 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 607 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 608 v = rtmp + bs2 * pj[j]; 609 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 610 pv += bs2; 611 } 612 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 613 } 614 } 615 616 /* finished row so stick it into b->a */ 617 /* L part */ 618 pv = b->a + bs2 * bi[i]; 619 pj = b->j + bi[i]; 620 nz = bi[i + 1] - bi[i]; 621 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 622 623 /* Mark diagonal and invert diagonal for simpler triangular solves */ 624 pv = b->a + bs2 * bdiag[i]; 625 pj = b->j + bdiag[i]; 626 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 627 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 628 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 629 630 /* U part */ 631 pv = b->a + bs2 * (bdiag[i + 1] + 1); 632 pj = b->j + bdiag[i + 1] + 1; 633 nz = bdiag[i] - bdiag[i + 1] - 1; 634 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 635 } 636 PetscCall(PetscFree2(rtmp, mwork)); 637 638 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 639 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 640 C->assembled = PETSC_TRUE; 641 642 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 643 PetscFunctionReturn(PETSC_SUCCESS); 644 } 645 646 #if defined(PETSC_HAVE_SSE) 647 648 #include PETSC_HAVE_SSE 649 650 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 651 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info) 652 { 653 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 654 int i, j, n = a->mbs; 655 int *bj = b->j, *bjtmp, *pj; 656 int row; 657 int *ajtmpold, nz, *bi = b->i; 658 int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 659 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 660 MatScalar *ba = b->a, *aa = a->a; 661 int nonzero = 0; 662 PetscBool pivotinblocks = b->pivotinblocks; 663 PetscReal shift = info->shiftamount; 664 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 665 666 PetscFunctionBegin; 667 allowzeropivot = PetscNot(A->erroriffailure); 668 SSE_SCOPE_BEGIN; 669 670 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 671 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 672 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 673 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 674 /* if ((unsigned long)bj==(unsigned long)aj) { */ 675 /* colscale = 4; */ 676 /* } */ 677 for (i = 0; i < n; i++) { 678 nz = bi[i + 1] - bi[i]; 679 bjtmp = bj + bi[i]; 680 /* zero out the 4x4 block accumulators */ 681 /* zero out one register */ 682 XOR_PS(XMM7, XMM7); 683 for (j = 0; j < nz; j++) { 684 x = rtmp + 16 * bjtmp[j]; 685 /* x = rtmp+4*bjtmp[j]; */ 686 SSE_INLINE_BEGIN_1(x) 687 /* Copy zero register to memory locations */ 688 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 689 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 690 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 691 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 692 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 693 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 694 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 695 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 696 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 697 SSE_INLINE_END_1; 698 } 699 /* load in initial (unfactored row) */ 700 nz = ai[i + 1] - ai[i]; 701 ajtmpold = aj + ai[i]; 702 v = aa + 16 * ai[i]; 703 for (j = 0; j < nz; j++) { 704 x = rtmp + 16 * ajtmpold[j]; 705 /* x = rtmp+colscale*ajtmpold[j]; */ 706 /* Copy v block into x block */ 707 SSE_INLINE_BEGIN_2(v, x) 708 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 709 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 710 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 711 712 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 713 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 714 715 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 716 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 717 718 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 719 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 720 721 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 722 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 723 724 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 725 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 726 727 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 728 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 729 730 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 731 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 732 SSE_INLINE_END_2; 733 734 v += 16; 735 } 736 /* row = (*bjtmp++)/4; */ 737 row = *bjtmp++; 738 while (row < i) { 739 pc = rtmp + 16 * row; 740 SSE_INLINE_BEGIN_1(pc) 741 /* Load block from lower triangle */ 742 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 743 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 744 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 745 746 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 747 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 748 749 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 750 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 751 752 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 753 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 754 755 /* Compare block to zero block */ 756 757 SSE_COPY_PS(XMM4, XMM7) 758 SSE_CMPNEQ_PS(XMM4, XMM0) 759 760 SSE_COPY_PS(XMM5, XMM7) 761 SSE_CMPNEQ_PS(XMM5, XMM1) 762 763 SSE_COPY_PS(XMM6, XMM7) 764 SSE_CMPNEQ_PS(XMM6, XMM2) 765 766 SSE_CMPNEQ_PS(XMM7, XMM3) 767 768 /* Reduce the comparisons to one SSE register */ 769 SSE_OR_PS(XMM6, XMM7) 770 SSE_OR_PS(XMM5, XMM4) 771 SSE_OR_PS(XMM5, XMM6) 772 SSE_INLINE_END_1; 773 774 /* Reduce the one SSE register to an integer register for branching */ 775 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 776 MOVEMASK(nonzero, XMM5); 777 778 /* If block is nonzero ... */ 779 if (nonzero) { 780 pv = ba + 16 * diag_offset[row]; 781 PREFETCH_L1(&pv[16]); 782 pj = bj + diag_offset[row] + 1; 783 784 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 785 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 786 /* but the diagonal was inverted already */ 787 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 788 789 SSE_INLINE_BEGIN_2(pv, pc) 790 /* Column 0, product is accumulated in XMM4 */ 791 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 792 SSE_SHUFFLE(XMM4, XMM4, 0x00) 793 SSE_MULT_PS(XMM4, XMM0) 794 795 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 796 SSE_SHUFFLE(XMM5, XMM5, 0x00) 797 SSE_MULT_PS(XMM5, XMM1) 798 SSE_ADD_PS(XMM4, XMM5) 799 800 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 801 SSE_SHUFFLE(XMM6, XMM6, 0x00) 802 SSE_MULT_PS(XMM6, XMM2) 803 SSE_ADD_PS(XMM4, XMM6) 804 805 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 806 SSE_SHUFFLE(XMM7, XMM7, 0x00) 807 SSE_MULT_PS(XMM7, XMM3) 808 SSE_ADD_PS(XMM4, XMM7) 809 810 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 811 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 812 813 /* Column 1, product is accumulated in XMM5 */ 814 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 815 SSE_SHUFFLE(XMM5, XMM5, 0x00) 816 SSE_MULT_PS(XMM5, XMM0) 817 818 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 819 SSE_SHUFFLE(XMM6, XMM6, 0x00) 820 SSE_MULT_PS(XMM6, XMM1) 821 SSE_ADD_PS(XMM5, XMM6) 822 823 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 824 SSE_SHUFFLE(XMM7, XMM7, 0x00) 825 SSE_MULT_PS(XMM7, XMM2) 826 SSE_ADD_PS(XMM5, XMM7) 827 828 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 829 SSE_SHUFFLE(XMM6, XMM6, 0x00) 830 SSE_MULT_PS(XMM6, XMM3) 831 SSE_ADD_PS(XMM5, XMM6) 832 833 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 834 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 835 836 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 837 838 /* Column 2, product is accumulated in XMM6 */ 839 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 840 SSE_SHUFFLE(XMM6, XMM6, 0x00) 841 SSE_MULT_PS(XMM6, XMM0) 842 843 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 844 SSE_SHUFFLE(XMM7, XMM7, 0x00) 845 SSE_MULT_PS(XMM7, XMM1) 846 SSE_ADD_PS(XMM6, XMM7) 847 848 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 849 SSE_SHUFFLE(XMM7, XMM7, 0x00) 850 SSE_MULT_PS(XMM7, XMM2) 851 SSE_ADD_PS(XMM6, XMM7) 852 853 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 854 SSE_SHUFFLE(XMM7, XMM7, 0x00) 855 SSE_MULT_PS(XMM7, XMM3) 856 SSE_ADD_PS(XMM6, XMM7) 857 858 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 859 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 860 861 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 862 /* Column 3, product is accumulated in XMM0 */ 863 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 864 SSE_SHUFFLE(XMM7, XMM7, 0x00) 865 SSE_MULT_PS(XMM0, XMM7) 866 867 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 868 SSE_SHUFFLE(XMM7, XMM7, 0x00) 869 SSE_MULT_PS(XMM1, XMM7) 870 SSE_ADD_PS(XMM0, XMM1) 871 872 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 873 SSE_SHUFFLE(XMM1, XMM1, 0x00) 874 SSE_MULT_PS(XMM1, XMM2) 875 SSE_ADD_PS(XMM0, XMM1) 876 877 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 878 SSE_SHUFFLE(XMM7, XMM7, 0x00) 879 SSE_MULT_PS(XMM3, XMM7) 880 SSE_ADD_PS(XMM0, XMM3) 881 882 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 883 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 884 885 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 886 /* This is code to be maintained and read by humans after all. */ 887 /* Copy Multiplier Col 3 into XMM3 */ 888 SSE_COPY_PS(XMM3, XMM0) 889 /* Copy Multiplier Col 2 into XMM2 */ 890 SSE_COPY_PS(XMM2, XMM6) 891 /* Copy Multiplier Col 1 into XMM1 */ 892 SSE_COPY_PS(XMM1, XMM5) 893 /* Copy Multiplier Col 0 into XMM0 */ 894 SSE_COPY_PS(XMM0, XMM4) 895 SSE_INLINE_END_2; 896 897 /* Update the row: */ 898 nz = bi[row + 1] - diag_offset[row] - 1; 899 pv += 16; 900 for (j = 0; j < nz; j++) { 901 PREFETCH_L1(&pv[16]); 902 x = rtmp + 16 * pj[j]; 903 /* x = rtmp + 4*pj[j]; */ 904 905 /* X:=X-M*PV, One column at a time */ 906 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 907 SSE_INLINE_BEGIN_2(x, pv) 908 /* Load First Column of X*/ 909 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 910 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 911 912 /* Matrix-Vector Product: */ 913 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 914 SSE_SHUFFLE(XMM5, XMM5, 0x00) 915 SSE_MULT_PS(XMM5, XMM0) 916 SSE_SUB_PS(XMM4, XMM5) 917 918 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 919 SSE_SHUFFLE(XMM6, XMM6, 0x00) 920 SSE_MULT_PS(XMM6, XMM1) 921 SSE_SUB_PS(XMM4, XMM6) 922 923 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 924 SSE_SHUFFLE(XMM7, XMM7, 0x00) 925 SSE_MULT_PS(XMM7, XMM2) 926 SSE_SUB_PS(XMM4, XMM7) 927 928 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 929 SSE_SHUFFLE(XMM5, XMM5, 0x00) 930 SSE_MULT_PS(XMM5, XMM3) 931 SSE_SUB_PS(XMM4, XMM5) 932 933 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 934 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 935 936 /* Second Column */ 937 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 938 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 939 940 /* Matrix-Vector Product: */ 941 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 942 SSE_SHUFFLE(XMM6, XMM6, 0x00) 943 SSE_MULT_PS(XMM6, XMM0) 944 SSE_SUB_PS(XMM5, XMM6) 945 946 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 947 SSE_SHUFFLE(XMM7, XMM7, 0x00) 948 SSE_MULT_PS(XMM7, XMM1) 949 SSE_SUB_PS(XMM5, XMM7) 950 951 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 952 SSE_SHUFFLE(XMM4, XMM4, 0x00) 953 SSE_MULT_PS(XMM4, XMM2) 954 SSE_SUB_PS(XMM5, XMM4) 955 956 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 957 SSE_SHUFFLE(XMM6, XMM6, 0x00) 958 SSE_MULT_PS(XMM6, XMM3) 959 SSE_SUB_PS(XMM5, XMM6) 960 961 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 962 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 963 964 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 965 966 /* Third Column */ 967 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 968 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 969 970 /* Matrix-Vector Product: */ 971 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 972 SSE_SHUFFLE(XMM7, XMM7, 0x00) 973 SSE_MULT_PS(XMM7, XMM0) 974 SSE_SUB_PS(XMM6, XMM7) 975 976 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 977 SSE_SHUFFLE(XMM4, XMM4, 0x00) 978 SSE_MULT_PS(XMM4, XMM1) 979 SSE_SUB_PS(XMM6, XMM4) 980 981 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 982 SSE_SHUFFLE(XMM5, XMM5, 0x00) 983 SSE_MULT_PS(XMM5, XMM2) 984 SSE_SUB_PS(XMM6, XMM5) 985 986 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 987 SSE_SHUFFLE(XMM7, XMM7, 0x00) 988 SSE_MULT_PS(XMM7, XMM3) 989 SSE_SUB_PS(XMM6, XMM7) 990 991 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 992 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 993 994 /* Fourth Column */ 995 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 996 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 997 998 /* Matrix-Vector Product: */ 999 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1000 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1001 SSE_MULT_PS(XMM5, XMM0) 1002 SSE_SUB_PS(XMM4, XMM5) 1003 1004 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1005 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1006 SSE_MULT_PS(XMM6, XMM1) 1007 SSE_SUB_PS(XMM4, XMM6) 1008 1009 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1010 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1011 SSE_MULT_PS(XMM7, XMM2) 1012 SSE_SUB_PS(XMM4, XMM7) 1013 1014 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1015 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1016 SSE_MULT_PS(XMM5, XMM3) 1017 SSE_SUB_PS(XMM4, XMM5) 1018 1019 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1020 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1021 SSE_INLINE_END_2; 1022 pv += 16; 1023 } 1024 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1025 } 1026 row = *bjtmp++; 1027 /* row = (*bjtmp++)/4; */ 1028 } 1029 /* finished row so stick it into b->a */ 1030 pv = ba + 16 * bi[i]; 1031 pj = bj + bi[i]; 1032 nz = bi[i + 1] - bi[i]; 1033 1034 /* Copy x block back into pv block */ 1035 for (j = 0; j < nz; j++) { 1036 x = rtmp + 16 * pj[j]; 1037 /* x = rtmp+4*pj[j]; */ 1038 1039 SSE_INLINE_BEGIN_2(x, pv) 1040 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1041 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1042 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1043 1044 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1045 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1046 1047 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1048 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1049 1050 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1051 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1052 1053 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1054 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1055 1056 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1057 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1058 1059 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1060 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1061 1062 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1063 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1064 SSE_INLINE_END_2; 1065 pv += 16; 1066 } 1067 /* invert diagonal block */ 1068 w = ba + 16 * diag_offset[i]; 1069 if (pivotinblocks) { 1070 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1071 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1072 } else { 1073 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1074 } 1075 /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1076 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1077 } 1078 1079 PetscCall(PetscFree(rtmp)); 1080 1081 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1082 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1083 C->assembled = PETSC_TRUE; 1084 1085 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1086 /* Flop Count from inverting diagonal blocks */ 1087 SSE_SCOPE_END; 1088 PetscFunctionReturn(PETSC_SUCCESS); 1089 } 1090 1091 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1092 { 1093 Mat A = C; 1094 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1095 int i, j, n = a->mbs; 1096 unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj; 1097 unsigned short *aj = (unsigned short *)(a->j), *ajtmp; 1098 unsigned int row; 1099 int nz, *bi = b->i; 1100 int *diag_offset = b->diag, *ai = a->i; 1101 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1102 MatScalar *ba = b->a, *aa = a->a; 1103 int nonzero = 0; 1104 PetscBool pivotinblocks = b->pivotinblocks; 1105 PetscReal shift = info->shiftamount; 1106 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1107 1108 PetscFunctionBegin; 1109 allowzeropivot = PetscNot(A->erroriffailure); 1110 SSE_SCOPE_BEGIN; 1111 1112 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1113 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 1114 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1115 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1116 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1117 /* colscale = 4; */ 1118 /* } */ 1119 1120 for (i = 0; i < n; i++) { 1121 nz = bi[i + 1] - bi[i]; 1122 bjtmp = bj + bi[i]; 1123 /* zero out the 4x4 block accumulators */ 1124 /* zero out one register */ 1125 XOR_PS(XMM7, XMM7); 1126 for (j = 0; j < nz; j++) { 1127 x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1128 /* x = rtmp+4*bjtmp[j]; */ 1129 SSE_INLINE_BEGIN_1(x) 1130 /* Copy zero register to memory locations */ 1131 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1132 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1133 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1134 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1135 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1136 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1137 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1138 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1139 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1140 SSE_INLINE_END_1; 1141 } 1142 /* load in initial (unfactored row) */ 1143 nz = ai[i + 1] - ai[i]; 1144 ajtmp = aj + ai[i]; 1145 v = aa + 16 * ai[i]; 1146 for (j = 0; j < nz; j++) { 1147 x = rtmp + 16 * ((unsigned int)ajtmp[j]); 1148 /* x = rtmp+colscale*ajtmp[j]; */ 1149 /* Copy v block into x block */ 1150 SSE_INLINE_BEGIN_2(v, x) 1151 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1152 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1153 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1154 1155 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1156 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1157 1158 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1159 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1160 1161 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1162 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1163 1164 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1165 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1166 1167 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1168 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1169 1170 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1171 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1172 1173 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1174 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1175 SSE_INLINE_END_2; 1176 1177 v += 16; 1178 } 1179 /* row = (*bjtmp++)/4; */ 1180 row = (unsigned int)(*bjtmp++); 1181 while (row < i) { 1182 pc = rtmp + 16 * row; 1183 SSE_INLINE_BEGIN_1(pc) 1184 /* Load block from lower triangle */ 1185 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1186 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1187 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1188 1189 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1190 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1191 1192 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1193 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1194 1195 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1196 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1197 1198 /* Compare block to zero block */ 1199 1200 SSE_COPY_PS(XMM4, XMM7) 1201 SSE_CMPNEQ_PS(XMM4, XMM0) 1202 1203 SSE_COPY_PS(XMM5, XMM7) 1204 SSE_CMPNEQ_PS(XMM5, XMM1) 1205 1206 SSE_COPY_PS(XMM6, XMM7) 1207 SSE_CMPNEQ_PS(XMM6, XMM2) 1208 1209 SSE_CMPNEQ_PS(XMM7, XMM3) 1210 1211 /* Reduce the comparisons to one SSE register */ 1212 SSE_OR_PS(XMM6, XMM7) 1213 SSE_OR_PS(XMM5, XMM4) 1214 SSE_OR_PS(XMM5, XMM6) 1215 SSE_INLINE_END_1; 1216 1217 /* Reduce the one SSE register to an integer register for branching */ 1218 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1219 MOVEMASK(nonzero, XMM5); 1220 1221 /* If block is nonzero ... */ 1222 if (nonzero) { 1223 pv = ba + 16 * diag_offset[row]; 1224 PREFETCH_L1(&pv[16]); 1225 pj = bj + diag_offset[row] + 1; 1226 1227 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1228 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1229 /* but the diagonal was inverted already */ 1230 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1231 1232 SSE_INLINE_BEGIN_2(pv, pc) 1233 /* Column 0, product is accumulated in XMM4 */ 1234 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1235 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1236 SSE_MULT_PS(XMM4, XMM0) 1237 1238 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1239 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1240 SSE_MULT_PS(XMM5, XMM1) 1241 SSE_ADD_PS(XMM4, XMM5) 1242 1243 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1244 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1245 SSE_MULT_PS(XMM6, XMM2) 1246 SSE_ADD_PS(XMM4, XMM6) 1247 1248 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1249 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1250 SSE_MULT_PS(XMM7, XMM3) 1251 SSE_ADD_PS(XMM4, XMM7) 1252 1253 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1254 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1255 1256 /* Column 1, product is accumulated in XMM5 */ 1257 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1258 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1259 SSE_MULT_PS(XMM5, XMM0) 1260 1261 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1262 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1263 SSE_MULT_PS(XMM6, XMM1) 1264 SSE_ADD_PS(XMM5, XMM6) 1265 1266 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1267 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1268 SSE_MULT_PS(XMM7, XMM2) 1269 SSE_ADD_PS(XMM5, XMM7) 1270 1271 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1272 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1273 SSE_MULT_PS(XMM6, XMM3) 1274 SSE_ADD_PS(XMM5, XMM6) 1275 1276 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1277 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1278 1279 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1280 1281 /* Column 2, product is accumulated in XMM6 */ 1282 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1283 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1284 SSE_MULT_PS(XMM6, XMM0) 1285 1286 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1287 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1288 SSE_MULT_PS(XMM7, XMM1) 1289 SSE_ADD_PS(XMM6, XMM7) 1290 1291 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1292 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1293 SSE_MULT_PS(XMM7, XMM2) 1294 SSE_ADD_PS(XMM6, XMM7) 1295 1296 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1297 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1298 SSE_MULT_PS(XMM7, XMM3) 1299 SSE_ADD_PS(XMM6, XMM7) 1300 1301 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1302 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1303 1304 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1305 /* Column 3, product is accumulated in XMM0 */ 1306 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1307 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1308 SSE_MULT_PS(XMM0, XMM7) 1309 1310 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1311 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1312 SSE_MULT_PS(XMM1, XMM7) 1313 SSE_ADD_PS(XMM0, XMM1) 1314 1315 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1316 SSE_SHUFFLE(XMM1, XMM1, 0x00) 1317 SSE_MULT_PS(XMM1, XMM2) 1318 SSE_ADD_PS(XMM0, XMM1) 1319 1320 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1321 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1322 SSE_MULT_PS(XMM3, XMM7) 1323 SSE_ADD_PS(XMM0, XMM3) 1324 1325 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1326 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1327 1328 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1329 /* This is code to be maintained and read by humans after all. */ 1330 /* Copy Multiplier Col 3 into XMM3 */ 1331 SSE_COPY_PS(XMM3, XMM0) 1332 /* Copy Multiplier Col 2 into XMM2 */ 1333 SSE_COPY_PS(XMM2, XMM6) 1334 /* Copy Multiplier Col 1 into XMM1 */ 1335 SSE_COPY_PS(XMM1, XMM5) 1336 /* Copy Multiplier Col 0 into XMM0 */ 1337 SSE_COPY_PS(XMM0, XMM4) 1338 SSE_INLINE_END_2; 1339 1340 /* Update the row: */ 1341 nz = bi[row + 1] - diag_offset[row] - 1; 1342 pv += 16; 1343 for (j = 0; j < nz; j++) { 1344 PREFETCH_L1(&pv[16]); 1345 x = rtmp + 16 * ((unsigned int)pj[j]); 1346 /* x = rtmp + 4*pj[j]; */ 1347 1348 /* X:=X-M*PV, One column at a time */ 1349 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1350 SSE_INLINE_BEGIN_2(x, pv) 1351 /* Load First Column of X*/ 1352 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1353 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1354 1355 /* Matrix-Vector Product: */ 1356 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1357 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1358 SSE_MULT_PS(XMM5, XMM0) 1359 SSE_SUB_PS(XMM4, XMM5) 1360 1361 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1362 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1363 SSE_MULT_PS(XMM6, XMM1) 1364 SSE_SUB_PS(XMM4, XMM6) 1365 1366 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1367 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1368 SSE_MULT_PS(XMM7, XMM2) 1369 SSE_SUB_PS(XMM4, XMM7) 1370 1371 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1372 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1373 SSE_MULT_PS(XMM5, XMM3) 1374 SSE_SUB_PS(XMM4, XMM5) 1375 1376 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1377 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1378 1379 /* Second Column */ 1380 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1381 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1382 1383 /* Matrix-Vector Product: */ 1384 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1385 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1386 SSE_MULT_PS(XMM6, XMM0) 1387 SSE_SUB_PS(XMM5, XMM6) 1388 1389 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1390 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1391 SSE_MULT_PS(XMM7, XMM1) 1392 SSE_SUB_PS(XMM5, XMM7) 1393 1394 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1395 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1396 SSE_MULT_PS(XMM4, XMM2) 1397 SSE_SUB_PS(XMM5, XMM4) 1398 1399 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1400 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1401 SSE_MULT_PS(XMM6, XMM3) 1402 SSE_SUB_PS(XMM5, XMM6) 1403 1404 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1405 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1406 1407 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1408 1409 /* Third Column */ 1410 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1411 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1412 1413 /* Matrix-Vector Product: */ 1414 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1415 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1416 SSE_MULT_PS(XMM7, XMM0) 1417 SSE_SUB_PS(XMM6, XMM7) 1418 1419 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1420 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1421 SSE_MULT_PS(XMM4, XMM1) 1422 SSE_SUB_PS(XMM6, XMM4) 1423 1424 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1425 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1426 SSE_MULT_PS(XMM5, XMM2) 1427 SSE_SUB_PS(XMM6, XMM5) 1428 1429 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1430 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1431 SSE_MULT_PS(XMM7, XMM3) 1432 SSE_SUB_PS(XMM6, XMM7) 1433 1434 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1435 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1436 1437 /* Fourth Column */ 1438 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1439 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1440 1441 /* Matrix-Vector Product: */ 1442 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1443 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1444 SSE_MULT_PS(XMM5, XMM0) 1445 SSE_SUB_PS(XMM4, XMM5) 1446 1447 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1448 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1449 SSE_MULT_PS(XMM6, XMM1) 1450 SSE_SUB_PS(XMM4, XMM6) 1451 1452 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1453 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1454 SSE_MULT_PS(XMM7, XMM2) 1455 SSE_SUB_PS(XMM4, XMM7) 1456 1457 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1458 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1459 SSE_MULT_PS(XMM5, XMM3) 1460 SSE_SUB_PS(XMM4, XMM5) 1461 1462 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1463 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1464 SSE_INLINE_END_2; 1465 pv += 16; 1466 } 1467 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1468 } 1469 row = (unsigned int)(*bjtmp++); 1470 /* row = (*bjtmp++)/4; */ 1471 /* bjtmp++; */ 1472 } 1473 /* finished row so stick it into b->a */ 1474 pv = ba + 16 * bi[i]; 1475 pj = bj + bi[i]; 1476 nz = bi[i + 1] - bi[i]; 1477 1478 /* Copy x block back into pv block */ 1479 for (j = 0; j < nz; j++) { 1480 x = rtmp + 16 * ((unsigned int)pj[j]); 1481 /* x = rtmp+4*pj[j]; */ 1482 1483 SSE_INLINE_BEGIN_2(x, pv) 1484 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1485 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1486 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1487 1488 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1489 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1490 1491 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1492 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1493 1494 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1495 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1496 1497 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1498 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1499 1500 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1501 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1502 1503 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1504 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1505 1506 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1507 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1508 SSE_INLINE_END_2; 1509 pv += 16; 1510 } 1511 /* invert diagonal block */ 1512 w = ba + 16 * diag_offset[i]; 1513 if (pivotinblocks) { 1514 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1515 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1516 } else { 1517 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1518 } 1519 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1520 } 1521 1522 PetscCall(PetscFree(rtmp)); 1523 1524 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1525 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1526 C->assembled = PETSC_TRUE; 1527 1528 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1529 /* Flop Count from inverting diagonal blocks */ 1530 SSE_SCOPE_END; 1531 PetscFunctionReturn(PETSC_SUCCESS); 1532 } 1533 1534 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info) 1535 { 1536 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1537 int i, j, n = a->mbs; 1538 unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj; 1539 unsigned int row; 1540 int *ajtmpold, nz, *bi = b->i; 1541 int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 1542 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1543 MatScalar *ba = b->a, *aa = a->a; 1544 int nonzero = 0; 1545 PetscBool pivotinblocks = b->pivotinblocks; 1546 PetscReal shift = info->shiftamount; 1547 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1548 1549 PetscFunctionBegin; 1550 allowzeropivot = PetscNot(A->erroriffailure); 1551 SSE_SCOPE_BEGIN; 1552 1553 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1554 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 1555 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1556 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1557 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1558 /* colscale = 4; */ 1559 /* } */ 1560 if ((unsigned long)bj == (unsigned long)aj) return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1561 1562 for (i = 0; i < n; i++) { 1563 nz = bi[i + 1] - bi[i]; 1564 bjtmp = bj + bi[i]; 1565 /* zero out the 4x4 block accumulators */ 1566 /* zero out one register */ 1567 XOR_PS(XMM7, XMM7); 1568 for (j = 0; j < nz; j++) { 1569 x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1570 /* x = rtmp+4*bjtmp[j]; */ 1571 SSE_INLINE_BEGIN_1(x) 1572 /* Copy zero register to memory locations */ 1573 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1574 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1575 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1576 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1577 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1578 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1579 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1580 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1581 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1582 SSE_INLINE_END_1; 1583 } 1584 /* load in initial (unfactored row) */ 1585 nz = ai[i + 1] - ai[i]; 1586 ajtmpold = aj + ai[i]; 1587 v = aa + 16 * ai[i]; 1588 for (j = 0; j < nz; j++) { 1589 x = rtmp + 16 * ajtmpold[j]; 1590 /* x = rtmp+colscale*ajtmpold[j]; */ 1591 /* Copy v block into x block */ 1592 SSE_INLINE_BEGIN_2(v, x) 1593 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1594 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1595 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1596 1597 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1598 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1599 1600 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1601 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1602 1603 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1604 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1605 1606 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1607 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1608 1609 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1610 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1611 1612 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1613 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1614 1615 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1616 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1617 SSE_INLINE_END_2; 1618 1619 v += 16; 1620 } 1621 /* row = (*bjtmp++)/4; */ 1622 row = (unsigned int)(*bjtmp++); 1623 while (row < i) { 1624 pc = rtmp + 16 * row; 1625 SSE_INLINE_BEGIN_1(pc) 1626 /* Load block from lower triangle */ 1627 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1628 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1629 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1630 1631 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1632 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1633 1634 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1635 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1636 1637 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1638 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1639 1640 /* Compare block to zero block */ 1641 1642 SSE_COPY_PS(XMM4, XMM7) 1643 SSE_CMPNEQ_PS(XMM4, XMM0) 1644 1645 SSE_COPY_PS(XMM5, XMM7) 1646 SSE_CMPNEQ_PS(XMM5, XMM1) 1647 1648 SSE_COPY_PS(XMM6, XMM7) 1649 SSE_CMPNEQ_PS(XMM6, XMM2) 1650 1651 SSE_CMPNEQ_PS(XMM7, XMM3) 1652 1653 /* Reduce the comparisons to one SSE register */ 1654 SSE_OR_PS(XMM6, XMM7) 1655 SSE_OR_PS(XMM5, XMM4) 1656 SSE_OR_PS(XMM5, XMM6) 1657 SSE_INLINE_END_1; 1658 1659 /* Reduce the one SSE register to an integer register for branching */ 1660 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1661 MOVEMASK(nonzero, XMM5); 1662 1663 /* If block is nonzero ... */ 1664 if (nonzero) { 1665 pv = ba + 16 * diag_offset[row]; 1666 PREFETCH_L1(&pv[16]); 1667 pj = bj + diag_offset[row] + 1; 1668 1669 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1670 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1671 /* but the diagonal was inverted already */ 1672 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1673 1674 SSE_INLINE_BEGIN_2(pv, pc) 1675 /* Column 0, product is accumulated in XMM4 */ 1676 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1677 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1678 SSE_MULT_PS(XMM4, XMM0) 1679 1680 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1681 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1682 SSE_MULT_PS(XMM5, XMM1) 1683 SSE_ADD_PS(XMM4, XMM5) 1684 1685 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1686 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1687 SSE_MULT_PS(XMM6, XMM2) 1688 SSE_ADD_PS(XMM4, XMM6) 1689 1690 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1691 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1692 SSE_MULT_PS(XMM7, XMM3) 1693 SSE_ADD_PS(XMM4, XMM7) 1694 1695 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1696 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1697 1698 /* Column 1, product is accumulated in XMM5 */ 1699 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1700 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1701 SSE_MULT_PS(XMM5, XMM0) 1702 1703 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1704 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1705 SSE_MULT_PS(XMM6, XMM1) 1706 SSE_ADD_PS(XMM5, XMM6) 1707 1708 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1709 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1710 SSE_MULT_PS(XMM7, XMM2) 1711 SSE_ADD_PS(XMM5, XMM7) 1712 1713 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1714 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1715 SSE_MULT_PS(XMM6, XMM3) 1716 SSE_ADD_PS(XMM5, XMM6) 1717 1718 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1719 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1720 1721 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1722 1723 /* Column 2, product is accumulated in XMM6 */ 1724 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1725 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1726 SSE_MULT_PS(XMM6, XMM0) 1727 1728 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1729 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1730 SSE_MULT_PS(XMM7, XMM1) 1731 SSE_ADD_PS(XMM6, XMM7) 1732 1733 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1734 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1735 SSE_MULT_PS(XMM7, XMM2) 1736 SSE_ADD_PS(XMM6, XMM7) 1737 1738 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1739 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1740 SSE_MULT_PS(XMM7, XMM3) 1741 SSE_ADD_PS(XMM6, XMM7) 1742 1743 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1744 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1745 1746 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1747 /* Column 3, product is accumulated in XMM0 */ 1748 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1749 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1750 SSE_MULT_PS(XMM0, XMM7) 1751 1752 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1753 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1754 SSE_MULT_PS(XMM1, XMM7) 1755 SSE_ADD_PS(XMM0, XMM1) 1756 1757 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1758 SSE_SHUFFLE(XMM1, XMM1, 0x00) 1759 SSE_MULT_PS(XMM1, XMM2) 1760 SSE_ADD_PS(XMM0, XMM1) 1761 1762 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1763 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1764 SSE_MULT_PS(XMM3, XMM7) 1765 SSE_ADD_PS(XMM0, XMM3) 1766 1767 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1768 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1769 1770 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1771 /* This is code to be maintained and read by humans after all. */ 1772 /* Copy Multiplier Col 3 into XMM3 */ 1773 SSE_COPY_PS(XMM3, XMM0) 1774 /* Copy Multiplier Col 2 into XMM2 */ 1775 SSE_COPY_PS(XMM2, XMM6) 1776 /* Copy Multiplier Col 1 into XMM1 */ 1777 SSE_COPY_PS(XMM1, XMM5) 1778 /* Copy Multiplier Col 0 into XMM0 */ 1779 SSE_COPY_PS(XMM0, XMM4) 1780 SSE_INLINE_END_2; 1781 1782 /* Update the row: */ 1783 nz = bi[row + 1] - diag_offset[row] - 1; 1784 pv += 16; 1785 for (j = 0; j < nz; j++) { 1786 PREFETCH_L1(&pv[16]); 1787 x = rtmp + 16 * ((unsigned int)pj[j]); 1788 /* x = rtmp + 4*pj[j]; */ 1789 1790 /* X:=X-M*PV, One column at a time */ 1791 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1792 SSE_INLINE_BEGIN_2(x, pv) 1793 /* Load First Column of X*/ 1794 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1795 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1796 1797 /* Matrix-Vector Product: */ 1798 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1799 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1800 SSE_MULT_PS(XMM5, XMM0) 1801 SSE_SUB_PS(XMM4, XMM5) 1802 1803 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1804 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1805 SSE_MULT_PS(XMM6, XMM1) 1806 SSE_SUB_PS(XMM4, XMM6) 1807 1808 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1809 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1810 SSE_MULT_PS(XMM7, XMM2) 1811 SSE_SUB_PS(XMM4, XMM7) 1812 1813 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1814 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1815 SSE_MULT_PS(XMM5, XMM3) 1816 SSE_SUB_PS(XMM4, XMM5) 1817 1818 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1819 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1820 1821 /* Second Column */ 1822 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1823 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1824 1825 /* Matrix-Vector Product: */ 1826 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1827 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1828 SSE_MULT_PS(XMM6, XMM0) 1829 SSE_SUB_PS(XMM5, XMM6) 1830 1831 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1832 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1833 SSE_MULT_PS(XMM7, XMM1) 1834 SSE_SUB_PS(XMM5, XMM7) 1835 1836 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1837 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1838 SSE_MULT_PS(XMM4, XMM2) 1839 SSE_SUB_PS(XMM5, XMM4) 1840 1841 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1842 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1843 SSE_MULT_PS(XMM6, XMM3) 1844 SSE_SUB_PS(XMM5, XMM6) 1845 1846 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1847 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1848 1849 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1850 1851 /* Third Column */ 1852 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1853 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1854 1855 /* Matrix-Vector Product: */ 1856 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1857 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1858 SSE_MULT_PS(XMM7, XMM0) 1859 SSE_SUB_PS(XMM6, XMM7) 1860 1861 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1862 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1863 SSE_MULT_PS(XMM4, XMM1) 1864 SSE_SUB_PS(XMM6, XMM4) 1865 1866 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1867 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1868 SSE_MULT_PS(XMM5, XMM2) 1869 SSE_SUB_PS(XMM6, XMM5) 1870 1871 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1872 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1873 SSE_MULT_PS(XMM7, XMM3) 1874 SSE_SUB_PS(XMM6, XMM7) 1875 1876 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1877 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1878 1879 /* Fourth Column */ 1880 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1881 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1882 1883 /* Matrix-Vector Product: */ 1884 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1885 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1886 SSE_MULT_PS(XMM5, XMM0) 1887 SSE_SUB_PS(XMM4, XMM5) 1888 1889 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1890 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1891 SSE_MULT_PS(XMM6, XMM1) 1892 SSE_SUB_PS(XMM4, XMM6) 1893 1894 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1895 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1896 SSE_MULT_PS(XMM7, XMM2) 1897 SSE_SUB_PS(XMM4, XMM7) 1898 1899 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1900 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1901 SSE_MULT_PS(XMM5, XMM3) 1902 SSE_SUB_PS(XMM4, XMM5) 1903 1904 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1905 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1906 SSE_INLINE_END_2; 1907 pv += 16; 1908 } 1909 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1910 } 1911 row = (unsigned int)(*bjtmp++); 1912 /* row = (*bjtmp++)/4; */ 1913 /* bjtmp++; */ 1914 } 1915 /* finished row so stick it into b->a */ 1916 pv = ba + 16 * bi[i]; 1917 pj = bj + bi[i]; 1918 nz = bi[i + 1] - bi[i]; 1919 1920 /* Copy x block back into pv block */ 1921 for (j = 0; j < nz; j++) { 1922 x = rtmp + 16 * ((unsigned int)pj[j]); 1923 /* x = rtmp+4*pj[j]; */ 1924 1925 SSE_INLINE_BEGIN_2(x, pv) 1926 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1927 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1928 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1929 1930 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1931 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1932 1933 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1934 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1935 1936 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1937 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1938 1939 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1940 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1941 1942 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1943 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1944 1945 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1946 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1947 1948 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1949 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1950 SSE_INLINE_END_2; 1951 pv += 16; 1952 } 1953 /* invert diagonal block */ 1954 w = ba + 16 * diag_offset[i]; 1955 if (pivotinblocks) { 1956 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected)); 1957 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1958 } else { 1959 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1960 } 1961 /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1962 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1963 } 1964 1965 PetscCall(PetscFree(rtmp)); 1966 1967 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1968 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1969 C->assembled = PETSC_TRUE; 1970 1971 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1972 /* Flop Count from inverting diagonal blocks */ 1973 SSE_SCOPE_END; 1974 PetscFunctionReturn(PETSC_SUCCESS); 1975 } 1976 1977 #endif 1978