1be1d678aSKris Buschelman 283287d42SBarry Smith /* 383287d42SBarry Smith Factorization code for BAIJ format. 483287d42SBarry Smith */ 5c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 6af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 783287d42SBarry Smith 883287d42SBarry Smith /* ------------------------------------------------------------*/ 983287d42SBarry Smith /* 1083287d42SBarry Smith Version for when blocks are 4 by 4 1183287d42SBarry Smith */ 12d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info) 13d71ae5a4SJacob Faibussowitsch { 1483287d42SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1583287d42SBarry Smith IS isrow = b->row, isicol = b->icol; 165d0c19d7SBarry Smith const PetscInt *r, *ic; 175d0c19d7SBarry Smith PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 18690b6cddSBarry Smith PetscInt *ajtmpold, *ajtmp, nz, row; 19690b6cddSBarry Smith PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 2083287d42SBarry Smith MatScalar *pv, *v, *rtmp, *pc, *w, *x; 2183287d42SBarry Smith MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 2283287d42SBarry Smith MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 2383287d42SBarry Smith MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 2483287d42SBarry Smith MatScalar m13, m14, m15, m16; 2583287d42SBarry Smith MatScalar *ba = b->a, *aa = a->a; 26a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 27182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 280164db54SHong Zhang PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 2983287d42SBarry Smith 3083287d42SBarry Smith PetscFunctionBegin; 319566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 329566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 339566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 340164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 3583287d42SBarry Smith 3683287d42SBarry Smith for (i = 0; i < n; i++) { 3783287d42SBarry Smith nz = bi[i + 1] - bi[i]; 3883287d42SBarry Smith ajtmp = bj + bi[i]; 3983287d42SBarry Smith for (j = 0; j < nz; j++) { 4083287d42SBarry Smith x = rtmp + 16 * ajtmp[j]; 4183287d42SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 4283287d42SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 4383287d42SBarry Smith } 4483287d42SBarry Smith /* load in initial (unfactored row) */ 4583287d42SBarry Smith idx = r[i]; 4683287d42SBarry Smith nz = ai[idx + 1] - ai[idx]; 4783287d42SBarry Smith ajtmpold = aj + ai[idx]; 4883287d42SBarry Smith v = aa + 16 * ai[idx]; 4983287d42SBarry Smith for (j = 0; j < nz; j++) { 5083287d42SBarry Smith x = rtmp + 16 * ic[ajtmpold[j]]; 519371c9d4SSatish Balay x[0] = v[0]; 529371c9d4SSatish Balay x[1] = v[1]; 539371c9d4SSatish Balay x[2] = v[2]; 549371c9d4SSatish Balay x[3] = v[3]; 559371c9d4SSatish Balay x[4] = v[4]; 569371c9d4SSatish Balay x[5] = v[5]; 579371c9d4SSatish Balay x[6] = v[6]; 589371c9d4SSatish Balay x[7] = v[7]; 599371c9d4SSatish Balay x[8] = v[8]; 609371c9d4SSatish Balay x[9] = v[9]; 619371c9d4SSatish Balay x[10] = v[10]; 629371c9d4SSatish Balay x[11] = v[11]; 639371c9d4SSatish Balay x[12] = v[12]; 649371c9d4SSatish Balay x[13] = v[13]; 659371c9d4SSatish Balay x[14] = v[14]; 669371c9d4SSatish Balay x[15] = v[15]; 6783287d42SBarry Smith v += 16; 6883287d42SBarry Smith } 6983287d42SBarry Smith row = *ajtmp++; 7083287d42SBarry Smith while (row < i) { 7183287d42SBarry Smith pc = rtmp + 16 * row; 729371c9d4SSatish Balay p1 = pc[0]; 739371c9d4SSatish Balay p2 = pc[1]; 749371c9d4SSatish Balay p3 = pc[2]; 759371c9d4SSatish Balay p4 = pc[3]; 769371c9d4SSatish Balay p5 = pc[4]; 779371c9d4SSatish Balay p6 = pc[5]; 789371c9d4SSatish Balay p7 = pc[6]; 799371c9d4SSatish Balay p8 = pc[7]; 809371c9d4SSatish Balay p9 = pc[8]; 819371c9d4SSatish Balay p10 = pc[9]; 829371c9d4SSatish Balay p11 = pc[10]; 839371c9d4SSatish Balay p12 = pc[11]; 849371c9d4SSatish Balay p13 = pc[12]; 859371c9d4SSatish Balay p14 = pc[13]; 869371c9d4SSatish Balay p15 = pc[14]; 879371c9d4SSatish Balay p16 = pc[15]; 889371c9d4SSatish Balay if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 8983287d42SBarry Smith pv = ba + 16 * diag_offset[row]; 9083287d42SBarry Smith pj = bj + diag_offset[row] + 1; 919371c9d4SSatish Balay x1 = pv[0]; 929371c9d4SSatish Balay x2 = pv[1]; 939371c9d4SSatish Balay x3 = pv[2]; 949371c9d4SSatish Balay x4 = pv[3]; 959371c9d4SSatish Balay x5 = pv[4]; 969371c9d4SSatish Balay x6 = pv[5]; 979371c9d4SSatish Balay x7 = pv[6]; 989371c9d4SSatish Balay x8 = pv[7]; 999371c9d4SSatish Balay x9 = pv[8]; 1009371c9d4SSatish Balay x10 = pv[9]; 1019371c9d4SSatish Balay x11 = pv[10]; 1029371c9d4SSatish Balay x12 = pv[11]; 1039371c9d4SSatish Balay x13 = pv[12]; 1049371c9d4SSatish Balay x14 = pv[13]; 1059371c9d4SSatish Balay x15 = pv[14]; 1069371c9d4SSatish Balay x16 = pv[15]; 10783287d42SBarry Smith pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 10883287d42SBarry Smith pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 10983287d42SBarry Smith pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 11083287d42SBarry Smith pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 11183287d42SBarry Smith 11283287d42SBarry Smith pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 11383287d42SBarry Smith pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 11483287d42SBarry Smith pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 11583287d42SBarry Smith pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 11683287d42SBarry Smith 11783287d42SBarry Smith pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 11883287d42SBarry Smith pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 11983287d42SBarry Smith pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 12083287d42SBarry Smith pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 12183287d42SBarry Smith 12283287d42SBarry Smith pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 12383287d42SBarry Smith pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 12483287d42SBarry Smith pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 12583287d42SBarry Smith pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 12683287d42SBarry Smith 12783287d42SBarry Smith nz = bi[row + 1] - diag_offset[row] - 1; 12883287d42SBarry Smith pv += 16; 12983287d42SBarry Smith for (j = 0; j < nz; j++) { 1309371c9d4SSatish Balay x1 = pv[0]; 1319371c9d4SSatish Balay x2 = pv[1]; 1329371c9d4SSatish Balay x3 = pv[2]; 1339371c9d4SSatish Balay x4 = pv[3]; 1349371c9d4SSatish Balay x5 = pv[4]; 1359371c9d4SSatish Balay x6 = pv[5]; 1369371c9d4SSatish Balay x7 = pv[6]; 1379371c9d4SSatish Balay x8 = pv[7]; 1389371c9d4SSatish Balay x9 = pv[8]; 1399371c9d4SSatish Balay x10 = pv[9]; 1409371c9d4SSatish Balay x11 = pv[10]; 1419371c9d4SSatish Balay x12 = pv[11]; 1429371c9d4SSatish Balay x13 = pv[12]; 1439371c9d4SSatish Balay x14 = pv[13]; 1449371c9d4SSatish Balay x15 = pv[14]; 1459371c9d4SSatish Balay x16 = pv[15]; 14683287d42SBarry Smith x = rtmp + 16 * pj[j]; 14783287d42SBarry Smith x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 14883287d42SBarry Smith x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 14983287d42SBarry Smith x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 15083287d42SBarry Smith x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 15183287d42SBarry Smith 15283287d42SBarry Smith x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 15383287d42SBarry Smith x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 15483287d42SBarry Smith x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 15583287d42SBarry Smith x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 15683287d42SBarry Smith 15783287d42SBarry Smith x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 15883287d42SBarry Smith x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 15983287d42SBarry Smith x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 16083287d42SBarry Smith x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 16183287d42SBarry Smith 16283287d42SBarry Smith x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 16383287d42SBarry Smith x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 16483287d42SBarry Smith x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 16583287d42SBarry Smith x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 16683287d42SBarry Smith 16783287d42SBarry Smith pv += 16; 16883287d42SBarry Smith } 1699566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 17083287d42SBarry Smith } 17183287d42SBarry Smith row = *ajtmp++; 17283287d42SBarry Smith } 17383287d42SBarry Smith /* finished row so stick it into b->a */ 17483287d42SBarry Smith pv = ba + 16 * bi[i]; 17583287d42SBarry Smith pj = bj + bi[i]; 17683287d42SBarry Smith nz = bi[i + 1] - bi[i]; 17783287d42SBarry Smith for (j = 0; j < nz; j++) { 17883287d42SBarry Smith x = rtmp + 16 * pj[j]; 1799371c9d4SSatish Balay pv[0] = x[0]; 1809371c9d4SSatish Balay pv[1] = x[1]; 1819371c9d4SSatish Balay pv[2] = x[2]; 1829371c9d4SSatish Balay pv[3] = x[3]; 1839371c9d4SSatish Balay pv[4] = x[4]; 1849371c9d4SSatish Balay pv[5] = x[5]; 1859371c9d4SSatish Balay pv[6] = x[6]; 1869371c9d4SSatish Balay pv[7] = x[7]; 1879371c9d4SSatish Balay pv[8] = x[8]; 1889371c9d4SSatish Balay pv[9] = x[9]; 1899371c9d4SSatish Balay pv[10] = x[10]; 1909371c9d4SSatish Balay pv[11] = x[11]; 1919371c9d4SSatish Balay pv[12] = x[12]; 1929371c9d4SSatish Balay pv[13] = x[13]; 1939371c9d4SSatish Balay pv[14] = x[14]; 1949371c9d4SSatish Balay pv[15] = x[15]; 19583287d42SBarry Smith pv += 16; 19683287d42SBarry Smith } 19783287d42SBarry Smith /* invert diagonal block */ 19883287d42SBarry Smith w = ba + 16 * diag_offset[i]; 199bcd9e38bSBarry Smith if (pivotinblocks) { 2009566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 2017b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 202bcd9e38bSBarry Smith } else { 2039566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 204bcd9e38bSBarry Smith } 20583287d42SBarry Smith } 20683287d42SBarry Smith 2079566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 2089566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 2099566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 21026fbe8dcSKarl Rupp 21106e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 21206e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 21383287d42SBarry Smith C->assembled = PETSC_TRUE; 21426fbe8dcSKarl Rupp 2159566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 216*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21783287d42SBarry Smith } 218e6580ceeSShri Abhyankar 2194dd39f65SShri Abhyankar /* MatLUFactorNumeric_SeqBAIJ_4 - 2204dd39f65SShri Abhyankar copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 22196b95a6bSBarry Smith PetscKernel_A_gets_A_times_B() 22296b95a6bSBarry Smith PetscKernel_A_gets_A_minus_B_times_C() 22396b95a6bSBarry Smith PetscKernel_A_gets_inverse_A() 224209027a4SShri Abhyankar */ 225c0c7eb62SShri Abhyankar 226d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info) 227d71ae5a4SJacob Faibussowitsch { 228209027a4SShri Abhyankar Mat C = B; 229209027a4SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 230209027a4SShri Abhyankar IS isrow = b->row, isicol = b->icol; 2315a586d82SBarry Smith const PetscInt *r, *ic; 232bbd65245SShri Abhyankar PetscInt i, j, k, nz, nzL, row; 233bbd65245SShri Abhyankar const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 234bbd65245SShri Abhyankar const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 235209027a4SShri Abhyankar MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 236bbd65245SShri Abhyankar PetscInt flg; 2376ba06ab7SHong Zhang PetscReal shift; 238a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected; 239209027a4SShri Abhyankar 240209027a4SShri Abhyankar PetscFunctionBegin; 2410164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 2429566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 2439566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 244209027a4SShri Abhyankar 2456ba06ab7SHong Zhang if (info->shifttype == MAT_SHIFT_NONE) { 2466ba06ab7SHong Zhang shift = 0; 2476ba06ab7SHong Zhang } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 2486ba06ab7SHong Zhang shift = info->shiftamount; 2496ba06ab7SHong Zhang } 2506ba06ab7SHong Zhang 251209027a4SShri Abhyankar /* generate work space needed by the factorization */ 2529566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 2539566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rtmp, bs2 * n)); 254209027a4SShri Abhyankar 255209027a4SShri Abhyankar for (i = 0; i < n; i++) { 256209027a4SShri Abhyankar /* zero rtmp */ 257209027a4SShri Abhyankar /* L part */ 258209027a4SShri Abhyankar nz = bi[i + 1] - bi[i]; 259209027a4SShri Abhyankar bjtmp = bj + bi[i]; 26048a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 261209027a4SShri Abhyankar 262209027a4SShri Abhyankar /* U part */ 26378bb4007SShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 26478bb4007SShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 26548a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 26678bb4007SShri Abhyankar 26778bb4007SShri Abhyankar /* load in initial (unfactored row) */ 26878bb4007SShri Abhyankar nz = ai[r[i] + 1] - ai[r[i]]; 26978bb4007SShri Abhyankar ajtmp = aj + ai[r[i]]; 27078bb4007SShri Abhyankar v = aa + bs2 * ai[r[i]]; 27148a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 27278bb4007SShri Abhyankar 27378bb4007SShri Abhyankar /* elimination */ 27478bb4007SShri Abhyankar bjtmp = bj + bi[i]; 27578bb4007SShri Abhyankar nzL = bi[i + 1] - bi[i]; 27678bb4007SShri Abhyankar for (k = 0; k < nzL; k++) { 27778bb4007SShri Abhyankar row = bjtmp[k]; 27878bb4007SShri Abhyankar pc = rtmp + bs2 * row; 279c35f09e5SBarry Smith for (flg = 0, j = 0; j < bs2; j++) { 280c35f09e5SBarry Smith if (pc[j] != 0.0) { 281c35f09e5SBarry Smith flg = 1; 282c35f09e5SBarry Smith break; 283c35f09e5SBarry Smith } 284c35f09e5SBarry Smith } 28578bb4007SShri Abhyankar if (flg) { 28678bb4007SShri Abhyankar pv = b->a + bs2 * bdiag[row]; 28796b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 2889566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 28978bb4007SShri Abhyankar 290a5b23f4aSJose E. Roman pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 29178bb4007SShri Abhyankar pv = b->a + bs2 * (bdiag[row + 1] + 1); 29278bb4007SShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 29378bb4007SShri Abhyankar for (j = 0; j < nz; j++) { 29496b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 29578bb4007SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 29678bb4007SShri Abhyankar v = rtmp + bs2 * pj[j]; 2979566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 29878bb4007SShri Abhyankar pv += bs2; 29978bb4007SShri Abhyankar } 3009566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 30178bb4007SShri Abhyankar } 30278bb4007SShri Abhyankar } 30378bb4007SShri Abhyankar 30478bb4007SShri Abhyankar /* finished row so stick it into b->a */ 30578bb4007SShri Abhyankar /* L part */ 30678bb4007SShri Abhyankar pv = b->a + bs2 * bi[i]; 30778bb4007SShri Abhyankar pj = b->j + bi[i]; 30878bb4007SShri Abhyankar nz = bi[i + 1] - bi[i]; 30948a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 31078bb4007SShri Abhyankar 311a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 31278bb4007SShri Abhyankar pv = b->a + bs2 * bdiag[i]; 31378bb4007SShri Abhyankar pj = b->j + bdiag[i]; 3149566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 3159566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 3167b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 31778bb4007SShri Abhyankar 31878bb4007SShri Abhyankar /* U part */ 31978bb4007SShri Abhyankar pv = b->a + bs2 * (bdiag[i + 1] + 1); 32078bb4007SShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 32178bb4007SShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; 32248a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 32378bb4007SShri Abhyankar } 32478bb4007SShri Abhyankar 3259566063dSJacob Faibussowitsch PetscCall(PetscFree2(rtmp, mwork)); 3269566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 3279566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 32826fbe8dcSKarl Rupp 3294dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4; 3304dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 33178bb4007SShri Abhyankar C->assembled = PETSC_TRUE; 33226fbe8dcSKarl Rupp 3339566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 334*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 33578bb4007SShri Abhyankar } 33678bb4007SShri Abhyankar 337d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 338d71ae5a4SJacob Faibussowitsch { 339e6580ceeSShri Abhyankar /* 340e6580ceeSShri Abhyankar Default Version for when blocks are 4 by 4 Using natural ordering 341e6580ceeSShri Abhyankar */ 342e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 343e6580ceeSShri Abhyankar PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 344e6580ceeSShri Abhyankar PetscInt *ajtmpold, *ajtmp, nz, row; 345e6580ceeSShri Abhyankar PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 346e6580ceeSShri Abhyankar MatScalar *pv, *v, *rtmp, *pc, *w, *x; 347e6580ceeSShri Abhyankar MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 348e6580ceeSShri Abhyankar MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 349e6580ceeSShri Abhyankar MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 350e6580ceeSShri Abhyankar MatScalar m13, m14, m15, m16; 351e6580ceeSShri Abhyankar MatScalar *ba = b->a, *aa = a->a; 352a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 353182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 3540164db54SHong Zhang PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 355e6580ceeSShri Abhyankar 356e6580ceeSShri Abhyankar PetscFunctionBegin; 3570164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 3589566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 359e6580ceeSShri Abhyankar 360e6580ceeSShri Abhyankar for (i = 0; i < n; i++) { 361e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 362e6580ceeSShri Abhyankar ajtmp = bj + bi[i]; 363e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 364e6580ceeSShri Abhyankar x = rtmp + 16 * ajtmp[j]; 365e6580ceeSShri Abhyankar x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 366e6580ceeSShri Abhyankar x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 367e6580ceeSShri Abhyankar } 368e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 369e6580ceeSShri Abhyankar nz = ai[i + 1] - ai[i]; 370e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 371e6580ceeSShri Abhyankar v = aa + 16 * ai[i]; 372e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 373e6580ceeSShri Abhyankar x = rtmp + 16 * ajtmpold[j]; 3749371c9d4SSatish Balay x[0] = v[0]; 3759371c9d4SSatish Balay x[1] = v[1]; 3769371c9d4SSatish Balay x[2] = v[2]; 3779371c9d4SSatish Balay x[3] = v[3]; 3789371c9d4SSatish Balay x[4] = v[4]; 3799371c9d4SSatish Balay x[5] = v[5]; 3809371c9d4SSatish Balay x[6] = v[6]; 3819371c9d4SSatish Balay x[7] = v[7]; 3829371c9d4SSatish Balay x[8] = v[8]; 3839371c9d4SSatish Balay x[9] = v[9]; 3849371c9d4SSatish Balay x[10] = v[10]; 3859371c9d4SSatish Balay x[11] = v[11]; 3869371c9d4SSatish Balay x[12] = v[12]; 3879371c9d4SSatish Balay x[13] = v[13]; 3889371c9d4SSatish Balay x[14] = v[14]; 3899371c9d4SSatish Balay x[15] = v[15]; 390e6580ceeSShri Abhyankar v += 16; 391e6580ceeSShri Abhyankar } 392e6580ceeSShri Abhyankar row = *ajtmp++; 393e6580ceeSShri Abhyankar while (row < i) { 394e6580ceeSShri Abhyankar pc = rtmp + 16 * row; 3959371c9d4SSatish Balay p1 = pc[0]; 3969371c9d4SSatish Balay p2 = pc[1]; 3979371c9d4SSatish Balay p3 = pc[2]; 3989371c9d4SSatish Balay p4 = pc[3]; 3999371c9d4SSatish Balay p5 = pc[4]; 4009371c9d4SSatish Balay p6 = pc[5]; 4019371c9d4SSatish Balay p7 = pc[6]; 4029371c9d4SSatish Balay p8 = pc[7]; 4039371c9d4SSatish Balay p9 = pc[8]; 4049371c9d4SSatish Balay p10 = pc[9]; 4059371c9d4SSatish Balay p11 = pc[10]; 4069371c9d4SSatish Balay p12 = pc[11]; 4079371c9d4SSatish Balay p13 = pc[12]; 4089371c9d4SSatish Balay p14 = pc[13]; 4099371c9d4SSatish Balay p15 = pc[14]; 4109371c9d4SSatish Balay p16 = pc[15]; 4119371c9d4SSatish Balay if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 412e6580ceeSShri Abhyankar pv = ba + 16 * diag_offset[row]; 413e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 4149371c9d4SSatish Balay x1 = pv[0]; 4159371c9d4SSatish Balay x2 = pv[1]; 4169371c9d4SSatish Balay x3 = pv[2]; 4179371c9d4SSatish Balay x4 = pv[3]; 4189371c9d4SSatish Balay x5 = pv[4]; 4199371c9d4SSatish Balay x6 = pv[5]; 4209371c9d4SSatish Balay x7 = pv[6]; 4219371c9d4SSatish Balay x8 = pv[7]; 4229371c9d4SSatish Balay x9 = pv[8]; 4239371c9d4SSatish Balay x10 = pv[9]; 4249371c9d4SSatish Balay x11 = pv[10]; 4259371c9d4SSatish Balay x12 = pv[11]; 4269371c9d4SSatish Balay x13 = pv[12]; 4279371c9d4SSatish Balay x14 = pv[13]; 4289371c9d4SSatish Balay x15 = pv[14]; 4299371c9d4SSatish Balay x16 = pv[15]; 430e6580ceeSShri Abhyankar pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 431e6580ceeSShri Abhyankar pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 432e6580ceeSShri Abhyankar pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 433e6580ceeSShri Abhyankar pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 434e6580ceeSShri Abhyankar 435e6580ceeSShri Abhyankar pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 436e6580ceeSShri Abhyankar pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 437e6580ceeSShri Abhyankar pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 438e6580ceeSShri Abhyankar pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 439e6580ceeSShri Abhyankar 440e6580ceeSShri Abhyankar pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 441e6580ceeSShri Abhyankar pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 442e6580ceeSShri Abhyankar pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 443e6580ceeSShri Abhyankar pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 444e6580ceeSShri Abhyankar 445e6580ceeSShri Abhyankar pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 446e6580ceeSShri Abhyankar pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 447e6580ceeSShri Abhyankar pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 448e6580ceeSShri Abhyankar pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 449e6580ceeSShri Abhyankar nz = bi[row + 1] - diag_offset[row] - 1; 450e6580ceeSShri Abhyankar pv += 16; 451e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 4529371c9d4SSatish Balay x1 = pv[0]; 4539371c9d4SSatish Balay x2 = pv[1]; 4549371c9d4SSatish Balay x3 = pv[2]; 4559371c9d4SSatish Balay x4 = pv[3]; 4569371c9d4SSatish Balay x5 = pv[4]; 4579371c9d4SSatish Balay x6 = pv[5]; 4589371c9d4SSatish Balay x7 = pv[6]; 4599371c9d4SSatish Balay x8 = pv[7]; 4609371c9d4SSatish Balay x9 = pv[8]; 4619371c9d4SSatish Balay x10 = pv[9]; 4629371c9d4SSatish Balay x11 = pv[10]; 4639371c9d4SSatish Balay x12 = pv[11]; 4649371c9d4SSatish Balay x13 = pv[12]; 4659371c9d4SSatish Balay x14 = pv[13]; 4669371c9d4SSatish Balay x15 = pv[14]; 4679371c9d4SSatish Balay x16 = pv[15]; 468e6580ceeSShri Abhyankar x = rtmp + 16 * pj[j]; 469e6580ceeSShri Abhyankar x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 470e6580ceeSShri Abhyankar x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 471e6580ceeSShri Abhyankar x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 472e6580ceeSShri Abhyankar x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 473e6580ceeSShri Abhyankar 474e6580ceeSShri Abhyankar x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 475e6580ceeSShri Abhyankar x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 476e6580ceeSShri Abhyankar x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 477e6580ceeSShri Abhyankar x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 478e6580ceeSShri Abhyankar 479e6580ceeSShri Abhyankar x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 480e6580ceeSShri Abhyankar x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 481e6580ceeSShri Abhyankar x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 482e6580ceeSShri Abhyankar x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 483e6580ceeSShri Abhyankar 484e6580ceeSShri Abhyankar x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 485e6580ceeSShri Abhyankar x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 486e6580ceeSShri Abhyankar x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 487e6580ceeSShri Abhyankar x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 488e6580ceeSShri Abhyankar 489e6580ceeSShri Abhyankar pv += 16; 490e6580ceeSShri Abhyankar } 4919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 492e6580ceeSShri Abhyankar } 493e6580ceeSShri Abhyankar row = *ajtmp++; 494e6580ceeSShri Abhyankar } 495e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 496e6580ceeSShri Abhyankar pv = ba + 16 * bi[i]; 497e6580ceeSShri Abhyankar pj = bj + bi[i]; 498e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 499e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 500e6580ceeSShri Abhyankar x = rtmp + 16 * pj[j]; 5019371c9d4SSatish Balay pv[0] = x[0]; 5029371c9d4SSatish Balay pv[1] = x[1]; 5039371c9d4SSatish Balay pv[2] = x[2]; 5049371c9d4SSatish Balay pv[3] = x[3]; 5059371c9d4SSatish Balay pv[4] = x[4]; 5069371c9d4SSatish Balay pv[5] = x[5]; 5079371c9d4SSatish Balay pv[6] = x[6]; 5089371c9d4SSatish Balay pv[7] = x[7]; 5099371c9d4SSatish Balay pv[8] = x[8]; 5109371c9d4SSatish Balay pv[9] = x[9]; 5119371c9d4SSatish Balay pv[10] = x[10]; 5129371c9d4SSatish Balay pv[11] = x[11]; 5139371c9d4SSatish Balay pv[12] = x[12]; 5149371c9d4SSatish Balay pv[13] = x[13]; 5159371c9d4SSatish Balay pv[14] = x[14]; 5169371c9d4SSatish Balay pv[15] = x[15]; 517e6580ceeSShri Abhyankar pv += 16; 518e6580ceeSShri Abhyankar } 519e6580ceeSShri Abhyankar /* invert diagonal block */ 520e6580ceeSShri Abhyankar w = ba + 16 * diag_offset[i]; 521e6580ceeSShri Abhyankar if (pivotinblocks) { 5229566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 5237b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 524e6580ceeSShri Abhyankar } else { 5259566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 526e6580ceeSShri Abhyankar } 527e6580ceeSShri Abhyankar } 528e6580ceeSShri Abhyankar 5299566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 53026fbe8dcSKarl Rupp 53106e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 53206e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 533e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 53426fbe8dcSKarl Rupp 5359566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 536*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 537e6580ceeSShri Abhyankar } 538c0c7eb62SShri Abhyankar 539209027a4SShri Abhyankar /* 5404dd39f65SShri Abhyankar MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 5414dd39f65SShri Abhyankar copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 542209027a4SShri Abhyankar */ 543d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 544d71ae5a4SJacob Faibussowitsch { 545209027a4SShri Abhyankar Mat C = B; 546209027a4SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 547bbd65245SShri Abhyankar PetscInt i, j, k, nz, nzL, row; 548bbd65245SShri Abhyankar const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 549bbd65245SShri Abhyankar const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 550209027a4SShri Abhyankar MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 551bbd65245SShri Abhyankar PetscInt flg; 5526ba06ab7SHong Zhang PetscReal shift; 553a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected; 554e6580ceeSShri Abhyankar 555209027a4SShri Abhyankar PetscFunctionBegin; 5560164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 5570164db54SHong Zhang 558209027a4SShri Abhyankar /* generate work space needed by the factorization */ 5599566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 5609566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rtmp, bs2 * n)); 561209027a4SShri Abhyankar 5626ba06ab7SHong Zhang if (info->shifttype == MAT_SHIFT_NONE) { 5636ba06ab7SHong Zhang shift = 0; 5646ba06ab7SHong Zhang } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 5656ba06ab7SHong Zhang shift = info->shiftamount; 5666ba06ab7SHong Zhang } 5676ba06ab7SHong Zhang 568209027a4SShri Abhyankar for (i = 0; i < n; i++) { 569209027a4SShri Abhyankar /* zero rtmp */ 570209027a4SShri Abhyankar /* L part */ 571209027a4SShri Abhyankar nz = bi[i + 1] - bi[i]; 572209027a4SShri Abhyankar bjtmp = bj + bi[i]; 57348a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 574209027a4SShri Abhyankar 575209027a4SShri Abhyankar /* U part */ 576b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 577b2b2dd24SShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 57848a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 579b2b2dd24SShri Abhyankar 580b2b2dd24SShri Abhyankar /* load in initial (unfactored row) */ 581b2b2dd24SShri Abhyankar nz = ai[i + 1] - ai[i]; 582b2b2dd24SShri Abhyankar ajtmp = aj + ai[i]; 583b2b2dd24SShri Abhyankar v = aa + bs2 * ai[i]; 58448a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 585b2b2dd24SShri Abhyankar 586b2b2dd24SShri Abhyankar /* elimination */ 587b2b2dd24SShri Abhyankar bjtmp = bj + bi[i]; 588b2b2dd24SShri Abhyankar nzL = bi[i + 1] - bi[i]; 589b2b2dd24SShri Abhyankar for (k = 0; k < nzL; k++) { 590b2b2dd24SShri Abhyankar row = bjtmp[k]; 591b2b2dd24SShri Abhyankar pc = rtmp + bs2 * row; 592c35f09e5SBarry Smith for (flg = 0, j = 0; j < bs2; j++) { 593c35f09e5SBarry Smith if (pc[j] != 0.0) { 594c35f09e5SBarry Smith flg = 1; 595c35f09e5SBarry Smith break; 596c35f09e5SBarry Smith } 597c35f09e5SBarry Smith } 598b2b2dd24SShri Abhyankar if (flg) { 599b2b2dd24SShri Abhyankar pv = b->a + bs2 * bdiag[row]; 60096b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 6019566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 602b2b2dd24SShri Abhyankar 603a5b23f4aSJose E. Roman pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 604b2b2dd24SShri Abhyankar pv = b->a + bs2 * (bdiag[row + 1] + 1); 605b2b2dd24SShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 606b2b2dd24SShri Abhyankar for (j = 0; j < nz; j++) { 60796b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 608b2b2dd24SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 609b2b2dd24SShri Abhyankar v = rtmp + bs2 * pj[j]; 6109566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 611b2b2dd24SShri Abhyankar pv += bs2; 612b2b2dd24SShri Abhyankar } 6139566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 614b2b2dd24SShri Abhyankar } 615b2b2dd24SShri Abhyankar } 616b2b2dd24SShri Abhyankar 617b2b2dd24SShri Abhyankar /* finished row so stick it into b->a */ 618b2b2dd24SShri Abhyankar /* L part */ 619b2b2dd24SShri Abhyankar pv = b->a + bs2 * bi[i]; 620b2b2dd24SShri Abhyankar pj = b->j + bi[i]; 621b2b2dd24SShri Abhyankar nz = bi[i + 1] - bi[i]; 62248a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 623b2b2dd24SShri Abhyankar 624a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 625b2b2dd24SShri Abhyankar pv = b->a + bs2 * bdiag[i]; 626b2b2dd24SShri Abhyankar pj = b->j + bdiag[i]; 6279566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 6289566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 6297b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 630b2b2dd24SShri Abhyankar 631b2b2dd24SShri Abhyankar /* U part */ 632b2b2dd24SShri Abhyankar pv = b->a + bs2 * (bdiag[i + 1] + 1); 633b2b2dd24SShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 634b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; 63548a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 636b2b2dd24SShri Abhyankar } 6379566063dSJacob Faibussowitsch PetscCall(PetscFree2(rtmp, mwork)); 63826fbe8dcSKarl Rupp 6394dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 6404dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 641b2b2dd24SShri Abhyankar C->assembled = PETSC_TRUE; 64226fbe8dcSKarl Rupp 6439566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 644*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 645b2b2dd24SShri Abhyankar } 646b2b2dd24SShri Abhyankar 647e6580ceeSShri Abhyankar #if defined(PETSC_HAVE_SSE) 648e6580ceeSShri Abhyankar 649e6580ceeSShri Abhyankar #include PETSC_HAVE_SSE 650e6580ceeSShri Abhyankar 651e6580ceeSShri Abhyankar /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 652d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info) 653d71ae5a4SJacob Faibussowitsch { 654e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 655e6580ceeSShri Abhyankar int i, j, n = a->mbs; 656e6580ceeSShri Abhyankar int *bj = b->j, *bjtmp, *pj; 657e6580ceeSShri Abhyankar int row; 658e6580ceeSShri Abhyankar int *ajtmpold, nz, *bi = b->i; 659e6580ceeSShri Abhyankar int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 660e6580ceeSShri Abhyankar MatScalar *pv, *v, *rtmp, *pc, *w, *x; 661e6580ceeSShri Abhyankar MatScalar *ba = b->a, *aa = a->a; 662e6580ceeSShri Abhyankar int nonzero = 0; 663a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 664182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 6650164db54SHong Zhang PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 666e6580ceeSShri Abhyankar 667e6580ceeSShri Abhyankar PetscFunctionBegin; 6680164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 669e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 670e6580ceeSShri Abhyankar 671aed4548fSBarry Smith PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 672aed4548fSBarry Smith PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 6739566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 674aed4548fSBarry Smith PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 675e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 676e6580ceeSShri Abhyankar /* colscale = 4; */ 677e6580ceeSShri Abhyankar /* } */ 678e6580ceeSShri Abhyankar for (i = 0; i < n; i++) { 679e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 680e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 681e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 682e6580ceeSShri Abhyankar /* zero out one register */ 683e6580ceeSShri Abhyankar XOR_PS(XMM7, XMM7); 684e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 685e6580ceeSShri Abhyankar x = rtmp + 16 * bjtmp[j]; 686e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 687e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 688e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 689e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 690e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 691e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 692e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 693e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 694e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 695e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 696e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 697e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 698e6580ceeSShri Abhyankar SSE_INLINE_END_1; 699e6580ceeSShri Abhyankar } 700e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 701e6580ceeSShri Abhyankar nz = ai[i + 1] - ai[i]; 702e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 703e6580ceeSShri Abhyankar v = aa + 16 * ai[i]; 704e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 705e6580ceeSShri Abhyankar x = rtmp + 16 * ajtmpold[j]; 706e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmpold[j]; */ 707e6580ceeSShri Abhyankar /* Copy v block into x block */ 708e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v, x) 709e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 710e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 711e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 712e6580ceeSShri Abhyankar 713e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 714e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 715e6580ceeSShri Abhyankar 716e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 717e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 718e6580ceeSShri Abhyankar 719e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 720e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 721e6580ceeSShri Abhyankar 722e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 723e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 724e6580ceeSShri Abhyankar 725e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 726e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 727e6580ceeSShri Abhyankar 728e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 729e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 730e6580ceeSShri Abhyankar 731e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 732e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 733e6580ceeSShri Abhyankar SSE_INLINE_END_2; 734e6580ceeSShri Abhyankar 735e6580ceeSShri Abhyankar v += 16; 736e6580ceeSShri Abhyankar } 737e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 738e6580ceeSShri Abhyankar row = *bjtmp++; 739e6580ceeSShri Abhyankar while (row < i) { 740e6580ceeSShri Abhyankar pc = rtmp + 16 * row; 741e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 742e6580ceeSShri Abhyankar /* Load block from lower triangle */ 743e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 744e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 745e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 746e6580ceeSShri Abhyankar 747e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 748e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 749e6580ceeSShri Abhyankar 750e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 751e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 752e6580ceeSShri Abhyankar 753e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 754e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 755e6580ceeSShri Abhyankar 756e6580ceeSShri Abhyankar /* Compare block to zero block */ 757e6580ceeSShri Abhyankar 758e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4, XMM7) 759e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4, XMM0) 760e6580ceeSShri Abhyankar 761e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5, XMM7) 762e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5, XMM1) 763e6580ceeSShri Abhyankar 764e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6, XMM7) 765e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6, XMM2) 766e6580ceeSShri Abhyankar 767e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7, XMM3) 768e6580ceeSShri Abhyankar 769e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 770e6580ceeSShri Abhyankar SSE_OR_PS(XMM6, XMM7) 771e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM4) 772e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM6) 773e6580ceeSShri Abhyankar SSE_INLINE_END_1; 774e6580ceeSShri Abhyankar 775e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 776e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 777e6580ceeSShri Abhyankar MOVEMASK(nonzero, XMM5); 778e6580ceeSShri Abhyankar 779e6580ceeSShri Abhyankar /* If block is nonzero ... */ 780e6580ceeSShri Abhyankar if (nonzero) { 781e6580ceeSShri Abhyankar pv = ba + 16 * diag_offset[row]; 782e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 783e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 784e6580ceeSShri Abhyankar 785e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 786e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 787e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 788e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 789e6580ceeSShri Abhyankar 790e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv, pc) 791e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 792e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 793e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 794e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM0) 795e6580ceeSShri Abhyankar 796e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 797e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 798e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM1) 799e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM5) 800e6580ceeSShri Abhyankar 801e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 802e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 803e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM2) 804e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM6) 805e6580ceeSShri Abhyankar 806e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 807e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 808e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 809e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM7) 810e6580ceeSShri Abhyankar 811e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 812e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 813e6580ceeSShri Abhyankar 814e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 815e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 816e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 817e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 818e6580ceeSShri Abhyankar 819e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 820e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 821e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 822e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 823e6580ceeSShri Abhyankar 824e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 825e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 826e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 827e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM7) 828e6580ceeSShri Abhyankar 829e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 830e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 831e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 832e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 833e6580ceeSShri Abhyankar 834e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 835e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 836e6580ceeSShri Abhyankar 837e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 838e6580ceeSShri Abhyankar 839e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 840e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 841e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 842e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 843e6580ceeSShri Abhyankar 844e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 845e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 846e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 847e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 848e6580ceeSShri Abhyankar 849e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 850e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 851e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 852e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 853e6580ceeSShri Abhyankar 854e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 855e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 856e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 857e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 858e6580ceeSShri Abhyankar 859e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 860e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 861e6580ceeSShri Abhyankar 862e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 863e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 864e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 865e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 866e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0, XMM7) 867e6580ceeSShri Abhyankar 868e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 869e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 870e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM7) 871e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 872e6580ceeSShri Abhyankar 873e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 874e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1, XMM1, 0x00) 875e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM2) 876e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 877e6580ceeSShri Abhyankar 878e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 879e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 880e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3, XMM7) 881e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM3) 882e6580ceeSShri Abhyankar 883e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 884e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 885e6580ceeSShri Abhyankar 886e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 887e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans after all. */ 888e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 889e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3, XMM0) 890e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 891e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2, XMM6) 892e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 893e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1, XMM5) 894e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 895e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0, XMM4) 896e6580ceeSShri Abhyankar SSE_INLINE_END_2; 897e6580ceeSShri Abhyankar 898e6580ceeSShri Abhyankar /* Update the row: */ 899e6580ceeSShri Abhyankar nz = bi[row + 1] - diag_offset[row] - 1; 900e6580ceeSShri Abhyankar pv += 16; 901e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 902e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 903e6580ceeSShri Abhyankar x = rtmp + 16 * pj[j]; 904e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 905e6580ceeSShri Abhyankar 906e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 907e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 908e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 909e6580ceeSShri Abhyankar /* Load First Column of X*/ 910e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 911e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 912e6580ceeSShri Abhyankar 913e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 914e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 915e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 916e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 917e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 918e6580ceeSShri Abhyankar 919e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 920e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 921e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 922e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 923e6580ceeSShri Abhyankar 924e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 925e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 926e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 927e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 928e6580ceeSShri Abhyankar 929e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 930e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 931e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 932e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 933e6580ceeSShri Abhyankar 934e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 935e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 936e6580ceeSShri Abhyankar 937e6580ceeSShri Abhyankar /* Second Column */ 938e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 939e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 940e6580ceeSShri Abhyankar 941e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 942e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 943e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 944e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 945e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 946e6580ceeSShri Abhyankar 947e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 948e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 949e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 950e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM7) 951e6580ceeSShri Abhyankar 952e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 953e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 954e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM2) 955e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM4) 956e6580ceeSShri Abhyankar 957e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 958e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 959e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 960e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 961e6580ceeSShri Abhyankar 962e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 963e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 964e6580ceeSShri Abhyankar 965e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 966e6580ceeSShri Abhyankar 967e6580ceeSShri Abhyankar /* Third Column */ 968e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 969e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 970e6580ceeSShri Abhyankar 971e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 972e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 973e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 974e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM0) 975e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 976e6580ceeSShri Abhyankar 977e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 978e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 979e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM1) 980e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM4) 981e6580ceeSShri Abhyankar 982e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 983e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 984e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM2) 985e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM5) 986e6580ceeSShri Abhyankar 987e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 988e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 989e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 990e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 991e6580ceeSShri Abhyankar 992e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 993e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 994e6580ceeSShri Abhyankar 995e6580ceeSShri Abhyankar /* Fourth Column */ 996e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 997e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 998e6580ceeSShri Abhyankar 999e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1000e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1001e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1002e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1003e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1004e6580ceeSShri Abhyankar 1005e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1006e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1007e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1008e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 1009e6580ceeSShri Abhyankar 1010e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1011e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1012e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1013e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 1014e6580ceeSShri Abhyankar 1015e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1016e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1017e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 1018e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1019e6580ceeSShri Abhyankar 1020e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1021e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1022e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1023e6580ceeSShri Abhyankar pv += 16; 1024e6580ceeSShri Abhyankar } 10259566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1026e6580ceeSShri Abhyankar } 1027e6580ceeSShri Abhyankar row = *bjtmp++; 1028e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1029e6580ceeSShri Abhyankar } 1030e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1031e6580ceeSShri Abhyankar pv = ba + 16 * bi[i]; 1032e6580ceeSShri Abhyankar pj = bj + bi[i]; 1033e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 1034e6580ceeSShri Abhyankar 1035e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1036e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1037e6580ceeSShri Abhyankar x = rtmp + 16 * pj[j]; 1038e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1039e6580ceeSShri Abhyankar 1040e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 1041e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1042e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1043e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1044e6580ceeSShri Abhyankar 1045e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1046e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1047e6580ceeSShri Abhyankar 1048e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1049e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1050e6580ceeSShri Abhyankar 1051e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1052e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1053e6580ceeSShri Abhyankar 1054e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1055e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1056e6580ceeSShri Abhyankar 1057e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1058e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1059e6580ceeSShri Abhyankar 1060e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1061e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1062e6580ceeSShri Abhyankar 1063e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1064e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1065e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1066e6580ceeSShri Abhyankar pv += 16; 1067e6580ceeSShri Abhyankar } 1068e6580ceeSShri Abhyankar /* invert diagonal block */ 1069e6580ceeSShri Abhyankar w = ba + 16 * diag_offset[i]; 1070e6580ceeSShri Abhyankar if (pivotinblocks) { 10719566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1072603e8f96SBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1073e6580ceeSShri Abhyankar } else { 10749566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1075e6580ceeSShri Abhyankar } 10769566063dSJacob Faibussowitsch /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1077e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1078e6580ceeSShri Abhyankar } 1079e6580ceeSShri Abhyankar 10809566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 108126fbe8dcSKarl Rupp 1082e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1083e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1084e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 108526fbe8dcSKarl Rupp 10869566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1087e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1088e6580ceeSShri Abhyankar SSE_SCOPE_END; 1089*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1090e6580ceeSShri Abhyankar } 1091e6580ceeSShri Abhyankar 1092d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1093d71ae5a4SJacob Faibussowitsch { 1094e6580ceeSShri Abhyankar Mat A = C; 1095e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1096e6580ceeSShri Abhyankar int i, j, n = a->mbs; 1097e6580ceeSShri Abhyankar unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj; 1098e6580ceeSShri Abhyankar unsigned short *aj = (unsigned short *)(a->j), *ajtmp; 1099e6580ceeSShri Abhyankar unsigned int row; 1100e6580ceeSShri Abhyankar int nz, *bi = b->i; 1101e6580ceeSShri Abhyankar int *diag_offset = b->diag, *ai = a->i; 1102e6580ceeSShri Abhyankar MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1103e6580ceeSShri Abhyankar MatScalar *ba = b->a, *aa = a->a; 1104e6580ceeSShri Abhyankar int nonzero = 0; 1105a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 1106182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 11070164db54SHong Zhang PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1108e6580ceeSShri Abhyankar 1109e6580ceeSShri Abhyankar PetscFunctionBegin; 11100164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 1111e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 1112e6580ceeSShri Abhyankar 1113aed4548fSBarry Smith PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1114aed4548fSBarry Smith PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 11159566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1116aed4548fSBarry Smith PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1117e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 1118e6580ceeSShri Abhyankar /* colscale = 4; */ 1119e6580ceeSShri Abhyankar /* } */ 1120e6580ceeSShri Abhyankar 1121e6580ceeSShri Abhyankar for (i = 0; i < n; i++) { 1122e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 1123e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 1124e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 1125e6580ceeSShri Abhyankar /* zero out one register */ 1126e6580ceeSShri Abhyankar XOR_PS(XMM7, XMM7); 1127e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1128e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1129e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 1130e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 1131e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 1132e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1133e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1134e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1135e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1136e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1137e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1138e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1139e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1140e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1141e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1142e6580ceeSShri Abhyankar } 1143e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 1144e6580ceeSShri Abhyankar nz = ai[i + 1] - ai[i]; 1145e6580ceeSShri Abhyankar ajtmp = aj + ai[i]; 1146e6580ceeSShri Abhyankar v = aa + 16 * ai[i]; 1147e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1148e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)ajtmp[j]); 1149e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmp[j]; */ 1150e6580ceeSShri Abhyankar /* Copy v block into x block */ 1151e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v, x) 1152e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1153e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1154e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1155e6580ceeSShri Abhyankar 1156e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1157e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1158e6580ceeSShri Abhyankar 1159e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1160e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1161e6580ceeSShri Abhyankar 1162e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1163e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1164e6580ceeSShri Abhyankar 1165e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1166e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1167e6580ceeSShri Abhyankar 1168e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1169e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1170e6580ceeSShri Abhyankar 1171e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1172e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1173e6580ceeSShri Abhyankar 1174e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1175e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1176e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1177e6580ceeSShri Abhyankar 1178e6580ceeSShri Abhyankar v += 16; 1179e6580ceeSShri Abhyankar } 1180e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1181e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1182e6580ceeSShri Abhyankar while (row < i) { 1183e6580ceeSShri Abhyankar pc = rtmp + 16 * row; 1184e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 1185e6580ceeSShri Abhyankar /* Load block from lower triangle */ 1186e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1187e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1188e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1189e6580ceeSShri Abhyankar 1190e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1191e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1192e6580ceeSShri Abhyankar 1193e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1194e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1195e6580ceeSShri Abhyankar 1196e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1197e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1198e6580ceeSShri Abhyankar 1199e6580ceeSShri Abhyankar /* Compare block to zero block */ 1200e6580ceeSShri Abhyankar 1201e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4, XMM7) 1202e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4, XMM0) 1203e6580ceeSShri Abhyankar 1204e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5, XMM7) 1205e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5, XMM1) 1206e6580ceeSShri Abhyankar 1207e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6, XMM7) 1208e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6, XMM2) 1209e6580ceeSShri Abhyankar 1210e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7, XMM3) 1211e6580ceeSShri Abhyankar 1212e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 1213e6580ceeSShri Abhyankar SSE_OR_PS(XMM6, XMM7) 1214e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM4) 1215e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM6) 1216e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1217e6580ceeSShri Abhyankar 1218e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 1219e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1220e6580ceeSShri Abhyankar MOVEMASK(nonzero, XMM5); 1221e6580ceeSShri Abhyankar 1222e6580ceeSShri Abhyankar /* If block is nonzero ... */ 1223e6580ceeSShri Abhyankar if (nonzero) { 1224e6580ceeSShri Abhyankar pv = ba + 16 * diag_offset[row]; 1225e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1226e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 1227e6580ceeSShri Abhyankar 1228e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1229e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1230e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 1231e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1232e6580ceeSShri Abhyankar 1233e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv, pc) 1234e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 1235e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1236e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1237e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM0) 1238e6580ceeSShri Abhyankar 1239e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1240e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1241e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM1) 1242e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM5) 1243e6580ceeSShri Abhyankar 1244e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1245e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1246e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM2) 1247e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM6) 1248e6580ceeSShri Abhyankar 1249e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1250e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1251e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1252e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM7) 1253e6580ceeSShri Abhyankar 1254e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1255e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1256e6580ceeSShri Abhyankar 1257e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 1258e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1259e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1260e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1261e6580ceeSShri Abhyankar 1262e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1263e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1264e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1265e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 1266e6580ceeSShri Abhyankar 1267e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1268e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1269e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1270e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM7) 1271e6580ceeSShri Abhyankar 1272e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1273e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1274e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 1275e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 1276e6580ceeSShri Abhyankar 1277e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1278e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1279e6580ceeSShri Abhyankar 1280e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1281e6580ceeSShri Abhyankar 1282e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 1283e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1284e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1285e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 1286e6580ceeSShri Abhyankar 1287e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1288e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1289e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 1290e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1291e6580ceeSShri Abhyankar 1292e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1293e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1294e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1295e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1296e6580ceeSShri Abhyankar 1297e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1298e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1299e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1300e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1301e6580ceeSShri Abhyankar 1302e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1303e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1304e6580ceeSShri Abhyankar 1305e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1306e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 1307e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1308e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1309e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0, XMM7) 1310e6580ceeSShri Abhyankar 1311e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1312e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1313e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM7) 1314e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 1315e6580ceeSShri Abhyankar 1316e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1317e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1, XMM1, 0x00) 1318e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM2) 1319e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 1320e6580ceeSShri Abhyankar 1321e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1322e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1323e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3, XMM7) 1324e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM3) 1325e6580ceeSShri Abhyankar 1326e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1327e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1328e6580ceeSShri Abhyankar 1329e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1330e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans after all. */ 1331e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 1332e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3, XMM0) 1333e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 1334e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2, XMM6) 1335e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 1336e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1, XMM5) 1337e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 1338e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0, XMM4) 1339e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1340e6580ceeSShri Abhyankar 1341e6580ceeSShri Abhyankar /* Update the row: */ 1342e6580ceeSShri Abhyankar nz = bi[row + 1] - diag_offset[row] - 1; 1343e6580ceeSShri Abhyankar pv += 16; 1344e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1345e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1346e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)pj[j]); 1347e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 1348e6580ceeSShri Abhyankar 1349e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 1350e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1351e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 1352e6580ceeSShri Abhyankar /* Load First Column of X*/ 1353e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1354e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1355e6580ceeSShri Abhyankar 1356e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1357e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1358e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1359e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1360e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1361e6580ceeSShri Abhyankar 1362e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1363e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1364e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1365e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 1366e6580ceeSShri Abhyankar 1367e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1368e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1369e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1370e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 1371e6580ceeSShri Abhyankar 1372e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1373e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1374e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 1375e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1376e6580ceeSShri Abhyankar 1377e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1378e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1379e6580ceeSShri Abhyankar 1380e6580ceeSShri Abhyankar /* Second Column */ 1381e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1382e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1383e6580ceeSShri Abhyankar 1384e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1385e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1386e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1387e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 1388e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 1389e6580ceeSShri Abhyankar 1390e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1391e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1392e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 1393e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM7) 1394e6580ceeSShri Abhyankar 1395e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1396e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1397e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM2) 1398e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM4) 1399e6580ceeSShri Abhyankar 1400e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1401e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1402e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 1403e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 1404e6580ceeSShri Abhyankar 1405e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1406e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1407e6580ceeSShri Abhyankar 1408e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1409e6580ceeSShri Abhyankar 1410e6580ceeSShri Abhyankar /* Third Column */ 1411e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1412e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1413e6580ceeSShri Abhyankar 1414e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1415e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1416e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1417e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM0) 1418e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 1419e6580ceeSShri Abhyankar 1420e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1421e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1422e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM1) 1423e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM4) 1424e6580ceeSShri Abhyankar 1425e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1426e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1427e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM2) 1428e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM5) 1429e6580ceeSShri Abhyankar 1430e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1431e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1432e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1433e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 1434e6580ceeSShri Abhyankar 1435e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1436e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1437e6580ceeSShri Abhyankar 1438e6580ceeSShri Abhyankar /* Fourth Column */ 1439e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1440e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1441e6580ceeSShri Abhyankar 1442e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1443e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1444e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1445e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1446e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1447e6580ceeSShri Abhyankar 1448e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1449e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1450e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1451e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 1452e6580ceeSShri Abhyankar 1453e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1454e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1455e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1456e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 1457e6580ceeSShri Abhyankar 1458e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1459e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1460e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 1461e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1462e6580ceeSShri Abhyankar 1463e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1464e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1465e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1466e6580ceeSShri Abhyankar pv += 16; 1467e6580ceeSShri Abhyankar } 14689566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1469e6580ceeSShri Abhyankar } 1470e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1471e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1472e6580ceeSShri Abhyankar /* bjtmp++; */ 1473e6580ceeSShri Abhyankar } 1474e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1475e6580ceeSShri Abhyankar pv = ba + 16 * bi[i]; 1476e6580ceeSShri Abhyankar pj = bj + bi[i]; 1477e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 1478e6580ceeSShri Abhyankar 1479e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1480e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1481e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)pj[j]); 1482e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1483e6580ceeSShri Abhyankar 1484e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 1485e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1486e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1487e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1488e6580ceeSShri Abhyankar 1489e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1490e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1491e6580ceeSShri Abhyankar 1492e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1493e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1494e6580ceeSShri Abhyankar 1495e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1496e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1497e6580ceeSShri Abhyankar 1498e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1499e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1500e6580ceeSShri Abhyankar 1501e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1502e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1503e6580ceeSShri Abhyankar 1504e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1505e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1506e6580ceeSShri Abhyankar 1507e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1508e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1509e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1510e6580ceeSShri Abhyankar pv += 16; 1511e6580ceeSShri Abhyankar } 1512e6580ceeSShri Abhyankar /* invert diagonal block */ 1513e6580ceeSShri Abhyankar w = ba + 16 * diag_offset[i]; 1514e6580ceeSShri Abhyankar if (pivotinblocks) { 15159566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1516603e8f96SBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1517e6580ceeSShri Abhyankar } else { 15189566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1519e6580ceeSShri Abhyankar } 1520e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1521e6580ceeSShri Abhyankar } 1522e6580ceeSShri Abhyankar 15239566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 152426fbe8dcSKarl Rupp 1525e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1526e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1527e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 152826fbe8dcSKarl Rupp 15299566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1530e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1531e6580ceeSShri Abhyankar SSE_SCOPE_END; 1532*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1533e6580ceeSShri Abhyankar } 1534e6580ceeSShri Abhyankar 1535d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info) 1536d71ae5a4SJacob Faibussowitsch { 1537e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1538e6580ceeSShri Abhyankar int i, j, n = a->mbs; 1539e6580ceeSShri Abhyankar unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj; 1540e6580ceeSShri Abhyankar unsigned int row; 1541e6580ceeSShri Abhyankar int *ajtmpold, nz, *bi = b->i; 1542e6580ceeSShri Abhyankar int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 1543e6580ceeSShri Abhyankar MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1544e6580ceeSShri Abhyankar MatScalar *ba = b->a, *aa = a->a; 1545e6580ceeSShri Abhyankar int nonzero = 0; 1546a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 1547182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 15480164db54SHong Zhang PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1549e6580ceeSShri Abhyankar 1550e6580ceeSShri Abhyankar PetscFunctionBegin; 15510164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 1552e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 1553e6580ceeSShri Abhyankar 1554aed4548fSBarry Smith PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1555aed4548fSBarry Smith PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 15569566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1557aed4548fSBarry Smith PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1558e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 1559e6580ceeSShri Abhyankar /* colscale = 4; */ 1560e6580ceeSShri Abhyankar /* } */ 1561ad540459SPierre Jolivet if ((unsigned long)bj == (unsigned long)aj) return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1562e6580ceeSShri Abhyankar 1563e6580ceeSShri Abhyankar for (i = 0; i < n; i++) { 1564e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 1565e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 1566e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 1567e6580ceeSShri Abhyankar /* zero out one register */ 1568e6580ceeSShri Abhyankar XOR_PS(XMM7, XMM7); 1569e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1570e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1571e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 1572e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 1573e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 1574e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1575e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1576e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1577e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1578e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1579e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1580e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1581e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1582e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1583e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1584e6580ceeSShri Abhyankar } 1585e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 1586e6580ceeSShri Abhyankar nz = ai[i + 1] - ai[i]; 1587e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 1588e6580ceeSShri Abhyankar v = aa + 16 * ai[i]; 1589e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1590e6580ceeSShri Abhyankar x = rtmp + 16 * ajtmpold[j]; 1591e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmpold[j]; */ 1592e6580ceeSShri Abhyankar /* Copy v block into x block */ 1593e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v, x) 1594e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1595e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1596e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1597e6580ceeSShri Abhyankar 1598e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1599e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1600e6580ceeSShri Abhyankar 1601e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1602e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1603e6580ceeSShri Abhyankar 1604e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1605e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1606e6580ceeSShri Abhyankar 1607e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1608e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1609e6580ceeSShri Abhyankar 1610e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1611e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1612e6580ceeSShri Abhyankar 1613e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1614e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1615e6580ceeSShri Abhyankar 1616e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1617e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1618e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1619e6580ceeSShri Abhyankar 1620e6580ceeSShri Abhyankar v += 16; 1621e6580ceeSShri Abhyankar } 1622e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1623e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1624e6580ceeSShri Abhyankar while (row < i) { 1625e6580ceeSShri Abhyankar pc = rtmp + 16 * row; 1626e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 1627e6580ceeSShri Abhyankar /* Load block from lower triangle */ 1628e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1629e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1630e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1631e6580ceeSShri Abhyankar 1632e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1633e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1634e6580ceeSShri Abhyankar 1635e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1636e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1637e6580ceeSShri Abhyankar 1638e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1639e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1640e6580ceeSShri Abhyankar 1641e6580ceeSShri Abhyankar /* Compare block to zero block */ 1642e6580ceeSShri Abhyankar 1643e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4, XMM7) 1644e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4, XMM0) 1645e6580ceeSShri Abhyankar 1646e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5, XMM7) 1647e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5, XMM1) 1648e6580ceeSShri Abhyankar 1649e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6, XMM7) 1650e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6, XMM2) 1651e6580ceeSShri Abhyankar 1652e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7, XMM3) 1653e6580ceeSShri Abhyankar 1654e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 1655e6580ceeSShri Abhyankar SSE_OR_PS(XMM6, XMM7) 1656e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM4) 1657e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM6) 1658e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1659e6580ceeSShri Abhyankar 1660e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 1661e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1662e6580ceeSShri Abhyankar MOVEMASK(nonzero, XMM5); 1663e6580ceeSShri Abhyankar 1664e6580ceeSShri Abhyankar /* If block is nonzero ... */ 1665e6580ceeSShri Abhyankar if (nonzero) { 1666e6580ceeSShri Abhyankar pv = ba + 16 * diag_offset[row]; 1667e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1668e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 1669e6580ceeSShri Abhyankar 1670e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1671e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1672e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 1673e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1674e6580ceeSShri Abhyankar 1675e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv, pc) 1676e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 1677e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1678e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1679e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM0) 1680e6580ceeSShri Abhyankar 1681e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1682e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1683e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM1) 1684e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM5) 1685e6580ceeSShri Abhyankar 1686e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1687e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1688e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM2) 1689e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM6) 1690e6580ceeSShri Abhyankar 1691e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1692e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1693e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1694e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM7) 1695e6580ceeSShri Abhyankar 1696e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1697e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1698e6580ceeSShri Abhyankar 1699e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 1700e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1701e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1702e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1703e6580ceeSShri Abhyankar 1704e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1705e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1706e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1707e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 1708e6580ceeSShri Abhyankar 1709e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1710e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1711e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1712e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM7) 1713e6580ceeSShri Abhyankar 1714e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1715e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1716e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 1717e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 1718e6580ceeSShri Abhyankar 1719e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1720e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1721e6580ceeSShri Abhyankar 1722e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1723e6580ceeSShri Abhyankar 1724e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 1725e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1726e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1727e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 1728e6580ceeSShri Abhyankar 1729e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1730e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1731e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 1732e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1733e6580ceeSShri Abhyankar 1734e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1735e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1736e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1737e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1738e6580ceeSShri Abhyankar 1739e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1740e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1741e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1742e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1743e6580ceeSShri Abhyankar 1744e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1745e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1746e6580ceeSShri Abhyankar 1747e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1748e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 1749e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1750e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1751e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0, XMM7) 1752e6580ceeSShri Abhyankar 1753e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1754e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1755e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM7) 1756e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 1757e6580ceeSShri Abhyankar 1758e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1759e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1, XMM1, 0x00) 1760e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM2) 1761e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 1762e6580ceeSShri Abhyankar 1763e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1764e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1765e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3, XMM7) 1766e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM3) 1767e6580ceeSShri Abhyankar 1768e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1769e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1770e6580ceeSShri Abhyankar 1771e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1772e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans after all. */ 1773e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 1774e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3, XMM0) 1775e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 1776e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2, XMM6) 1777e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 1778e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1, XMM5) 1779e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 1780e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0, XMM4) 1781e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1782e6580ceeSShri Abhyankar 1783e6580ceeSShri Abhyankar /* Update the row: */ 1784e6580ceeSShri Abhyankar nz = bi[row + 1] - diag_offset[row] - 1; 1785e6580ceeSShri Abhyankar pv += 16; 1786e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1787e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1788e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)pj[j]); 1789e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 1790e6580ceeSShri Abhyankar 1791e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 1792e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1793e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 1794e6580ceeSShri Abhyankar /* Load First Column of X*/ 1795e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1796e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1797e6580ceeSShri Abhyankar 1798e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1799e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1800e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1801e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1802e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1803e6580ceeSShri Abhyankar 1804e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1805e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1806e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1807e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 1808e6580ceeSShri Abhyankar 1809e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1810e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1811e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1812e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 1813e6580ceeSShri Abhyankar 1814e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1815e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1816e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 1817e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1818e6580ceeSShri Abhyankar 1819e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1820e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1821e6580ceeSShri Abhyankar 1822e6580ceeSShri Abhyankar /* Second Column */ 1823e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1824e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1825e6580ceeSShri Abhyankar 1826e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1827e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1828e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1829e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 1830e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 1831e6580ceeSShri Abhyankar 1832e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1833e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1834e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 1835e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM7) 1836e6580ceeSShri Abhyankar 1837e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1838e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1839e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM2) 1840e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM4) 1841e6580ceeSShri Abhyankar 1842e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1843e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1844e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 1845e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 1846e6580ceeSShri Abhyankar 1847e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1848e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1849e6580ceeSShri Abhyankar 1850e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1851e6580ceeSShri Abhyankar 1852e6580ceeSShri Abhyankar /* Third Column */ 1853e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1854e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1855e6580ceeSShri Abhyankar 1856e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1857e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1858e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1859e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM0) 1860e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 1861e6580ceeSShri Abhyankar 1862e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1863e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1864e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM1) 1865e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM4) 1866e6580ceeSShri Abhyankar 1867e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1868e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1869e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM2) 1870e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM5) 1871e6580ceeSShri Abhyankar 1872e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1873e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1874e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1875e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 1876e6580ceeSShri Abhyankar 1877e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1878e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1879e6580ceeSShri Abhyankar 1880e6580ceeSShri Abhyankar /* Fourth Column */ 1881e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1882e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1883e6580ceeSShri Abhyankar 1884e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1885e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1886e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1887e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1888e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1889e6580ceeSShri Abhyankar 1890e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1891e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1892e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1893e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 1894e6580ceeSShri Abhyankar 1895e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1896e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1897e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1898e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 1899e6580ceeSShri Abhyankar 1900e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1901e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1902e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 1903e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1904e6580ceeSShri Abhyankar 1905e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1906e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1907e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1908e6580ceeSShri Abhyankar pv += 16; 1909e6580ceeSShri Abhyankar } 19109566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1911e6580ceeSShri Abhyankar } 1912e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1913e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1914e6580ceeSShri Abhyankar /* bjtmp++; */ 1915e6580ceeSShri Abhyankar } 1916e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1917e6580ceeSShri Abhyankar pv = ba + 16 * bi[i]; 1918e6580ceeSShri Abhyankar pj = bj + bi[i]; 1919e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 1920e6580ceeSShri Abhyankar 1921e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1922e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1923e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)pj[j]); 1924e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1925e6580ceeSShri Abhyankar 1926e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 1927e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1928e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1929e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1930e6580ceeSShri Abhyankar 1931e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1932e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1933e6580ceeSShri Abhyankar 1934e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1935e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1936e6580ceeSShri Abhyankar 1937e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1938e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1939e6580ceeSShri Abhyankar 1940e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1941e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1942e6580ceeSShri Abhyankar 1943e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1944e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1945e6580ceeSShri Abhyankar 1946e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1947e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1948e6580ceeSShri Abhyankar 1949e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1950e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1951e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1952e6580ceeSShri Abhyankar pv += 16; 1953e6580ceeSShri Abhyankar } 1954e6580ceeSShri Abhyankar /* invert diagonal block */ 1955e6580ceeSShri Abhyankar w = ba + 16 * diag_offset[i]; 1956e6580ceeSShri Abhyankar if (pivotinblocks) { 19579566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected)); 1958603e8f96SBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1959e6580ceeSShri Abhyankar } else { 19609566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1961e6580ceeSShri Abhyankar } 19629566063dSJacob Faibussowitsch /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1963e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1964e6580ceeSShri Abhyankar } 1965e6580ceeSShri Abhyankar 19669566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 196726fbe8dcSKarl Rupp 1968e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1969e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1970e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 197126fbe8dcSKarl Rupp 19729566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1973e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1974e6580ceeSShri Abhyankar SSE_SCOPE_END; 1975*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1976e6580ceeSShri Abhyankar } 1977e6580ceeSShri Abhyankar 1978e6580ceeSShri Abhyankar #endif 1979