1be1d678aSKris Buschelman 283287d42SBarry Smith /* 383287d42SBarry Smith Factorization code for BAIJ format. 483287d42SBarry Smith */ 5c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 6af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 783287d42SBarry Smith 883287d42SBarry Smith /* ------------------------------------------------------------*/ 983287d42SBarry Smith /* 1083287d42SBarry Smith Version for when blocks are 4 by 4 1183287d42SBarry Smith */ 129371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info) { 1383287d42SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1483287d42SBarry Smith IS isrow = b->row, isicol = b->icol; 155d0c19d7SBarry Smith const PetscInt *r, *ic; 165d0c19d7SBarry Smith PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 17690b6cddSBarry Smith PetscInt *ajtmpold, *ajtmp, nz, row; 18690b6cddSBarry Smith PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 1983287d42SBarry Smith MatScalar *pv, *v, *rtmp, *pc, *w, *x; 2083287d42SBarry Smith MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 2183287d42SBarry Smith MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 2283287d42SBarry Smith MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 2383287d42SBarry Smith MatScalar m13, m14, m15, m16; 2483287d42SBarry Smith MatScalar *ba = b->a, *aa = a->a; 25a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 26182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 270164db54SHong Zhang PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 2883287d42SBarry Smith 2983287d42SBarry Smith PetscFunctionBegin; 309566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 319566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 329566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 330164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 3483287d42SBarry Smith 3583287d42SBarry Smith for (i = 0; i < n; i++) { 3683287d42SBarry Smith nz = bi[i + 1] - bi[i]; 3783287d42SBarry Smith ajtmp = bj + bi[i]; 3883287d42SBarry Smith for (j = 0; j < nz; j++) { 3983287d42SBarry Smith x = rtmp + 16 * ajtmp[j]; 4083287d42SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 4183287d42SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 4283287d42SBarry Smith } 4383287d42SBarry Smith /* load in initial (unfactored row) */ 4483287d42SBarry Smith idx = r[i]; 4583287d42SBarry Smith nz = ai[idx + 1] - ai[idx]; 4683287d42SBarry Smith ajtmpold = aj + ai[idx]; 4783287d42SBarry Smith v = aa + 16 * ai[idx]; 4883287d42SBarry Smith for (j = 0; j < nz; j++) { 4983287d42SBarry Smith x = rtmp + 16 * ic[ajtmpold[j]]; 509371c9d4SSatish Balay x[0] = v[0]; 519371c9d4SSatish Balay x[1] = v[1]; 529371c9d4SSatish Balay x[2] = v[2]; 539371c9d4SSatish Balay x[3] = v[3]; 549371c9d4SSatish Balay x[4] = v[4]; 559371c9d4SSatish Balay x[5] = v[5]; 569371c9d4SSatish Balay x[6] = v[6]; 579371c9d4SSatish Balay x[7] = v[7]; 589371c9d4SSatish Balay x[8] = v[8]; 599371c9d4SSatish Balay x[9] = v[9]; 609371c9d4SSatish Balay x[10] = v[10]; 619371c9d4SSatish Balay x[11] = v[11]; 629371c9d4SSatish Balay x[12] = v[12]; 639371c9d4SSatish Balay x[13] = v[13]; 649371c9d4SSatish Balay x[14] = v[14]; 659371c9d4SSatish Balay x[15] = v[15]; 6683287d42SBarry Smith v += 16; 6783287d42SBarry Smith } 6883287d42SBarry Smith row = *ajtmp++; 6983287d42SBarry Smith while (row < i) { 7083287d42SBarry Smith pc = rtmp + 16 * row; 719371c9d4SSatish Balay p1 = pc[0]; 729371c9d4SSatish Balay p2 = pc[1]; 739371c9d4SSatish Balay p3 = pc[2]; 749371c9d4SSatish Balay p4 = pc[3]; 759371c9d4SSatish Balay p5 = pc[4]; 769371c9d4SSatish Balay p6 = pc[5]; 779371c9d4SSatish Balay p7 = pc[6]; 789371c9d4SSatish Balay p8 = pc[7]; 799371c9d4SSatish Balay p9 = pc[8]; 809371c9d4SSatish Balay p10 = pc[9]; 819371c9d4SSatish Balay p11 = pc[10]; 829371c9d4SSatish Balay p12 = pc[11]; 839371c9d4SSatish Balay p13 = pc[12]; 849371c9d4SSatish Balay p14 = pc[13]; 859371c9d4SSatish Balay p15 = pc[14]; 869371c9d4SSatish Balay p16 = pc[15]; 879371c9d4SSatish Balay if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 8883287d42SBarry Smith pv = ba + 16 * diag_offset[row]; 8983287d42SBarry Smith pj = bj + diag_offset[row] + 1; 909371c9d4SSatish Balay x1 = pv[0]; 919371c9d4SSatish Balay x2 = pv[1]; 929371c9d4SSatish Balay x3 = pv[2]; 939371c9d4SSatish Balay x4 = pv[3]; 949371c9d4SSatish Balay x5 = pv[4]; 959371c9d4SSatish Balay x6 = pv[5]; 969371c9d4SSatish Balay x7 = pv[6]; 979371c9d4SSatish Balay x8 = pv[7]; 989371c9d4SSatish Balay x9 = pv[8]; 999371c9d4SSatish Balay x10 = pv[9]; 1009371c9d4SSatish Balay x11 = pv[10]; 1019371c9d4SSatish Balay x12 = pv[11]; 1029371c9d4SSatish Balay x13 = pv[12]; 1039371c9d4SSatish Balay x14 = pv[13]; 1049371c9d4SSatish Balay x15 = pv[14]; 1059371c9d4SSatish Balay x16 = pv[15]; 10683287d42SBarry Smith pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 10783287d42SBarry Smith pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 10883287d42SBarry Smith pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 10983287d42SBarry Smith pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 11083287d42SBarry Smith 11183287d42SBarry Smith pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 11283287d42SBarry Smith pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 11383287d42SBarry Smith pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 11483287d42SBarry Smith pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 11583287d42SBarry Smith 11683287d42SBarry Smith pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 11783287d42SBarry Smith pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 11883287d42SBarry Smith pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 11983287d42SBarry Smith pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 12083287d42SBarry Smith 12183287d42SBarry Smith pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 12283287d42SBarry Smith pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 12383287d42SBarry Smith pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 12483287d42SBarry Smith pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 12583287d42SBarry Smith 12683287d42SBarry Smith nz = bi[row + 1] - diag_offset[row] - 1; 12783287d42SBarry Smith pv += 16; 12883287d42SBarry Smith for (j = 0; j < nz; j++) { 1299371c9d4SSatish Balay x1 = pv[0]; 1309371c9d4SSatish Balay x2 = pv[1]; 1319371c9d4SSatish Balay x3 = pv[2]; 1329371c9d4SSatish Balay x4 = pv[3]; 1339371c9d4SSatish Balay x5 = pv[4]; 1349371c9d4SSatish Balay x6 = pv[5]; 1359371c9d4SSatish Balay x7 = pv[6]; 1369371c9d4SSatish Balay x8 = pv[7]; 1379371c9d4SSatish Balay x9 = pv[8]; 1389371c9d4SSatish Balay x10 = pv[9]; 1399371c9d4SSatish Balay x11 = pv[10]; 1409371c9d4SSatish Balay x12 = pv[11]; 1419371c9d4SSatish Balay x13 = pv[12]; 1429371c9d4SSatish Balay x14 = pv[13]; 1439371c9d4SSatish Balay x15 = pv[14]; 1449371c9d4SSatish Balay x16 = pv[15]; 14583287d42SBarry Smith x = rtmp + 16 * pj[j]; 14683287d42SBarry Smith x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 14783287d42SBarry Smith x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 14883287d42SBarry Smith x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 14983287d42SBarry Smith x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 15083287d42SBarry Smith 15183287d42SBarry Smith x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 15283287d42SBarry Smith x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 15383287d42SBarry Smith x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 15483287d42SBarry Smith x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 15583287d42SBarry Smith 15683287d42SBarry Smith x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 15783287d42SBarry Smith x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 15883287d42SBarry Smith x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 15983287d42SBarry Smith x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 16083287d42SBarry Smith 16183287d42SBarry Smith x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 16283287d42SBarry Smith x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 16383287d42SBarry Smith x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 16483287d42SBarry Smith x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 16583287d42SBarry Smith 16683287d42SBarry Smith pv += 16; 16783287d42SBarry Smith } 1689566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 16983287d42SBarry Smith } 17083287d42SBarry Smith row = *ajtmp++; 17183287d42SBarry Smith } 17283287d42SBarry Smith /* finished row so stick it into b->a */ 17383287d42SBarry Smith pv = ba + 16 * bi[i]; 17483287d42SBarry Smith pj = bj + bi[i]; 17583287d42SBarry Smith nz = bi[i + 1] - bi[i]; 17683287d42SBarry Smith for (j = 0; j < nz; j++) { 17783287d42SBarry Smith x = rtmp + 16 * pj[j]; 1789371c9d4SSatish Balay pv[0] = x[0]; 1799371c9d4SSatish Balay pv[1] = x[1]; 1809371c9d4SSatish Balay pv[2] = x[2]; 1819371c9d4SSatish Balay pv[3] = x[3]; 1829371c9d4SSatish Balay pv[4] = x[4]; 1839371c9d4SSatish Balay pv[5] = x[5]; 1849371c9d4SSatish Balay pv[6] = x[6]; 1859371c9d4SSatish Balay pv[7] = x[7]; 1869371c9d4SSatish Balay pv[8] = x[8]; 1879371c9d4SSatish Balay pv[9] = x[9]; 1889371c9d4SSatish Balay pv[10] = x[10]; 1899371c9d4SSatish Balay pv[11] = x[11]; 1909371c9d4SSatish Balay pv[12] = x[12]; 1919371c9d4SSatish Balay pv[13] = x[13]; 1929371c9d4SSatish Balay pv[14] = x[14]; 1939371c9d4SSatish Balay pv[15] = x[15]; 19483287d42SBarry Smith pv += 16; 19583287d42SBarry Smith } 19683287d42SBarry Smith /* invert diagonal block */ 19783287d42SBarry Smith w = ba + 16 * diag_offset[i]; 198bcd9e38bSBarry Smith if (pivotinblocks) { 1999566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 2007b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 201bcd9e38bSBarry Smith } else { 2029566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 203bcd9e38bSBarry Smith } 20483287d42SBarry Smith } 20583287d42SBarry Smith 2069566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 2079566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 2089566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 20926fbe8dcSKarl Rupp 21006e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 21106e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 21283287d42SBarry Smith C->assembled = PETSC_TRUE; 21326fbe8dcSKarl Rupp 2149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 21583287d42SBarry Smith PetscFunctionReturn(0); 21683287d42SBarry Smith } 217e6580ceeSShri Abhyankar 2184dd39f65SShri Abhyankar /* MatLUFactorNumeric_SeqBAIJ_4 - 2194dd39f65SShri Abhyankar copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 22096b95a6bSBarry Smith PetscKernel_A_gets_A_times_B() 22196b95a6bSBarry Smith PetscKernel_A_gets_A_minus_B_times_C() 22296b95a6bSBarry Smith PetscKernel_A_gets_inverse_A() 223209027a4SShri Abhyankar */ 224c0c7eb62SShri Abhyankar 2259371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info) { 226209027a4SShri Abhyankar Mat C = B; 227209027a4SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 228209027a4SShri Abhyankar IS isrow = b->row, isicol = b->icol; 2295a586d82SBarry Smith const PetscInt *r, *ic; 230bbd65245SShri Abhyankar PetscInt i, j, k, nz, nzL, row; 231bbd65245SShri Abhyankar const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 232bbd65245SShri Abhyankar const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 233209027a4SShri Abhyankar MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 234bbd65245SShri Abhyankar PetscInt flg; 2356ba06ab7SHong Zhang PetscReal shift; 236a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected; 237209027a4SShri Abhyankar 238209027a4SShri Abhyankar PetscFunctionBegin; 2390164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 2409566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 2419566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 242209027a4SShri Abhyankar 2436ba06ab7SHong Zhang if (info->shifttype == MAT_SHIFT_NONE) { 2446ba06ab7SHong Zhang shift = 0; 2456ba06ab7SHong Zhang } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 2466ba06ab7SHong Zhang shift = info->shiftamount; 2476ba06ab7SHong Zhang } 2486ba06ab7SHong Zhang 249209027a4SShri Abhyankar /* generate work space needed by the factorization */ 2509566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 2519566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rtmp, bs2 * n)); 252209027a4SShri Abhyankar 253209027a4SShri Abhyankar for (i = 0; i < n; i++) { 254209027a4SShri Abhyankar /* zero rtmp */ 255209027a4SShri Abhyankar /* L part */ 256209027a4SShri Abhyankar nz = bi[i + 1] - bi[i]; 257209027a4SShri Abhyankar bjtmp = bj + bi[i]; 258*48a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 259209027a4SShri Abhyankar 260209027a4SShri Abhyankar /* U part */ 26178bb4007SShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 26278bb4007SShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 263*48a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 26478bb4007SShri Abhyankar 26578bb4007SShri Abhyankar /* load in initial (unfactored row) */ 26678bb4007SShri Abhyankar nz = ai[r[i] + 1] - ai[r[i]]; 26778bb4007SShri Abhyankar ajtmp = aj + ai[r[i]]; 26878bb4007SShri Abhyankar v = aa + bs2 * ai[r[i]]; 269*48a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 27078bb4007SShri Abhyankar 27178bb4007SShri Abhyankar /* elimination */ 27278bb4007SShri Abhyankar bjtmp = bj + bi[i]; 27378bb4007SShri Abhyankar nzL = bi[i + 1] - bi[i]; 27478bb4007SShri Abhyankar for (k = 0; k < nzL; k++) { 27578bb4007SShri Abhyankar row = bjtmp[k]; 27678bb4007SShri Abhyankar pc = rtmp + bs2 * row; 277c35f09e5SBarry Smith for (flg = 0, j = 0; j < bs2; j++) { 278c35f09e5SBarry Smith if (pc[j] != 0.0) { 279c35f09e5SBarry Smith flg = 1; 280c35f09e5SBarry Smith break; 281c35f09e5SBarry Smith } 282c35f09e5SBarry Smith } 28378bb4007SShri Abhyankar if (flg) { 28478bb4007SShri Abhyankar pv = b->a + bs2 * bdiag[row]; 28596b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 2869566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 28778bb4007SShri Abhyankar 288a5b23f4aSJose E. Roman pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 28978bb4007SShri Abhyankar pv = b->a + bs2 * (bdiag[row + 1] + 1); 29078bb4007SShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 29178bb4007SShri Abhyankar for (j = 0; j < nz; j++) { 29296b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 29378bb4007SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 29478bb4007SShri Abhyankar v = rtmp + bs2 * pj[j]; 2959566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 29678bb4007SShri Abhyankar pv += bs2; 29778bb4007SShri Abhyankar } 2989566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 29978bb4007SShri Abhyankar } 30078bb4007SShri Abhyankar } 30178bb4007SShri Abhyankar 30278bb4007SShri Abhyankar /* finished row so stick it into b->a */ 30378bb4007SShri Abhyankar /* L part */ 30478bb4007SShri Abhyankar pv = b->a + bs2 * bi[i]; 30578bb4007SShri Abhyankar pj = b->j + bi[i]; 30678bb4007SShri Abhyankar nz = bi[i + 1] - bi[i]; 307*48a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 30878bb4007SShri Abhyankar 309a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 31078bb4007SShri Abhyankar pv = b->a + bs2 * bdiag[i]; 31178bb4007SShri Abhyankar pj = b->j + bdiag[i]; 3129566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 3139566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 3147b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 31578bb4007SShri Abhyankar 31678bb4007SShri Abhyankar /* U part */ 31778bb4007SShri Abhyankar pv = b->a + bs2 * (bdiag[i + 1] + 1); 31878bb4007SShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 31978bb4007SShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; 320*48a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 32178bb4007SShri Abhyankar } 32278bb4007SShri Abhyankar 3239566063dSJacob Faibussowitsch PetscCall(PetscFree2(rtmp, mwork)); 3249566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 3259566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 32626fbe8dcSKarl Rupp 3274dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4; 3284dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 32978bb4007SShri Abhyankar C->assembled = PETSC_TRUE; 33026fbe8dcSKarl Rupp 3319566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 33278bb4007SShri Abhyankar PetscFunctionReturn(0); 33378bb4007SShri Abhyankar } 33478bb4007SShri Abhyankar 3359371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) { 336e6580ceeSShri Abhyankar /* 337e6580ceeSShri Abhyankar Default Version for when blocks are 4 by 4 Using natural ordering 338e6580ceeSShri Abhyankar */ 339e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 340e6580ceeSShri Abhyankar PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 341e6580ceeSShri Abhyankar PetscInt *ajtmpold, *ajtmp, nz, row; 342e6580ceeSShri Abhyankar PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 343e6580ceeSShri Abhyankar MatScalar *pv, *v, *rtmp, *pc, *w, *x; 344e6580ceeSShri Abhyankar MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 345e6580ceeSShri Abhyankar MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 346e6580ceeSShri Abhyankar MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 347e6580ceeSShri Abhyankar MatScalar m13, m14, m15, m16; 348e6580ceeSShri Abhyankar MatScalar *ba = b->a, *aa = a->a; 349a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 350182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 3510164db54SHong Zhang PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 352e6580ceeSShri Abhyankar 353e6580ceeSShri Abhyankar PetscFunctionBegin; 3540164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 3559566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 356e6580ceeSShri Abhyankar 357e6580ceeSShri Abhyankar for (i = 0; i < n; i++) { 358e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 359e6580ceeSShri Abhyankar ajtmp = bj + bi[i]; 360e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 361e6580ceeSShri Abhyankar x = rtmp + 16 * ajtmp[j]; 362e6580ceeSShri Abhyankar x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 363e6580ceeSShri Abhyankar x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 364e6580ceeSShri Abhyankar } 365e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 366e6580ceeSShri Abhyankar nz = ai[i + 1] - ai[i]; 367e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 368e6580ceeSShri Abhyankar v = aa + 16 * ai[i]; 369e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 370e6580ceeSShri Abhyankar x = rtmp + 16 * ajtmpold[j]; 3719371c9d4SSatish Balay x[0] = v[0]; 3729371c9d4SSatish Balay x[1] = v[1]; 3739371c9d4SSatish Balay x[2] = v[2]; 3749371c9d4SSatish Balay x[3] = v[3]; 3759371c9d4SSatish Balay x[4] = v[4]; 3769371c9d4SSatish Balay x[5] = v[5]; 3779371c9d4SSatish Balay x[6] = v[6]; 3789371c9d4SSatish Balay x[7] = v[7]; 3799371c9d4SSatish Balay x[8] = v[8]; 3809371c9d4SSatish Balay x[9] = v[9]; 3819371c9d4SSatish Balay x[10] = v[10]; 3829371c9d4SSatish Balay x[11] = v[11]; 3839371c9d4SSatish Balay x[12] = v[12]; 3849371c9d4SSatish Balay x[13] = v[13]; 3859371c9d4SSatish Balay x[14] = v[14]; 3869371c9d4SSatish Balay x[15] = v[15]; 387e6580ceeSShri Abhyankar v += 16; 388e6580ceeSShri Abhyankar } 389e6580ceeSShri Abhyankar row = *ajtmp++; 390e6580ceeSShri Abhyankar while (row < i) { 391e6580ceeSShri Abhyankar pc = rtmp + 16 * row; 3929371c9d4SSatish Balay p1 = pc[0]; 3939371c9d4SSatish Balay p2 = pc[1]; 3949371c9d4SSatish Balay p3 = pc[2]; 3959371c9d4SSatish Balay p4 = pc[3]; 3969371c9d4SSatish Balay p5 = pc[4]; 3979371c9d4SSatish Balay p6 = pc[5]; 3989371c9d4SSatish Balay p7 = pc[6]; 3999371c9d4SSatish Balay p8 = pc[7]; 4009371c9d4SSatish Balay p9 = pc[8]; 4019371c9d4SSatish Balay p10 = pc[9]; 4029371c9d4SSatish Balay p11 = pc[10]; 4039371c9d4SSatish Balay p12 = pc[11]; 4049371c9d4SSatish Balay p13 = pc[12]; 4059371c9d4SSatish Balay p14 = pc[13]; 4069371c9d4SSatish Balay p15 = pc[14]; 4079371c9d4SSatish Balay p16 = pc[15]; 4089371c9d4SSatish Balay if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 409e6580ceeSShri Abhyankar pv = ba + 16 * diag_offset[row]; 410e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 4119371c9d4SSatish Balay x1 = pv[0]; 4129371c9d4SSatish Balay x2 = pv[1]; 4139371c9d4SSatish Balay x3 = pv[2]; 4149371c9d4SSatish Balay x4 = pv[3]; 4159371c9d4SSatish Balay x5 = pv[4]; 4169371c9d4SSatish Balay x6 = pv[5]; 4179371c9d4SSatish Balay x7 = pv[6]; 4189371c9d4SSatish Balay x8 = pv[7]; 4199371c9d4SSatish Balay x9 = pv[8]; 4209371c9d4SSatish Balay x10 = pv[9]; 4219371c9d4SSatish Balay x11 = pv[10]; 4229371c9d4SSatish Balay x12 = pv[11]; 4239371c9d4SSatish Balay x13 = pv[12]; 4249371c9d4SSatish Balay x14 = pv[13]; 4259371c9d4SSatish Balay x15 = pv[14]; 4269371c9d4SSatish Balay x16 = pv[15]; 427e6580ceeSShri Abhyankar pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 428e6580ceeSShri Abhyankar pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 429e6580ceeSShri Abhyankar pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 430e6580ceeSShri Abhyankar pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 431e6580ceeSShri Abhyankar 432e6580ceeSShri Abhyankar pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 433e6580ceeSShri Abhyankar pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 434e6580ceeSShri Abhyankar pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 435e6580ceeSShri Abhyankar pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 436e6580ceeSShri Abhyankar 437e6580ceeSShri Abhyankar pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 438e6580ceeSShri Abhyankar pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 439e6580ceeSShri Abhyankar pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 440e6580ceeSShri Abhyankar pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 441e6580ceeSShri Abhyankar 442e6580ceeSShri Abhyankar pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 443e6580ceeSShri Abhyankar pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 444e6580ceeSShri Abhyankar pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 445e6580ceeSShri Abhyankar pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 446e6580ceeSShri Abhyankar nz = bi[row + 1] - diag_offset[row] - 1; 447e6580ceeSShri Abhyankar pv += 16; 448e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 4499371c9d4SSatish Balay x1 = pv[0]; 4509371c9d4SSatish Balay x2 = pv[1]; 4519371c9d4SSatish Balay x3 = pv[2]; 4529371c9d4SSatish Balay x4 = pv[3]; 4539371c9d4SSatish Balay x5 = pv[4]; 4549371c9d4SSatish Balay x6 = pv[5]; 4559371c9d4SSatish Balay x7 = pv[6]; 4569371c9d4SSatish Balay x8 = pv[7]; 4579371c9d4SSatish Balay x9 = pv[8]; 4589371c9d4SSatish Balay x10 = pv[9]; 4599371c9d4SSatish Balay x11 = pv[10]; 4609371c9d4SSatish Balay x12 = pv[11]; 4619371c9d4SSatish Balay x13 = pv[12]; 4629371c9d4SSatish Balay x14 = pv[13]; 4639371c9d4SSatish Balay x15 = pv[14]; 4649371c9d4SSatish Balay x16 = pv[15]; 465e6580ceeSShri Abhyankar x = rtmp + 16 * pj[j]; 466e6580ceeSShri Abhyankar x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 467e6580ceeSShri Abhyankar x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 468e6580ceeSShri Abhyankar x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 469e6580ceeSShri Abhyankar x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 470e6580ceeSShri Abhyankar 471e6580ceeSShri Abhyankar x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 472e6580ceeSShri Abhyankar x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 473e6580ceeSShri Abhyankar x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 474e6580ceeSShri Abhyankar x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 475e6580ceeSShri Abhyankar 476e6580ceeSShri Abhyankar x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 477e6580ceeSShri Abhyankar x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 478e6580ceeSShri Abhyankar x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 479e6580ceeSShri Abhyankar x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 480e6580ceeSShri Abhyankar 481e6580ceeSShri Abhyankar x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 482e6580ceeSShri Abhyankar x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 483e6580ceeSShri Abhyankar x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 484e6580ceeSShri Abhyankar x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 485e6580ceeSShri Abhyankar 486e6580ceeSShri Abhyankar pv += 16; 487e6580ceeSShri Abhyankar } 4889566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 489e6580ceeSShri Abhyankar } 490e6580ceeSShri Abhyankar row = *ajtmp++; 491e6580ceeSShri Abhyankar } 492e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 493e6580ceeSShri Abhyankar pv = ba + 16 * bi[i]; 494e6580ceeSShri Abhyankar pj = bj + bi[i]; 495e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 496e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 497e6580ceeSShri Abhyankar x = rtmp + 16 * pj[j]; 4989371c9d4SSatish Balay pv[0] = x[0]; 4999371c9d4SSatish Balay pv[1] = x[1]; 5009371c9d4SSatish Balay pv[2] = x[2]; 5019371c9d4SSatish Balay pv[3] = x[3]; 5029371c9d4SSatish Balay pv[4] = x[4]; 5039371c9d4SSatish Balay pv[5] = x[5]; 5049371c9d4SSatish Balay pv[6] = x[6]; 5059371c9d4SSatish Balay pv[7] = x[7]; 5069371c9d4SSatish Balay pv[8] = x[8]; 5079371c9d4SSatish Balay pv[9] = x[9]; 5089371c9d4SSatish Balay pv[10] = x[10]; 5099371c9d4SSatish Balay pv[11] = x[11]; 5109371c9d4SSatish Balay pv[12] = x[12]; 5119371c9d4SSatish Balay pv[13] = x[13]; 5129371c9d4SSatish Balay pv[14] = x[14]; 5139371c9d4SSatish Balay pv[15] = x[15]; 514e6580ceeSShri Abhyankar pv += 16; 515e6580ceeSShri Abhyankar } 516e6580ceeSShri Abhyankar /* invert diagonal block */ 517e6580ceeSShri Abhyankar w = ba + 16 * diag_offset[i]; 518e6580ceeSShri Abhyankar if (pivotinblocks) { 5199566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 5207b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 521e6580ceeSShri Abhyankar } else { 5229566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 523e6580ceeSShri Abhyankar } 524e6580ceeSShri Abhyankar } 525e6580ceeSShri Abhyankar 5269566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 52726fbe8dcSKarl Rupp 52806e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 52906e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 530e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 53126fbe8dcSKarl Rupp 5329566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 533e6580ceeSShri Abhyankar PetscFunctionReturn(0); 534e6580ceeSShri Abhyankar } 535c0c7eb62SShri Abhyankar 536209027a4SShri Abhyankar /* 5374dd39f65SShri Abhyankar MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 5384dd39f65SShri Abhyankar copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 539209027a4SShri Abhyankar */ 5409371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) { 541209027a4SShri Abhyankar Mat C = B; 542209027a4SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 543bbd65245SShri Abhyankar PetscInt i, j, k, nz, nzL, row; 544bbd65245SShri Abhyankar const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 545bbd65245SShri Abhyankar const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 546209027a4SShri Abhyankar MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 547bbd65245SShri Abhyankar PetscInt flg; 5486ba06ab7SHong Zhang PetscReal shift; 549a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected; 550e6580ceeSShri Abhyankar 551209027a4SShri Abhyankar PetscFunctionBegin; 5520164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 5530164db54SHong Zhang 554209027a4SShri Abhyankar /* generate work space needed by the factorization */ 5559566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 5569566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rtmp, bs2 * n)); 557209027a4SShri Abhyankar 5586ba06ab7SHong Zhang if (info->shifttype == MAT_SHIFT_NONE) { 5596ba06ab7SHong Zhang shift = 0; 5606ba06ab7SHong Zhang } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 5616ba06ab7SHong Zhang shift = info->shiftamount; 5626ba06ab7SHong Zhang } 5636ba06ab7SHong Zhang 564209027a4SShri Abhyankar for (i = 0; i < n; i++) { 565209027a4SShri Abhyankar /* zero rtmp */ 566209027a4SShri Abhyankar /* L part */ 567209027a4SShri Abhyankar nz = bi[i + 1] - bi[i]; 568209027a4SShri Abhyankar bjtmp = bj + bi[i]; 569*48a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 570209027a4SShri Abhyankar 571209027a4SShri Abhyankar /* U part */ 572b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 573b2b2dd24SShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 574*48a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 575b2b2dd24SShri Abhyankar 576b2b2dd24SShri Abhyankar /* load in initial (unfactored row) */ 577b2b2dd24SShri Abhyankar nz = ai[i + 1] - ai[i]; 578b2b2dd24SShri Abhyankar ajtmp = aj + ai[i]; 579b2b2dd24SShri Abhyankar v = aa + bs2 * ai[i]; 580*48a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 581b2b2dd24SShri Abhyankar 582b2b2dd24SShri Abhyankar /* elimination */ 583b2b2dd24SShri Abhyankar bjtmp = bj + bi[i]; 584b2b2dd24SShri Abhyankar nzL = bi[i + 1] - bi[i]; 585b2b2dd24SShri Abhyankar for (k = 0; k < nzL; k++) { 586b2b2dd24SShri Abhyankar row = bjtmp[k]; 587b2b2dd24SShri Abhyankar pc = rtmp + bs2 * row; 588c35f09e5SBarry Smith for (flg = 0, j = 0; j < bs2; j++) { 589c35f09e5SBarry Smith if (pc[j] != 0.0) { 590c35f09e5SBarry Smith flg = 1; 591c35f09e5SBarry Smith break; 592c35f09e5SBarry Smith } 593c35f09e5SBarry Smith } 594b2b2dd24SShri Abhyankar if (flg) { 595b2b2dd24SShri Abhyankar pv = b->a + bs2 * bdiag[row]; 59696b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 5979566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 598b2b2dd24SShri Abhyankar 599a5b23f4aSJose E. Roman pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 600b2b2dd24SShri Abhyankar pv = b->a + bs2 * (bdiag[row + 1] + 1); 601b2b2dd24SShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 602b2b2dd24SShri Abhyankar for (j = 0; j < nz; j++) { 60396b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 604b2b2dd24SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 605b2b2dd24SShri Abhyankar v = rtmp + bs2 * pj[j]; 6069566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 607b2b2dd24SShri Abhyankar pv += bs2; 608b2b2dd24SShri Abhyankar } 6099566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 610b2b2dd24SShri Abhyankar } 611b2b2dd24SShri Abhyankar } 612b2b2dd24SShri Abhyankar 613b2b2dd24SShri Abhyankar /* finished row so stick it into b->a */ 614b2b2dd24SShri Abhyankar /* L part */ 615b2b2dd24SShri Abhyankar pv = b->a + bs2 * bi[i]; 616b2b2dd24SShri Abhyankar pj = b->j + bi[i]; 617b2b2dd24SShri Abhyankar nz = bi[i + 1] - bi[i]; 618*48a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 619b2b2dd24SShri Abhyankar 620a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 621b2b2dd24SShri Abhyankar pv = b->a + bs2 * bdiag[i]; 622b2b2dd24SShri Abhyankar pj = b->j + bdiag[i]; 6239566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 6249566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 6257b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 626b2b2dd24SShri Abhyankar 627b2b2dd24SShri Abhyankar /* U part */ 628b2b2dd24SShri Abhyankar pv = b->a + bs2 * (bdiag[i + 1] + 1); 629b2b2dd24SShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 630b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; 631*48a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 632b2b2dd24SShri Abhyankar } 6339566063dSJacob Faibussowitsch PetscCall(PetscFree2(rtmp, mwork)); 63426fbe8dcSKarl Rupp 6354dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 6364dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 637b2b2dd24SShri Abhyankar C->assembled = PETSC_TRUE; 63826fbe8dcSKarl Rupp 6399566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 640b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 641b2b2dd24SShri Abhyankar } 642b2b2dd24SShri Abhyankar 643e6580ceeSShri Abhyankar #if defined(PETSC_HAVE_SSE) 644e6580ceeSShri Abhyankar 645e6580ceeSShri Abhyankar #include PETSC_HAVE_SSE 646e6580ceeSShri Abhyankar 647e6580ceeSShri Abhyankar /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 6489371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info) { 649e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 650e6580ceeSShri Abhyankar int i, j, n = a->mbs; 651e6580ceeSShri Abhyankar int *bj = b->j, *bjtmp, *pj; 652e6580ceeSShri Abhyankar int row; 653e6580ceeSShri Abhyankar int *ajtmpold, nz, *bi = b->i; 654e6580ceeSShri Abhyankar int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 655e6580ceeSShri Abhyankar MatScalar *pv, *v, *rtmp, *pc, *w, *x; 656e6580ceeSShri Abhyankar MatScalar *ba = b->a, *aa = a->a; 657e6580ceeSShri Abhyankar int nonzero = 0; 658a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 659182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 6600164db54SHong Zhang PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 661e6580ceeSShri Abhyankar 662e6580ceeSShri Abhyankar PetscFunctionBegin; 6630164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 664e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 665e6580ceeSShri Abhyankar 666aed4548fSBarry Smith PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 667aed4548fSBarry Smith PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 6689566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 669aed4548fSBarry Smith PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 670e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 671e6580ceeSShri Abhyankar /* colscale = 4; */ 672e6580ceeSShri Abhyankar /* } */ 673e6580ceeSShri Abhyankar for (i = 0; i < n; i++) { 674e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 675e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 676e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 677e6580ceeSShri Abhyankar /* zero out one register */ 678e6580ceeSShri Abhyankar XOR_PS(XMM7, XMM7); 679e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 680e6580ceeSShri Abhyankar x = rtmp + 16 * bjtmp[j]; 681e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 682e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 683e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 684e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 685e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 686e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 687e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 688e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 689e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 690e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 691e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 692e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 693e6580ceeSShri Abhyankar SSE_INLINE_END_1; 694e6580ceeSShri Abhyankar } 695e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 696e6580ceeSShri Abhyankar nz = ai[i + 1] - ai[i]; 697e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 698e6580ceeSShri Abhyankar v = aa + 16 * ai[i]; 699e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 700e6580ceeSShri Abhyankar x = rtmp + 16 * ajtmpold[j]; 701e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmpold[j]; */ 702e6580ceeSShri Abhyankar /* Copy v block into x block */ 703e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v, x) 704e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 705e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 706e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 707e6580ceeSShri Abhyankar 708e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 709e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 710e6580ceeSShri Abhyankar 711e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 712e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 713e6580ceeSShri Abhyankar 714e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 715e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 716e6580ceeSShri Abhyankar 717e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 718e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 719e6580ceeSShri Abhyankar 720e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 721e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 722e6580ceeSShri Abhyankar 723e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 724e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 725e6580ceeSShri Abhyankar 726e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 727e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 728e6580ceeSShri Abhyankar SSE_INLINE_END_2; 729e6580ceeSShri Abhyankar 730e6580ceeSShri Abhyankar v += 16; 731e6580ceeSShri Abhyankar } 732e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 733e6580ceeSShri Abhyankar row = *bjtmp++; 734e6580ceeSShri Abhyankar while (row < i) { 735e6580ceeSShri Abhyankar pc = rtmp + 16 * row; 736e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 737e6580ceeSShri Abhyankar /* Load block from lower triangle */ 738e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 739e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 740e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 741e6580ceeSShri Abhyankar 742e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 743e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 744e6580ceeSShri Abhyankar 745e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 746e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 747e6580ceeSShri Abhyankar 748e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 749e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 750e6580ceeSShri Abhyankar 751e6580ceeSShri Abhyankar /* Compare block to zero block */ 752e6580ceeSShri Abhyankar 753e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4, XMM7) 754e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4, XMM0) 755e6580ceeSShri Abhyankar 756e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5, XMM7) 757e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5, XMM1) 758e6580ceeSShri Abhyankar 759e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6, XMM7) 760e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6, XMM2) 761e6580ceeSShri Abhyankar 762e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7, XMM3) 763e6580ceeSShri Abhyankar 764e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 765e6580ceeSShri Abhyankar SSE_OR_PS(XMM6, XMM7) 766e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM4) 767e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM6) 768e6580ceeSShri Abhyankar SSE_INLINE_END_1; 769e6580ceeSShri Abhyankar 770e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 771e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 772e6580ceeSShri Abhyankar MOVEMASK(nonzero, XMM5); 773e6580ceeSShri Abhyankar 774e6580ceeSShri Abhyankar /* If block is nonzero ... */ 775e6580ceeSShri Abhyankar if (nonzero) { 776e6580ceeSShri Abhyankar pv = ba + 16 * diag_offset[row]; 777e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 778e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 779e6580ceeSShri Abhyankar 780e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 781e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 782e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 783e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 784e6580ceeSShri Abhyankar 785e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv, pc) 786e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 787e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 788e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 789e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM0) 790e6580ceeSShri Abhyankar 791e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 792e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 793e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM1) 794e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM5) 795e6580ceeSShri Abhyankar 796e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 797e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 798e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM2) 799e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM6) 800e6580ceeSShri Abhyankar 801e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 802e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 803e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 804e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM7) 805e6580ceeSShri Abhyankar 806e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 807e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 808e6580ceeSShri Abhyankar 809e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 810e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 811e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 812e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 813e6580ceeSShri Abhyankar 814e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 815e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 816e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 817e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 818e6580ceeSShri Abhyankar 819e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 820e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 821e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 822e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM7) 823e6580ceeSShri Abhyankar 824e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 825e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 826e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 827e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 828e6580ceeSShri Abhyankar 829e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 830e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 831e6580ceeSShri Abhyankar 832e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 833e6580ceeSShri Abhyankar 834e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 835e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 836e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 837e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 838e6580ceeSShri Abhyankar 839e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 840e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 841e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 842e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 843e6580ceeSShri Abhyankar 844e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 845e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 846e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 847e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 848e6580ceeSShri Abhyankar 849e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 850e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 851e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 852e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 853e6580ceeSShri Abhyankar 854e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 855e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 856e6580ceeSShri Abhyankar 857e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 858e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 859e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 860e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 861e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0, XMM7) 862e6580ceeSShri Abhyankar 863e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 864e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 865e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM7) 866e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 867e6580ceeSShri Abhyankar 868e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 869e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1, XMM1, 0x00) 870e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM2) 871e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 872e6580ceeSShri Abhyankar 873e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 874e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 875e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3, XMM7) 876e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM3) 877e6580ceeSShri Abhyankar 878e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 879e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 880e6580ceeSShri Abhyankar 881e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 882e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans after all. */ 883e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 884e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3, XMM0) 885e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 886e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2, XMM6) 887e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 888e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1, XMM5) 889e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 890e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0, XMM4) 891e6580ceeSShri Abhyankar SSE_INLINE_END_2; 892e6580ceeSShri Abhyankar 893e6580ceeSShri Abhyankar /* Update the row: */ 894e6580ceeSShri Abhyankar nz = bi[row + 1] - diag_offset[row] - 1; 895e6580ceeSShri Abhyankar pv += 16; 896e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 897e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 898e6580ceeSShri Abhyankar x = rtmp + 16 * pj[j]; 899e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 900e6580ceeSShri Abhyankar 901e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 902e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 903e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 904e6580ceeSShri Abhyankar /* Load First Column of X*/ 905e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 906e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 907e6580ceeSShri Abhyankar 908e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 909e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 910e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 911e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 912e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 913e6580ceeSShri Abhyankar 914e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 915e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 916e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 917e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 918e6580ceeSShri Abhyankar 919e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 920e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 921e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 922e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 923e6580ceeSShri Abhyankar 924e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 925e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 926e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 927e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 928e6580ceeSShri Abhyankar 929e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 930e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 931e6580ceeSShri Abhyankar 932e6580ceeSShri Abhyankar /* Second Column */ 933e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 934e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 935e6580ceeSShri Abhyankar 936e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 937e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 938e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 939e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 940e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 941e6580ceeSShri Abhyankar 942e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 943e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 944e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 945e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM7) 946e6580ceeSShri Abhyankar 947e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 948e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 949e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM2) 950e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM4) 951e6580ceeSShri Abhyankar 952e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 953e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 954e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 955e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 956e6580ceeSShri Abhyankar 957e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 958e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 959e6580ceeSShri Abhyankar 960e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 961e6580ceeSShri Abhyankar 962e6580ceeSShri Abhyankar /* Third Column */ 963e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 964e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 965e6580ceeSShri Abhyankar 966e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 967e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 968e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 969e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM0) 970e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 971e6580ceeSShri Abhyankar 972e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 973e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 974e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM1) 975e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM4) 976e6580ceeSShri Abhyankar 977e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 978e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 979e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM2) 980e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM5) 981e6580ceeSShri Abhyankar 982e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 983e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 984e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 985e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 986e6580ceeSShri Abhyankar 987e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 988e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 989e6580ceeSShri Abhyankar 990e6580ceeSShri Abhyankar /* Fourth Column */ 991e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 992e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 993e6580ceeSShri Abhyankar 994e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 995e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 996e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 997e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 998e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 999e6580ceeSShri Abhyankar 1000e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1001e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1002e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1003e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 1004e6580ceeSShri Abhyankar 1005e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1006e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1007e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1008e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 1009e6580ceeSShri Abhyankar 1010e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1011e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1012e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 1013e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1014e6580ceeSShri Abhyankar 1015e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1016e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1017e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1018e6580ceeSShri Abhyankar pv += 16; 1019e6580ceeSShri Abhyankar } 10209566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1021e6580ceeSShri Abhyankar } 1022e6580ceeSShri Abhyankar row = *bjtmp++; 1023e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1024e6580ceeSShri Abhyankar } 1025e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1026e6580ceeSShri Abhyankar pv = ba + 16 * bi[i]; 1027e6580ceeSShri Abhyankar pj = bj + bi[i]; 1028e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 1029e6580ceeSShri Abhyankar 1030e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1031e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1032e6580ceeSShri Abhyankar x = rtmp + 16 * pj[j]; 1033e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1034e6580ceeSShri Abhyankar 1035e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 1036e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1037e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1038e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1039e6580ceeSShri Abhyankar 1040e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1041e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1042e6580ceeSShri Abhyankar 1043e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1044e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1045e6580ceeSShri Abhyankar 1046e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1047e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1048e6580ceeSShri Abhyankar 1049e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1050e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1051e6580ceeSShri Abhyankar 1052e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1053e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1054e6580ceeSShri Abhyankar 1055e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1056e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1057e6580ceeSShri Abhyankar 1058e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1059e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1060e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1061e6580ceeSShri Abhyankar pv += 16; 1062e6580ceeSShri Abhyankar } 1063e6580ceeSShri Abhyankar /* invert diagonal block */ 1064e6580ceeSShri Abhyankar w = ba + 16 * diag_offset[i]; 1065e6580ceeSShri Abhyankar if (pivotinblocks) { 10669566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1067603e8f96SBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1068e6580ceeSShri Abhyankar } else { 10699566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1070e6580ceeSShri Abhyankar } 10719566063dSJacob Faibussowitsch /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1072e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1073e6580ceeSShri Abhyankar } 1074e6580ceeSShri Abhyankar 10759566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 107626fbe8dcSKarl Rupp 1077e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1078e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1079e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 108026fbe8dcSKarl Rupp 10819566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1082e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1083e6580ceeSShri Abhyankar SSE_SCOPE_END; 1084e6580ceeSShri Abhyankar PetscFunctionReturn(0); 1085e6580ceeSShri Abhyankar } 1086e6580ceeSShri Abhyankar 10879371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) { 1088e6580ceeSShri Abhyankar Mat A = C; 1089e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1090e6580ceeSShri Abhyankar int i, j, n = a->mbs; 1091e6580ceeSShri Abhyankar unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj; 1092e6580ceeSShri Abhyankar unsigned short *aj = (unsigned short *)(a->j), *ajtmp; 1093e6580ceeSShri Abhyankar unsigned int row; 1094e6580ceeSShri Abhyankar int nz, *bi = b->i; 1095e6580ceeSShri Abhyankar int *diag_offset = b->diag, *ai = a->i; 1096e6580ceeSShri Abhyankar MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1097e6580ceeSShri Abhyankar MatScalar *ba = b->a, *aa = a->a; 1098e6580ceeSShri Abhyankar int nonzero = 0; 1099a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 1100182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 11010164db54SHong Zhang PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1102e6580ceeSShri Abhyankar 1103e6580ceeSShri Abhyankar PetscFunctionBegin; 11040164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 1105e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 1106e6580ceeSShri Abhyankar 1107aed4548fSBarry Smith PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1108aed4548fSBarry Smith PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 11099566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1110aed4548fSBarry Smith PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1111e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 1112e6580ceeSShri Abhyankar /* colscale = 4; */ 1113e6580ceeSShri Abhyankar /* } */ 1114e6580ceeSShri Abhyankar 1115e6580ceeSShri Abhyankar for (i = 0; i < n; i++) { 1116e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 1117e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 1118e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 1119e6580ceeSShri Abhyankar /* zero out one register */ 1120e6580ceeSShri Abhyankar XOR_PS(XMM7, XMM7); 1121e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1122e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1123e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 1124e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 1125e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 1126e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1127e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1128e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1129e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1130e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1131e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1132e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1133e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1134e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1135e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1136e6580ceeSShri Abhyankar } 1137e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 1138e6580ceeSShri Abhyankar nz = ai[i + 1] - ai[i]; 1139e6580ceeSShri Abhyankar ajtmp = aj + ai[i]; 1140e6580ceeSShri Abhyankar v = aa + 16 * ai[i]; 1141e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1142e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)ajtmp[j]); 1143e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmp[j]; */ 1144e6580ceeSShri Abhyankar /* Copy v block into x block */ 1145e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v, x) 1146e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1147e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1148e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1149e6580ceeSShri Abhyankar 1150e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1151e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1152e6580ceeSShri Abhyankar 1153e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1154e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1155e6580ceeSShri Abhyankar 1156e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1157e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1158e6580ceeSShri Abhyankar 1159e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1160e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1161e6580ceeSShri Abhyankar 1162e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1163e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1164e6580ceeSShri Abhyankar 1165e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1166e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1167e6580ceeSShri Abhyankar 1168e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1169e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1170e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1171e6580ceeSShri Abhyankar 1172e6580ceeSShri Abhyankar v += 16; 1173e6580ceeSShri Abhyankar } 1174e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1175e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1176e6580ceeSShri Abhyankar while (row < i) { 1177e6580ceeSShri Abhyankar pc = rtmp + 16 * row; 1178e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 1179e6580ceeSShri Abhyankar /* Load block from lower triangle */ 1180e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1181e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1182e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1183e6580ceeSShri Abhyankar 1184e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1185e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1186e6580ceeSShri Abhyankar 1187e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1188e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1189e6580ceeSShri Abhyankar 1190e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1191e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1192e6580ceeSShri Abhyankar 1193e6580ceeSShri Abhyankar /* Compare block to zero block */ 1194e6580ceeSShri Abhyankar 1195e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4, XMM7) 1196e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4, XMM0) 1197e6580ceeSShri Abhyankar 1198e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5, XMM7) 1199e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5, XMM1) 1200e6580ceeSShri Abhyankar 1201e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6, XMM7) 1202e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6, XMM2) 1203e6580ceeSShri Abhyankar 1204e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7, XMM3) 1205e6580ceeSShri Abhyankar 1206e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 1207e6580ceeSShri Abhyankar SSE_OR_PS(XMM6, XMM7) 1208e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM4) 1209e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM6) 1210e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1211e6580ceeSShri Abhyankar 1212e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 1213e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1214e6580ceeSShri Abhyankar MOVEMASK(nonzero, XMM5); 1215e6580ceeSShri Abhyankar 1216e6580ceeSShri Abhyankar /* If block is nonzero ... */ 1217e6580ceeSShri Abhyankar if (nonzero) { 1218e6580ceeSShri Abhyankar pv = ba + 16 * diag_offset[row]; 1219e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1220e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 1221e6580ceeSShri Abhyankar 1222e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1223e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1224e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 1225e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1226e6580ceeSShri Abhyankar 1227e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv, pc) 1228e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 1229e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1230e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1231e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM0) 1232e6580ceeSShri Abhyankar 1233e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1234e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1235e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM1) 1236e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM5) 1237e6580ceeSShri Abhyankar 1238e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1239e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1240e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM2) 1241e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM6) 1242e6580ceeSShri Abhyankar 1243e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1244e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1245e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1246e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM7) 1247e6580ceeSShri Abhyankar 1248e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1249e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1250e6580ceeSShri Abhyankar 1251e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 1252e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1253e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1254e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1255e6580ceeSShri Abhyankar 1256e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1257e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1258e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1259e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 1260e6580ceeSShri Abhyankar 1261e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1262e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1263e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1264e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM7) 1265e6580ceeSShri Abhyankar 1266e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1267e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1268e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 1269e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 1270e6580ceeSShri Abhyankar 1271e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1272e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1273e6580ceeSShri Abhyankar 1274e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1275e6580ceeSShri Abhyankar 1276e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 1277e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1278e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1279e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 1280e6580ceeSShri Abhyankar 1281e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1282e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1283e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 1284e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1285e6580ceeSShri Abhyankar 1286e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1287e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1288e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1289e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1290e6580ceeSShri Abhyankar 1291e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1292e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1293e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1294e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1295e6580ceeSShri Abhyankar 1296e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1297e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1298e6580ceeSShri Abhyankar 1299e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1300e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 1301e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1302e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1303e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0, XMM7) 1304e6580ceeSShri Abhyankar 1305e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1306e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1307e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM7) 1308e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 1309e6580ceeSShri Abhyankar 1310e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1311e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1, XMM1, 0x00) 1312e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM2) 1313e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 1314e6580ceeSShri Abhyankar 1315e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1316e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1317e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3, XMM7) 1318e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM3) 1319e6580ceeSShri Abhyankar 1320e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1321e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1322e6580ceeSShri Abhyankar 1323e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1324e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans after all. */ 1325e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 1326e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3, XMM0) 1327e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 1328e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2, XMM6) 1329e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 1330e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1, XMM5) 1331e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 1332e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0, XMM4) 1333e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1334e6580ceeSShri Abhyankar 1335e6580ceeSShri Abhyankar /* Update the row: */ 1336e6580ceeSShri Abhyankar nz = bi[row + 1] - diag_offset[row] - 1; 1337e6580ceeSShri Abhyankar pv += 16; 1338e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1339e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1340e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)pj[j]); 1341e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 1342e6580ceeSShri Abhyankar 1343e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 1344e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1345e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 1346e6580ceeSShri Abhyankar /* Load First Column of X*/ 1347e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1348e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1349e6580ceeSShri Abhyankar 1350e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1351e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1352e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1353e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1354e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1355e6580ceeSShri Abhyankar 1356e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1357e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1358e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1359e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 1360e6580ceeSShri Abhyankar 1361e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1362e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1363e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1364e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 1365e6580ceeSShri Abhyankar 1366e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1367e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1368e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 1369e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1370e6580ceeSShri Abhyankar 1371e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1372e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1373e6580ceeSShri Abhyankar 1374e6580ceeSShri Abhyankar /* Second Column */ 1375e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1376e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1377e6580ceeSShri Abhyankar 1378e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1379e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1380e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1381e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 1382e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 1383e6580ceeSShri Abhyankar 1384e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1385e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1386e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 1387e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM7) 1388e6580ceeSShri Abhyankar 1389e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1390e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1391e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM2) 1392e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM4) 1393e6580ceeSShri Abhyankar 1394e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1395e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1396e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 1397e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 1398e6580ceeSShri Abhyankar 1399e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1400e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1401e6580ceeSShri Abhyankar 1402e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1403e6580ceeSShri Abhyankar 1404e6580ceeSShri Abhyankar /* Third Column */ 1405e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1406e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1407e6580ceeSShri Abhyankar 1408e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1409e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1410e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1411e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM0) 1412e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 1413e6580ceeSShri Abhyankar 1414e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1415e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1416e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM1) 1417e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM4) 1418e6580ceeSShri Abhyankar 1419e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1420e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1421e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM2) 1422e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM5) 1423e6580ceeSShri Abhyankar 1424e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1425e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1426e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1427e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 1428e6580ceeSShri Abhyankar 1429e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1430e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1431e6580ceeSShri Abhyankar 1432e6580ceeSShri Abhyankar /* Fourth Column */ 1433e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1434e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1435e6580ceeSShri Abhyankar 1436e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1437e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1438e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1439e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1440e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1441e6580ceeSShri Abhyankar 1442e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1443e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1444e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1445e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 1446e6580ceeSShri Abhyankar 1447e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1448e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1449e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1450e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 1451e6580ceeSShri Abhyankar 1452e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1453e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1454e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 1455e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1456e6580ceeSShri Abhyankar 1457e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1458e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1459e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1460e6580ceeSShri Abhyankar pv += 16; 1461e6580ceeSShri Abhyankar } 14629566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1463e6580ceeSShri Abhyankar } 1464e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1465e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1466e6580ceeSShri Abhyankar /* bjtmp++; */ 1467e6580ceeSShri Abhyankar } 1468e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1469e6580ceeSShri Abhyankar pv = ba + 16 * bi[i]; 1470e6580ceeSShri Abhyankar pj = bj + bi[i]; 1471e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 1472e6580ceeSShri Abhyankar 1473e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1474e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1475e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)pj[j]); 1476e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1477e6580ceeSShri Abhyankar 1478e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 1479e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1480e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1481e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1482e6580ceeSShri Abhyankar 1483e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1484e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1485e6580ceeSShri Abhyankar 1486e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1487e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1488e6580ceeSShri Abhyankar 1489e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1490e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1491e6580ceeSShri Abhyankar 1492e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1493e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1494e6580ceeSShri Abhyankar 1495e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1496e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1497e6580ceeSShri Abhyankar 1498e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1499e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1500e6580ceeSShri Abhyankar 1501e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1502e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1503e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1504e6580ceeSShri Abhyankar pv += 16; 1505e6580ceeSShri Abhyankar } 1506e6580ceeSShri Abhyankar /* invert diagonal block */ 1507e6580ceeSShri Abhyankar w = ba + 16 * diag_offset[i]; 1508e6580ceeSShri Abhyankar if (pivotinblocks) { 15099566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1510603e8f96SBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1511e6580ceeSShri Abhyankar } else { 15129566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1513e6580ceeSShri Abhyankar } 1514e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1515e6580ceeSShri Abhyankar } 1516e6580ceeSShri Abhyankar 15179566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 151826fbe8dcSKarl Rupp 1519e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1520e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1521e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 152226fbe8dcSKarl Rupp 15239566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1524e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1525e6580ceeSShri Abhyankar SSE_SCOPE_END; 1526e6580ceeSShri Abhyankar PetscFunctionReturn(0); 1527e6580ceeSShri Abhyankar } 1528e6580ceeSShri Abhyankar 15299371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info) { 1530e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1531e6580ceeSShri Abhyankar int i, j, n = a->mbs; 1532e6580ceeSShri Abhyankar unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj; 1533e6580ceeSShri Abhyankar unsigned int row; 1534e6580ceeSShri Abhyankar int *ajtmpold, nz, *bi = b->i; 1535e6580ceeSShri Abhyankar int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 1536e6580ceeSShri Abhyankar MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1537e6580ceeSShri Abhyankar MatScalar *ba = b->a, *aa = a->a; 1538e6580ceeSShri Abhyankar int nonzero = 0; 1539a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 1540182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 15410164db54SHong Zhang PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1542e6580ceeSShri Abhyankar 1543e6580ceeSShri Abhyankar PetscFunctionBegin; 15440164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 1545e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 1546e6580ceeSShri Abhyankar 1547aed4548fSBarry Smith PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1548aed4548fSBarry Smith PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 15499566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1550aed4548fSBarry Smith PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1551e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 1552e6580ceeSShri Abhyankar /* colscale = 4; */ 1553e6580ceeSShri Abhyankar /* } */ 15549371c9d4SSatish Balay if ((unsigned long)bj == (unsigned long)aj) { return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); } 1555e6580ceeSShri Abhyankar 1556e6580ceeSShri Abhyankar for (i = 0; i < n; i++) { 1557e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 1558e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 1559e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 1560e6580ceeSShri Abhyankar /* zero out one register */ 1561e6580ceeSShri Abhyankar XOR_PS(XMM7, XMM7); 1562e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1563e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1564e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 1565e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 1566e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 1567e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1568e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1569e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1570e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1571e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1572e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1573e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1574e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1575e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1576e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1577e6580ceeSShri Abhyankar } 1578e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 1579e6580ceeSShri Abhyankar nz = ai[i + 1] - ai[i]; 1580e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 1581e6580ceeSShri Abhyankar v = aa + 16 * ai[i]; 1582e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1583e6580ceeSShri Abhyankar x = rtmp + 16 * ajtmpold[j]; 1584e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmpold[j]; */ 1585e6580ceeSShri Abhyankar /* Copy v block into x block */ 1586e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v, x) 1587e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1588e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1589e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1590e6580ceeSShri Abhyankar 1591e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1592e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1593e6580ceeSShri Abhyankar 1594e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1595e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1596e6580ceeSShri Abhyankar 1597e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1598e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1599e6580ceeSShri Abhyankar 1600e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1601e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1602e6580ceeSShri Abhyankar 1603e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1604e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1605e6580ceeSShri Abhyankar 1606e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1607e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1608e6580ceeSShri Abhyankar 1609e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1610e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1611e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1612e6580ceeSShri Abhyankar 1613e6580ceeSShri Abhyankar v += 16; 1614e6580ceeSShri Abhyankar } 1615e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1616e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1617e6580ceeSShri Abhyankar while (row < i) { 1618e6580ceeSShri Abhyankar pc = rtmp + 16 * row; 1619e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 1620e6580ceeSShri Abhyankar /* Load block from lower triangle */ 1621e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1622e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1623e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1624e6580ceeSShri Abhyankar 1625e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1626e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1627e6580ceeSShri Abhyankar 1628e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1629e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1630e6580ceeSShri Abhyankar 1631e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1632e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1633e6580ceeSShri Abhyankar 1634e6580ceeSShri Abhyankar /* Compare block to zero block */ 1635e6580ceeSShri Abhyankar 1636e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4, XMM7) 1637e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4, XMM0) 1638e6580ceeSShri Abhyankar 1639e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5, XMM7) 1640e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5, XMM1) 1641e6580ceeSShri Abhyankar 1642e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6, XMM7) 1643e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6, XMM2) 1644e6580ceeSShri Abhyankar 1645e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7, XMM3) 1646e6580ceeSShri Abhyankar 1647e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 1648e6580ceeSShri Abhyankar SSE_OR_PS(XMM6, XMM7) 1649e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM4) 1650e6580ceeSShri Abhyankar SSE_OR_PS(XMM5, XMM6) 1651e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1652e6580ceeSShri Abhyankar 1653e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 1654e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1655e6580ceeSShri Abhyankar MOVEMASK(nonzero, XMM5); 1656e6580ceeSShri Abhyankar 1657e6580ceeSShri Abhyankar /* If block is nonzero ... */ 1658e6580ceeSShri Abhyankar if (nonzero) { 1659e6580ceeSShri Abhyankar pv = ba + 16 * diag_offset[row]; 1660e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1661e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 1662e6580ceeSShri Abhyankar 1663e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1664e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1665e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 1666e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1667e6580ceeSShri Abhyankar 1668e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv, pc) 1669e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 1670e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1671e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1672e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM0) 1673e6580ceeSShri Abhyankar 1674e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1675e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1676e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM1) 1677e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM5) 1678e6580ceeSShri Abhyankar 1679e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1680e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1681e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM2) 1682e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM6) 1683e6580ceeSShri Abhyankar 1684e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1685e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1686e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1687e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4, XMM7) 1688e6580ceeSShri Abhyankar 1689e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1690e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1691e6580ceeSShri Abhyankar 1692e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 1693e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1694e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1695e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1696e6580ceeSShri Abhyankar 1697e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1698e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1699e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1700e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 1701e6580ceeSShri Abhyankar 1702e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1703e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1704e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1705e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM7) 1706e6580ceeSShri Abhyankar 1707e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1708e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1709e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 1710e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5, XMM6) 1711e6580ceeSShri Abhyankar 1712e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1713e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1714e6580ceeSShri Abhyankar 1715e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1716e6580ceeSShri Abhyankar 1717e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 1718e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1719e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1720e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 1721e6580ceeSShri Abhyankar 1722e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1723e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1724e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 1725e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1726e6580ceeSShri Abhyankar 1727e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1728e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1729e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1730e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1731e6580ceeSShri Abhyankar 1732e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1733e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1734e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1735e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6, XMM7) 1736e6580ceeSShri Abhyankar 1737e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1738e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1739e6580ceeSShri Abhyankar 1740e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1741e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 1742e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1743e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1744e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0, XMM7) 1745e6580ceeSShri Abhyankar 1746e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1747e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1748e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM7) 1749e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 1750e6580ceeSShri Abhyankar 1751e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1752e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1, XMM1, 0x00) 1753e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1, XMM2) 1754e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM1) 1755e6580ceeSShri Abhyankar 1756e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1757e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1758e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3, XMM7) 1759e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0, XMM3) 1760e6580ceeSShri Abhyankar 1761e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1762e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1763e6580ceeSShri Abhyankar 1764e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1765e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans after all. */ 1766e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 1767e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3, XMM0) 1768e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 1769e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2, XMM6) 1770e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 1771e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1, XMM5) 1772e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 1773e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0, XMM4) 1774e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1775e6580ceeSShri Abhyankar 1776e6580ceeSShri Abhyankar /* Update the row: */ 1777e6580ceeSShri Abhyankar nz = bi[row + 1] - diag_offset[row] - 1; 1778e6580ceeSShri Abhyankar pv += 16; 1779e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1780e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1781e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)pj[j]); 1782e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 1783e6580ceeSShri Abhyankar 1784e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 1785e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1786e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 1787e6580ceeSShri Abhyankar /* Load First Column of X*/ 1788e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1789e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1790e6580ceeSShri Abhyankar 1791e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1792e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1793e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1794e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1795e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1796e6580ceeSShri Abhyankar 1797e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1798e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1799e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1800e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 1801e6580ceeSShri Abhyankar 1802e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1803e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1804e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1805e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 1806e6580ceeSShri Abhyankar 1807e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1808e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1809e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 1810e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1811e6580ceeSShri Abhyankar 1812e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1813e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1814e6580ceeSShri Abhyankar 1815e6580ceeSShri Abhyankar /* Second Column */ 1816e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1817e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1818e6580ceeSShri Abhyankar 1819e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1820e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1821e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1822e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM0) 1823e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 1824e6580ceeSShri Abhyankar 1825e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1826e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1827e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM1) 1828e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM7) 1829e6580ceeSShri Abhyankar 1830e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1831e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1832e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM2) 1833e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM4) 1834e6580ceeSShri Abhyankar 1835e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1836e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1837e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM3) 1838e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5, XMM6) 1839e6580ceeSShri Abhyankar 1840e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1841e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1842e6580ceeSShri Abhyankar 1843e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1844e6580ceeSShri Abhyankar 1845e6580ceeSShri Abhyankar /* Third Column */ 1846e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1847e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1848e6580ceeSShri Abhyankar 1849e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1850e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1851e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1852e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM0) 1853e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 1854e6580ceeSShri Abhyankar 1855e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1856e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4, XMM4, 0x00) 1857e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4, XMM1) 1858e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM4) 1859e6580ceeSShri Abhyankar 1860e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1861e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1862e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM2) 1863e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM5) 1864e6580ceeSShri Abhyankar 1865e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1866e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1867e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM3) 1868e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6, XMM7) 1869e6580ceeSShri Abhyankar 1870e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1871e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1872e6580ceeSShri Abhyankar 1873e6580ceeSShri Abhyankar /* Fourth Column */ 1874e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1875e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1876e6580ceeSShri Abhyankar 1877e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1878e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1879e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1880e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM0) 1881e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1882e6580ceeSShri Abhyankar 1883e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1884e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6, XMM6, 0x00) 1885e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6, XMM1) 1886e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM6) 1887e6580ceeSShri Abhyankar 1888e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1889e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7, XMM7, 0x00) 1890e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7, XMM2) 1891e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM7) 1892e6580ceeSShri Abhyankar 1893e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1894e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5, XMM5, 0x00) 1895e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5, XMM3) 1896e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4, XMM5) 1897e6580ceeSShri Abhyankar 1898e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1899e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1900e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1901e6580ceeSShri Abhyankar pv += 16; 1902e6580ceeSShri Abhyankar } 19039566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1904e6580ceeSShri Abhyankar } 1905e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1906e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1907e6580ceeSShri Abhyankar /* bjtmp++; */ 1908e6580ceeSShri Abhyankar } 1909e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1910e6580ceeSShri Abhyankar pv = ba + 16 * bi[i]; 1911e6580ceeSShri Abhyankar pj = bj + bi[i]; 1912e6580ceeSShri Abhyankar nz = bi[i + 1] - bi[i]; 1913e6580ceeSShri Abhyankar 1914e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1915e6580ceeSShri Abhyankar for (j = 0; j < nz; j++) { 1916e6580ceeSShri Abhyankar x = rtmp + 16 * ((unsigned int)pj[j]); 1917e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1918e6580ceeSShri Abhyankar 1919e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x, pv) 1920e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1921e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1922e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1923e6580ceeSShri Abhyankar 1924e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1925e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1926e6580ceeSShri Abhyankar 1927e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1928e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1929e6580ceeSShri Abhyankar 1930e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1931e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1932e6580ceeSShri Abhyankar 1933e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1934e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1935e6580ceeSShri Abhyankar 1936e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1937e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1938e6580ceeSShri Abhyankar 1939e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1940e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1941e6580ceeSShri Abhyankar 1942e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1943e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1944e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1945e6580ceeSShri Abhyankar pv += 16; 1946e6580ceeSShri Abhyankar } 1947e6580ceeSShri Abhyankar /* invert diagonal block */ 1948e6580ceeSShri Abhyankar w = ba + 16 * diag_offset[i]; 1949e6580ceeSShri Abhyankar if (pivotinblocks) { 19509566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected)); 1951603e8f96SBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1952e6580ceeSShri Abhyankar } else { 19539566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1954e6580ceeSShri Abhyankar } 19559566063dSJacob Faibussowitsch /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1956e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1957e6580ceeSShri Abhyankar } 1958e6580ceeSShri Abhyankar 19599566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 196026fbe8dcSKarl Rupp 1961e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1962e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1963e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 196426fbe8dcSKarl Rupp 19659566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1966e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1967e6580ceeSShri Abhyankar SSE_SCOPE_END; 1968e6580ceeSShri Abhyankar PetscFunctionReturn(0); 1969e6580ceeSShri Abhyankar } 1970e6580ceeSShri Abhyankar 1971e6580ceeSShri Abhyankar #endif 1972