xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 13bcc0bde560b8b9ee0f6de38aa9e5134a784726)
1be1d678aSKris Buschelman 
283287d42SBarry Smith /*
383287d42SBarry Smith     Factorization code for BAIJ format.
483287d42SBarry Smith */
5c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
6af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
783287d42SBarry Smith 
883287d42SBarry Smith /*
983287d42SBarry Smith       Version for when blocks are 4 by 4
1083287d42SBarry Smith */
11d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
12d71ae5a4SJacob Faibussowitsch {
1383287d42SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1483287d42SBarry Smith   IS              isrow = b->row, isicol = b->icol;
155d0c19d7SBarry Smith   const PetscInt *r, *ic;
165d0c19d7SBarry Smith   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
17690b6cddSBarry Smith   PetscInt       *ajtmpold, *ajtmp, nz, row;
18690b6cddSBarry Smith   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
1983287d42SBarry Smith   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
2083287d42SBarry Smith   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
2183287d42SBarry Smith   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
2283287d42SBarry Smith   MatScalar       p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
2383287d42SBarry Smith   MatScalar       m13, m14, m15, m16;
2483287d42SBarry Smith   MatScalar      *ba = b->a, *aa = a->a;
25a455e926SHong Zhang   PetscBool       pivotinblocks = b->pivotinblocks;
26182b8fbaSHong Zhang   PetscReal       shift         = info->shiftamount;
270164db54SHong Zhang   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
2883287d42SBarry Smith 
2983287d42SBarry Smith   PetscFunctionBegin;
309566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
319566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
329566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
330164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
3483287d42SBarry Smith 
3583287d42SBarry Smith   for (i = 0; i < n; i++) {
3683287d42SBarry Smith     nz    = bi[i + 1] - bi[i];
3783287d42SBarry Smith     ajtmp = bj + bi[i];
3883287d42SBarry Smith     for (j = 0; j < nz; j++) {
3983287d42SBarry Smith       x    = rtmp + 16 * ajtmp[j];
4083287d42SBarry Smith       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
4183287d42SBarry Smith       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
4283287d42SBarry Smith     }
4383287d42SBarry Smith     /* load in initial (unfactored row) */
4483287d42SBarry Smith     idx      = r[i];
4583287d42SBarry Smith     nz       = ai[idx + 1] - ai[idx];
4683287d42SBarry Smith     ajtmpold = aj + ai[idx];
4783287d42SBarry Smith     v        = aa + 16 * ai[idx];
4883287d42SBarry Smith     for (j = 0; j < nz; j++) {
4983287d42SBarry Smith       x     = rtmp + 16 * ic[ajtmpold[j]];
509371c9d4SSatish Balay       x[0]  = v[0];
519371c9d4SSatish Balay       x[1]  = v[1];
529371c9d4SSatish Balay       x[2]  = v[2];
539371c9d4SSatish Balay       x[3]  = v[3];
549371c9d4SSatish Balay       x[4]  = v[4];
559371c9d4SSatish Balay       x[5]  = v[5];
569371c9d4SSatish Balay       x[6]  = v[6];
579371c9d4SSatish Balay       x[7]  = v[7];
589371c9d4SSatish Balay       x[8]  = v[8];
599371c9d4SSatish Balay       x[9]  = v[9];
609371c9d4SSatish Balay       x[10] = v[10];
619371c9d4SSatish Balay       x[11] = v[11];
629371c9d4SSatish Balay       x[12] = v[12];
639371c9d4SSatish Balay       x[13] = v[13];
649371c9d4SSatish Balay       x[14] = v[14];
659371c9d4SSatish Balay       x[15] = v[15];
6683287d42SBarry Smith       v += 16;
6783287d42SBarry Smith     }
6883287d42SBarry Smith     row = *ajtmp++;
6983287d42SBarry Smith     while (row < i) {
7083287d42SBarry Smith       pc  = rtmp + 16 * row;
719371c9d4SSatish Balay       p1  = pc[0];
729371c9d4SSatish Balay       p2  = pc[1];
739371c9d4SSatish Balay       p3  = pc[2];
749371c9d4SSatish Balay       p4  = pc[3];
759371c9d4SSatish Balay       p5  = pc[4];
769371c9d4SSatish Balay       p6  = pc[5];
779371c9d4SSatish Balay       p7  = pc[6];
789371c9d4SSatish Balay       p8  = pc[7];
799371c9d4SSatish Balay       p9  = pc[8];
809371c9d4SSatish Balay       p10 = pc[9];
819371c9d4SSatish Balay       p11 = pc[10];
829371c9d4SSatish Balay       p12 = pc[11];
839371c9d4SSatish Balay       p13 = pc[12];
849371c9d4SSatish Balay       p14 = pc[13];
859371c9d4SSatish Balay       p15 = pc[14];
869371c9d4SSatish Balay       p16 = pc[15];
879371c9d4SSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
8883287d42SBarry Smith         pv    = ba + 16 * diag_offset[row];
8983287d42SBarry Smith         pj    = bj + diag_offset[row] + 1;
909371c9d4SSatish Balay         x1    = pv[0];
919371c9d4SSatish Balay         x2    = pv[1];
929371c9d4SSatish Balay         x3    = pv[2];
939371c9d4SSatish Balay         x4    = pv[3];
949371c9d4SSatish Balay         x5    = pv[4];
959371c9d4SSatish Balay         x6    = pv[5];
969371c9d4SSatish Balay         x7    = pv[6];
979371c9d4SSatish Balay         x8    = pv[7];
989371c9d4SSatish Balay         x9    = pv[8];
999371c9d4SSatish Balay         x10   = pv[9];
1009371c9d4SSatish Balay         x11   = pv[10];
1019371c9d4SSatish Balay         x12   = pv[11];
1029371c9d4SSatish Balay         x13   = pv[12];
1039371c9d4SSatish Balay         x14   = pv[13];
1049371c9d4SSatish Balay         x15   = pv[14];
1059371c9d4SSatish Balay         x16   = pv[15];
10683287d42SBarry Smith         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
10783287d42SBarry Smith         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
10883287d42SBarry Smith         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
10983287d42SBarry Smith         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
11083287d42SBarry Smith 
11183287d42SBarry Smith         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
11283287d42SBarry Smith         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
11383287d42SBarry Smith         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
11483287d42SBarry Smith         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
11583287d42SBarry Smith 
11683287d42SBarry Smith         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
11783287d42SBarry Smith         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
11883287d42SBarry Smith         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
11983287d42SBarry Smith         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
12083287d42SBarry Smith 
12183287d42SBarry Smith         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
12283287d42SBarry Smith         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
12383287d42SBarry Smith         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
12483287d42SBarry Smith         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
12583287d42SBarry Smith 
12683287d42SBarry Smith         nz = bi[row + 1] - diag_offset[row] - 1;
12783287d42SBarry Smith         pv += 16;
12883287d42SBarry Smith         for (j = 0; j < nz; j++) {
1299371c9d4SSatish Balay           x1  = pv[0];
1309371c9d4SSatish Balay           x2  = pv[1];
1319371c9d4SSatish Balay           x3  = pv[2];
1329371c9d4SSatish Balay           x4  = pv[3];
1339371c9d4SSatish Balay           x5  = pv[4];
1349371c9d4SSatish Balay           x6  = pv[5];
1359371c9d4SSatish Balay           x7  = pv[6];
1369371c9d4SSatish Balay           x8  = pv[7];
1379371c9d4SSatish Balay           x9  = pv[8];
1389371c9d4SSatish Balay           x10 = pv[9];
1399371c9d4SSatish Balay           x11 = pv[10];
1409371c9d4SSatish Balay           x12 = pv[11];
1419371c9d4SSatish Balay           x13 = pv[12];
1429371c9d4SSatish Balay           x14 = pv[13];
1439371c9d4SSatish Balay           x15 = pv[14];
1449371c9d4SSatish Balay           x16 = pv[15];
14583287d42SBarry Smith           x   = rtmp + 16 * pj[j];
14683287d42SBarry Smith           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
14783287d42SBarry Smith           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
14883287d42SBarry Smith           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
14983287d42SBarry Smith           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
15083287d42SBarry Smith 
15183287d42SBarry Smith           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
15283287d42SBarry Smith           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
15383287d42SBarry Smith           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
15483287d42SBarry Smith           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
15583287d42SBarry Smith 
15683287d42SBarry Smith           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
15783287d42SBarry Smith           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
15883287d42SBarry Smith           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
15983287d42SBarry Smith           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
16083287d42SBarry Smith 
16183287d42SBarry Smith           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
16283287d42SBarry Smith           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
16383287d42SBarry Smith           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
16483287d42SBarry Smith           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
16583287d42SBarry Smith 
16683287d42SBarry Smith           pv += 16;
16783287d42SBarry Smith         }
1689566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
16983287d42SBarry Smith       }
17083287d42SBarry Smith       row = *ajtmp++;
17183287d42SBarry Smith     }
17283287d42SBarry Smith     /* finished row so stick it into b->a */
17383287d42SBarry Smith     pv = ba + 16 * bi[i];
17483287d42SBarry Smith     pj = bj + bi[i];
17583287d42SBarry Smith     nz = bi[i + 1] - bi[i];
17683287d42SBarry Smith     for (j = 0; j < nz; j++) {
17783287d42SBarry Smith       x      = rtmp + 16 * pj[j];
1789371c9d4SSatish Balay       pv[0]  = x[0];
1799371c9d4SSatish Balay       pv[1]  = x[1];
1809371c9d4SSatish Balay       pv[2]  = x[2];
1819371c9d4SSatish Balay       pv[3]  = x[3];
1829371c9d4SSatish Balay       pv[4]  = x[4];
1839371c9d4SSatish Balay       pv[5]  = x[5];
1849371c9d4SSatish Balay       pv[6]  = x[6];
1859371c9d4SSatish Balay       pv[7]  = x[7];
1869371c9d4SSatish Balay       pv[8]  = x[8];
1879371c9d4SSatish Balay       pv[9]  = x[9];
1889371c9d4SSatish Balay       pv[10] = x[10];
1899371c9d4SSatish Balay       pv[11] = x[11];
1909371c9d4SSatish Balay       pv[12] = x[12];
1919371c9d4SSatish Balay       pv[13] = x[13];
1929371c9d4SSatish Balay       pv[14] = x[14];
1939371c9d4SSatish Balay       pv[15] = x[15];
19483287d42SBarry Smith       pv += 16;
19583287d42SBarry Smith     }
19683287d42SBarry Smith     /* invert diagonal block */
19783287d42SBarry Smith     w = ba + 16 * diag_offset[i];
198bcd9e38bSBarry Smith     if (pivotinblocks) {
1999566063dSJacob Faibussowitsch       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
2007b6c816cSBarry Smith       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
201bcd9e38bSBarry Smith     } else {
2029566063dSJacob Faibussowitsch       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
203bcd9e38bSBarry Smith     }
20483287d42SBarry Smith   }
20583287d42SBarry Smith 
2069566063dSJacob Faibussowitsch   PetscCall(PetscFree(rtmp));
2079566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
2089566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
20926fbe8dcSKarl Rupp 
21006e38f1dSHong Zhang   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
21106e38f1dSHong Zhang   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
21283287d42SBarry Smith   C->assembled           = PETSC_TRUE;
21326fbe8dcSKarl Rupp 
2149566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
2153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21683287d42SBarry Smith }
217e6580ceeSShri Abhyankar 
2184dd39f65SShri Abhyankar /* MatLUFactorNumeric_SeqBAIJ_4 -
2194dd39f65SShri Abhyankar      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
22096b95a6bSBarry Smith        PetscKernel_A_gets_A_times_B()
22196b95a6bSBarry Smith        PetscKernel_A_gets_A_minus_B_times_C()
22296b95a6bSBarry Smith        PetscKernel_A_gets_inverse_A()
223209027a4SShri Abhyankar */
224c0c7eb62SShri Abhyankar 
225d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
226d71ae5a4SJacob Faibussowitsch {
227209027a4SShri Abhyankar   Mat             C = B;
228209027a4SShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
229209027a4SShri Abhyankar   IS              isrow = b->row, isicol = b->icol;
2305a586d82SBarry Smith   const PetscInt *r, *ic;
231bbd65245SShri Abhyankar   PetscInt        i, j, k, nz, nzL, row;
232bbd65245SShri Abhyankar   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
233bbd65245SShri Abhyankar   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
234209027a4SShri Abhyankar   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
235bbd65245SShri Abhyankar   PetscInt        flg;
2366ba06ab7SHong Zhang   PetscReal       shift;
237a455e926SHong Zhang   PetscBool       allowzeropivot, zeropivotdetected;
238209027a4SShri Abhyankar 
239209027a4SShri Abhyankar   PetscFunctionBegin;
2400164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
2419566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
2429566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
243209027a4SShri Abhyankar 
244*13bcc0bdSJacob Faibussowitsch   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
2456ba06ab7SHong Zhang     shift = 0;
2466ba06ab7SHong Zhang   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
2476ba06ab7SHong Zhang     shift = info->shiftamount;
2486ba06ab7SHong Zhang   }
2496ba06ab7SHong Zhang 
250209027a4SShri Abhyankar   /* generate work space needed by the factorization */
2519566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
2529566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(rtmp, bs2 * n));
253209027a4SShri Abhyankar 
254209027a4SShri Abhyankar   for (i = 0; i < n; i++) {
255209027a4SShri Abhyankar     /* zero rtmp */
256209027a4SShri Abhyankar     /* L part */
257209027a4SShri Abhyankar     nz    = bi[i + 1] - bi[i];
258209027a4SShri Abhyankar     bjtmp = bj + bi[i];
25948a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
260209027a4SShri Abhyankar 
261209027a4SShri Abhyankar     /* U part */
26278bb4007SShri Abhyankar     nz    = bdiag[i] - bdiag[i + 1];
26378bb4007SShri Abhyankar     bjtmp = bj + bdiag[i + 1] + 1;
26448a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
26578bb4007SShri Abhyankar 
26678bb4007SShri Abhyankar     /* load in initial (unfactored row) */
26778bb4007SShri Abhyankar     nz    = ai[r[i] + 1] - ai[r[i]];
26878bb4007SShri Abhyankar     ajtmp = aj + ai[r[i]];
26978bb4007SShri Abhyankar     v     = aa + bs2 * ai[r[i]];
27048a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
27178bb4007SShri Abhyankar 
27278bb4007SShri Abhyankar     /* elimination */
27378bb4007SShri Abhyankar     bjtmp = bj + bi[i];
27478bb4007SShri Abhyankar     nzL   = bi[i + 1] - bi[i];
27578bb4007SShri Abhyankar     for (k = 0; k < nzL; k++) {
27678bb4007SShri Abhyankar       row = bjtmp[k];
27778bb4007SShri Abhyankar       pc  = rtmp + bs2 * row;
278c35f09e5SBarry Smith       for (flg = 0, j = 0; j < bs2; j++) {
279c35f09e5SBarry Smith         if (pc[j] != 0.0) {
280c35f09e5SBarry Smith           flg = 1;
281c35f09e5SBarry Smith           break;
282c35f09e5SBarry Smith         }
283c35f09e5SBarry Smith       }
28478bb4007SShri Abhyankar       if (flg) {
28578bb4007SShri Abhyankar         pv = b->a + bs2 * bdiag[row];
28696b95a6bSBarry Smith         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
2879566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
28878bb4007SShri Abhyankar 
289a5b23f4aSJose E. Roman         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
29078bb4007SShri Abhyankar         pv = b->a + bs2 * (bdiag[row + 1] + 1);
29178bb4007SShri Abhyankar         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
29278bb4007SShri Abhyankar         for (j = 0; j < nz; j++) {
29396b95a6bSBarry Smith           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
29478bb4007SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
29578bb4007SShri Abhyankar           v = rtmp + bs2 * pj[j];
2969566063dSJacob Faibussowitsch           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
29778bb4007SShri Abhyankar           pv += bs2;
29878bb4007SShri Abhyankar         }
2999566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
30078bb4007SShri Abhyankar       }
30178bb4007SShri Abhyankar     }
30278bb4007SShri Abhyankar 
30378bb4007SShri Abhyankar     /* finished row so stick it into b->a */
30478bb4007SShri Abhyankar     /* L part */
30578bb4007SShri Abhyankar     pv = b->a + bs2 * bi[i];
30678bb4007SShri Abhyankar     pj = b->j + bi[i];
30778bb4007SShri Abhyankar     nz = bi[i + 1] - bi[i];
30848a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
30978bb4007SShri Abhyankar 
310a5b23f4aSJose E. Roman     /* Mark diagonal and invert diagonal for simpler triangular solves */
31178bb4007SShri Abhyankar     pv = b->a + bs2 * bdiag[i];
31278bb4007SShri Abhyankar     pj = b->j + bdiag[i];
3139566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
3149566063dSJacob Faibussowitsch     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
3157b6c816cSBarry Smith     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
31678bb4007SShri Abhyankar 
31778bb4007SShri Abhyankar     /* U part */
31878bb4007SShri Abhyankar     pv = b->a + bs2 * (bdiag[i + 1] + 1);
31978bb4007SShri Abhyankar     pj = b->j + bdiag[i + 1] + 1;
32078bb4007SShri Abhyankar     nz = bdiag[i] - bdiag[i + 1] - 1;
32148a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
32278bb4007SShri Abhyankar   }
32378bb4007SShri Abhyankar 
3249566063dSJacob Faibussowitsch   PetscCall(PetscFree2(rtmp, mwork));
3259566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
3269566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
32726fbe8dcSKarl Rupp 
3284dd39f65SShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4;
3294dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
33078bb4007SShri Abhyankar   C->assembled           = PETSC_TRUE;
33126fbe8dcSKarl Rupp 
3329566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
3333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
33478bb4007SShri Abhyankar }
33578bb4007SShri Abhyankar 
336d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
337d71ae5a4SJacob Faibussowitsch {
338e6580ceeSShri Abhyankar   /*
339e6580ceeSShri Abhyankar     Default Version for when blocks are 4 by 4 Using natural ordering
340e6580ceeSShri Abhyankar */
341e6580ceeSShri Abhyankar   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
342e6580ceeSShri Abhyankar   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
343e6580ceeSShri Abhyankar   PetscInt    *ajtmpold, *ajtmp, nz, row;
344e6580ceeSShri Abhyankar   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
345e6580ceeSShri Abhyankar   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
346e6580ceeSShri Abhyankar   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
347e6580ceeSShri Abhyankar   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
348e6580ceeSShri Abhyankar   MatScalar    p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
349e6580ceeSShri Abhyankar   MatScalar    m13, m14, m15, m16;
350e6580ceeSShri Abhyankar   MatScalar   *ba = b->a, *aa = a->a;
351a455e926SHong Zhang   PetscBool    pivotinblocks = b->pivotinblocks;
352182b8fbaSHong Zhang   PetscReal    shift         = info->shiftamount;
3530164db54SHong Zhang   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
354e6580ceeSShri Abhyankar 
355e6580ceeSShri Abhyankar   PetscFunctionBegin;
3560164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
3579566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
358e6580ceeSShri Abhyankar 
359e6580ceeSShri Abhyankar   for (i = 0; i < n; i++) {
360e6580ceeSShri Abhyankar     nz    = bi[i + 1] - bi[i];
361e6580ceeSShri Abhyankar     ajtmp = bj + bi[i];
362e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
363e6580ceeSShri Abhyankar       x    = rtmp + 16 * ajtmp[j];
364e6580ceeSShri Abhyankar       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
365e6580ceeSShri Abhyankar       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
366e6580ceeSShri Abhyankar     }
367e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
368e6580ceeSShri Abhyankar     nz       = ai[i + 1] - ai[i];
369e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
370e6580ceeSShri Abhyankar     v        = aa + 16 * ai[i];
371e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
372e6580ceeSShri Abhyankar       x     = rtmp + 16 * ajtmpold[j];
3739371c9d4SSatish Balay       x[0]  = v[0];
3749371c9d4SSatish Balay       x[1]  = v[1];
3759371c9d4SSatish Balay       x[2]  = v[2];
3769371c9d4SSatish Balay       x[3]  = v[3];
3779371c9d4SSatish Balay       x[4]  = v[4];
3789371c9d4SSatish Balay       x[5]  = v[5];
3799371c9d4SSatish Balay       x[6]  = v[6];
3809371c9d4SSatish Balay       x[7]  = v[7];
3819371c9d4SSatish Balay       x[8]  = v[8];
3829371c9d4SSatish Balay       x[9]  = v[9];
3839371c9d4SSatish Balay       x[10] = v[10];
3849371c9d4SSatish Balay       x[11] = v[11];
3859371c9d4SSatish Balay       x[12] = v[12];
3869371c9d4SSatish Balay       x[13] = v[13];
3879371c9d4SSatish Balay       x[14] = v[14];
3889371c9d4SSatish Balay       x[15] = v[15];
389e6580ceeSShri Abhyankar       v += 16;
390e6580ceeSShri Abhyankar     }
391e6580ceeSShri Abhyankar     row = *ajtmp++;
392e6580ceeSShri Abhyankar     while (row < i) {
393e6580ceeSShri Abhyankar       pc  = rtmp + 16 * row;
3949371c9d4SSatish Balay       p1  = pc[0];
3959371c9d4SSatish Balay       p2  = pc[1];
3969371c9d4SSatish Balay       p3  = pc[2];
3979371c9d4SSatish Balay       p4  = pc[3];
3989371c9d4SSatish Balay       p5  = pc[4];
3999371c9d4SSatish Balay       p6  = pc[5];
4009371c9d4SSatish Balay       p7  = pc[6];
4019371c9d4SSatish Balay       p8  = pc[7];
4029371c9d4SSatish Balay       p9  = pc[8];
4039371c9d4SSatish Balay       p10 = pc[9];
4049371c9d4SSatish Balay       p11 = pc[10];
4059371c9d4SSatish Balay       p12 = pc[11];
4069371c9d4SSatish Balay       p13 = pc[12];
4079371c9d4SSatish Balay       p14 = pc[13];
4089371c9d4SSatish Balay       p15 = pc[14];
4099371c9d4SSatish Balay       p16 = pc[15];
4109371c9d4SSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
411e6580ceeSShri Abhyankar         pv    = ba + 16 * diag_offset[row];
412e6580ceeSShri Abhyankar         pj    = bj + diag_offset[row] + 1;
4139371c9d4SSatish Balay         x1    = pv[0];
4149371c9d4SSatish Balay         x2    = pv[1];
4159371c9d4SSatish Balay         x3    = pv[2];
4169371c9d4SSatish Balay         x4    = pv[3];
4179371c9d4SSatish Balay         x5    = pv[4];
4189371c9d4SSatish Balay         x6    = pv[5];
4199371c9d4SSatish Balay         x7    = pv[6];
4209371c9d4SSatish Balay         x8    = pv[7];
4219371c9d4SSatish Balay         x9    = pv[8];
4229371c9d4SSatish Balay         x10   = pv[9];
4239371c9d4SSatish Balay         x11   = pv[10];
4249371c9d4SSatish Balay         x12   = pv[11];
4259371c9d4SSatish Balay         x13   = pv[12];
4269371c9d4SSatish Balay         x14   = pv[13];
4279371c9d4SSatish Balay         x15   = pv[14];
4289371c9d4SSatish Balay         x16   = pv[15];
429e6580ceeSShri Abhyankar         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
430e6580ceeSShri Abhyankar         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
431e6580ceeSShri Abhyankar         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
432e6580ceeSShri Abhyankar         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
433e6580ceeSShri Abhyankar 
434e6580ceeSShri Abhyankar         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
435e6580ceeSShri Abhyankar         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
436e6580ceeSShri Abhyankar         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
437e6580ceeSShri Abhyankar         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
438e6580ceeSShri Abhyankar 
439e6580ceeSShri Abhyankar         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
440e6580ceeSShri Abhyankar         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
441e6580ceeSShri Abhyankar         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
442e6580ceeSShri Abhyankar         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
443e6580ceeSShri Abhyankar 
444e6580ceeSShri Abhyankar         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
445e6580ceeSShri Abhyankar         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
446e6580ceeSShri Abhyankar         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
447e6580ceeSShri Abhyankar         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
448e6580ceeSShri Abhyankar         nz           = bi[row + 1] - diag_offset[row] - 1;
449e6580ceeSShri Abhyankar         pv += 16;
450e6580ceeSShri Abhyankar         for (j = 0; j < nz; j++) {
4519371c9d4SSatish Balay           x1  = pv[0];
4529371c9d4SSatish Balay           x2  = pv[1];
4539371c9d4SSatish Balay           x3  = pv[2];
4549371c9d4SSatish Balay           x4  = pv[3];
4559371c9d4SSatish Balay           x5  = pv[4];
4569371c9d4SSatish Balay           x6  = pv[5];
4579371c9d4SSatish Balay           x7  = pv[6];
4589371c9d4SSatish Balay           x8  = pv[7];
4599371c9d4SSatish Balay           x9  = pv[8];
4609371c9d4SSatish Balay           x10 = pv[9];
4619371c9d4SSatish Balay           x11 = pv[10];
4629371c9d4SSatish Balay           x12 = pv[11];
4639371c9d4SSatish Balay           x13 = pv[12];
4649371c9d4SSatish Balay           x14 = pv[13];
4659371c9d4SSatish Balay           x15 = pv[14];
4669371c9d4SSatish Balay           x16 = pv[15];
467e6580ceeSShri Abhyankar           x   = rtmp + 16 * pj[j];
468e6580ceeSShri Abhyankar           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
469e6580ceeSShri Abhyankar           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
470e6580ceeSShri Abhyankar           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
471e6580ceeSShri Abhyankar           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
472e6580ceeSShri Abhyankar 
473e6580ceeSShri Abhyankar           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
474e6580ceeSShri Abhyankar           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
475e6580ceeSShri Abhyankar           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
476e6580ceeSShri Abhyankar           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
477e6580ceeSShri Abhyankar 
478e6580ceeSShri Abhyankar           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
479e6580ceeSShri Abhyankar           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
480e6580ceeSShri Abhyankar           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
481e6580ceeSShri Abhyankar           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
482e6580ceeSShri Abhyankar 
483e6580ceeSShri Abhyankar           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
484e6580ceeSShri Abhyankar           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
485e6580ceeSShri Abhyankar           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
486e6580ceeSShri Abhyankar           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
487e6580ceeSShri Abhyankar 
488e6580ceeSShri Abhyankar           pv += 16;
489e6580ceeSShri Abhyankar         }
4909566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
491e6580ceeSShri Abhyankar       }
492e6580ceeSShri Abhyankar       row = *ajtmp++;
493e6580ceeSShri Abhyankar     }
494e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
495e6580ceeSShri Abhyankar     pv = ba + 16 * bi[i];
496e6580ceeSShri Abhyankar     pj = bj + bi[i];
497e6580ceeSShri Abhyankar     nz = bi[i + 1] - bi[i];
498e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
499e6580ceeSShri Abhyankar       x      = rtmp + 16 * pj[j];
5009371c9d4SSatish Balay       pv[0]  = x[0];
5019371c9d4SSatish Balay       pv[1]  = x[1];
5029371c9d4SSatish Balay       pv[2]  = x[2];
5039371c9d4SSatish Balay       pv[3]  = x[3];
5049371c9d4SSatish Balay       pv[4]  = x[4];
5059371c9d4SSatish Balay       pv[5]  = x[5];
5069371c9d4SSatish Balay       pv[6]  = x[6];
5079371c9d4SSatish Balay       pv[7]  = x[7];
5089371c9d4SSatish Balay       pv[8]  = x[8];
5099371c9d4SSatish Balay       pv[9]  = x[9];
5109371c9d4SSatish Balay       pv[10] = x[10];
5119371c9d4SSatish Balay       pv[11] = x[11];
5129371c9d4SSatish Balay       pv[12] = x[12];
5139371c9d4SSatish Balay       pv[13] = x[13];
5149371c9d4SSatish Balay       pv[14] = x[14];
5159371c9d4SSatish Balay       pv[15] = x[15];
516e6580ceeSShri Abhyankar       pv += 16;
517e6580ceeSShri Abhyankar     }
518e6580ceeSShri Abhyankar     /* invert diagonal block */
519e6580ceeSShri Abhyankar     w = ba + 16 * diag_offset[i];
520e6580ceeSShri Abhyankar     if (pivotinblocks) {
5219566063dSJacob Faibussowitsch       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
5227b6c816cSBarry Smith       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
523e6580ceeSShri Abhyankar     } else {
5249566063dSJacob Faibussowitsch       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
525e6580ceeSShri Abhyankar     }
526e6580ceeSShri Abhyankar   }
527e6580ceeSShri Abhyankar 
5289566063dSJacob Faibussowitsch   PetscCall(PetscFree(rtmp));
52926fbe8dcSKarl Rupp 
53006e38f1dSHong Zhang   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
53106e38f1dSHong Zhang   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
532e6580ceeSShri Abhyankar   C->assembled           = PETSC_TRUE;
53326fbe8dcSKarl Rupp 
5349566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
5353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
536e6580ceeSShri Abhyankar }
537c0c7eb62SShri Abhyankar 
538209027a4SShri Abhyankar /*
5394dd39f65SShri Abhyankar   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
5404dd39f65SShri Abhyankar     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
541209027a4SShri Abhyankar */
542d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
543d71ae5a4SJacob Faibussowitsch {
544209027a4SShri Abhyankar   Mat             C = B;
545209027a4SShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
546bbd65245SShri Abhyankar   PetscInt        i, j, k, nz, nzL, row;
547bbd65245SShri Abhyankar   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
548bbd65245SShri Abhyankar   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
549209027a4SShri Abhyankar   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
550bbd65245SShri Abhyankar   PetscInt        flg;
5516ba06ab7SHong Zhang   PetscReal       shift;
552a455e926SHong Zhang   PetscBool       allowzeropivot, zeropivotdetected;
553e6580ceeSShri Abhyankar 
554209027a4SShri Abhyankar   PetscFunctionBegin;
5550164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
5560164db54SHong Zhang 
557209027a4SShri Abhyankar   /* generate work space needed by the factorization */
5589566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
5599566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(rtmp, bs2 * n));
560209027a4SShri Abhyankar 
561*13bcc0bdSJacob Faibussowitsch   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
5626ba06ab7SHong Zhang     shift = 0;
5636ba06ab7SHong Zhang   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
5646ba06ab7SHong Zhang     shift = info->shiftamount;
5656ba06ab7SHong Zhang   }
5666ba06ab7SHong Zhang 
567209027a4SShri Abhyankar   for (i = 0; i < n; i++) {
568209027a4SShri Abhyankar     /* zero rtmp */
569209027a4SShri Abhyankar     /* L part */
570209027a4SShri Abhyankar     nz    = bi[i + 1] - bi[i];
571209027a4SShri Abhyankar     bjtmp = bj + bi[i];
57248a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
573209027a4SShri Abhyankar 
574209027a4SShri Abhyankar     /* U part */
575b2b2dd24SShri Abhyankar     nz    = bdiag[i] - bdiag[i + 1];
576b2b2dd24SShri Abhyankar     bjtmp = bj + bdiag[i + 1] + 1;
57748a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
578b2b2dd24SShri Abhyankar 
579b2b2dd24SShri Abhyankar     /* load in initial (unfactored row) */
580b2b2dd24SShri Abhyankar     nz    = ai[i + 1] - ai[i];
581b2b2dd24SShri Abhyankar     ajtmp = aj + ai[i];
582b2b2dd24SShri Abhyankar     v     = aa + bs2 * ai[i];
58348a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
584b2b2dd24SShri Abhyankar 
585b2b2dd24SShri Abhyankar     /* elimination */
586b2b2dd24SShri Abhyankar     bjtmp = bj + bi[i];
587b2b2dd24SShri Abhyankar     nzL   = bi[i + 1] - bi[i];
588b2b2dd24SShri Abhyankar     for (k = 0; k < nzL; k++) {
589b2b2dd24SShri Abhyankar       row = bjtmp[k];
590b2b2dd24SShri Abhyankar       pc  = rtmp + bs2 * row;
591c35f09e5SBarry Smith       for (flg = 0, j = 0; j < bs2; j++) {
592c35f09e5SBarry Smith         if (pc[j] != 0.0) {
593c35f09e5SBarry Smith           flg = 1;
594c35f09e5SBarry Smith           break;
595c35f09e5SBarry Smith         }
596c35f09e5SBarry Smith       }
597b2b2dd24SShri Abhyankar       if (flg) {
598b2b2dd24SShri Abhyankar         pv = b->a + bs2 * bdiag[row];
59996b95a6bSBarry Smith         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
6009566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
601b2b2dd24SShri Abhyankar 
602a5b23f4aSJose E. Roman         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
603b2b2dd24SShri Abhyankar         pv = b->a + bs2 * (bdiag[row + 1] + 1);
604b2b2dd24SShri Abhyankar         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
605b2b2dd24SShri Abhyankar         for (j = 0; j < nz; j++) {
60696b95a6bSBarry Smith           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
607b2b2dd24SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
608b2b2dd24SShri Abhyankar           v = rtmp + bs2 * pj[j];
6099566063dSJacob Faibussowitsch           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
610b2b2dd24SShri Abhyankar           pv += bs2;
611b2b2dd24SShri Abhyankar         }
6129566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
613b2b2dd24SShri Abhyankar       }
614b2b2dd24SShri Abhyankar     }
615b2b2dd24SShri Abhyankar 
616b2b2dd24SShri Abhyankar     /* finished row so stick it into b->a */
617b2b2dd24SShri Abhyankar     /* L part */
618b2b2dd24SShri Abhyankar     pv = b->a + bs2 * bi[i];
619b2b2dd24SShri Abhyankar     pj = b->j + bi[i];
620b2b2dd24SShri Abhyankar     nz = bi[i + 1] - bi[i];
62148a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
622b2b2dd24SShri Abhyankar 
623a5b23f4aSJose E. Roman     /* Mark diagonal and invert diagonal for simpler triangular solves */
624b2b2dd24SShri Abhyankar     pv = b->a + bs2 * bdiag[i];
625b2b2dd24SShri Abhyankar     pj = b->j + bdiag[i];
6269566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
6279566063dSJacob Faibussowitsch     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
6287b6c816cSBarry Smith     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
629b2b2dd24SShri Abhyankar 
630b2b2dd24SShri Abhyankar     /* U part */
631b2b2dd24SShri Abhyankar     pv = b->a + bs2 * (bdiag[i + 1] + 1);
632b2b2dd24SShri Abhyankar     pj = b->j + bdiag[i + 1] + 1;
633b2b2dd24SShri Abhyankar     nz = bdiag[i] - bdiag[i + 1] - 1;
63448a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
635b2b2dd24SShri Abhyankar   }
6369566063dSJacob Faibussowitsch   PetscCall(PetscFree2(rtmp, mwork));
63726fbe8dcSKarl Rupp 
6384dd39f65SShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
6394dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
640b2b2dd24SShri Abhyankar   C->assembled           = PETSC_TRUE;
64126fbe8dcSKarl Rupp 
6429566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
6433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
644b2b2dd24SShri Abhyankar }
645b2b2dd24SShri Abhyankar 
646e6580ceeSShri Abhyankar #if defined(PETSC_HAVE_SSE)
647e6580ceeSShri Abhyankar 
648e6580ceeSShri Abhyankar   #include PETSC_HAVE_SSE
649e6580ceeSShri Abhyankar 
650e6580ceeSShri Abhyankar /* SSE Version for when blocks are 4 by 4 Using natural ordering */
651d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info)
652d71ae5a4SJacob Faibussowitsch {
653e6580ceeSShri Abhyankar   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
654e6580ceeSShri Abhyankar   int          i, j, n = a->mbs;
655e6580ceeSShri Abhyankar   int         *bj = b->j, *bjtmp, *pj;
656e6580ceeSShri Abhyankar   int          row;
657e6580ceeSShri Abhyankar   int         *ajtmpold, nz, *bi = b->i;
658e6580ceeSShri Abhyankar   int         *diag_offset = b->diag, *ai = a->i, *aj = a->j;
659e6580ceeSShri Abhyankar   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
660e6580ceeSShri Abhyankar   MatScalar   *ba = b->a, *aa = a->a;
661e6580ceeSShri Abhyankar   int          nonzero       = 0;
662a455e926SHong Zhang   PetscBool    pivotinblocks = b->pivotinblocks;
663182b8fbaSHong Zhang   PetscReal    shift         = info->shiftamount;
6640164db54SHong Zhang   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
665e6580ceeSShri Abhyankar 
666e6580ceeSShri Abhyankar   PetscFunctionBegin;
6670164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
668e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
669e6580ceeSShri Abhyankar 
670aed4548fSBarry Smith   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
671aed4548fSBarry Smith   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
6729566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
673aed4548fSBarry Smith   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
674e6580ceeSShri Abhyankar   /*    if ((unsigned long)bj==(unsigned long)aj) { */
675e6580ceeSShri Abhyankar   /*      colscale = 4; */
676e6580ceeSShri Abhyankar   /*    } */
677e6580ceeSShri Abhyankar   for (i = 0; i < n; i++) {
678e6580ceeSShri Abhyankar     nz    = bi[i + 1] - bi[i];
679e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
680e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
681e6580ceeSShri Abhyankar     /* zero out one register */
682e6580ceeSShri Abhyankar     XOR_PS(XMM7, XMM7);
683e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
684e6580ceeSShri Abhyankar       x = rtmp + 16 * bjtmp[j];
685e6580ceeSShri Abhyankar       /*        x = rtmp+4*bjtmp[j]; */
686e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
687e6580ceeSShri Abhyankar       /* Copy zero register to memory locations */
688e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
689e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
690e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
691e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
692e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
693e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
694e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
695e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
696e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
697e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
698e6580ceeSShri Abhyankar     }
699e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
700e6580ceeSShri Abhyankar     nz       = ai[i + 1] - ai[i];
701e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
702e6580ceeSShri Abhyankar     v        = aa + 16 * ai[i];
703e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
704e6580ceeSShri Abhyankar       x = rtmp + 16 * ajtmpold[j];
705e6580ceeSShri Abhyankar       /*        x = rtmp+colscale*ajtmpold[j]; */
706e6580ceeSShri Abhyankar       /* Copy v block into x block */
707e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v, x)
708e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
709e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
710e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
711e6580ceeSShri Abhyankar 
712e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
713e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
714e6580ceeSShri Abhyankar 
715e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
716e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
717e6580ceeSShri Abhyankar 
718e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
719e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
720e6580ceeSShri Abhyankar 
721e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
722e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
723e6580ceeSShri Abhyankar 
724e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
725e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
726e6580ceeSShri Abhyankar 
727e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
728e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
729e6580ceeSShri Abhyankar 
730e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
731e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
732e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
733e6580ceeSShri Abhyankar 
734e6580ceeSShri Abhyankar       v += 16;
735e6580ceeSShri Abhyankar     }
736e6580ceeSShri Abhyankar     /*      row = (*bjtmp++)/4; */
737e6580ceeSShri Abhyankar     row = *bjtmp++;
738e6580ceeSShri Abhyankar     while (row < i) {
739e6580ceeSShri Abhyankar       pc = rtmp + 16 * row;
740e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
741e6580ceeSShri Abhyankar       /* Load block from lower triangle */
742e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
743e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
744e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
745e6580ceeSShri Abhyankar 
746e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
747e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
748e6580ceeSShri Abhyankar 
749e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
750e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
751e6580ceeSShri Abhyankar 
752e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
753e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
754e6580ceeSShri Abhyankar 
755e6580ceeSShri Abhyankar       /* Compare block to zero block */
756e6580ceeSShri Abhyankar 
757e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM4, XMM7)
758e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM4, XMM0)
759e6580ceeSShri Abhyankar 
760e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM5, XMM7)
761e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM5, XMM1)
762e6580ceeSShri Abhyankar 
763e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM6, XMM7)
764e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM6, XMM2)
765e6580ceeSShri Abhyankar 
766e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM7, XMM3)
767e6580ceeSShri Abhyankar 
768e6580ceeSShri Abhyankar       /* Reduce the comparisons to one SSE register */
769e6580ceeSShri Abhyankar       SSE_OR_PS(XMM6, XMM7)
770e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5, XMM4)
771e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5, XMM6)
772e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
773e6580ceeSShri Abhyankar 
774e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
775e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
776e6580ceeSShri Abhyankar       MOVEMASK(nonzero, XMM5);
777e6580ceeSShri Abhyankar 
778e6580ceeSShri Abhyankar       /* If block is nonzero ... */
779e6580ceeSShri Abhyankar       if (nonzero) {
780e6580ceeSShri Abhyankar         pv = ba + 16 * diag_offset[row];
781e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
782e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
783e6580ceeSShri Abhyankar 
784e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
785e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
786e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
787e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
788e6580ceeSShri Abhyankar 
789e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv, pc)
790e6580ceeSShri Abhyankar         /* Column 0, product is accumulated in XMM4 */
791e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
792e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM4, XMM4, 0x00)
793e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM4, XMM0)
794e6580ceeSShri Abhyankar 
795e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
796e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5, XMM5, 0x00)
797e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5, XMM1)
798e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4, XMM5)
799e6580ceeSShri Abhyankar 
800e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
801e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
802e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM2)
803e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4, XMM6)
804e6580ceeSShri Abhyankar 
805e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
806e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
807e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM3)
808e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4, XMM7)
809e6580ceeSShri Abhyankar 
810e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
811e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
812e6580ceeSShri Abhyankar 
813e6580ceeSShri Abhyankar         /* Column 1, product is accumulated in XMM5 */
814e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
815e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5, XMM5, 0x00)
816e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5, XMM0)
817e6580ceeSShri Abhyankar 
818e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
819e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
820e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM1)
821e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5, XMM6)
822e6580ceeSShri Abhyankar 
823e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
824e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
825e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM2)
826e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5, XMM7)
827e6580ceeSShri Abhyankar 
828e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
829e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
830e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM3)
831e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5, XMM6)
832e6580ceeSShri Abhyankar 
833e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
834e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
835e6580ceeSShri Abhyankar 
836e6580ceeSShri Abhyankar         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
837e6580ceeSShri Abhyankar 
838e6580ceeSShri Abhyankar         /* Column 2, product is accumulated in XMM6 */
839e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
840e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
841e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM0)
842e6580ceeSShri Abhyankar 
843e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
844e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
845e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM1)
846e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6, XMM7)
847e6580ceeSShri Abhyankar 
848e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
849e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
850e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM2)
851e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6, XMM7)
852e6580ceeSShri Abhyankar 
853e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
854e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
855e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM3)
856e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6, XMM7)
857e6580ceeSShri Abhyankar 
858e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
859e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
860e6580ceeSShri Abhyankar 
861e6580ceeSShri Abhyankar         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
862e6580ceeSShri Abhyankar         /* Column 3, product is accumulated in XMM0 */
863e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
864e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
865e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM0, XMM7)
866e6580ceeSShri Abhyankar 
867e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
868e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
869e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1, XMM7)
870e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0, XMM1)
871e6580ceeSShri Abhyankar 
872e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
873e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM1, XMM1, 0x00)
874e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1, XMM2)
875e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0, XMM1)
876e6580ceeSShri Abhyankar 
877e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
878e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
879e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM3, XMM7)
880e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0, XMM3)
881e6580ceeSShri Abhyankar 
882e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
883e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
884e6580ceeSShri Abhyankar 
885e6580ceeSShri Abhyankar         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
886e6580ceeSShri Abhyankar         /* This is code to be maintained and read by humans after all. */
887e6580ceeSShri Abhyankar         /* Copy Multiplier Col 3 into XMM3 */
888e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM3, XMM0)
889e6580ceeSShri Abhyankar         /* Copy Multiplier Col 2 into XMM2 */
890e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM2, XMM6)
891e6580ceeSShri Abhyankar         /* Copy Multiplier Col 1 into XMM1 */
892e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM1, XMM5)
893e6580ceeSShri Abhyankar         /* Copy Multiplier Col 0 into XMM0 */
894e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM0, XMM4)
895e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
896e6580ceeSShri Abhyankar 
897e6580ceeSShri Abhyankar         /* Update the row: */
898e6580ceeSShri Abhyankar         nz = bi[row + 1] - diag_offset[row] - 1;
899e6580ceeSShri Abhyankar         pv += 16;
900e6580ceeSShri Abhyankar         for (j = 0; j < nz; j++) {
901e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
902e6580ceeSShri Abhyankar           x = rtmp + 16 * pj[j];
903e6580ceeSShri Abhyankar           /*            x = rtmp + 4*pj[j]; */
904e6580ceeSShri Abhyankar 
905e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
906e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
907e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x, pv)
908e6580ceeSShri Abhyankar           /* Load First Column of X*/
909e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
910e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
911e6580ceeSShri Abhyankar 
912e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
913e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
914e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
915e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM0)
916e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
917e6580ceeSShri Abhyankar 
918e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
919e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
920e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM1)
921e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM6)
922e6580ceeSShri Abhyankar 
923e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
924e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
925e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM2)
926e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM7)
927e6580ceeSShri Abhyankar 
928e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
929e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
930e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM3)
931e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
932e6580ceeSShri Abhyankar 
933e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
934e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
935e6580ceeSShri Abhyankar 
936e6580ceeSShri Abhyankar           /* Second Column */
937e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
938e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
939e6580ceeSShri Abhyankar 
940e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
941e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
942e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
943e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM0)
944e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM6)
945e6580ceeSShri Abhyankar 
946e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
947e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
948e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM1)
949e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM7)
950e6580ceeSShri Abhyankar 
951e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
952e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4, XMM4, 0x00)
953e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4, XMM2)
954e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM4)
955e6580ceeSShri Abhyankar 
956e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
957e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
958e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM3)
959e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM6)
960e6580ceeSShri Abhyankar 
961e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
962e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
963e6580ceeSShri Abhyankar 
964e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
965e6580ceeSShri Abhyankar 
966e6580ceeSShri Abhyankar           /* Third Column */
967e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
968e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
969e6580ceeSShri Abhyankar 
970e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
971e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
972e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
973e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM0)
974e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM7)
975e6580ceeSShri Abhyankar 
976e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
977e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4, XMM4, 0x00)
978e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4, XMM1)
979e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM4)
980e6580ceeSShri Abhyankar 
981e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
982e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
983e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM2)
984e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM5)
985e6580ceeSShri Abhyankar 
986e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
987e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
988e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM3)
989e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM7)
990e6580ceeSShri Abhyankar 
991e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
992e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
993e6580ceeSShri Abhyankar 
994e6580ceeSShri Abhyankar           /* Fourth Column */
995e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
996e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
997e6580ceeSShri Abhyankar 
998e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
999e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1000e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1001e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM0)
1002e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
1003e6580ceeSShri Abhyankar 
1004e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1005e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1006e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM1)
1007e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM6)
1008e6580ceeSShri Abhyankar 
1009e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1010e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1011e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM2)
1012e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM7)
1013e6580ceeSShri Abhyankar 
1014e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1015e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1016e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM3)
1017e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
1018e6580ceeSShri Abhyankar 
1019e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1020e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1021e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
1022e6580ceeSShri Abhyankar           pv += 16;
1023e6580ceeSShri Abhyankar         }
10249566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1025e6580ceeSShri Abhyankar       }
1026e6580ceeSShri Abhyankar       row = *bjtmp++;
1027e6580ceeSShri Abhyankar       /*        row = (*bjtmp++)/4; */
1028e6580ceeSShri Abhyankar     }
1029e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
1030e6580ceeSShri Abhyankar     pv = ba + 16 * bi[i];
1031e6580ceeSShri Abhyankar     pj = bj + bi[i];
1032e6580ceeSShri Abhyankar     nz = bi[i + 1] - bi[i];
1033e6580ceeSShri Abhyankar 
1034e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
1035e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
1036e6580ceeSShri Abhyankar       x = rtmp + 16 * pj[j];
1037e6580ceeSShri Abhyankar       /*        x  = rtmp+4*pj[j]; */
1038e6580ceeSShri Abhyankar 
1039e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x, pv)
1040e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1041e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1042e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1043e6580ceeSShri Abhyankar 
1044e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1045e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1046e6580ceeSShri Abhyankar 
1047e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1048e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1049e6580ceeSShri Abhyankar 
1050e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1051e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1052e6580ceeSShri Abhyankar 
1053e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1054e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1055e6580ceeSShri Abhyankar 
1056e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1057e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1058e6580ceeSShri Abhyankar 
1059e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1060e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1061e6580ceeSShri Abhyankar 
1062e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1063e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1064e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1065e6580ceeSShri Abhyankar       pv += 16;
1066e6580ceeSShri Abhyankar     }
1067e6580ceeSShri Abhyankar     /* invert diagonal block */
1068e6580ceeSShri Abhyankar     w = ba + 16 * diag_offset[i];
1069e6580ceeSShri Abhyankar     if (pivotinblocks) {
10709566063dSJacob Faibussowitsch       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1071603e8f96SBarry Smith       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1072e6580ceeSShri Abhyankar     } else {
10739566063dSJacob Faibussowitsch       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1074e6580ceeSShri Abhyankar     }
10759566063dSJacob Faibussowitsch     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1076e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1077e6580ceeSShri Abhyankar   }
1078e6580ceeSShri Abhyankar 
10799566063dSJacob Faibussowitsch   PetscCall(PetscFree(rtmp));
108026fbe8dcSKarl Rupp 
1081e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1082e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1083e6580ceeSShri Abhyankar   C->assembled           = PETSC_TRUE;
108426fbe8dcSKarl Rupp 
10859566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1086e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
1087e6580ceeSShri Abhyankar   SSE_SCOPE_END;
10883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1089e6580ceeSShri Abhyankar }
1090e6580ceeSShri Abhyankar 
1091d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1092d71ae5a4SJacob Faibussowitsch {
1093e6580ceeSShri Abhyankar   Mat             A = C;
1094e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1095e6580ceeSShri Abhyankar   int             i, j, n = a->mbs;
1096e6580ceeSShri Abhyankar   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1097e6580ceeSShri Abhyankar   unsigned short *aj = (unsigned short *)(a->j), *ajtmp;
1098e6580ceeSShri Abhyankar   unsigned int    row;
1099e6580ceeSShri Abhyankar   int             nz, *bi = b->i;
1100e6580ceeSShri Abhyankar   int            *diag_offset = b->diag, *ai = a->i;
1101e6580ceeSShri Abhyankar   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1102e6580ceeSShri Abhyankar   MatScalar      *ba = b->a, *aa = a->a;
1103e6580ceeSShri Abhyankar   int             nonzero       = 0;
1104a455e926SHong Zhang   PetscBool       pivotinblocks = b->pivotinblocks;
1105182b8fbaSHong Zhang   PetscReal       shift         = info->shiftamount;
11060164db54SHong Zhang   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
1107e6580ceeSShri Abhyankar 
1108e6580ceeSShri Abhyankar   PetscFunctionBegin;
11090164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
1110e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
1111e6580ceeSShri Abhyankar 
1112aed4548fSBarry Smith   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1113aed4548fSBarry Smith   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
11149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1115aed4548fSBarry Smith   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1116e6580ceeSShri Abhyankar   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1117e6580ceeSShri Abhyankar   /*      colscale = 4; */
1118e6580ceeSShri Abhyankar   /*    } */
1119e6580ceeSShri Abhyankar 
1120e6580ceeSShri Abhyankar   for (i = 0; i < n; i++) {
1121e6580ceeSShri Abhyankar     nz    = bi[i + 1] - bi[i];
1122e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
1123e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
1124e6580ceeSShri Abhyankar     /* zero out one register */
1125e6580ceeSShri Abhyankar     XOR_PS(XMM7, XMM7);
1126e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
1127e6580ceeSShri Abhyankar       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1128e6580ceeSShri Abhyankar       /*        x = rtmp+4*bjtmp[j]; */
1129e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
1130e6580ceeSShri Abhyankar       /* Copy zero register to memory locations */
1131e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1132e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1133e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1134e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1135e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1136e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1137e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1138e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1139e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1140e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1141e6580ceeSShri Abhyankar     }
1142e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
1143e6580ceeSShri Abhyankar     nz    = ai[i + 1] - ai[i];
1144e6580ceeSShri Abhyankar     ajtmp = aj + ai[i];
1145e6580ceeSShri Abhyankar     v     = aa + 16 * ai[i];
1146e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
1147e6580ceeSShri Abhyankar       x = rtmp + 16 * ((unsigned int)ajtmp[j]);
1148e6580ceeSShri Abhyankar       /*        x = rtmp+colscale*ajtmp[j]; */
1149e6580ceeSShri Abhyankar       /* Copy v block into x block */
1150e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v, x)
1151e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1152e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1153e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1154e6580ceeSShri Abhyankar 
1155e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1156e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1157e6580ceeSShri Abhyankar 
1158e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1159e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1160e6580ceeSShri Abhyankar 
1161e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1162e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1163e6580ceeSShri Abhyankar 
1164e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1165e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1166e6580ceeSShri Abhyankar 
1167e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1168e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1169e6580ceeSShri Abhyankar 
1170e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1171e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1172e6580ceeSShri Abhyankar 
1173e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1174e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1175e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1176e6580ceeSShri Abhyankar 
1177e6580ceeSShri Abhyankar       v += 16;
1178e6580ceeSShri Abhyankar     }
1179e6580ceeSShri Abhyankar     /*      row = (*bjtmp++)/4; */
1180e6580ceeSShri Abhyankar     row = (unsigned int)(*bjtmp++);
1181e6580ceeSShri Abhyankar     while (row < i) {
1182e6580ceeSShri Abhyankar       pc = rtmp + 16 * row;
1183e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
1184e6580ceeSShri Abhyankar       /* Load block from lower triangle */
1185e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1186e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1187e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1188e6580ceeSShri Abhyankar 
1189e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1190e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1191e6580ceeSShri Abhyankar 
1192e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1193e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1194e6580ceeSShri Abhyankar 
1195e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1196e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1197e6580ceeSShri Abhyankar 
1198e6580ceeSShri Abhyankar       /* Compare block to zero block */
1199e6580ceeSShri Abhyankar 
1200e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM4, XMM7)
1201e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM4, XMM0)
1202e6580ceeSShri Abhyankar 
1203e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM5, XMM7)
1204e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM5, XMM1)
1205e6580ceeSShri Abhyankar 
1206e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM6, XMM7)
1207e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM6, XMM2)
1208e6580ceeSShri Abhyankar 
1209e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM7, XMM3)
1210e6580ceeSShri Abhyankar 
1211e6580ceeSShri Abhyankar       /* Reduce the comparisons to one SSE register */
1212e6580ceeSShri Abhyankar       SSE_OR_PS(XMM6, XMM7)
1213e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5, XMM4)
1214e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5, XMM6)
1215e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1216e6580ceeSShri Abhyankar 
1217e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
1218e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1219e6580ceeSShri Abhyankar       MOVEMASK(nonzero, XMM5);
1220e6580ceeSShri Abhyankar 
1221e6580ceeSShri Abhyankar       /* If block is nonzero ... */
1222e6580ceeSShri Abhyankar       if (nonzero) {
1223e6580ceeSShri Abhyankar         pv = ba + 16 * diag_offset[row];
1224e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
1225e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
1226e6580ceeSShri Abhyankar 
1227e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1228e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1229e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
1230e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1231e6580ceeSShri Abhyankar 
1232e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv, pc)
1233e6580ceeSShri Abhyankar         /* Column 0, product is accumulated in XMM4 */
1234e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1235e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1236e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM4, XMM0)
1237e6580ceeSShri Abhyankar 
1238e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1239e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1240e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5, XMM1)
1241e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4, XMM5)
1242e6580ceeSShri Abhyankar 
1243e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1244e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1245e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM2)
1246e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4, XMM6)
1247e6580ceeSShri Abhyankar 
1248e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1249e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1250e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM3)
1251e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4, XMM7)
1252e6580ceeSShri Abhyankar 
1253e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1254e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1255e6580ceeSShri Abhyankar 
1256e6580ceeSShri Abhyankar         /* Column 1, product is accumulated in XMM5 */
1257e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1258e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1259e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5, XMM0)
1260e6580ceeSShri Abhyankar 
1261e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1262e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1263e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM1)
1264e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5, XMM6)
1265e6580ceeSShri Abhyankar 
1266e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1267e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1268e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM2)
1269e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5, XMM7)
1270e6580ceeSShri Abhyankar 
1271e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1272e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1273e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM3)
1274e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5, XMM6)
1275e6580ceeSShri Abhyankar 
1276e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1277e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1278e6580ceeSShri Abhyankar 
1279e6580ceeSShri Abhyankar         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1280e6580ceeSShri Abhyankar 
1281e6580ceeSShri Abhyankar         /* Column 2, product is accumulated in XMM6 */
1282e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1283e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1284e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM0)
1285e6580ceeSShri Abhyankar 
1286e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1287e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1288e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM1)
1289e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6, XMM7)
1290e6580ceeSShri Abhyankar 
1291e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1292e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1293e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM2)
1294e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6, XMM7)
1295e6580ceeSShri Abhyankar 
1296e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1297e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1298e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM3)
1299e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6, XMM7)
1300e6580ceeSShri Abhyankar 
1301e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1302e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1303e6580ceeSShri Abhyankar 
1304e6580ceeSShri Abhyankar         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1305e6580ceeSShri Abhyankar         /* Column 3, product is accumulated in XMM0 */
1306e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1307e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1308e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM0, XMM7)
1309e6580ceeSShri Abhyankar 
1310e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1311e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1312e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1, XMM7)
1313e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0, XMM1)
1314e6580ceeSShri Abhyankar 
1315e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1316e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1317e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1, XMM2)
1318e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0, XMM1)
1319e6580ceeSShri Abhyankar 
1320e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1321e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1322e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM3, XMM7)
1323e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0, XMM3)
1324e6580ceeSShri Abhyankar 
1325e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1326e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1327e6580ceeSShri Abhyankar 
1328e6580ceeSShri Abhyankar         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1329e6580ceeSShri Abhyankar         /* This is code to be maintained and read by humans after all. */
1330e6580ceeSShri Abhyankar         /* Copy Multiplier Col 3 into XMM3 */
1331e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM3, XMM0)
1332e6580ceeSShri Abhyankar         /* Copy Multiplier Col 2 into XMM2 */
1333e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM2, XMM6)
1334e6580ceeSShri Abhyankar         /* Copy Multiplier Col 1 into XMM1 */
1335e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM1, XMM5)
1336e6580ceeSShri Abhyankar         /* Copy Multiplier Col 0 into XMM0 */
1337e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM0, XMM4)
1338e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
1339e6580ceeSShri Abhyankar 
1340e6580ceeSShri Abhyankar         /* Update the row: */
1341e6580ceeSShri Abhyankar         nz = bi[row + 1] - diag_offset[row] - 1;
1342e6580ceeSShri Abhyankar         pv += 16;
1343e6580ceeSShri Abhyankar         for (j = 0; j < nz; j++) {
1344e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
1345e6580ceeSShri Abhyankar           x = rtmp + 16 * ((unsigned int)pj[j]);
1346e6580ceeSShri Abhyankar           /*            x = rtmp + 4*pj[j]; */
1347e6580ceeSShri Abhyankar 
1348e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
1349e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1350e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x, pv)
1351e6580ceeSShri Abhyankar           /* Load First Column of X*/
1352e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1353e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1354e6580ceeSShri Abhyankar 
1355e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1356e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1357e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1358e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM0)
1359e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
1360e6580ceeSShri Abhyankar 
1361e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1362e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1363e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM1)
1364e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM6)
1365e6580ceeSShri Abhyankar 
1366e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1367e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1368e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM2)
1369e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM7)
1370e6580ceeSShri Abhyankar 
1371e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1372e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1373e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM3)
1374e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
1375e6580ceeSShri Abhyankar 
1376e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1377e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1378e6580ceeSShri Abhyankar 
1379e6580ceeSShri Abhyankar           /* Second Column */
1380e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1381e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1382e6580ceeSShri Abhyankar 
1383e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1384e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1385e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1386e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM0)
1387e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM6)
1388e6580ceeSShri Abhyankar 
1389e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1390e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1391e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM1)
1392e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM7)
1393e6580ceeSShri Abhyankar 
1394e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1395e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1396e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4, XMM2)
1397e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM4)
1398e6580ceeSShri Abhyankar 
1399e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1400e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1401e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM3)
1402e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM6)
1403e6580ceeSShri Abhyankar 
1404e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1405e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1406e6580ceeSShri Abhyankar 
1407e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1408e6580ceeSShri Abhyankar 
1409e6580ceeSShri Abhyankar           /* Third Column */
1410e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1411e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1412e6580ceeSShri Abhyankar 
1413e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1414e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1415e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1416e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM0)
1417e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM7)
1418e6580ceeSShri Abhyankar 
1419e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1420e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1421e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4, XMM1)
1422e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM4)
1423e6580ceeSShri Abhyankar 
1424e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1425e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1426e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM2)
1427e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM5)
1428e6580ceeSShri Abhyankar 
1429e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1430e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1431e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM3)
1432e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM7)
1433e6580ceeSShri Abhyankar 
1434e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1435e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1436e6580ceeSShri Abhyankar 
1437e6580ceeSShri Abhyankar           /* Fourth Column */
1438e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1439e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1440e6580ceeSShri Abhyankar 
1441e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1442e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1443e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1444e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM0)
1445e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
1446e6580ceeSShri Abhyankar 
1447e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1448e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1449e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM1)
1450e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM6)
1451e6580ceeSShri Abhyankar 
1452e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1453e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1454e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM2)
1455e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM7)
1456e6580ceeSShri Abhyankar 
1457e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1458e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1459e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM3)
1460e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
1461e6580ceeSShri Abhyankar 
1462e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1463e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1464e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
1465e6580ceeSShri Abhyankar           pv += 16;
1466e6580ceeSShri Abhyankar         }
14679566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1468e6580ceeSShri Abhyankar       }
1469e6580ceeSShri Abhyankar       row = (unsigned int)(*bjtmp++);
1470e6580ceeSShri Abhyankar       /*        row = (*bjtmp++)/4; */
1471e6580ceeSShri Abhyankar       /*        bjtmp++; */
1472e6580ceeSShri Abhyankar     }
1473e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
1474e6580ceeSShri Abhyankar     pv = ba + 16 * bi[i];
1475e6580ceeSShri Abhyankar     pj = bj + bi[i];
1476e6580ceeSShri Abhyankar     nz = bi[i + 1] - bi[i];
1477e6580ceeSShri Abhyankar 
1478e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
1479e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
1480e6580ceeSShri Abhyankar       x = rtmp + 16 * ((unsigned int)pj[j]);
1481e6580ceeSShri Abhyankar       /*        x  = rtmp+4*pj[j]; */
1482e6580ceeSShri Abhyankar 
1483e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x, pv)
1484e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1485e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1486e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1487e6580ceeSShri Abhyankar 
1488e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1489e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1490e6580ceeSShri Abhyankar 
1491e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1492e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1493e6580ceeSShri Abhyankar 
1494e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1495e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1496e6580ceeSShri Abhyankar 
1497e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1498e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1499e6580ceeSShri Abhyankar 
1500e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1501e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1502e6580ceeSShri Abhyankar 
1503e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1504e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1505e6580ceeSShri Abhyankar 
1506e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1507e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1508e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1509e6580ceeSShri Abhyankar       pv += 16;
1510e6580ceeSShri Abhyankar     }
1511e6580ceeSShri Abhyankar     /* invert diagonal block */
1512e6580ceeSShri Abhyankar     w = ba + 16 * diag_offset[i];
1513e6580ceeSShri Abhyankar     if (pivotinblocks) {
15149566063dSJacob Faibussowitsch       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1515603e8f96SBarry Smith       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1516e6580ceeSShri Abhyankar     } else {
15179566063dSJacob Faibussowitsch       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1518e6580ceeSShri Abhyankar     }
1519e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1520e6580ceeSShri Abhyankar   }
1521e6580ceeSShri Abhyankar 
15229566063dSJacob Faibussowitsch   PetscCall(PetscFree(rtmp));
152326fbe8dcSKarl Rupp 
1524e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1525e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1526e6580ceeSShri Abhyankar   C->assembled           = PETSC_TRUE;
152726fbe8dcSKarl Rupp 
15289566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1529e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
1530e6580ceeSShri Abhyankar   SSE_SCOPE_END;
15313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1532e6580ceeSShri Abhyankar }
1533e6580ceeSShri Abhyankar 
1534d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info)
1535d71ae5a4SJacob Faibussowitsch {
1536e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1537e6580ceeSShri Abhyankar   int             i, j, n = a->mbs;
1538e6580ceeSShri Abhyankar   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1539e6580ceeSShri Abhyankar   unsigned int    row;
1540e6580ceeSShri Abhyankar   int            *ajtmpold, nz, *bi = b->i;
1541e6580ceeSShri Abhyankar   int            *diag_offset = b->diag, *ai = a->i, *aj = a->j;
1542e6580ceeSShri Abhyankar   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1543e6580ceeSShri Abhyankar   MatScalar      *ba = b->a, *aa = a->a;
1544e6580ceeSShri Abhyankar   int             nonzero       = 0;
1545a455e926SHong Zhang   PetscBool       pivotinblocks = b->pivotinblocks;
1546182b8fbaSHong Zhang   PetscReal       shift         = info->shiftamount;
15470164db54SHong Zhang   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
1548e6580ceeSShri Abhyankar 
1549e6580ceeSShri Abhyankar   PetscFunctionBegin;
15500164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
1551e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
1552e6580ceeSShri Abhyankar 
1553aed4548fSBarry Smith   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1554aed4548fSBarry Smith   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
15559566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1556aed4548fSBarry Smith   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1557e6580ceeSShri Abhyankar   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1558e6580ceeSShri Abhyankar   /*      colscale = 4; */
1559e6580ceeSShri Abhyankar   /*    } */
1560ad540459SPierre Jolivet   if ((unsigned long)bj == (unsigned long)aj) return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1561e6580ceeSShri Abhyankar 
1562e6580ceeSShri Abhyankar   for (i = 0; i < n; i++) {
1563e6580ceeSShri Abhyankar     nz    = bi[i + 1] - bi[i];
1564e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
1565e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
1566e6580ceeSShri Abhyankar     /* zero out one register */
1567e6580ceeSShri Abhyankar     XOR_PS(XMM7, XMM7);
1568e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
1569e6580ceeSShri Abhyankar       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1570e6580ceeSShri Abhyankar       /*        x = rtmp+4*bjtmp[j]; */
1571e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
1572e6580ceeSShri Abhyankar       /* Copy zero register to memory locations */
1573e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1574e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1575e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1576e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1577e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1578e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1579e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1580e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1581e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1582e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1583e6580ceeSShri Abhyankar     }
1584e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
1585e6580ceeSShri Abhyankar     nz       = ai[i + 1] - ai[i];
1586e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
1587e6580ceeSShri Abhyankar     v        = aa + 16 * ai[i];
1588e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
1589e6580ceeSShri Abhyankar       x = rtmp + 16 * ajtmpold[j];
1590e6580ceeSShri Abhyankar       /*        x = rtmp+colscale*ajtmpold[j]; */
1591e6580ceeSShri Abhyankar       /* Copy v block into x block */
1592e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v, x)
1593e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1594e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1595e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1596e6580ceeSShri Abhyankar 
1597e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1598e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1599e6580ceeSShri Abhyankar 
1600e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1601e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1602e6580ceeSShri Abhyankar 
1603e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1604e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1605e6580ceeSShri Abhyankar 
1606e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1607e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1608e6580ceeSShri Abhyankar 
1609e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1610e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1611e6580ceeSShri Abhyankar 
1612e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1613e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1614e6580ceeSShri Abhyankar 
1615e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1616e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1617e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1618e6580ceeSShri Abhyankar 
1619e6580ceeSShri Abhyankar       v += 16;
1620e6580ceeSShri Abhyankar     }
1621e6580ceeSShri Abhyankar     /*      row = (*bjtmp++)/4; */
1622e6580ceeSShri Abhyankar     row = (unsigned int)(*bjtmp++);
1623e6580ceeSShri Abhyankar     while (row < i) {
1624e6580ceeSShri Abhyankar       pc = rtmp + 16 * row;
1625e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
1626e6580ceeSShri Abhyankar       /* Load block from lower triangle */
1627e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1628e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1629e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1630e6580ceeSShri Abhyankar 
1631e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1632e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1633e6580ceeSShri Abhyankar 
1634e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1635e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1636e6580ceeSShri Abhyankar 
1637e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1638e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1639e6580ceeSShri Abhyankar 
1640e6580ceeSShri Abhyankar       /* Compare block to zero block */
1641e6580ceeSShri Abhyankar 
1642e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM4, XMM7)
1643e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM4, XMM0)
1644e6580ceeSShri Abhyankar 
1645e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM5, XMM7)
1646e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM5, XMM1)
1647e6580ceeSShri Abhyankar 
1648e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM6, XMM7)
1649e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM6, XMM2)
1650e6580ceeSShri Abhyankar 
1651e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM7, XMM3)
1652e6580ceeSShri Abhyankar 
1653e6580ceeSShri Abhyankar       /* Reduce the comparisons to one SSE register */
1654e6580ceeSShri Abhyankar       SSE_OR_PS(XMM6, XMM7)
1655e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5, XMM4)
1656e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5, XMM6)
1657e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1658e6580ceeSShri Abhyankar 
1659e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
1660e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1661e6580ceeSShri Abhyankar       MOVEMASK(nonzero, XMM5);
1662e6580ceeSShri Abhyankar 
1663e6580ceeSShri Abhyankar       /* If block is nonzero ... */
1664e6580ceeSShri Abhyankar       if (nonzero) {
1665e6580ceeSShri Abhyankar         pv = ba + 16 * diag_offset[row];
1666e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
1667e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
1668e6580ceeSShri Abhyankar 
1669e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1670e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1671e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
1672e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1673e6580ceeSShri Abhyankar 
1674e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv, pc)
1675e6580ceeSShri Abhyankar         /* Column 0, product is accumulated in XMM4 */
1676e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1677e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1678e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM4, XMM0)
1679e6580ceeSShri Abhyankar 
1680e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1681e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1682e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5, XMM1)
1683e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4, XMM5)
1684e6580ceeSShri Abhyankar 
1685e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1686e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1687e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM2)
1688e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4, XMM6)
1689e6580ceeSShri Abhyankar 
1690e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1691e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1692e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM3)
1693e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4, XMM7)
1694e6580ceeSShri Abhyankar 
1695e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1696e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1697e6580ceeSShri Abhyankar 
1698e6580ceeSShri Abhyankar         /* Column 1, product is accumulated in XMM5 */
1699e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1700e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1701e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5, XMM0)
1702e6580ceeSShri Abhyankar 
1703e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1704e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1705e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM1)
1706e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5, XMM6)
1707e6580ceeSShri Abhyankar 
1708e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1709e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1710e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM2)
1711e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5, XMM7)
1712e6580ceeSShri Abhyankar 
1713e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1714e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1715e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM3)
1716e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5, XMM6)
1717e6580ceeSShri Abhyankar 
1718e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1719e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1720e6580ceeSShri Abhyankar 
1721e6580ceeSShri Abhyankar         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1722e6580ceeSShri Abhyankar 
1723e6580ceeSShri Abhyankar         /* Column 2, product is accumulated in XMM6 */
1724e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1725e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1726e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6, XMM0)
1727e6580ceeSShri Abhyankar 
1728e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1729e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1730e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM1)
1731e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6, XMM7)
1732e6580ceeSShri Abhyankar 
1733e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1734e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1735e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM2)
1736e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6, XMM7)
1737e6580ceeSShri Abhyankar 
1738e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1739e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1740e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7, XMM3)
1741e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6, XMM7)
1742e6580ceeSShri Abhyankar 
1743e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1744e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1745e6580ceeSShri Abhyankar 
1746e6580ceeSShri Abhyankar         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1747e6580ceeSShri Abhyankar         /* Column 3, product is accumulated in XMM0 */
1748e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1749e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1750e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM0, XMM7)
1751e6580ceeSShri Abhyankar 
1752e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1753e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1754e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1, XMM7)
1755e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0, XMM1)
1756e6580ceeSShri Abhyankar 
1757e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1758e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1759e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1, XMM2)
1760e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0, XMM1)
1761e6580ceeSShri Abhyankar 
1762e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1763e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1764e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM3, XMM7)
1765e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0, XMM3)
1766e6580ceeSShri Abhyankar 
1767e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1768e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1769e6580ceeSShri Abhyankar 
1770e6580ceeSShri Abhyankar         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1771e6580ceeSShri Abhyankar         /* This is code to be maintained and read by humans after all. */
1772e6580ceeSShri Abhyankar         /* Copy Multiplier Col 3 into XMM3 */
1773e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM3, XMM0)
1774e6580ceeSShri Abhyankar         /* Copy Multiplier Col 2 into XMM2 */
1775e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM2, XMM6)
1776e6580ceeSShri Abhyankar         /* Copy Multiplier Col 1 into XMM1 */
1777e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM1, XMM5)
1778e6580ceeSShri Abhyankar         /* Copy Multiplier Col 0 into XMM0 */
1779e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM0, XMM4)
1780e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
1781e6580ceeSShri Abhyankar 
1782e6580ceeSShri Abhyankar         /* Update the row: */
1783e6580ceeSShri Abhyankar         nz = bi[row + 1] - diag_offset[row] - 1;
1784e6580ceeSShri Abhyankar         pv += 16;
1785e6580ceeSShri Abhyankar         for (j = 0; j < nz; j++) {
1786e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
1787e6580ceeSShri Abhyankar           x = rtmp + 16 * ((unsigned int)pj[j]);
1788e6580ceeSShri Abhyankar           /*            x = rtmp + 4*pj[j]; */
1789e6580ceeSShri Abhyankar 
1790e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
1791e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1792e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x, pv)
1793e6580ceeSShri Abhyankar           /* Load First Column of X*/
1794e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1795e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1796e6580ceeSShri Abhyankar 
1797e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1798e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1799e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1800e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM0)
1801e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
1802e6580ceeSShri Abhyankar 
1803e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1804e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1805e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM1)
1806e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM6)
1807e6580ceeSShri Abhyankar 
1808e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1809e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1810e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM2)
1811e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM7)
1812e6580ceeSShri Abhyankar 
1813e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1814e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1815e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM3)
1816e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
1817e6580ceeSShri Abhyankar 
1818e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1819e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1820e6580ceeSShri Abhyankar 
1821e6580ceeSShri Abhyankar           /* Second Column */
1822e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1823e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1824e6580ceeSShri Abhyankar 
1825e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1826e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1827e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1828e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM0)
1829e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM6)
1830e6580ceeSShri Abhyankar 
1831e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1832e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1833e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM1)
1834e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM7)
1835e6580ceeSShri Abhyankar 
1836e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1837e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1838e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4, XMM2)
1839e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM4)
1840e6580ceeSShri Abhyankar 
1841e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1842e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1843e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM3)
1844e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5, XMM6)
1845e6580ceeSShri Abhyankar 
1846e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1847e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1848e6580ceeSShri Abhyankar 
1849e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1850e6580ceeSShri Abhyankar 
1851e6580ceeSShri Abhyankar           /* Third Column */
1852e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1853e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1854e6580ceeSShri Abhyankar 
1855e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1856e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1857e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1858e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM0)
1859e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM7)
1860e6580ceeSShri Abhyankar 
1861e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1862e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1863e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4, XMM1)
1864e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM4)
1865e6580ceeSShri Abhyankar 
1866e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1867e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1868e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM2)
1869e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM5)
1870e6580ceeSShri Abhyankar 
1871e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1872e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1873e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM3)
1874e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6, XMM7)
1875e6580ceeSShri Abhyankar 
1876e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1877e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1878e6580ceeSShri Abhyankar 
1879e6580ceeSShri Abhyankar           /* Fourth Column */
1880e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1881e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1882e6580ceeSShri Abhyankar 
1883e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1884e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1885e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1886e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM0)
1887e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
1888e6580ceeSShri Abhyankar 
1889e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1890e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1891e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6, XMM1)
1892e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM6)
1893e6580ceeSShri Abhyankar 
1894e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1895e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1896e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7, XMM2)
1897e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM7)
1898e6580ceeSShri Abhyankar 
1899e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1900e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1901e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5, XMM3)
1902e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4, XMM5)
1903e6580ceeSShri Abhyankar 
1904e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1905e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1906e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
1907e6580ceeSShri Abhyankar           pv += 16;
1908e6580ceeSShri Abhyankar         }
19099566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1910e6580ceeSShri Abhyankar       }
1911e6580ceeSShri Abhyankar       row = (unsigned int)(*bjtmp++);
1912e6580ceeSShri Abhyankar       /*        row = (*bjtmp++)/4; */
1913e6580ceeSShri Abhyankar       /*        bjtmp++; */
1914e6580ceeSShri Abhyankar     }
1915e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
1916e6580ceeSShri Abhyankar     pv = ba + 16 * bi[i];
1917e6580ceeSShri Abhyankar     pj = bj + bi[i];
1918e6580ceeSShri Abhyankar     nz = bi[i + 1] - bi[i];
1919e6580ceeSShri Abhyankar 
1920e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
1921e6580ceeSShri Abhyankar     for (j = 0; j < nz; j++) {
1922e6580ceeSShri Abhyankar       x = rtmp + 16 * ((unsigned int)pj[j]);
1923e6580ceeSShri Abhyankar       /*        x  = rtmp+4*pj[j]; */
1924e6580ceeSShri Abhyankar 
1925e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x, pv)
1926e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1927e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1928e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1929e6580ceeSShri Abhyankar 
1930e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1931e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1932e6580ceeSShri Abhyankar 
1933e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1934e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1935e6580ceeSShri Abhyankar 
1936e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1937e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1938e6580ceeSShri Abhyankar 
1939e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1940e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1941e6580ceeSShri Abhyankar 
1942e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1943e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1944e6580ceeSShri Abhyankar 
1945e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1946e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1947e6580ceeSShri Abhyankar 
1948e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1949e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1950e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1951e6580ceeSShri Abhyankar       pv += 16;
1952e6580ceeSShri Abhyankar     }
1953e6580ceeSShri Abhyankar     /* invert diagonal block */
1954e6580ceeSShri Abhyankar     w = ba + 16 * diag_offset[i];
1955e6580ceeSShri Abhyankar     if (pivotinblocks) {
19569566063dSJacob Faibussowitsch       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected));
1957603e8f96SBarry Smith       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1958e6580ceeSShri Abhyankar     } else {
19599566063dSJacob Faibussowitsch       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1960e6580ceeSShri Abhyankar     }
19619566063dSJacob Faibussowitsch     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1962e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1963e6580ceeSShri Abhyankar   }
1964e6580ceeSShri Abhyankar 
19659566063dSJacob Faibussowitsch   PetscCall(PetscFree(rtmp));
196626fbe8dcSKarl Rupp 
1967e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1968e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1969e6580ceeSShri Abhyankar   C->assembled           = PETSC_TRUE;
197026fbe8dcSKarl Rupp 
19719566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1972e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
1973e6580ceeSShri Abhyankar   SSE_SCOPE_END;
19743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1975e6580ceeSShri Abhyankar }
1976e6580ceeSShri Abhyankar 
1977e6580ceeSShri Abhyankar #endif
1978