xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision df4cd43f92eaa320656440c40edb1046daee8f75)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 /*
9       Version for when blocks are 4 by 4
10 */
11 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
12 {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
14   IS              isrow = b->row, isicol = b->icol;
15   const PetscInt *r, *ic;
16   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
17   PetscInt       *ajtmpold, *ajtmp, nz, row;
18   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
19   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
20   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
21   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
22   MatScalar       p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
23   MatScalar       m13, m14, m15, m16;
24   MatScalar      *ba = b->a, *aa = a->a;
25   PetscBool       pivotinblocks = b->pivotinblocks;
26   PetscReal       shift         = info->shiftamount;
27   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
28 
29   PetscFunctionBegin;
30   PetscCall(ISGetIndices(isrow, &r));
31   PetscCall(ISGetIndices(isicol, &ic));
32   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
33   allowzeropivot = PetscNot(A->erroriffailure);
34 
35   for (i = 0; i < n; i++) {
36     nz    = bi[i + 1] - bi[i];
37     ajtmp = bj + bi[i];
38     for (j = 0; j < nz; j++) {
39       x    = rtmp + 16 * ajtmp[j];
40       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
41       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
42     }
43     /* load in initial (unfactored row) */
44     idx      = r[i];
45     nz       = ai[idx + 1] - ai[idx];
46     ajtmpold = aj + ai[idx];
47     v        = aa + 16 * ai[idx];
48     for (j = 0; j < nz; j++) {
49       x     = rtmp + 16 * ic[ajtmpold[j]];
50       x[0]  = v[0];
51       x[1]  = v[1];
52       x[2]  = v[2];
53       x[3]  = v[3];
54       x[4]  = v[4];
55       x[5]  = v[5];
56       x[6]  = v[6];
57       x[7]  = v[7];
58       x[8]  = v[8];
59       x[9]  = v[9];
60       x[10] = v[10];
61       x[11] = v[11];
62       x[12] = v[12];
63       x[13] = v[13];
64       x[14] = v[14];
65       x[15] = v[15];
66       v += 16;
67     }
68     row = *ajtmp++;
69     while (row < i) {
70       pc  = rtmp + 16 * row;
71       p1  = pc[0];
72       p2  = pc[1];
73       p3  = pc[2];
74       p4  = pc[3];
75       p5  = pc[4];
76       p6  = pc[5];
77       p7  = pc[6];
78       p8  = pc[7];
79       p9  = pc[8];
80       p10 = pc[9];
81       p11 = pc[10];
82       p12 = pc[11];
83       p13 = pc[12];
84       p14 = pc[13];
85       p15 = pc[14];
86       p16 = pc[15];
87       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
88         pv    = ba + 16 * diag_offset[row];
89         pj    = bj + diag_offset[row] + 1;
90         x1    = pv[0];
91         x2    = pv[1];
92         x3    = pv[2];
93         x4    = pv[3];
94         x5    = pv[4];
95         x6    = pv[5];
96         x7    = pv[6];
97         x8    = pv[7];
98         x9    = pv[8];
99         x10   = pv[9];
100         x11   = pv[10];
101         x12   = pv[11];
102         x13   = pv[12];
103         x14   = pv[13];
104         x15   = pv[14];
105         x16   = pv[15];
106         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
107         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
108         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
109         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
110 
111         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
112         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
113         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
114         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
115 
116         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
117         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
118         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
119         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
120 
121         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
122         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
123         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
124         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
125 
126         nz = bi[row + 1] - diag_offset[row] - 1;
127         pv += 16;
128         for (j = 0; j < nz; j++) {
129           x1  = pv[0];
130           x2  = pv[1];
131           x3  = pv[2];
132           x4  = pv[3];
133           x5  = pv[4];
134           x6  = pv[5];
135           x7  = pv[6];
136           x8  = pv[7];
137           x9  = pv[8];
138           x10 = pv[9];
139           x11 = pv[10];
140           x12 = pv[11];
141           x13 = pv[12];
142           x14 = pv[13];
143           x15 = pv[14];
144           x16 = pv[15];
145           x   = rtmp + 16 * pj[j];
146           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
147           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
148           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
149           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
150 
151           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
152           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
153           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
154           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
155 
156           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
157           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
158           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
159           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
160 
161           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
162           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
163           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
164           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
165 
166           pv += 16;
167         }
168         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
169       }
170       row = *ajtmp++;
171     }
172     /* finished row so stick it into b->a */
173     pv = ba + 16 * bi[i];
174     pj = bj + bi[i];
175     nz = bi[i + 1] - bi[i];
176     for (j = 0; j < nz; j++) {
177       x      = rtmp + 16 * pj[j];
178       pv[0]  = x[0];
179       pv[1]  = x[1];
180       pv[2]  = x[2];
181       pv[3]  = x[3];
182       pv[4]  = x[4];
183       pv[5]  = x[5];
184       pv[6]  = x[6];
185       pv[7]  = x[7];
186       pv[8]  = x[8];
187       pv[9]  = x[9];
188       pv[10] = x[10];
189       pv[11] = x[11];
190       pv[12] = x[12];
191       pv[13] = x[13];
192       pv[14] = x[14];
193       pv[15] = x[15];
194       pv += 16;
195     }
196     /* invert diagonal block */
197     w = ba + 16 * diag_offset[i];
198     if (pivotinblocks) {
199       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
200       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
201     } else {
202       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
203     }
204   }
205 
206   PetscCall(PetscFree(rtmp));
207   PetscCall(ISRestoreIndices(isicol, &ic));
208   PetscCall(ISRestoreIndices(isrow, &r));
209 
210   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
211   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
212   C->assembled           = PETSC_TRUE;
213 
214   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
215   PetscFunctionReturn(PETSC_SUCCESS);
216 }
217 
218 /* MatLUFactorNumeric_SeqBAIJ_4 -
219      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
220        PetscKernel_A_gets_A_times_B()
221        PetscKernel_A_gets_A_minus_B_times_C()
222        PetscKernel_A_gets_inverse_A()
223 */
224 
225 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
226 {
227   Mat             C = B;
228   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
229   IS              isrow = b->row, isicol = b->icol;
230   const PetscInt *r, *ic;
231   PetscInt        i, j, k, nz, nzL, row;
232   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
233   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
234   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
235   PetscInt        flg;
236   PetscReal       shift;
237   PetscBool       allowzeropivot, zeropivotdetected;
238 
239   PetscFunctionBegin;
240   allowzeropivot = PetscNot(A->erroriffailure);
241   PetscCall(ISGetIndices(isrow, &r));
242   PetscCall(ISGetIndices(isicol, &ic));
243 
244   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
245     shift = 0;
246   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
247     shift = info->shiftamount;
248   }
249 
250   /* generate work space needed by the factorization */
251   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
252   PetscCall(PetscArrayzero(rtmp, bs2 * n));
253 
254   for (i = 0; i < n; i++) {
255     /* zero rtmp */
256     /* L part */
257     nz    = bi[i + 1] - bi[i];
258     bjtmp = bj + bi[i];
259     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
260 
261     /* U part */
262     nz    = bdiag[i] - bdiag[i + 1];
263     bjtmp = bj + bdiag[i + 1] + 1;
264     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
265 
266     /* load in initial (unfactored row) */
267     nz    = ai[r[i] + 1] - ai[r[i]];
268     ajtmp = aj + ai[r[i]];
269     v     = aa + bs2 * ai[r[i]];
270     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
271 
272     /* elimination */
273     bjtmp = bj + bi[i];
274     nzL   = bi[i + 1] - bi[i];
275     for (k = 0; k < nzL; k++) {
276       row = bjtmp[k];
277       pc  = rtmp + bs2 * row;
278       for (flg = 0, j = 0; j < bs2; j++) {
279         if (pc[j] != 0.0) {
280           flg = 1;
281           break;
282         }
283       }
284       if (flg) {
285         pv = b->a + bs2 * bdiag[row];
286         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
287         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
288 
289         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
290         pv = b->a + bs2 * (bdiag[row + 1] + 1);
291         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
292         for (j = 0; j < nz; j++) {
293           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
294           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
295           v = rtmp + bs2 * pj[j];
296           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
297           pv += bs2;
298         }
299         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
300       }
301     }
302 
303     /* finished row so stick it into b->a */
304     /* L part */
305     pv = b->a + bs2 * bi[i];
306     pj = b->j + bi[i];
307     nz = bi[i + 1] - bi[i];
308     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
309 
310     /* Mark diagonal and invert diagonal for simpler triangular solves */
311     pv = b->a + bs2 * bdiag[i];
312     pj = b->j + bdiag[i];
313     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
314     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
315     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
316 
317     /* U part */
318     pv = b->a + bs2 * (bdiag[i + 1] + 1);
319     pj = b->j + bdiag[i + 1] + 1;
320     nz = bdiag[i] - bdiag[i + 1] - 1;
321     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
322   }
323 
324   PetscCall(PetscFree2(rtmp, mwork));
325   PetscCall(ISRestoreIndices(isicol, &ic));
326   PetscCall(ISRestoreIndices(isrow, &r));
327 
328   C->ops->solve          = MatSolve_SeqBAIJ_4;
329   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
330   C->assembled           = PETSC_TRUE;
331 
332   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
333   PetscFunctionReturn(PETSC_SUCCESS);
334 }
335 
336 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
337 {
338   /*
339     Default Version for when blocks are 4 by 4 Using natural ordering
340 */
341   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
342   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
343   PetscInt    *ajtmpold, *ajtmp, nz, row;
344   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
345   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
346   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
347   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
348   MatScalar    p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
349   MatScalar    m13, m14, m15, m16;
350   MatScalar   *ba = b->a, *aa = a->a;
351   PetscBool    pivotinblocks = b->pivotinblocks;
352   PetscReal    shift         = info->shiftamount;
353   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
354 
355   PetscFunctionBegin;
356   allowzeropivot = PetscNot(A->erroriffailure);
357   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
358 
359   for (i = 0; i < n; i++) {
360     nz    = bi[i + 1] - bi[i];
361     ajtmp = bj + bi[i];
362     for (j = 0; j < nz; j++) {
363       x    = rtmp + 16 * ajtmp[j];
364       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
365       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
366     }
367     /* load in initial (unfactored row) */
368     nz       = ai[i + 1] - ai[i];
369     ajtmpold = aj + ai[i];
370     v        = aa + 16 * ai[i];
371     for (j = 0; j < nz; j++) {
372       x     = rtmp + 16 * ajtmpold[j];
373       x[0]  = v[0];
374       x[1]  = v[1];
375       x[2]  = v[2];
376       x[3]  = v[3];
377       x[4]  = v[4];
378       x[5]  = v[5];
379       x[6]  = v[6];
380       x[7]  = v[7];
381       x[8]  = v[8];
382       x[9]  = v[9];
383       x[10] = v[10];
384       x[11] = v[11];
385       x[12] = v[12];
386       x[13] = v[13];
387       x[14] = v[14];
388       x[15] = v[15];
389       v += 16;
390     }
391     row = *ajtmp++;
392     while (row < i) {
393       pc  = rtmp + 16 * row;
394       p1  = pc[0];
395       p2  = pc[1];
396       p3  = pc[2];
397       p4  = pc[3];
398       p5  = pc[4];
399       p6  = pc[5];
400       p7  = pc[6];
401       p8  = pc[7];
402       p9  = pc[8];
403       p10 = pc[9];
404       p11 = pc[10];
405       p12 = pc[11];
406       p13 = pc[12];
407       p14 = pc[13];
408       p15 = pc[14];
409       p16 = pc[15];
410       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
411         pv    = ba + 16 * diag_offset[row];
412         pj    = bj + diag_offset[row] + 1;
413         x1    = pv[0];
414         x2    = pv[1];
415         x3    = pv[2];
416         x4    = pv[3];
417         x5    = pv[4];
418         x6    = pv[5];
419         x7    = pv[6];
420         x8    = pv[7];
421         x9    = pv[8];
422         x10   = pv[9];
423         x11   = pv[10];
424         x12   = pv[11];
425         x13   = pv[12];
426         x14   = pv[13];
427         x15   = pv[14];
428         x16   = pv[15];
429         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
430         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
431         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
432         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
433 
434         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
435         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
436         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
437         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
438 
439         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
440         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
441         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
442         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
443 
444         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
445         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
446         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
447         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
448         nz           = bi[row + 1] - diag_offset[row] - 1;
449         pv += 16;
450         for (j = 0; j < nz; j++) {
451           x1  = pv[0];
452           x2  = pv[1];
453           x3  = pv[2];
454           x4  = pv[3];
455           x5  = pv[4];
456           x6  = pv[5];
457           x7  = pv[6];
458           x8  = pv[7];
459           x9  = pv[8];
460           x10 = pv[9];
461           x11 = pv[10];
462           x12 = pv[11];
463           x13 = pv[12];
464           x14 = pv[13];
465           x15 = pv[14];
466           x16 = pv[15];
467           x   = rtmp + 16 * pj[j];
468           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
469           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
470           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
471           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
472 
473           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
474           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
475           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
476           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
477 
478           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
479           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
480           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
481           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
482 
483           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
484           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
485           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
486           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
487 
488           pv += 16;
489         }
490         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
491       }
492       row = *ajtmp++;
493     }
494     /* finished row so stick it into b->a */
495     pv = ba + 16 * bi[i];
496     pj = bj + bi[i];
497     nz = bi[i + 1] - bi[i];
498     for (j = 0; j < nz; j++) {
499       x      = rtmp + 16 * pj[j];
500       pv[0]  = x[0];
501       pv[1]  = x[1];
502       pv[2]  = x[2];
503       pv[3]  = x[3];
504       pv[4]  = x[4];
505       pv[5]  = x[5];
506       pv[6]  = x[6];
507       pv[7]  = x[7];
508       pv[8]  = x[8];
509       pv[9]  = x[9];
510       pv[10] = x[10];
511       pv[11] = x[11];
512       pv[12] = x[12];
513       pv[13] = x[13];
514       pv[14] = x[14];
515       pv[15] = x[15];
516       pv += 16;
517     }
518     /* invert diagonal block */
519     w = ba + 16 * diag_offset[i];
520     if (pivotinblocks) {
521       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
522       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
523     } else {
524       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
525     }
526   }
527 
528   PetscCall(PetscFree(rtmp));
529 
530   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
531   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
532   C->assembled           = PETSC_TRUE;
533 
534   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
535   PetscFunctionReturn(PETSC_SUCCESS);
536 }
537 
538 /*
539   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
540     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
541 */
542 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
543 {
544   Mat             C = B;
545   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
546   PetscInt        i, j, k, nz, nzL, row;
547   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
548   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
549   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
550   PetscInt        flg;
551   PetscReal       shift;
552   PetscBool       allowzeropivot, zeropivotdetected;
553 
554   PetscFunctionBegin;
555   allowzeropivot = PetscNot(A->erroriffailure);
556 
557   /* generate work space needed by the factorization */
558   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
559   PetscCall(PetscArrayzero(rtmp, bs2 * n));
560 
561   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
562     shift = 0;
563   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
564     shift = info->shiftamount;
565   }
566 
567   for (i = 0; i < n; i++) {
568     /* zero rtmp */
569     /* L part */
570     nz    = bi[i + 1] - bi[i];
571     bjtmp = bj + bi[i];
572     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
573 
574     /* U part */
575     nz    = bdiag[i] - bdiag[i + 1];
576     bjtmp = bj + bdiag[i + 1] + 1;
577     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
578 
579     /* load in initial (unfactored row) */
580     nz    = ai[i + 1] - ai[i];
581     ajtmp = aj + ai[i];
582     v     = aa + bs2 * ai[i];
583     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
584 
585     /* elimination */
586     bjtmp = bj + bi[i];
587     nzL   = bi[i + 1] - bi[i];
588     for (k = 0; k < nzL; k++) {
589       row = bjtmp[k];
590       pc  = rtmp + bs2 * row;
591       for (flg = 0, j = 0; j < bs2; j++) {
592         if (pc[j] != 0.0) {
593           flg = 1;
594           break;
595         }
596       }
597       if (flg) {
598         pv = b->a + bs2 * bdiag[row];
599         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
600         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
601 
602         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
603         pv = b->a + bs2 * (bdiag[row + 1] + 1);
604         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
605         for (j = 0; j < nz; j++) {
606           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
607           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
608           v = rtmp + bs2 * pj[j];
609           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
610           pv += bs2;
611         }
612         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
613       }
614     }
615 
616     /* finished row so stick it into b->a */
617     /* L part */
618     pv = b->a + bs2 * bi[i];
619     pj = b->j + bi[i];
620     nz = bi[i + 1] - bi[i];
621     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
622 
623     /* Mark diagonal and invert diagonal for simpler triangular solves */
624     pv = b->a + bs2 * bdiag[i];
625     pj = b->j + bdiag[i];
626     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
627     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
628     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
629 
630     /* U part */
631     pv = b->a + bs2 * (bdiag[i + 1] + 1);
632     pj = b->j + bdiag[i + 1] + 1;
633     nz = bdiag[i] - bdiag[i + 1] - 1;
634     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
635   }
636   PetscCall(PetscFree2(rtmp, mwork));
637 
638   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
639   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
640   C->assembled           = PETSC_TRUE;
641 
642   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
643   PetscFunctionReturn(PETSC_SUCCESS);
644 }
645 
646 #if defined(PETSC_HAVE_SSE)
647 
648   #include PETSC_HAVE_SSE
649 
650 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
651 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info)
652 {
653   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
654   int          i, j, n = a->mbs;
655   int         *bj = b->j, *bjtmp, *pj;
656   int          row;
657   int         *ajtmpold, nz, *bi = b->i;
658   int         *diag_offset = b->diag, *ai = a->i, *aj = a->j;
659   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
660   MatScalar   *ba = b->a, *aa = a->a;
661   int          nonzero       = 0;
662   PetscBool    pivotinblocks = b->pivotinblocks;
663   PetscReal    shift         = info->shiftamount;
664   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
665 
666   PetscFunctionBegin;
667   allowzeropivot = PetscNot(A->erroriffailure);
668   SSE_SCOPE_BEGIN;
669 
670   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
671   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
672   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
673   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
674   /*    if ((unsigned long)bj==(unsigned long)aj) { */
675   /*      colscale = 4; */
676   /*    } */
677   for (i = 0; i < n; i++) {
678     nz    = bi[i + 1] - bi[i];
679     bjtmp = bj + bi[i];
680     /* zero out the 4x4 block accumulators */
681     /* zero out one register */
682     XOR_PS(XMM7, XMM7);
683     for (j = 0; j < nz; j++) {
684       x = rtmp + 16 * bjtmp[j];
685       /*        x = rtmp+4*bjtmp[j]; */
686       SSE_INLINE_BEGIN_1(x)
687       /* Copy zero register to memory locations */
688       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
689       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
690       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
691       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
692       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
693       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
694       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
695       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
696       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
697       SSE_INLINE_END_1;
698     }
699     /* load in initial (unfactored row) */
700     nz       = ai[i + 1] - ai[i];
701     ajtmpold = aj + ai[i];
702     v        = aa + 16 * ai[i];
703     for (j = 0; j < nz; j++) {
704       x = rtmp + 16 * ajtmpold[j];
705       /*        x = rtmp+colscale*ajtmpold[j]; */
706       /* Copy v block into x block */
707       SSE_INLINE_BEGIN_2(v, x)
708       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
709       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
710       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
711 
712       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
713       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
714 
715       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
716       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
717 
718       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
719       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
720 
721       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
722       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
723 
724       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
725       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
726 
727       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
728       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
729 
730       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
731       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
732       SSE_INLINE_END_2;
733 
734       v += 16;
735     }
736     /*      row = (*bjtmp++)/4; */
737     row = *bjtmp++;
738     while (row < i) {
739       pc = rtmp + 16 * row;
740       SSE_INLINE_BEGIN_1(pc)
741       /* Load block from lower triangle */
742       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
743       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
744       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
745 
746       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
747       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
748 
749       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
750       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
751 
752       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
753       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
754 
755       /* Compare block to zero block */
756 
757       SSE_COPY_PS(XMM4, XMM7)
758       SSE_CMPNEQ_PS(XMM4, XMM0)
759 
760       SSE_COPY_PS(XMM5, XMM7)
761       SSE_CMPNEQ_PS(XMM5, XMM1)
762 
763       SSE_COPY_PS(XMM6, XMM7)
764       SSE_CMPNEQ_PS(XMM6, XMM2)
765 
766       SSE_CMPNEQ_PS(XMM7, XMM3)
767 
768       /* Reduce the comparisons to one SSE register */
769       SSE_OR_PS(XMM6, XMM7)
770       SSE_OR_PS(XMM5, XMM4)
771       SSE_OR_PS(XMM5, XMM6)
772       SSE_INLINE_END_1;
773 
774       /* Reduce the one SSE register to an integer register for branching */
775       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
776       MOVEMASK(nonzero, XMM5);
777 
778       /* If block is nonzero ... */
779       if (nonzero) {
780         pv = ba + 16 * diag_offset[row];
781         PREFETCH_L1(&pv[16]);
782         pj = bj + diag_offset[row] + 1;
783 
784         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
785         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
786         /* but the diagonal was inverted already */
787         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
788 
789         SSE_INLINE_BEGIN_2(pv, pc)
790         /* Column 0, product is accumulated in XMM4 */
791         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
792         SSE_SHUFFLE(XMM4, XMM4, 0x00)
793         SSE_MULT_PS(XMM4, XMM0)
794 
795         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
796         SSE_SHUFFLE(XMM5, XMM5, 0x00)
797         SSE_MULT_PS(XMM5, XMM1)
798         SSE_ADD_PS(XMM4, XMM5)
799 
800         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
801         SSE_SHUFFLE(XMM6, XMM6, 0x00)
802         SSE_MULT_PS(XMM6, XMM2)
803         SSE_ADD_PS(XMM4, XMM6)
804 
805         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
806         SSE_SHUFFLE(XMM7, XMM7, 0x00)
807         SSE_MULT_PS(XMM7, XMM3)
808         SSE_ADD_PS(XMM4, XMM7)
809 
810         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
811         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
812 
813         /* Column 1, product is accumulated in XMM5 */
814         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
815         SSE_SHUFFLE(XMM5, XMM5, 0x00)
816         SSE_MULT_PS(XMM5, XMM0)
817 
818         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
819         SSE_SHUFFLE(XMM6, XMM6, 0x00)
820         SSE_MULT_PS(XMM6, XMM1)
821         SSE_ADD_PS(XMM5, XMM6)
822 
823         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
824         SSE_SHUFFLE(XMM7, XMM7, 0x00)
825         SSE_MULT_PS(XMM7, XMM2)
826         SSE_ADD_PS(XMM5, XMM7)
827 
828         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
829         SSE_SHUFFLE(XMM6, XMM6, 0x00)
830         SSE_MULT_PS(XMM6, XMM3)
831         SSE_ADD_PS(XMM5, XMM6)
832 
833         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
834         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
835 
836         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
837 
838         /* Column 2, product is accumulated in XMM6 */
839         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
840         SSE_SHUFFLE(XMM6, XMM6, 0x00)
841         SSE_MULT_PS(XMM6, XMM0)
842 
843         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
844         SSE_SHUFFLE(XMM7, XMM7, 0x00)
845         SSE_MULT_PS(XMM7, XMM1)
846         SSE_ADD_PS(XMM6, XMM7)
847 
848         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
849         SSE_SHUFFLE(XMM7, XMM7, 0x00)
850         SSE_MULT_PS(XMM7, XMM2)
851         SSE_ADD_PS(XMM6, XMM7)
852 
853         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
854         SSE_SHUFFLE(XMM7, XMM7, 0x00)
855         SSE_MULT_PS(XMM7, XMM3)
856         SSE_ADD_PS(XMM6, XMM7)
857 
858         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
859         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
860 
861         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
862         /* Column 3, product is accumulated in XMM0 */
863         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
864         SSE_SHUFFLE(XMM7, XMM7, 0x00)
865         SSE_MULT_PS(XMM0, XMM7)
866 
867         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
868         SSE_SHUFFLE(XMM7, XMM7, 0x00)
869         SSE_MULT_PS(XMM1, XMM7)
870         SSE_ADD_PS(XMM0, XMM1)
871 
872         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
873         SSE_SHUFFLE(XMM1, XMM1, 0x00)
874         SSE_MULT_PS(XMM1, XMM2)
875         SSE_ADD_PS(XMM0, XMM1)
876 
877         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
878         SSE_SHUFFLE(XMM7, XMM7, 0x00)
879         SSE_MULT_PS(XMM3, XMM7)
880         SSE_ADD_PS(XMM0, XMM3)
881 
882         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
883         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
884 
885         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
886         /* This is code to be maintained and read by humans after all. */
887         /* Copy Multiplier Col 3 into XMM3 */
888         SSE_COPY_PS(XMM3, XMM0)
889         /* Copy Multiplier Col 2 into XMM2 */
890         SSE_COPY_PS(XMM2, XMM6)
891         /* Copy Multiplier Col 1 into XMM1 */
892         SSE_COPY_PS(XMM1, XMM5)
893         /* Copy Multiplier Col 0 into XMM0 */
894         SSE_COPY_PS(XMM0, XMM4)
895         SSE_INLINE_END_2;
896 
897         /* Update the row: */
898         nz = bi[row + 1] - diag_offset[row] - 1;
899         pv += 16;
900         for (j = 0; j < nz; j++) {
901           PREFETCH_L1(&pv[16]);
902           x = rtmp + 16 * pj[j];
903           /*            x = rtmp + 4*pj[j]; */
904 
905           /* X:=X-M*PV, One column at a time */
906           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
907           SSE_INLINE_BEGIN_2(x, pv)
908           /* Load First Column of X*/
909           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
910           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
911 
912           /* Matrix-Vector Product: */
913           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
914           SSE_SHUFFLE(XMM5, XMM5, 0x00)
915           SSE_MULT_PS(XMM5, XMM0)
916           SSE_SUB_PS(XMM4, XMM5)
917 
918           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
919           SSE_SHUFFLE(XMM6, XMM6, 0x00)
920           SSE_MULT_PS(XMM6, XMM1)
921           SSE_SUB_PS(XMM4, XMM6)
922 
923           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
924           SSE_SHUFFLE(XMM7, XMM7, 0x00)
925           SSE_MULT_PS(XMM7, XMM2)
926           SSE_SUB_PS(XMM4, XMM7)
927 
928           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
929           SSE_SHUFFLE(XMM5, XMM5, 0x00)
930           SSE_MULT_PS(XMM5, XMM3)
931           SSE_SUB_PS(XMM4, XMM5)
932 
933           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
934           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
935 
936           /* Second Column */
937           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
938           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
939 
940           /* Matrix-Vector Product: */
941           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
942           SSE_SHUFFLE(XMM6, XMM6, 0x00)
943           SSE_MULT_PS(XMM6, XMM0)
944           SSE_SUB_PS(XMM5, XMM6)
945 
946           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
947           SSE_SHUFFLE(XMM7, XMM7, 0x00)
948           SSE_MULT_PS(XMM7, XMM1)
949           SSE_SUB_PS(XMM5, XMM7)
950 
951           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
952           SSE_SHUFFLE(XMM4, XMM4, 0x00)
953           SSE_MULT_PS(XMM4, XMM2)
954           SSE_SUB_PS(XMM5, XMM4)
955 
956           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
957           SSE_SHUFFLE(XMM6, XMM6, 0x00)
958           SSE_MULT_PS(XMM6, XMM3)
959           SSE_SUB_PS(XMM5, XMM6)
960 
961           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
962           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
963 
964           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
965 
966           /* Third Column */
967           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
968           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
969 
970           /* Matrix-Vector Product: */
971           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
972           SSE_SHUFFLE(XMM7, XMM7, 0x00)
973           SSE_MULT_PS(XMM7, XMM0)
974           SSE_SUB_PS(XMM6, XMM7)
975 
976           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
977           SSE_SHUFFLE(XMM4, XMM4, 0x00)
978           SSE_MULT_PS(XMM4, XMM1)
979           SSE_SUB_PS(XMM6, XMM4)
980 
981           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
982           SSE_SHUFFLE(XMM5, XMM5, 0x00)
983           SSE_MULT_PS(XMM5, XMM2)
984           SSE_SUB_PS(XMM6, XMM5)
985 
986           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
987           SSE_SHUFFLE(XMM7, XMM7, 0x00)
988           SSE_MULT_PS(XMM7, XMM3)
989           SSE_SUB_PS(XMM6, XMM7)
990 
991           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
992           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
993 
994           /* Fourth Column */
995           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
996           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
997 
998           /* Matrix-Vector Product: */
999           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1000           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1001           SSE_MULT_PS(XMM5, XMM0)
1002           SSE_SUB_PS(XMM4, XMM5)
1003 
1004           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1005           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1006           SSE_MULT_PS(XMM6, XMM1)
1007           SSE_SUB_PS(XMM4, XMM6)
1008 
1009           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1010           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1011           SSE_MULT_PS(XMM7, XMM2)
1012           SSE_SUB_PS(XMM4, XMM7)
1013 
1014           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1015           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1016           SSE_MULT_PS(XMM5, XMM3)
1017           SSE_SUB_PS(XMM4, XMM5)
1018 
1019           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1020           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1021           SSE_INLINE_END_2;
1022           pv += 16;
1023         }
1024         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1025       }
1026       row = *bjtmp++;
1027       /*        row = (*bjtmp++)/4; */
1028     }
1029     /* finished row so stick it into b->a */
1030     pv = ba + 16 * bi[i];
1031     pj = bj + bi[i];
1032     nz = bi[i + 1] - bi[i];
1033 
1034     /* Copy x block back into pv block */
1035     for (j = 0; j < nz; j++) {
1036       x = rtmp + 16 * pj[j];
1037       /*        x  = rtmp+4*pj[j]; */
1038 
1039       SSE_INLINE_BEGIN_2(x, pv)
1040       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1041       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1042       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1043 
1044       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1045       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1046 
1047       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1048       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1049 
1050       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1051       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1052 
1053       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1054       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1055 
1056       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1057       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1058 
1059       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1060       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1061 
1062       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1063       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1064       SSE_INLINE_END_2;
1065       pv += 16;
1066     }
1067     /* invert diagonal block */
1068     w = ba + 16 * diag_offset[i];
1069     if (pivotinblocks) {
1070       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1071       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1072     } else {
1073       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1074     }
1075     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1076     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1077   }
1078 
1079   PetscCall(PetscFree(rtmp));
1080 
1081   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1082   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1083   C->assembled           = PETSC_TRUE;
1084 
1085   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1086   /* Flop Count from inverting diagonal blocks */
1087   SSE_SCOPE_END;
1088   PetscFunctionReturn(PETSC_SUCCESS);
1089 }
1090 
1091 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1092 {
1093   Mat             A = C;
1094   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1095   int             i, j, n = a->mbs;
1096   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1097   unsigned short *aj = (unsigned short *)(a->j), *ajtmp;
1098   unsigned int    row;
1099   int             nz, *bi = b->i;
1100   int            *diag_offset = b->diag, *ai = a->i;
1101   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1102   MatScalar      *ba = b->a, *aa = a->a;
1103   int             nonzero       = 0;
1104   PetscBool       pivotinblocks = b->pivotinblocks;
1105   PetscReal       shift         = info->shiftamount;
1106   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
1107 
1108   PetscFunctionBegin;
1109   allowzeropivot = PetscNot(A->erroriffailure);
1110   SSE_SCOPE_BEGIN;
1111 
1112   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1113   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
1114   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1115   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1116   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1117   /*      colscale = 4; */
1118   /*    } */
1119 
1120   for (i = 0; i < n; i++) {
1121     nz    = bi[i + 1] - bi[i];
1122     bjtmp = bj + bi[i];
1123     /* zero out the 4x4 block accumulators */
1124     /* zero out one register */
1125     XOR_PS(XMM7, XMM7);
1126     for (j = 0; j < nz; j++) {
1127       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1128       /*        x = rtmp+4*bjtmp[j]; */
1129       SSE_INLINE_BEGIN_1(x)
1130       /* Copy zero register to memory locations */
1131       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1132       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1133       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1134       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1135       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1136       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1137       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1138       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1139       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1140       SSE_INLINE_END_1;
1141     }
1142     /* load in initial (unfactored row) */
1143     nz    = ai[i + 1] - ai[i];
1144     ajtmp = aj + ai[i];
1145     v     = aa + 16 * ai[i];
1146     for (j = 0; j < nz; j++) {
1147       x = rtmp + 16 * ((unsigned int)ajtmp[j]);
1148       /*        x = rtmp+colscale*ajtmp[j]; */
1149       /* Copy v block into x block */
1150       SSE_INLINE_BEGIN_2(v, x)
1151       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1152       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1153       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1154 
1155       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1156       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1157 
1158       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1159       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1160 
1161       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1162       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1163 
1164       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1165       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1166 
1167       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1168       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1169 
1170       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1171       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1172 
1173       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1174       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1175       SSE_INLINE_END_2;
1176 
1177       v += 16;
1178     }
1179     /*      row = (*bjtmp++)/4; */
1180     row = (unsigned int)(*bjtmp++);
1181     while (row < i) {
1182       pc = rtmp + 16 * row;
1183       SSE_INLINE_BEGIN_1(pc)
1184       /* Load block from lower triangle */
1185       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1186       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1187       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1188 
1189       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1190       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1191 
1192       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1193       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1194 
1195       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1196       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1197 
1198       /* Compare block to zero block */
1199 
1200       SSE_COPY_PS(XMM4, XMM7)
1201       SSE_CMPNEQ_PS(XMM4, XMM0)
1202 
1203       SSE_COPY_PS(XMM5, XMM7)
1204       SSE_CMPNEQ_PS(XMM5, XMM1)
1205 
1206       SSE_COPY_PS(XMM6, XMM7)
1207       SSE_CMPNEQ_PS(XMM6, XMM2)
1208 
1209       SSE_CMPNEQ_PS(XMM7, XMM3)
1210 
1211       /* Reduce the comparisons to one SSE register */
1212       SSE_OR_PS(XMM6, XMM7)
1213       SSE_OR_PS(XMM5, XMM4)
1214       SSE_OR_PS(XMM5, XMM6)
1215       SSE_INLINE_END_1;
1216 
1217       /* Reduce the one SSE register to an integer register for branching */
1218       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1219       MOVEMASK(nonzero, XMM5);
1220 
1221       /* If block is nonzero ... */
1222       if (nonzero) {
1223         pv = ba + 16 * diag_offset[row];
1224         PREFETCH_L1(&pv[16]);
1225         pj = bj + diag_offset[row] + 1;
1226 
1227         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1228         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1229         /* but the diagonal was inverted already */
1230         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1231 
1232         SSE_INLINE_BEGIN_2(pv, pc)
1233         /* Column 0, product is accumulated in XMM4 */
1234         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1235         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1236         SSE_MULT_PS(XMM4, XMM0)
1237 
1238         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1239         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1240         SSE_MULT_PS(XMM5, XMM1)
1241         SSE_ADD_PS(XMM4, XMM5)
1242 
1243         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1244         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1245         SSE_MULT_PS(XMM6, XMM2)
1246         SSE_ADD_PS(XMM4, XMM6)
1247 
1248         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1249         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1250         SSE_MULT_PS(XMM7, XMM3)
1251         SSE_ADD_PS(XMM4, XMM7)
1252 
1253         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1254         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1255 
1256         /* Column 1, product is accumulated in XMM5 */
1257         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1258         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1259         SSE_MULT_PS(XMM5, XMM0)
1260 
1261         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1262         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1263         SSE_MULT_PS(XMM6, XMM1)
1264         SSE_ADD_PS(XMM5, XMM6)
1265 
1266         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1267         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1268         SSE_MULT_PS(XMM7, XMM2)
1269         SSE_ADD_PS(XMM5, XMM7)
1270 
1271         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1272         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1273         SSE_MULT_PS(XMM6, XMM3)
1274         SSE_ADD_PS(XMM5, XMM6)
1275 
1276         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1277         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1278 
1279         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1280 
1281         /* Column 2, product is accumulated in XMM6 */
1282         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1283         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1284         SSE_MULT_PS(XMM6, XMM0)
1285 
1286         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1287         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1288         SSE_MULT_PS(XMM7, XMM1)
1289         SSE_ADD_PS(XMM6, XMM7)
1290 
1291         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1292         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1293         SSE_MULT_PS(XMM7, XMM2)
1294         SSE_ADD_PS(XMM6, XMM7)
1295 
1296         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1297         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1298         SSE_MULT_PS(XMM7, XMM3)
1299         SSE_ADD_PS(XMM6, XMM7)
1300 
1301         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1302         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1303 
1304         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1305         /* Column 3, product is accumulated in XMM0 */
1306         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1307         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1308         SSE_MULT_PS(XMM0, XMM7)
1309 
1310         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1311         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1312         SSE_MULT_PS(XMM1, XMM7)
1313         SSE_ADD_PS(XMM0, XMM1)
1314 
1315         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1316         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1317         SSE_MULT_PS(XMM1, XMM2)
1318         SSE_ADD_PS(XMM0, XMM1)
1319 
1320         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1321         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1322         SSE_MULT_PS(XMM3, XMM7)
1323         SSE_ADD_PS(XMM0, XMM3)
1324 
1325         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1326         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1327 
1328         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1329         /* This is code to be maintained and read by humans after all. */
1330         /* Copy Multiplier Col 3 into XMM3 */
1331         SSE_COPY_PS(XMM3, XMM0)
1332         /* Copy Multiplier Col 2 into XMM2 */
1333         SSE_COPY_PS(XMM2, XMM6)
1334         /* Copy Multiplier Col 1 into XMM1 */
1335         SSE_COPY_PS(XMM1, XMM5)
1336         /* Copy Multiplier Col 0 into XMM0 */
1337         SSE_COPY_PS(XMM0, XMM4)
1338         SSE_INLINE_END_2;
1339 
1340         /* Update the row: */
1341         nz = bi[row + 1] - diag_offset[row] - 1;
1342         pv += 16;
1343         for (j = 0; j < nz; j++) {
1344           PREFETCH_L1(&pv[16]);
1345           x = rtmp + 16 * ((unsigned int)pj[j]);
1346           /*            x = rtmp + 4*pj[j]; */
1347 
1348           /* X:=X-M*PV, One column at a time */
1349           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1350           SSE_INLINE_BEGIN_2(x, pv)
1351           /* Load First Column of X*/
1352           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1353           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1354 
1355           /* Matrix-Vector Product: */
1356           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1357           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1358           SSE_MULT_PS(XMM5, XMM0)
1359           SSE_SUB_PS(XMM4, XMM5)
1360 
1361           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1362           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1363           SSE_MULT_PS(XMM6, XMM1)
1364           SSE_SUB_PS(XMM4, XMM6)
1365 
1366           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1367           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1368           SSE_MULT_PS(XMM7, XMM2)
1369           SSE_SUB_PS(XMM4, XMM7)
1370 
1371           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1372           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1373           SSE_MULT_PS(XMM5, XMM3)
1374           SSE_SUB_PS(XMM4, XMM5)
1375 
1376           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1377           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1378 
1379           /* Second Column */
1380           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1381           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1382 
1383           /* Matrix-Vector Product: */
1384           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1385           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1386           SSE_MULT_PS(XMM6, XMM0)
1387           SSE_SUB_PS(XMM5, XMM6)
1388 
1389           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1390           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1391           SSE_MULT_PS(XMM7, XMM1)
1392           SSE_SUB_PS(XMM5, XMM7)
1393 
1394           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1395           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1396           SSE_MULT_PS(XMM4, XMM2)
1397           SSE_SUB_PS(XMM5, XMM4)
1398 
1399           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1400           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1401           SSE_MULT_PS(XMM6, XMM3)
1402           SSE_SUB_PS(XMM5, XMM6)
1403 
1404           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1405           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1406 
1407           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1408 
1409           /* Third Column */
1410           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1411           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1412 
1413           /* Matrix-Vector Product: */
1414           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1415           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1416           SSE_MULT_PS(XMM7, XMM0)
1417           SSE_SUB_PS(XMM6, XMM7)
1418 
1419           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1420           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1421           SSE_MULT_PS(XMM4, XMM1)
1422           SSE_SUB_PS(XMM6, XMM4)
1423 
1424           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1425           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1426           SSE_MULT_PS(XMM5, XMM2)
1427           SSE_SUB_PS(XMM6, XMM5)
1428 
1429           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1430           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1431           SSE_MULT_PS(XMM7, XMM3)
1432           SSE_SUB_PS(XMM6, XMM7)
1433 
1434           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1435           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1436 
1437           /* Fourth Column */
1438           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1439           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1440 
1441           /* Matrix-Vector Product: */
1442           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1443           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1444           SSE_MULT_PS(XMM5, XMM0)
1445           SSE_SUB_PS(XMM4, XMM5)
1446 
1447           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1448           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1449           SSE_MULT_PS(XMM6, XMM1)
1450           SSE_SUB_PS(XMM4, XMM6)
1451 
1452           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1453           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1454           SSE_MULT_PS(XMM7, XMM2)
1455           SSE_SUB_PS(XMM4, XMM7)
1456 
1457           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1458           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1459           SSE_MULT_PS(XMM5, XMM3)
1460           SSE_SUB_PS(XMM4, XMM5)
1461 
1462           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1463           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1464           SSE_INLINE_END_2;
1465           pv += 16;
1466         }
1467         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1468       }
1469       row = (unsigned int)(*bjtmp++);
1470       /*        row = (*bjtmp++)/4; */
1471       /*        bjtmp++; */
1472     }
1473     /* finished row so stick it into b->a */
1474     pv = ba + 16 * bi[i];
1475     pj = bj + bi[i];
1476     nz = bi[i + 1] - bi[i];
1477 
1478     /* Copy x block back into pv block */
1479     for (j = 0; j < nz; j++) {
1480       x = rtmp + 16 * ((unsigned int)pj[j]);
1481       /*        x  = rtmp+4*pj[j]; */
1482 
1483       SSE_INLINE_BEGIN_2(x, pv)
1484       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1485       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1486       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1487 
1488       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1489       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1490 
1491       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1492       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1493 
1494       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1495       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1496 
1497       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1498       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1499 
1500       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1501       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1502 
1503       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1504       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1505 
1506       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1507       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1508       SSE_INLINE_END_2;
1509       pv += 16;
1510     }
1511     /* invert diagonal block */
1512     w = ba + 16 * diag_offset[i];
1513     if (pivotinblocks) {
1514       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1515       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1516     } else {
1517       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1518     }
1519     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1520   }
1521 
1522   PetscCall(PetscFree(rtmp));
1523 
1524   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1525   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1526   C->assembled           = PETSC_TRUE;
1527 
1528   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1529   /* Flop Count from inverting diagonal blocks */
1530   SSE_SCOPE_END;
1531   PetscFunctionReturn(PETSC_SUCCESS);
1532 }
1533 
1534 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info)
1535 {
1536   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1537   int             i, j, n = a->mbs;
1538   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1539   unsigned int    row;
1540   int            *ajtmpold, nz, *bi = b->i;
1541   int            *diag_offset = b->diag, *ai = a->i, *aj = a->j;
1542   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1543   MatScalar      *ba = b->a, *aa = a->a;
1544   int             nonzero       = 0;
1545   PetscBool       pivotinblocks = b->pivotinblocks;
1546   PetscReal       shift         = info->shiftamount;
1547   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
1548 
1549   PetscFunctionBegin;
1550   allowzeropivot = PetscNot(A->erroriffailure);
1551   SSE_SCOPE_BEGIN;
1552 
1553   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1554   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
1555   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1556   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1557   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1558   /*      colscale = 4; */
1559   /*    } */
1560   if ((unsigned long)bj == (unsigned long)aj) return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1561 
1562   for (i = 0; i < n; i++) {
1563     nz    = bi[i + 1] - bi[i];
1564     bjtmp = bj + bi[i];
1565     /* zero out the 4x4 block accumulators */
1566     /* zero out one register */
1567     XOR_PS(XMM7, XMM7);
1568     for (j = 0; j < nz; j++) {
1569       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1570       /*        x = rtmp+4*bjtmp[j]; */
1571       SSE_INLINE_BEGIN_1(x)
1572       /* Copy zero register to memory locations */
1573       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1574       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1575       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1576       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1577       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1578       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1579       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1580       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1581       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1582       SSE_INLINE_END_1;
1583     }
1584     /* load in initial (unfactored row) */
1585     nz       = ai[i + 1] - ai[i];
1586     ajtmpold = aj + ai[i];
1587     v        = aa + 16 * ai[i];
1588     for (j = 0; j < nz; j++) {
1589       x = rtmp + 16 * ajtmpold[j];
1590       /*        x = rtmp+colscale*ajtmpold[j]; */
1591       /* Copy v block into x block */
1592       SSE_INLINE_BEGIN_2(v, x)
1593       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1594       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1595       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1596 
1597       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1598       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1599 
1600       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1601       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1602 
1603       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1604       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1605 
1606       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1607       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1608 
1609       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1610       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1611 
1612       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1613       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1614 
1615       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1616       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1617       SSE_INLINE_END_2;
1618 
1619       v += 16;
1620     }
1621     /*      row = (*bjtmp++)/4; */
1622     row = (unsigned int)(*bjtmp++);
1623     while (row < i) {
1624       pc = rtmp + 16 * row;
1625       SSE_INLINE_BEGIN_1(pc)
1626       /* Load block from lower triangle */
1627       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1628       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1629       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1630 
1631       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1632       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1633 
1634       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1635       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1636 
1637       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1638       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1639 
1640       /* Compare block to zero block */
1641 
1642       SSE_COPY_PS(XMM4, XMM7)
1643       SSE_CMPNEQ_PS(XMM4, XMM0)
1644 
1645       SSE_COPY_PS(XMM5, XMM7)
1646       SSE_CMPNEQ_PS(XMM5, XMM1)
1647 
1648       SSE_COPY_PS(XMM6, XMM7)
1649       SSE_CMPNEQ_PS(XMM6, XMM2)
1650 
1651       SSE_CMPNEQ_PS(XMM7, XMM3)
1652 
1653       /* Reduce the comparisons to one SSE register */
1654       SSE_OR_PS(XMM6, XMM7)
1655       SSE_OR_PS(XMM5, XMM4)
1656       SSE_OR_PS(XMM5, XMM6)
1657       SSE_INLINE_END_1;
1658 
1659       /* Reduce the one SSE register to an integer register for branching */
1660       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1661       MOVEMASK(nonzero, XMM5);
1662 
1663       /* If block is nonzero ... */
1664       if (nonzero) {
1665         pv = ba + 16 * diag_offset[row];
1666         PREFETCH_L1(&pv[16]);
1667         pj = bj + diag_offset[row] + 1;
1668 
1669         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1670         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1671         /* but the diagonal was inverted already */
1672         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1673 
1674         SSE_INLINE_BEGIN_2(pv, pc)
1675         /* Column 0, product is accumulated in XMM4 */
1676         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1677         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1678         SSE_MULT_PS(XMM4, XMM0)
1679 
1680         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1681         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1682         SSE_MULT_PS(XMM5, XMM1)
1683         SSE_ADD_PS(XMM4, XMM5)
1684 
1685         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1686         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1687         SSE_MULT_PS(XMM6, XMM2)
1688         SSE_ADD_PS(XMM4, XMM6)
1689 
1690         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1691         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1692         SSE_MULT_PS(XMM7, XMM3)
1693         SSE_ADD_PS(XMM4, XMM7)
1694 
1695         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1696         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1697 
1698         /* Column 1, product is accumulated in XMM5 */
1699         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1700         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1701         SSE_MULT_PS(XMM5, XMM0)
1702 
1703         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1704         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1705         SSE_MULT_PS(XMM6, XMM1)
1706         SSE_ADD_PS(XMM5, XMM6)
1707 
1708         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1709         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1710         SSE_MULT_PS(XMM7, XMM2)
1711         SSE_ADD_PS(XMM5, XMM7)
1712 
1713         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1714         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1715         SSE_MULT_PS(XMM6, XMM3)
1716         SSE_ADD_PS(XMM5, XMM6)
1717 
1718         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1719         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1720 
1721         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1722 
1723         /* Column 2, product is accumulated in XMM6 */
1724         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1725         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1726         SSE_MULT_PS(XMM6, XMM0)
1727 
1728         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1729         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1730         SSE_MULT_PS(XMM7, XMM1)
1731         SSE_ADD_PS(XMM6, XMM7)
1732 
1733         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1734         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1735         SSE_MULT_PS(XMM7, XMM2)
1736         SSE_ADD_PS(XMM6, XMM7)
1737 
1738         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1739         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1740         SSE_MULT_PS(XMM7, XMM3)
1741         SSE_ADD_PS(XMM6, XMM7)
1742 
1743         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1744         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1745 
1746         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1747         /* Column 3, product is accumulated in XMM0 */
1748         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1749         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1750         SSE_MULT_PS(XMM0, XMM7)
1751 
1752         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1753         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1754         SSE_MULT_PS(XMM1, XMM7)
1755         SSE_ADD_PS(XMM0, XMM1)
1756 
1757         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1758         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1759         SSE_MULT_PS(XMM1, XMM2)
1760         SSE_ADD_PS(XMM0, XMM1)
1761 
1762         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1763         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1764         SSE_MULT_PS(XMM3, XMM7)
1765         SSE_ADD_PS(XMM0, XMM3)
1766 
1767         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1768         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1769 
1770         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1771         /* This is code to be maintained and read by humans after all. */
1772         /* Copy Multiplier Col 3 into XMM3 */
1773         SSE_COPY_PS(XMM3, XMM0)
1774         /* Copy Multiplier Col 2 into XMM2 */
1775         SSE_COPY_PS(XMM2, XMM6)
1776         /* Copy Multiplier Col 1 into XMM1 */
1777         SSE_COPY_PS(XMM1, XMM5)
1778         /* Copy Multiplier Col 0 into XMM0 */
1779         SSE_COPY_PS(XMM0, XMM4)
1780         SSE_INLINE_END_2;
1781 
1782         /* Update the row: */
1783         nz = bi[row + 1] - diag_offset[row] - 1;
1784         pv += 16;
1785         for (j = 0; j < nz; j++) {
1786           PREFETCH_L1(&pv[16]);
1787           x = rtmp + 16 * ((unsigned int)pj[j]);
1788           /*            x = rtmp + 4*pj[j]; */
1789 
1790           /* X:=X-M*PV, One column at a time */
1791           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1792           SSE_INLINE_BEGIN_2(x, pv)
1793           /* Load First Column of X*/
1794           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1795           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1796 
1797           /* Matrix-Vector Product: */
1798           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1799           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1800           SSE_MULT_PS(XMM5, XMM0)
1801           SSE_SUB_PS(XMM4, XMM5)
1802 
1803           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1804           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1805           SSE_MULT_PS(XMM6, XMM1)
1806           SSE_SUB_PS(XMM4, XMM6)
1807 
1808           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1809           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1810           SSE_MULT_PS(XMM7, XMM2)
1811           SSE_SUB_PS(XMM4, XMM7)
1812 
1813           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1814           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1815           SSE_MULT_PS(XMM5, XMM3)
1816           SSE_SUB_PS(XMM4, XMM5)
1817 
1818           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1819           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1820 
1821           /* Second Column */
1822           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1823           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1824 
1825           /* Matrix-Vector Product: */
1826           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1827           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1828           SSE_MULT_PS(XMM6, XMM0)
1829           SSE_SUB_PS(XMM5, XMM6)
1830 
1831           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1832           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1833           SSE_MULT_PS(XMM7, XMM1)
1834           SSE_SUB_PS(XMM5, XMM7)
1835 
1836           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1837           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1838           SSE_MULT_PS(XMM4, XMM2)
1839           SSE_SUB_PS(XMM5, XMM4)
1840 
1841           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1842           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1843           SSE_MULT_PS(XMM6, XMM3)
1844           SSE_SUB_PS(XMM5, XMM6)
1845 
1846           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1847           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1848 
1849           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1850 
1851           /* Third Column */
1852           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1853           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1854 
1855           /* Matrix-Vector Product: */
1856           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1857           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1858           SSE_MULT_PS(XMM7, XMM0)
1859           SSE_SUB_PS(XMM6, XMM7)
1860 
1861           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1862           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1863           SSE_MULT_PS(XMM4, XMM1)
1864           SSE_SUB_PS(XMM6, XMM4)
1865 
1866           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1867           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1868           SSE_MULT_PS(XMM5, XMM2)
1869           SSE_SUB_PS(XMM6, XMM5)
1870 
1871           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1872           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1873           SSE_MULT_PS(XMM7, XMM3)
1874           SSE_SUB_PS(XMM6, XMM7)
1875 
1876           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1877           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1878 
1879           /* Fourth Column */
1880           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1881           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1882 
1883           /* Matrix-Vector Product: */
1884           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1885           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1886           SSE_MULT_PS(XMM5, XMM0)
1887           SSE_SUB_PS(XMM4, XMM5)
1888 
1889           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1890           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1891           SSE_MULT_PS(XMM6, XMM1)
1892           SSE_SUB_PS(XMM4, XMM6)
1893 
1894           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1895           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1896           SSE_MULT_PS(XMM7, XMM2)
1897           SSE_SUB_PS(XMM4, XMM7)
1898 
1899           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1900           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1901           SSE_MULT_PS(XMM5, XMM3)
1902           SSE_SUB_PS(XMM4, XMM5)
1903 
1904           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1905           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1906           SSE_INLINE_END_2;
1907           pv += 16;
1908         }
1909         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1910       }
1911       row = (unsigned int)(*bjtmp++);
1912       /*        row = (*bjtmp++)/4; */
1913       /*        bjtmp++; */
1914     }
1915     /* finished row so stick it into b->a */
1916     pv = ba + 16 * bi[i];
1917     pj = bj + bi[i];
1918     nz = bi[i + 1] - bi[i];
1919 
1920     /* Copy x block back into pv block */
1921     for (j = 0; j < nz; j++) {
1922       x = rtmp + 16 * ((unsigned int)pj[j]);
1923       /*        x  = rtmp+4*pj[j]; */
1924 
1925       SSE_INLINE_BEGIN_2(x, pv)
1926       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1927       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1928       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1929 
1930       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1931       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1932 
1933       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1934       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1935 
1936       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1937       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1938 
1939       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1940       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1941 
1942       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1943       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1944 
1945       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1946       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1947 
1948       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1949       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1950       SSE_INLINE_END_2;
1951       pv += 16;
1952     }
1953     /* invert diagonal block */
1954     w = ba + 16 * diag_offset[i];
1955     if (pivotinblocks) {
1956       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected));
1957       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1958     } else {
1959       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1960     }
1961     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1962     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1963   }
1964 
1965   PetscCall(PetscFree(rtmp));
1966 
1967   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1968   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1969   C->assembled           = PETSC_TRUE;
1970 
1971   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1972   /* Flop Count from inverting diagonal blocks */
1973   SSE_SCOPE_END;
1974   PetscFunctionReturn(PETSC_SUCCESS);
1975 }
1976 
1977 #endif
1978