xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 31d78bcd2b98084dc1368b20eb1129c8b9fb39fe)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 /* ------------------------------------------------------------*/
9 /*
10       Version for when blocks are 4 by 4
11 */
12 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
13 {
14   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
15   IS              isrow = b->row, isicol = b->icol;
16   const PetscInt *r, *ic;
17   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
18   PetscInt       *ajtmpold, *ajtmp, nz, row;
19   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
20   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
21   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
22   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
23   MatScalar       p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
24   MatScalar       m13, m14, m15, m16;
25   MatScalar      *ba = b->a, *aa = a->a;
26   PetscBool       pivotinblocks = b->pivotinblocks;
27   PetscReal       shift         = info->shiftamount;
28   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
29 
30   PetscFunctionBegin;
31   PetscCall(ISGetIndices(isrow, &r));
32   PetscCall(ISGetIndices(isicol, &ic));
33   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
34   allowzeropivot = PetscNot(A->erroriffailure);
35 
36   for (i = 0; i < n; i++) {
37     nz    = bi[i + 1] - bi[i];
38     ajtmp = bj + bi[i];
39     for (j = 0; j < nz; j++) {
40       x    = rtmp + 16 * ajtmp[j];
41       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
42       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
43     }
44     /* load in initial (unfactored row) */
45     idx      = r[i];
46     nz       = ai[idx + 1] - ai[idx];
47     ajtmpold = aj + ai[idx];
48     v        = aa + 16 * ai[idx];
49     for (j = 0; j < nz; j++) {
50       x     = rtmp + 16 * ic[ajtmpold[j]];
51       x[0]  = v[0];
52       x[1]  = v[1];
53       x[2]  = v[2];
54       x[3]  = v[3];
55       x[4]  = v[4];
56       x[5]  = v[5];
57       x[6]  = v[6];
58       x[7]  = v[7];
59       x[8]  = v[8];
60       x[9]  = v[9];
61       x[10] = v[10];
62       x[11] = v[11];
63       x[12] = v[12];
64       x[13] = v[13];
65       x[14] = v[14];
66       x[15] = v[15];
67       v += 16;
68     }
69     row = *ajtmp++;
70     while (row < i) {
71       pc  = rtmp + 16 * row;
72       p1  = pc[0];
73       p2  = pc[1];
74       p3  = pc[2];
75       p4  = pc[3];
76       p5  = pc[4];
77       p6  = pc[5];
78       p7  = pc[6];
79       p8  = pc[7];
80       p9  = pc[8];
81       p10 = pc[9];
82       p11 = pc[10];
83       p12 = pc[11];
84       p13 = pc[12];
85       p14 = pc[13];
86       p15 = pc[14];
87       p16 = pc[15];
88       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
89         pv    = ba + 16 * diag_offset[row];
90         pj    = bj + diag_offset[row] + 1;
91         x1    = pv[0];
92         x2    = pv[1];
93         x3    = pv[2];
94         x4    = pv[3];
95         x5    = pv[4];
96         x6    = pv[5];
97         x7    = pv[6];
98         x8    = pv[7];
99         x9    = pv[8];
100         x10   = pv[9];
101         x11   = pv[10];
102         x12   = pv[11];
103         x13   = pv[12];
104         x14   = pv[13];
105         x15   = pv[14];
106         x16   = pv[15];
107         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
108         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
109         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
110         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
111 
112         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
113         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
114         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
115         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
116 
117         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
118         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
119         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
120         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
121 
122         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
123         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
124         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
125         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
126 
127         nz = bi[row + 1] - diag_offset[row] - 1;
128         pv += 16;
129         for (j = 0; j < nz; j++) {
130           x1  = pv[0];
131           x2  = pv[1];
132           x3  = pv[2];
133           x4  = pv[3];
134           x5  = pv[4];
135           x6  = pv[5];
136           x7  = pv[6];
137           x8  = pv[7];
138           x9  = pv[8];
139           x10 = pv[9];
140           x11 = pv[10];
141           x12 = pv[11];
142           x13 = pv[12];
143           x14 = pv[13];
144           x15 = pv[14];
145           x16 = pv[15];
146           x   = rtmp + 16 * pj[j];
147           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
148           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
149           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
150           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
151 
152           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
153           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
154           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
155           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
156 
157           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
158           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
159           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
160           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
161 
162           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
163           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
164           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
165           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
166 
167           pv += 16;
168         }
169         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
170       }
171       row = *ajtmp++;
172     }
173     /* finished row so stick it into b->a */
174     pv = ba + 16 * bi[i];
175     pj = bj + bi[i];
176     nz = bi[i + 1] - bi[i];
177     for (j = 0; j < nz; j++) {
178       x      = rtmp + 16 * pj[j];
179       pv[0]  = x[0];
180       pv[1]  = x[1];
181       pv[2]  = x[2];
182       pv[3]  = x[3];
183       pv[4]  = x[4];
184       pv[5]  = x[5];
185       pv[6]  = x[6];
186       pv[7]  = x[7];
187       pv[8]  = x[8];
188       pv[9]  = x[9];
189       pv[10] = x[10];
190       pv[11] = x[11];
191       pv[12] = x[12];
192       pv[13] = x[13];
193       pv[14] = x[14];
194       pv[15] = x[15];
195       pv += 16;
196     }
197     /* invert diagonal block */
198     w = ba + 16 * diag_offset[i];
199     if (pivotinblocks) {
200       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
201       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
202     } else {
203       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
204     }
205   }
206 
207   PetscCall(PetscFree(rtmp));
208   PetscCall(ISRestoreIndices(isicol, &ic));
209   PetscCall(ISRestoreIndices(isrow, &r));
210 
211   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
212   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
213   C->assembled           = PETSC_TRUE;
214 
215   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
216   PetscFunctionReturn(PETSC_SUCCESS);
217 }
218 
219 /* MatLUFactorNumeric_SeqBAIJ_4 -
220      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
221        PetscKernel_A_gets_A_times_B()
222        PetscKernel_A_gets_A_minus_B_times_C()
223        PetscKernel_A_gets_inverse_A()
224 */
225 
226 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
227 {
228   Mat             C = B;
229   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
230   IS              isrow = b->row, isicol = b->icol;
231   const PetscInt *r, *ic;
232   PetscInt        i, j, k, nz, nzL, row;
233   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
234   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
235   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
236   PetscInt        flg;
237   PetscReal       shift;
238   PetscBool       allowzeropivot, zeropivotdetected;
239 
240   PetscFunctionBegin;
241   allowzeropivot = PetscNot(A->erroriffailure);
242   PetscCall(ISGetIndices(isrow, &r));
243   PetscCall(ISGetIndices(isicol, &ic));
244 
245   if (info->shifttype == MAT_SHIFT_NONE) {
246     shift = 0;
247   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
248     shift = info->shiftamount;
249   }
250 
251   /* generate work space needed by the factorization */
252   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
253   PetscCall(PetscArrayzero(rtmp, bs2 * n));
254 
255   for (i = 0; i < n; i++) {
256     /* zero rtmp */
257     /* L part */
258     nz    = bi[i + 1] - bi[i];
259     bjtmp = bj + bi[i];
260     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
261 
262     /* U part */
263     nz    = bdiag[i] - bdiag[i + 1];
264     bjtmp = bj + bdiag[i + 1] + 1;
265     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
266 
267     /* load in initial (unfactored row) */
268     nz    = ai[r[i] + 1] - ai[r[i]];
269     ajtmp = aj + ai[r[i]];
270     v     = aa + bs2 * ai[r[i]];
271     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
272 
273     /* elimination */
274     bjtmp = bj + bi[i];
275     nzL   = bi[i + 1] - bi[i];
276     for (k = 0; k < nzL; k++) {
277       row = bjtmp[k];
278       pc  = rtmp + bs2 * row;
279       for (flg = 0, j = 0; j < bs2; j++) {
280         if (pc[j] != 0.0) {
281           flg = 1;
282           break;
283         }
284       }
285       if (flg) {
286         pv = b->a + bs2 * bdiag[row];
287         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
288         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
289 
290         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
291         pv = b->a + bs2 * (bdiag[row + 1] + 1);
292         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
293         for (j = 0; j < nz; j++) {
294           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
295           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
296           v = rtmp + bs2 * pj[j];
297           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
298           pv += bs2;
299         }
300         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
301       }
302     }
303 
304     /* finished row so stick it into b->a */
305     /* L part */
306     pv = b->a + bs2 * bi[i];
307     pj = b->j + bi[i];
308     nz = bi[i + 1] - bi[i];
309     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
310 
311     /* Mark diagonal and invert diagonal for simpler triangular solves */
312     pv = b->a + bs2 * bdiag[i];
313     pj = b->j + bdiag[i];
314     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
315     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
316     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
317 
318     /* U part */
319     pv = b->a + bs2 * (bdiag[i + 1] + 1);
320     pj = b->j + bdiag[i + 1] + 1;
321     nz = bdiag[i] - bdiag[i + 1] - 1;
322     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
323   }
324 
325   PetscCall(PetscFree2(rtmp, mwork));
326   PetscCall(ISRestoreIndices(isicol, &ic));
327   PetscCall(ISRestoreIndices(isrow, &r));
328 
329   C->ops->solve          = MatSolve_SeqBAIJ_4;
330   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
331   C->assembled           = PETSC_TRUE;
332 
333   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
334   PetscFunctionReturn(PETSC_SUCCESS);
335 }
336 
337 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
338 {
339   /*
340     Default Version for when blocks are 4 by 4 Using natural ordering
341 */
342   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
343   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
344   PetscInt    *ajtmpold, *ajtmp, nz, row;
345   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
346   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
347   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
348   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
349   MatScalar    p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
350   MatScalar    m13, m14, m15, m16;
351   MatScalar   *ba = b->a, *aa = a->a;
352   PetscBool    pivotinblocks = b->pivotinblocks;
353   PetscReal    shift         = info->shiftamount;
354   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
355 
356   PetscFunctionBegin;
357   allowzeropivot = PetscNot(A->erroriffailure);
358   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
359 
360   for (i = 0; i < n; i++) {
361     nz    = bi[i + 1] - bi[i];
362     ajtmp = bj + bi[i];
363     for (j = 0; j < nz; j++) {
364       x    = rtmp + 16 * ajtmp[j];
365       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
366       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
367     }
368     /* load in initial (unfactored row) */
369     nz       = ai[i + 1] - ai[i];
370     ajtmpold = aj + ai[i];
371     v        = aa + 16 * ai[i];
372     for (j = 0; j < nz; j++) {
373       x     = rtmp + 16 * ajtmpold[j];
374       x[0]  = v[0];
375       x[1]  = v[1];
376       x[2]  = v[2];
377       x[3]  = v[3];
378       x[4]  = v[4];
379       x[5]  = v[5];
380       x[6]  = v[6];
381       x[7]  = v[7];
382       x[8]  = v[8];
383       x[9]  = v[9];
384       x[10] = v[10];
385       x[11] = v[11];
386       x[12] = v[12];
387       x[13] = v[13];
388       x[14] = v[14];
389       x[15] = v[15];
390       v += 16;
391     }
392     row = *ajtmp++;
393     while (row < i) {
394       pc  = rtmp + 16 * row;
395       p1  = pc[0];
396       p2  = pc[1];
397       p3  = pc[2];
398       p4  = pc[3];
399       p5  = pc[4];
400       p6  = pc[5];
401       p7  = pc[6];
402       p8  = pc[7];
403       p9  = pc[8];
404       p10 = pc[9];
405       p11 = pc[10];
406       p12 = pc[11];
407       p13 = pc[12];
408       p14 = pc[13];
409       p15 = pc[14];
410       p16 = pc[15];
411       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
412         pv    = ba + 16 * diag_offset[row];
413         pj    = bj + diag_offset[row] + 1;
414         x1    = pv[0];
415         x2    = pv[1];
416         x3    = pv[2];
417         x4    = pv[3];
418         x5    = pv[4];
419         x6    = pv[5];
420         x7    = pv[6];
421         x8    = pv[7];
422         x9    = pv[8];
423         x10   = pv[9];
424         x11   = pv[10];
425         x12   = pv[11];
426         x13   = pv[12];
427         x14   = pv[13];
428         x15   = pv[14];
429         x16   = pv[15];
430         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
431         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
432         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
433         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
434 
435         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
436         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
437         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
438         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
439 
440         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
441         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
442         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
443         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
444 
445         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
446         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
447         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
448         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
449         nz           = bi[row + 1] - diag_offset[row] - 1;
450         pv += 16;
451         for (j = 0; j < nz; j++) {
452           x1  = pv[0];
453           x2  = pv[1];
454           x3  = pv[2];
455           x4  = pv[3];
456           x5  = pv[4];
457           x6  = pv[5];
458           x7  = pv[6];
459           x8  = pv[7];
460           x9  = pv[8];
461           x10 = pv[9];
462           x11 = pv[10];
463           x12 = pv[11];
464           x13 = pv[12];
465           x14 = pv[13];
466           x15 = pv[14];
467           x16 = pv[15];
468           x   = rtmp + 16 * pj[j];
469           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
470           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
471           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
472           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
473 
474           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
475           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
476           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
477           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
478 
479           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
480           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
481           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
482           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
483 
484           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
485           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
486           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
487           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
488 
489           pv += 16;
490         }
491         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
492       }
493       row = *ajtmp++;
494     }
495     /* finished row so stick it into b->a */
496     pv = ba + 16 * bi[i];
497     pj = bj + bi[i];
498     nz = bi[i + 1] - bi[i];
499     for (j = 0; j < nz; j++) {
500       x      = rtmp + 16 * pj[j];
501       pv[0]  = x[0];
502       pv[1]  = x[1];
503       pv[2]  = x[2];
504       pv[3]  = x[3];
505       pv[4]  = x[4];
506       pv[5]  = x[5];
507       pv[6]  = x[6];
508       pv[7]  = x[7];
509       pv[8]  = x[8];
510       pv[9]  = x[9];
511       pv[10] = x[10];
512       pv[11] = x[11];
513       pv[12] = x[12];
514       pv[13] = x[13];
515       pv[14] = x[14];
516       pv[15] = x[15];
517       pv += 16;
518     }
519     /* invert diagonal block */
520     w = ba + 16 * diag_offset[i];
521     if (pivotinblocks) {
522       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
523       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
524     } else {
525       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
526     }
527   }
528 
529   PetscCall(PetscFree(rtmp));
530 
531   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
532   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
533   C->assembled           = PETSC_TRUE;
534 
535   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
536   PetscFunctionReturn(PETSC_SUCCESS);
537 }
538 
539 /*
540   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
541     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
542 */
543 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
544 {
545   Mat             C = B;
546   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
547   PetscInt        i, j, k, nz, nzL, row;
548   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
549   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
550   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
551   PetscInt        flg;
552   PetscReal       shift;
553   PetscBool       allowzeropivot, zeropivotdetected;
554 
555   PetscFunctionBegin;
556   allowzeropivot = PetscNot(A->erroriffailure);
557 
558   /* generate work space needed by the factorization */
559   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
560   PetscCall(PetscArrayzero(rtmp, bs2 * n));
561 
562   if (info->shifttype == MAT_SHIFT_NONE) {
563     shift = 0;
564   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
565     shift = info->shiftamount;
566   }
567 
568   for (i = 0; i < n; i++) {
569     /* zero rtmp */
570     /* L part */
571     nz    = bi[i + 1] - bi[i];
572     bjtmp = bj + bi[i];
573     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
574 
575     /* U part */
576     nz    = bdiag[i] - bdiag[i + 1];
577     bjtmp = bj + bdiag[i + 1] + 1;
578     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
579 
580     /* load in initial (unfactored row) */
581     nz    = ai[i + 1] - ai[i];
582     ajtmp = aj + ai[i];
583     v     = aa + bs2 * ai[i];
584     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
585 
586     /* elimination */
587     bjtmp = bj + bi[i];
588     nzL   = bi[i + 1] - bi[i];
589     for (k = 0; k < nzL; k++) {
590       row = bjtmp[k];
591       pc  = rtmp + bs2 * row;
592       for (flg = 0, j = 0; j < bs2; j++) {
593         if (pc[j] != 0.0) {
594           flg = 1;
595           break;
596         }
597       }
598       if (flg) {
599         pv = b->a + bs2 * bdiag[row];
600         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
601         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
602 
603         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
604         pv = b->a + bs2 * (bdiag[row + 1] + 1);
605         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
606         for (j = 0; j < nz; j++) {
607           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
608           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
609           v = rtmp + bs2 * pj[j];
610           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
611           pv += bs2;
612         }
613         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
614       }
615     }
616 
617     /* finished row so stick it into b->a */
618     /* L part */
619     pv = b->a + bs2 * bi[i];
620     pj = b->j + bi[i];
621     nz = bi[i + 1] - bi[i];
622     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
623 
624     /* Mark diagonal and invert diagonal for simpler triangular solves */
625     pv = b->a + bs2 * bdiag[i];
626     pj = b->j + bdiag[i];
627     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
628     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
629     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
630 
631     /* U part */
632     pv = b->a + bs2 * (bdiag[i + 1] + 1);
633     pj = b->j + bdiag[i + 1] + 1;
634     nz = bdiag[i] - bdiag[i + 1] - 1;
635     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
636   }
637   PetscCall(PetscFree2(rtmp, mwork));
638 
639   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
640   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
641   C->assembled           = PETSC_TRUE;
642 
643   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
644   PetscFunctionReturn(PETSC_SUCCESS);
645 }
646 
647 #if defined(PETSC_HAVE_SSE)
648 
649   #include PETSC_HAVE_SSE
650 
651 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
652 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info)
653 {
654   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
655   int          i, j, n = a->mbs;
656   int         *bj = b->j, *bjtmp, *pj;
657   int          row;
658   int         *ajtmpold, nz, *bi = b->i;
659   int         *diag_offset = b->diag, *ai = a->i, *aj = a->j;
660   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
661   MatScalar   *ba = b->a, *aa = a->a;
662   int          nonzero       = 0;
663   PetscBool    pivotinblocks = b->pivotinblocks;
664   PetscReal    shift         = info->shiftamount;
665   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
666 
667   PetscFunctionBegin;
668   allowzeropivot = PetscNot(A->erroriffailure);
669   SSE_SCOPE_BEGIN;
670 
671   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
672   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
673   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
674   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
675   /*    if ((unsigned long)bj==(unsigned long)aj) { */
676   /*      colscale = 4; */
677   /*    } */
678   for (i = 0; i < n; i++) {
679     nz    = bi[i + 1] - bi[i];
680     bjtmp = bj + bi[i];
681     /* zero out the 4x4 block accumulators */
682     /* zero out one register */
683     XOR_PS(XMM7, XMM7);
684     for (j = 0; j < nz; j++) {
685       x = rtmp + 16 * bjtmp[j];
686       /*        x = rtmp+4*bjtmp[j]; */
687       SSE_INLINE_BEGIN_1(x)
688       /* Copy zero register to memory locations */
689       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
690       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
691       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
692       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
693       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
694       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
695       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
696       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
697       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
698       SSE_INLINE_END_1;
699     }
700     /* load in initial (unfactored row) */
701     nz       = ai[i + 1] - ai[i];
702     ajtmpold = aj + ai[i];
703     v        = aa + 16 * ai[i];
704     for (j = 0; j < nz; j++) {
705       x = rtmp + 16 * ajtmpold[j];
706       /*        x = rtmp+colscale*ajtmpold[j]; */
707       /* Copy v block into x block */
708       SSE_INLINE_BEGIN_2(v, x)
709       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
710       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
711       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
712 
713       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
714       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
715 
716       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
717       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
718 
719       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
720       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
721 
722       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
723       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
724 
725       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
726       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
727 
728       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
729       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
730 
731       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
732       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
733       SSE_INLINE_END_2;
734 
735       v += 16;
736     }
737     /*      row = (*bjtmp++)/4; */
738     row = *bjtmp++;
739     while (row < i) {
740       pc = rtmp + 16 * row;
741       SSE_INLINE_BEGIN_1(pc)
742       /* Load block from lower triangle */
743       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
744       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
745       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
746 
747       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
748       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
749 
750       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
751       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
752 
753       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
754       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
755 
756       /* Compare block to zero block */
757 
758       SSE_COPY_PS(XMM4, XMM7)
759       SSE_CMPNEQ_PS(XMM4, XMM0)
760 
761       SSE_COPY_PS(XMM5, XMM7)
762       SSE_CMPNEQ_PS(XMM5, XMM1)
763 
764       SSE_COPY_PS(XMM6, XMM7)
765       SSE_CMPNEQ_PS(XMM6, XMM2)
766 
767       SSE_CMPNEQ_PS(XMM7, XMM3)
768 
769       /* Reduce the comparisons to one SSE register */
770       SSE_OR_PS(XMM6, XMM7)
771       SSE_OR_PS(XMM5, XMM4)
772       SSE_OR_PS(XMM5, XMM6)
773       SSE_INLINE_END_1;
774 
775       /* Reduce the one SSE register to an integer register for branching */
776       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
777       MOVEMASK(nonzero, XMM5);
778 
779       /* If block is nonzero ... */
780       if (nonzero) {
781         pv = ba + 16 * diag_offset[row];
782         PREFETCH_L1(&pv[16]);
783         pj = bj + diag_offset[row] + 1;
784 
785         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
786         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
787         /* but the diagonal was inverted already */
788         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
789 
790         SSE_INLINE_BEGIN_2(pv, pc)
791         /* Column 0, product is accumulated in XMM4 */
792         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
793         SSE_SHUFFLE(XMM4, XMM4, 0x00)
794         SSE_MULT_PS(XMM4, XMM0)
795 
796         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
797         SSE_SHUFFLE(XMM5, XMM5, 0x00)
798         SSE_MULT_PS(XMM5, XMM1)
799         SSE_ADD_PS(XMM4, XMM5)
800 
801         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
802         SSE_SHUFFLE(XMM6, XMM6, 0x00)
803         SSE_MULT_PS(XMM6, XMM2)
804         SSE_ADD_PS(XMM4, XMM6)
805 
806         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
807         SSE_SHUFFLE(XMM7, XMM7, 0x00)
808         SSE_MULT_PS(XMM7, XMM3)
809         SSE_ADD_PS(XMM4, XMM7)
810 
811         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
812         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
813 
814         /* Column 1, product is accumulated in XMM5 */
815         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
816         SSE_SHUFFLE(XMM5, XMM5, 0x00)
817         SSE_MULT_PS(XMM5, XMM0)
818 
819         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
820         SSE_SHUFFLE(XMM6, XMM6, 0x00)
821         SSE_MULT_PS(XMM6, XMM1)
822         SSE_ADD_PS(XMM5, XMM6)
823 
824         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
825         SSE_SHUFFLE(XMM7, XMM7, 0x00)
826         SSE_MULT_PS(XMM7, XMM2)
827         SSE_ADD_PS(XMM5, XMM7)
828 
829         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
830         SSE_SHUFFLE(XMM6, XMM6, 0x00)
831         SSE_MULT_PS(XMM6, XMM3)
832         SSE_ADD_PS(XMM5, XMM6)
833 
834         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
835         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
836 
837         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
838 
839         /* Column 2, product is accumulated in XMM6 */
840         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
841         SSE_SHUFFLE(XMM6, XMM6, 0x00)
842         SSE_MULT_PS(XMM6, XMM0)
843 
844         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
845         SSE_SHUFFLE(XMM7, XMM7, 0x00)
846         SSE_MULT_PS(XMM7, XMM1)
847         SSE_ADD_PS(XMM6, XMM7)
848 
849         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
850         SSE_SHUFFLE(XMM7, XMM7, 0x00)
851         SSE_MULT_PS(XMM7, XMM2)
852         SSE_ADD_PS(XMM6, XMM7)
853 
854         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
855         SSE_SHUFFLE(XMM7, XMM7, 0x00)
856         SSE_MULT_PS(XMM7, XMM3)
857         SSE_ADD_PS(XMM6, XMM7)
858 
859         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
860         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
861 
862         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
863         /* Column 3, product is accumulated in XMM0 */
864         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
865         SSE_SHUFFLE(XMM7, XMM7, 0x00)
866         SSE_MULT_PS(XMM0, XMM7)
867 
868         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
869         SSE_SHUFFLE(XMM7, XMM7, 0x00)
870         SSE_MULT_PS(XMM1, XMM7)
871         SSE_ADD_PS(XMM0, XMM1)
872 
873         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
874         SSE_SHUFFLE(XMM1, XMM1, 0x00)
875         SSE_MULT_PS(XMM1, XMM2)
876         SSE_ADD_PS(XMM0, XMM1)
877 
878         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
879         SSE_SHUFFLE(XMM7, XMM7, 0x00)
880         SSE_MULT_PS(XMM3, XMM7)
881         SSE_ADD_PS(XMM0, XMM3)
882 
883         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
884         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
885 
886         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
887         /* This is code to be maintained and read by humans after all. */
888         /* Copy Multiplier Col 3 into XMM3 */
889         SSE_COPY_PS(XMM3, XMM0)
890         /* Copy Multiplier Col 2 into XMM2 */
891         SSE_COPY_PS(XMM2, XMM6)
892         /* Copy Multiplier Col 1 into XMM1 */
893         SSE_COPY_PS(XMM1, XMM5)
894         /* Copy Multiplier Col 0 into XMM0 */
895         SSE_COPY_PS(XMM0, XMM4)
896         SSE_INLINE_END_2;
897 
898         /* Update the row: */
899         nz = bi[row + 1] - diag_offset[row] - 1;
900         pv += 16;
901         for (j = 0; j < nz; j++) {
902           PREFETCH_L1(&pv[16]);
903           x = rtmp + 16 * pj[j];
904           /*            x = rtmp + 4*pj[j]; */
905 
906           /* X:=X-M*PV, One column at a time */
907           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
908           SSE_INLINE_BEGIN_2(x, pv)
909           /* Load First Column of X*/
910           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
911           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
912 
913           /* Matrix-Vector Product: */
914           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
915           SSE_SHUFFLE(XMM5, XMM5, 0x00)
916           SSE_MULT_PS(XMM5, XMM0)
917           SSE_SUB_PS(XMM4, XMM5)
918 
919           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
920           SSE_SHUFFLE(XMM6, XMM6, 0x00)
921           SSE_MULT_PS(XMM6, XMM1)
922           SSE_SUB_PS(XMM4, XMM6)
923 
924           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
925           SSE_SHUFFLE(XMM7, XMM7, 0x00)
926           SSE_MULT_PS(XMM7, XMM2)
927           SSE_SUB_PS(XMM4, XMM7)
928 
929           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
930           SSE_SHUFFLE(XMM5, XMM5, 0x00)
931           SSE_MULT_PS(XMM5, XMM3)
932           SSE_SUB_PS(XMM4, XMM5)
933 
934           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
935           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
936 
937           /* Second Column */
938           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
939           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
940 
941           /* Matrix-Vector Product: */
942           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
943           SSE_SHUFFLE(XMM6, XMM6, 0x00)
944           SSE_MULT_PS(XMM6, XMM0)
945           SSE_SUB_PS(XMM5, XMM6)
946 
947           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
948           SSE_SHUFFLE(XMM7, XMM7, 0x00)
949           SSE_MULT_PS(XMM7, XMM1)
950           SSE_SUB_PS(XMM5, XMM7)
951 
952           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
953           SSE_SHUFFLE(XMM4, XMM4, 0x00)
954           SSE_MULT_PS(XMM4, XMM2)
955           SSE_SUB_PS(XMM5, XMM4)
956 
957           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
958           SSE_SHUFFLE(XMM6, XMM6, 0x00)
959           SSE_MULT_PS(XMM6, XMM3)
960           SSE_SUB_PS(XMM5, XMM6)
961 
962           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
963           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
964 
965           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
966 
967           /* Third Column */
968           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
969           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
970 
971           /* Matrix-Vector Product: */
972           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
973           SSE_SHUFFLE(XMM7, XMM7, 0x00)
974           SSE_MULT_PS(XMM7, XMM0)
975           SSE_SUB_PS(XMM6, XMM7)
976 
977           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
978           SSE_SHUFFLE(XMM4, XMM4, 0x00)
979           SSE_MULT_PS(XMM4, XMM1)
980           SSE_SUB_PS(XMM6, XMM4)
981 
982           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
983           SSE_SHUFFLE(XMM5, XMM5, 0x00)
984           SSE_MULT_PS(XMM5, XMM2)
985           SSE_SUB_PS(XMM6, XMM5)
986 
987           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
988           SSE_SHUFFLE(XMM7, XMM7, 0x00)
989           SSE_MULT_PS(XMM7, XMM3)
990           SSE_SUB_PS(XMM6, XMM7)
991 
992           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
993           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
994 
995           /* Fourth Column */
996           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
997           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
998 
999           /* Matrix-Vector Product: */
1000           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1001           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1002           SSE_MULT_PS(XMM5, XMM0)
1003           SSE_SUB_PS(XMM4, XMM5)
1004 
1005           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1006           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1007           SSE_MULT_PS(XMM6, XMM1)
1008           SSE_SUB_PS(XMM4, XMM6)
1009 
1010           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1011           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1012           SSE_MULT_PS(XMM7, XMM2)
1013           SSE_SUB_PS(XMM4, XMM7)
1014 
1015           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1016           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1017           SSE_MULT_PS(XMM5, XMM3)
1018           SSE_SUB_PS(XMM4, XMM5)
1019 
1020           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1021           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1022           SSE_INLINE_END_2;
1023           pv += 16;
1024         }
1025         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1026       }
1027       row = *bjtmp++;
1028       /*        row = (*bjtmp++)/4; */
1029     }
1030     /* finished row so stick it into b->a */
1031     pv = ba + 16 * bi[i];
1032     pj = bj + bi[i];
1033     nz = bi[i + 1] - bi[i];
1034 
1035     /* Copy x block back into pv block */
1036     for (j = 0; j < nz; j++) {
1037       x = rtmp + 16 * pj[j];
1038       /*        x  = rtmp+4*pj[j]; */
1039 
1040       SSE_INLINE_BEGIN_2(x, pv)
1041       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1042       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1043       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1044 
1045       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1046       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1047 
1048       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1049       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1050 
1051       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1052       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1053 
1054       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1055       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1056 
1057       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1058       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1059 
1060       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1061       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1062 
1063       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1064       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1065       SSE_INLINE_END_2;
1066       pv += 16;
1067     }
1068     /* invert diagonal block */
1069     w = ba + 16 * diag_offset[i];
1070     if (pivotinblocks) {
1071       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1072       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1073     } else {
1074       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1075     }
1076     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1077     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1078   }
1079 
1080   PetscCall(PetscFree(rtmp));
1081 
1082   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1083   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1084   C->assembled           = PETSC_TRUE;
1085 
1086   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1087   /* Flop Count from inverting diagonal blocks */
1088   SSE_SCOPE_END;
1089   PetscFunctionReturn(PETSC_SUCCESS);
1090 }
1091 
1092 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1093 {
1094   Mat             A = C;
1095   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1096   int             i, j, n = a->mbs;
1097   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1098   unsigned short *aj = (unsigned short *)(a->j), *ajtmp;
1099   unsigned int    row;
1100   int             nz, *bi = b->i;
1101   int            *diag_offset = b->diag, *ai = a->i;
1102   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1103   MatScalar      *ba = b->a, *aa = a->a;
1104   int             nonzero       = 0;
1105   PetscBool       pivotinblocks = b->pivotinblocks;
1106   PetscReal       shift         = info->shiftamount;
1107   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
1108 
1109   PetscFunctionBegin;
1110   allowzeropivot = PetscNot(A->erroriffailure);
1111   SSE_SCOPE_BEGIN;
1112 
1113   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1114   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
1115   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1116   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1117   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1118   /*      colscale = 4; */
1119   /*    } */
1120 
1121   for (i = 0; i < n; i++) {
1122     nz    = bi[i + 1] - bi[i];
1123     bjtmp = bj + bi[i];
1124     /* zero out the 4x4 block accumulators */
1125     /* zero out one register */
1126     XOR_PS(XMM7, XMM7);
1127     for (j = 0; j < nz; j++) {
1128       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1129       /*        x = rtmp+4*bjtmp[j]; */
1130       SSE_INLINE_BEGIN_1(x)
1131       /* Copy zero register to memory locations */
1132       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1133       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1134       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1135       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1136       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1137       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1138       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1139       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1140       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1141       SSE_INLINE_END_1;
1142     }
1143     /* load in initial (unfactored row) */
1144     nz    = ai[i + 1] - ai[i];
1145     ajtmp = aj + ai[i];
1146     v     = aa + 16 * ai[i];
1147     for (j = 0; j < nz; j++) {
1148       x = rtmp + 16 * ((unsigned int)ajtmp[j]);
1149       /*        x = rtmp+colscale*ajtmp[j]; */
1150       /* Copy v block into x block */
1151       SSE_INLINE_BEGIN_2(v, x)
1152       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1153       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1154       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1155 
1156       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1157       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1158 
1159       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1160       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1161 
1162       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1163       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1164 
1165       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1166       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1167 
1168       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1169       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1170 
1171       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1172       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1173 
1174       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1175       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1176       SSE_INLINE_END_2;
1177 
1178       v += 16;
1179     }
1180     /*      row = (*bjtmp++)/4; */
1181     row = (unsigned int)(*bjtmp++);
1182     while (row < i) {
1183       pc = rtmp + 16 * row;
1184       SSE_INLINE_BEGIN_1(pc)
1185       /* Load block from lower triangle */
1186       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1187       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1188       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1189 
1190       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1191       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1192 
1193       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1194       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1195 
1196       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1197       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1198 
1199       /* Compare block to zero block */
1200 
1201       SSE_COPY_PS(XMM4, XMM7)
1202       SSE_CMPNEQ_PS(XMM4, XMM0)
1203 
1204       SSE_COPY_PS(XMM5, XMM7)
1205       SSE_CMPNEQ_PS(XMM5, XMM1)
1206 
1207       SSE_COPY_PS(XMM6, XMM7)
1208       SSE_CMPNEQ_PS(XMM6, XMM2)
1209 
1210       SSE_CMPNEQ_PS(XMM7, XMM3)
1211 
1212       /* Reduce the comparisons to one SSE register */
1213       SSE_OR_PS(XMM6, XMM7)
1214       SSE_OR_PS(XMM5, XMM4)
1215       SSE_OR_PS(XMM5, XMM6)
1216       SSE_INLINE_END_1;
1217 
1218       /* Reduce the one SSE register to an integer register for branching */
1219       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1220       MOVEMASK(nonzero, XMM5);
1221 
1222       /* If block is nonzero ... */
1223       if (nonzero) {
1224         pv = ba + 16 * diag_offset[row];
1225         PREFETCH_L1(&pv[16]);
1226         pj = bj + diag_offset[row] + 1;
1227 
1228         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1229         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1230         /* but the diagonal was inverted already */
1231         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1232 
1233         SSE_INLINE_BEGIN_2(pv, pc)
1234         /* Column 0, product is accumulated in XMM4 */
1235         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1236         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1237         SSE_MULT_PS(XMM4, XMM0)
1238 
1239         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1240         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1241         SSE_MULT_PS(XMM5, XMM1)
1242         SSE_ADD_PS(XMM4, XMM5)
1243 
1244         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1245         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1246         SSE_MULT_PS(XMM6, XMM2)
1247         SSE_ADD_PS(XMM4, XMM6)
1248 
1249         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1250         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1251         SSE_MULT_PS(XMM7, XMM3)
1252         SSE_ADD_PS(XMM4, XMM7)
1253 
1254         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1255         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1256 
1257         /* Column 1, product is accumulated in XMM5 */
1258         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1259         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1260         SSE_MULT_PS(XMM5, XMM0)
1261 
1262         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1263         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1264         SSE_MULT_PS(XMM6, XMM1)
1265         SSE_ADD_PS(XMM5, XMM6)
1266 
1267         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1268         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1269         SSE_MULT_PS(XMM7, XMM2)
1270         SSE_ADD_PS(XMM5, XMM7)
1271 
1272         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1273         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1274         SSE_MULT_PS(XMM6, XMM3)
1275         SSE_ADD_PS(XMM5, XMM6)
1276 
1277         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1278         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1279 
1280         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1281 
1282         /* Column 2, product is accumulated in XMM6 */
1283         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1284         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1285         SSE_MULT_PS(XMM6, XMM0)
1286 
1287         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1288         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1289         SSE_MULT_PS(XMM7, XMM1)
1290         SSE_ADD_PS(XMM6, XMM7)
1291 
1292         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1293         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1294         SSE_MULT_PS(XMM7, XMM2)
1295         SSE_ADD_PS(XMM6, XMM7)
1296 
1297         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1298         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1299         SSE_MULT_PS(XMM7, XMM3)
1300         SSE_ADD_PS(XMM6, XMM7)
1301 
1302         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1303         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1304 
1305         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1306         /* Column 3, product is accumulated in XMM0 */
1307         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1308         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1309         SSE_MULT_PS(XMM0, XMM7)
1310 
1311         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1312         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1313         SSE_MULT_PS(XMM1, XMM7)
1314         SSE_ADD_PS(XMM0, XMM1)
1315 
1316         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1317         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1318         SSE_MULT_PS(XMM1, XMM2)
1319         SSE_ADD_PS(XMM0, XMM1)
1320 
1321         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1322         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1323         SSE_MULT_PS(XMM3, XMM7)
1324         SSE_ADD_PS(XMM0, XMM3)
1325 
1326         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1327         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1328 
1329         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1330         /* This is code to be maintained and read by humans after all. */
1331         /* Copy Multiplier Col 3 into XMM3 */
1332         SSE_COPY_PS(XMM3, XMM0)
1333         /* Copy Multiplier Col 2 into XMM2 */
1334         SSE_COPY_PS(XMM2, XMM6)
1335         /* Copy Multiplier Col 1 into XMM1 */
1336         SSE_COPY_PS(XMM1, XMM5)
1337         /* Copy Multiplier Col 0 into XMM0 */
1338         SSE_COPY_PS(XMM0, XMM4)
1339         SSE_INLINE_END_2;
1340 
1341         /* Update the row: */
1342         nz = bi[row + 1] - diag_offset[row] - 1;
1343         pv += 16;
1344         for (j = 0; j < nz; j++) {
1345           PREFETCH_L1(&pv[16]);
1346           x = rtmp + 16 * ((unsigned int)pj[j]);
1347           /*            x = rtmp + 4*pj[j]; */
1348 
1349           /* X:=X-M*PV, One column at a time */
1350           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1351           SSE_INLINE_BEGIN_2(x, pv)
1352           /* Load First Column of X*/
1353           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1354           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1355 
1356           /* Matrix-Vector Product: */
1357           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1358           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1359           SSE_MULT_PS(XMM5, XMM0)
1360           SSE_SUB_PS(XMM4, XMM5)
1361 
1362           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1363           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1364           SSE_MULT_PS(XMM6, XMM1)
1365           SSE_SUB_PS(XMM4, XMM6)
1366 
1367           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1368           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1369           SSE_MULT_PS(XMM7, XMM2)
1370           SSE_SUB_PS(XMM4, XMM7)
1371 
1372           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1373           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1374           SSE_MULT_PS(XMM5, XMM3)
1375           SSE_SUB_PS(XMM4, XMM5)
1376 
1377           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1378           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1379 
1380           /* Second Column */
1381           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1382           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1383 
1384           /* Matrix-Vector Product: */
1385           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1386           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1387           SSE_MULT_PS(XMM6, XMM0)
1388           SSE_SUB_PS(XMM5, XMM6)
1389 
1390           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1391           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1392           SSE_MULT_PS(XMM7, XMM1)
1393           SSE_SUB_PS(XMM5, XMM7)
1394 
1395           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1396           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1397           SSE_MULT_PS(XMM4, XMM2)
1398           SSE_SUB_PS(XMM5, XMM4)
1399 
1400           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1401           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1402           SSE_MULT_PS(XMM6, XMM3)
1403           SSE_SUB_PS(XMM5, XMM6)
1404 
1405           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1406           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1407 
1408           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1409 
1410           /* Third Column */
1411           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1412           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1413 
1414           /* Matrix-Vector Product: */
1415           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1416           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1417           SSE_MULT_PS(XMM7, XMM0)
1418           SSE_SUB_PS(XMM6, XMM7)
1419 
1420           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1421           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1422           SSE_MULT_PS(XMM4, XMM1)
1423           SSE_SUB_PS(XMM6, XMM4)
1424 
1425           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1426           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1427           SSE_MULT_PS(XMM5, XMM2)
1428           SSE_SUB_PS(XMM6, XMM5)
1429 
1430           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1431           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1432           SSE_MULT_PS(XMM7, XMM3)
1433           SSE_SUB_PS(XMM6, XMM7)
1434 
1435           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1436           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1437 
1438           /* Fourth Column */
1439           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1440           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1441 
1442           /* Matrix-Vector Product: */
1443           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1444           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1445           SSE_MULT_PS(XMM5, XMM0)
1446           SSE_SUB_PS(XMM4, XMM5)
1447 
1448           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1449           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1450           SSE_MULT_PS(XMM6, XMM1)
1451           SSE_SUB_PS(XMM4, XMM6)
1452 
1453           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1454           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1455           SSE_MULT_PS(XMM7, XMM2)
1456           SSE_SUB_PS(XMM4, XMM7)
1457 
1458           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1459           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1460           SSE_MULT_PS(XMM5, XMM3)
1461           SSE_SUB_PS(XMM4, XMM5)
1462 
1463           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1464           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1465           SSE_INLINE_END_2;
1466           pv += 16;
1467         }
1468         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1469       }
1470       row = (unsigned int)(*bjtmp++);
1471       /*        row = (*bjtmp++)/4; */
1472       /*        bjtmp++; */
1473     }
1474     /* finished row so stick it into b->a */
1475     pv = ba + 16 * bi[i];
1476     pj = bj + bi[i];
1477     nz = bi[i + 1] - bi[i];
1478 
1479     /* Copy x block back into pv block */
1480     for (j = 0; j < nz; j++) {
1481       x = rtmp + 16 * ((unsigned int)pj[j]);
1482       /*        x  = rtmp+4*pj[j]; */
1483 
1484       SSE_INLINE_BEGIN_2(x, pv)
1485       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1486       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1487       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1488 
1489       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1490       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1491 
1492       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1493       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1494 
1495       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1496       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1497 
1498       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1499       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1500 
1501       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1502       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1503 
1504       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1505       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1506 
1507       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1508       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1509       SSE_INLINE_END_2;
1510       pv += 16;
1511     }
1512     /* invert diagonal block */
1513     w = ba + 16 * diag_offset[i];
1514     if (pivotinblocks) {
1515       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1516       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1517     } else {
1518       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1519     }
1520     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1521   }
1522 
1523   PetscCall(PetscFree(rtmp));
1524 
1525   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1526   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1527   C->assembled           = PETSC_TRUE;
1528 
1529   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1530   /* Flop Count from inverting diagonal blocks */
1531   SSE_SCOPE_END;
1532   PetscFunctionReturn(PETSC_SUCCESS);
1533 }
1534 
1535 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info)
1536 {
1537   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1538   int             i, j, n = a->mbs;
1539   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1540   unsigned int    row;
1541   int            *ajtmpold, nz, *bi = b->i;
1542   int            *diag_offset = b->diag, *ai = a->i, *aj = a->j;
1543   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1544   MatScalar      *ba = b->a, *aa = a->a;
1545   int             nonzero       = 0;
1546   PetscBool       pivotinblocks = b->pivotinblocks;
1547   PetscReal       shift         = info->shiftamount;
1548   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
1549 
1550   PetscFunctionBegin;
1551   allowzeropivot = PetscNot(A->erroriffailure);
1552   SSE_SCOPE_BEGIN;
1553 
1554   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1555   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
1556   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1557   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1558   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1559   /*      colscale = 4; */
1560   /*    } */
1561   if ((unsigned long)bj == (unsigned long)aj) return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1562 
1563   for (i = 0; i < n; i++) {
1564     nz    = bi[i + 1] - bi[i];
1565     bjtmp = bj + bi[i];
1566     /* zero out the 4x4 block accumulators */
1567     /* zero out one register */
1568     XOR_PS(XMM7, XMM7);
1569     for (j = 0; j < nz; j++) {
1570       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1571       /*        x = rtmp+4*bjtmp[j]; */
1572       SSE_INLINE_BEGIN_1(x)
1573       /* Copy zero register to memory locations */
1574       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1575       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1576       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1577       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1578       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1579       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1580       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1581       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1582       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1583       SSE_INLINE_END_1;
1584     }
1585     /* load in initial (unfactored row) */
1586     nz       = ai[i + 1] - ai[i];
1587     ajtmpold = aj + ai[i];
1588     v        = aa + 16 * ai[i];
1589     for (j = 0; j < nz; j++) {
1590       x = rtmp + 16 * ajtmpold[j];
1591       /*        x = rtmp+colscale*ajtmpold[j]; */
1592       /* Copy v block into x block */
1593       SSE_INLINE_BEGIN_2(v, x)
1594       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1595       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1596       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1597 
1598       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1599       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1600 
1601       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1602       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1603 
1604       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1605       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1606 
1607       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1608       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1609 
1610       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1611       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1612 
1613       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1614       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1615 
1616       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1617       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1618       SSE_INLINE_END_2;
1619 
1620       v += 16;
1621     }
1622     /*      row = (*bjtmp++)/4; */
1623     row = (unsigned int)(*bjtmp++);
1624     while (row < i) {
1625       pc = rtmp + 16 * row;
1626       SSE_INLINE_BEGIN_1(pc)
1627       /* Load block from lower triangle */
1628       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1629       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1630       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1631 
1632       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1633       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1634 
1635       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1636       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1637 
1638       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1639       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1640 
1641       /* Compare block to zero block */
1642 
1643       SSE_COPY_PS(XMM4, XMM7)
1644       SSE_CMPNEQ_PS(XMM4, XMM0)
1645 
1646       SSE_COPY_PS(XMM5, XMM7)
1647       SSE_CMPNEQ_PS(XMM5, XMM1)
1648 
1649       SSE_COPY_PS(XMM6, XMM7)
1650       SSE_CMPNEQ_PS(XMM6, XMM2)
1651 
1652       SSE_CMPNEQ_PS(XMM7, XMM3)
1653 
1654       /* Reduce the comparisons to one SSE register */
1655       SSE_OR_PS(XMM6, XMM7)
1656       SSE_OR_PS(XMM5, XMM4)
1657       SSE_OR_PS(XMM5, XMM6)
1658       SSE_INLINE_END_1;
1659 
1660       /* Reduce the one SSE register to an integer register for branching */
1661       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1662       MOVEMASK(nonzero, XMM5);
1663 
1664       /* If block is nonzero ... */
1665       if (nonzero) {
1666         pv = ba + 16 * diag_offset[row];
1667         PREFETCH_L1(&pv[16]);
1668         pj = bj + diag_offset[row] + 1;
1669 
1670         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1671         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1672         /* but the diagonal was inverted already */
1673         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1674 
1675         SSE_INLINE_BEGIN_2(pv, pc)
1676         /* Column 0, product is accumulated in XMM4 */
1677         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1678         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1679         SSE_MULT_PS(XMM4, XMM0)
1680 
1681         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1682         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1683         SSE_MULT_PS(XMM5, XMM1)
1684         SSE_ADD_PS(XMM4, XMM5)
1685 
1686         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1687         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1688         SSE_MULT_PS(XMM6, XMM2)
1689         SSE_ADD_PS(XMM4, XMM6)
1690 
1691         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1692         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1693         SSE_MULT_PS(XMM7, XMM3)
1694         SSE_ADD_PS(XMM4, XMM7)
1695 
1696         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1697         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1698 
1699         /* Column 1, product is accumulated in XMM5 */
1700         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1701         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1702         SSE_MULT_PS(XMM5, XMM0)
1703 
1704         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1705         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1706         SSE_MULT_PS(XMM6, XMM1)
1707         SSE_ADD_PS(XMM5, XMM6)
1708 
1709         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1710         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1711         SSE_MULT_PS(XMM7, XMM2)
1712         SSE_ADD_PS(XMM5, XMM7)
1713 
1714         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1715         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1716         SSE_MULT_PS(XMM6, XMM3)
1717         SSE_ADD_PS(XMM5, XMM6)
1718 
1719         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1720         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1721 
1722         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1723 
1724         /* Column 2, product is accumulated in XMM6 */
1725         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1726         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1727         SSE_MULT_PS(XMM6, XMM0)
1728 
1729         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1730         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1731         SSE_MULT_PS(XMM7, XMM1)
1732         SSE_ADD_PS(XMM6, XMM7)
1733 
1734         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1735         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1736         SSE_MULT_PS(XMM7, XMM2)
1737         SSE_ADD_PS(XMM6, XMM7)
1738 
1739         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1740         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1741         SSE_MULT_PS(XMM7, XMM3)
1742         SSE_ADD_PS(XMM6, XMM7)
1743 
1744         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1745         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1746 
1747         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1748         /* Column 3, product is accumulated in XMM0 */
1749         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1750         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1751         SSE_MULT_PS(XMM0, XMM7)
1752 
1753         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1754         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1755         SSE_MULT_PS(XMM1, XMM7)
1756         SSE_ADD_PS(XMM0, XMM1)
1757 
1758         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1759         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1760         SSE_MULT_PS(XMM1, XMM2)
1761         SSE_ADD_PS(XMM0, XMM1)
1762 
1763         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1764         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1765         SSE_MULT_PS(XMM3, XMM7)
1766         SSE_ADD_PS(XMM0, XMM3)
1767 
1768         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1769         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1770 
1771         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1772         /* This is code to be maintained and read by humans after all. */
1773         /* Copy Multiplier Col 3 into XMM3 */
1774         SSE_COPY_PS(XMM3, XMM0)
1775         /* Copy Multiplier Col 2 into XMM2 */
1776         SSE_COPY_PS(XMM2, XMM6)
1777         /* Copy Multiplier Col 1 into XMM1 */
1778         SSE_COPY_PS(XMM1, XMM5)
1779         /* Copy Multiplier Col 0 into XMM0 */
1780         SSE_COPY_PS(XMM0, XMM4)
1781         SSE_INLINE_END_2;
1782 
1783         /* Update the row: */
1784         nz = bi[row + 1] - diag_offset[row] - 1;
1785         pv += 16;
1786         for (j = 0; j < nz; j++) {
1787           PREFETCH_L1(&pv[16]);
1788           x = rtmp + 16 * ((unsigned int)pj[j]);
1789           /*            x = rtmp + 4*pj[j]; */
1790 
1791           /* X:=X-M*PV, One column at a time */
1792           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1793           SSE_INLINE_BEGIN_2(x, pv)
1794           /* Load First Column of X*/
1795           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1796           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1797 
1798           /* Matrix-Vector Product: */
1799           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1800           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1801           SSE_MULT_PS(XMM5, XMM0)
1802           SSE_SUB_PS(XMM4, XMM5)
1803 
1804           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1805           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1806           SSE_MULT_PS(XMM6, XMM1)
1807           SSE_SUB_PS(XMM4, XMM6)
1808 
1809           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1810           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1811           SSE_MULT_PS(XMM7, XMM2)
1812           SSE_SUB_PS(XMM4, XMM7)
1813 
1814           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1815           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1816           SSE_MULT_PS(XMM5, XMM3)
1817           SSE_SUB_PS(XMM4, XMM5)
1818 
1819           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1820           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1821 
1822           /* Second Column */
1823           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1824           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1825 
1826           /* Matrix-Vector Product: */
1827           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1828           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1829           SSE_MULT_PS(XMM6, XMM0)
1830           SSE_SUB_PS(XMM5, XMM6)
1831 
1832           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1833           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1834           SSE_MULT_PS(XMM7, XMM1)
1835           SSE_SUB_PS(XMM5, XMM7)
1836 
1837           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1838           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1839           SSE_MULT_PS(XMM4, XMM2)
1840           SSE_SUB_PS(XMM5, XMM4)
1841 
1842           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1843           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1844           SSE_MULT_PS(XMM6, XMM3)
1845           SSE_SUB_PS(XMM5, XMM6)
1846 
1847           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1848           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1849 
1850           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1851 
1852           /* Third Column */
1853           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1854           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1855 
1856           /* Matrix-Vector Product: */
1857           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1858           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1859           SSE_MULT_PS(XMM7, XMM0)
1860           SSE_SUB_PS(XMM6, XMM7)
1861 
1862           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1863           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1864           SSE_MULT_PS(XMM4, XMM1)
1865           SSE_SUB_PS(XMM6, XMM4)
1866 
1867           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1868           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1869           SSE_MULT_PS(XMM5, XMM2)
1870           SSE_SUB_PS(XMM6, XMM5)
1871 
1872           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1873           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1874           SSE_MULT_PS(XMM7, XMM3)
1875           SSE_SUB_PS(XMM6, XMM7)
1876 
1877           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1878           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1879 
1880           /* Fourth Column */
1881           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1882           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1883 
1884           /* Matrix-Vector Product: */
1885           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1886           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1887           SSE_MULT_PS(XMM5, XMM0)
1888           SSE_SUB_PS(XMM4, XMM5)
1889 
1890           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1891           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1892           SSE_MULT_PS(XMM6, XMM1)
1893           SSE_SUB_PS(XMM4, XMM6)
1894 
1895           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1896           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1897           SSE_MULT_PS(XMM7, XMM2)
1898           SSE_SUB_PS(XMM4, XMM7)
1899 
1900           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1901           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1902           SSE_MULT_PS(XMM5, XMM3)
1903           SSE_SUB_PS(XMM4, XMM5)
1904 
1905           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1906           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1907           SSE_INLINE_END_2;
1908           pv += 16;
1909         }
1910         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1911       }
1912       row = (unsigned int)(*bjtmp++);
1913       /*        row = (*bjtmp++)/4; */
1914       /*        bjtmp++; */
1915     }
1916     /* finished row so stick it into b->a */
1917     pv = ba + 16 * bi[i];
1918     pj = bj + bi[i];
1919     nz = bi[i + 1] - bi[i];
1920 
1921     /* Copy x block back into pv block */
1922     for (j = 0; j < nz; j++) {
1923       x = rtmp + 16 * ((unsigned int)pj[j]);
1924       /*        x  = rtmp+4*pj[j]; */
1925 
1926       SSE_INLINE_BEGIN_2(x, pv)
1927       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1928       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1929       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1930 
1931       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1932       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1933 
1934       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1935       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1936 
1937       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1938       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1939 
1940       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1941       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1942 
1943       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1944       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1945 
1946       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1947       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1948 
1949       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1950       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1951       SSE_INLINE_END_2;
1952       pv += 16;
1953     }
1954     /* invert diagonal block */
1955     w = ba + 16 * diag_offset[i];
1956     if (pivotinblocks) {
1957       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected));
1958       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1959     } else {
1960       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1961     }
1962     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1963     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1964   }
1965 
1966   PetscCall(PetscFree(rtmp));
1967 
1968   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1969   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1970   C->assembled           = PETSC_TRUE;
1971 
1972   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1973   /* Flop Count from inverting diagonal blocks */
1974   SSE_SCOPE_END;
1975   PetscFunctionReturn(PETSC_SUCCESS);
1976 }
1977 
1978 #endif
1979