xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 503c0ea9b45bcfbcebbb1ea5341243bbc69f0bea)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 /* ------------------------------------------------------------*/
9 /*
10       Version for when blocks are 4 by 4
11 */
12 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
13 {
14   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
15   IS             isrow = b->row,isicol = b->icol;
16   const PetscInt *r,*ic;
17   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
18   PetscInt       *ajtmpold,*ajtmp,nz,row;
19   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
20   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
21   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
22   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
23   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
24   MatScalar      m13,m14,m15,m16;
25   MatScalar      *ba           = b->a,*aa = a->a;
26   PetscBool      pivotinblocks = b->pivotinblocks;
27   PetscReal      shift         = info->shiftamount;
28   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
29 
30   PetscFunctionBegin;
31   PetscCall(ISGetIndices(isrow,&r));
32   PetscCall(ISGetIndices(isicol,&ic));
33   PetscCall(PetscMalloc1(16*(n+1),&rtmp));
34   allowzeropivot = PetscNot(A->erroriffailure);
35 
36   for (i=0; i<n; i++) {
37     nz    = bi[i+1] - bi[i];
38     ajtmp = bj + bi[i];
39     for  (j=0; j<nz; j++) {
40       x     = rtmp+16*ajtmp[j];
41       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
42       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
43     }
44     /* load in initial (unfactored row) */
45     idx      = r[i];
46     nz       = ai[idx+1] - ai[idx];
47     ajtmpold = aj + ai[idx];
48     v        = aa + 16*ai[idx];
49     for (j=0; j<nz; j++) {
50       x     = rtmp+16*ic[ajtmpold[j]];
51       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
52       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
53       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
54       x[14] = v[14]; x[15] = v[15];
55       v    += 16;
56     }
57     row = *ajtmp++;
58     while (row < i) {
59       pc  = rtmp + 16*row;
60       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
61       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
62       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
63       p15 = pc[14]; p16 = pc[15];
64       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
65           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
66           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
67           || p16 != 0.0) {
68         pv    = ba + 16*diag_offset[row];
69         pj    = bj + diag_offset[row] + 1;
70         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
71         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
72         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
73         x15   = pv[14]; x16 = pv[15];
74         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
75         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
76         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
77         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
78 
79         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
80         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
81         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
82         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
83 
84         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
85         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
86         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
87         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
88 
89         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
90         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
91         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
92         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
93 
94         nz  = bi[row+1] - diag_offset[row] - 1;
95         pv += 16;
96         for (j=0; j<nz; j++) {
97           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
98           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
99           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
100           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
101           x     = rtmp + 16*pj[j];
102           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
103           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
104           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
105           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
106 
107           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
108           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
109           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
110           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
111 
112           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
113           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
114           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
115           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
116 
117           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
118           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
119           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
120           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
121 
122           pv += 16;
123         }
124         PetscCall(PetscLogFlops(128.0*nz+112.0));
125       }
126       row = *ajtmp++;
127     }
128     /* finished row so stick it into b->a */
129     pv = ba + 16*bi[i];
130     pj = bj + bi[i];
131     nz = bi[i+1] - bi[i];
132     for (j=0; j<nz; j++) {
133       x      = rtmp+16*pj[j];
134       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
135       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
136       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
137       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
138       pv    += 16;
139     }
140     /* invert diagonal block */
141     w = ba + 16*diag_offset[i];
142     if (pivotinblocks) {
143       PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected));
144       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
145     } else {
146       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
147     }
148   }
149 
150   PetscCall(PetscFree(rtmp));
151   PetscCall(ISRestoreIndices(isicol,&ic));
152   PetscCall(ISRestoreIndices(isrow,&r));
153 
154   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
155   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
156   C->assembled           = PETSC_TRUE;
157 
158   PetscCall(PetscLogFlops(1.333333333333*4*4*4*b->mbs)); /* from inverting diagonal blocks */
159   PetscFunctionReturn(0);
160 }
161 
162 /* MatLUFactorNumeric_SeqBAIJ_4 -
163      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
164        PetscKernel_A_gets_A_times_B()
165        PetscKernel_A_gets_A_minus_B_times_C()
166        PetscKernel_A_gets_inverse_A()
167 */
168 
169 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
170 {
171   Mat            C     = B;
172   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
173   IS             isrow = b->row,isicol = b->icol;
174   const PetscInt *r,*ic;
175   PetscInt       i,j,k,nz,nzL,row;
176   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
177   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
178   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
179   PetscInt       flg;
180   PetscReal      shift;
181   PetscBool      allowzeropivot,zeropivotdetected;
182 
183   PetscFunctionBegin;
184   allowzeropivot = PetscNot(A->erroriffailure);
185   PetscCall(ISGetIndices(isrow,&r));
186   PetscCall(ISGetIndices(isicol,&ic));
187 
188   if (info->shifttype == MAT_SHIFT_NONE) {
189     shift = 0;
190   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
191     shift = info->shiftamount;
192   }
193 
194   /* generate work space needed by the factorization */
195   PetscCall(PetscMalloc2(bs2*n,&rtmp,bs2,&mwork));
196   PetscCall(PetscArrayzero(rtmp,bs2*n));
197 
198   for (i=0; i<n; i++) {
199     /* zero rtmp */
200     /* L part */
201     nz    = bi[i+1] - bi[i];
202     bjtmp = bj + bi[i];
203     for  (j=0; j<nz; j++) {
204       PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2));
205     }
206 
207     /* U part */
208     nz    = bdiag[i]-bdiag[i+1];
209     bjtmp = bj + bdiag[i+1]+1;
210     for  (j=0; j<nz; j++) {
211       PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2));
212     }
213 
214     /* load in initial (unfactored row) */
215     nz    = ai[r[i]+1] - ai[r[i]];
216     ajtmp = aj + ai[r[i]];
217     v     = aa + bs2*ai[r[i]];
218     for (j=0; j<nz; j++) {
219       PetscCall(PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2));
220     }
221 
222     /* elimination */
223     bjtmp = bj + bi[i];
224     nzL   = bi[i+1] - bi[i];
225     for (k=0; k < nzL; k++) {
226       row = bjtmp[k];
227       pc  = rtmp + bs2*row;
228       for (flg=0,j=0; j<bs2; j++) {
229         if (pc[j]!=0.0) {
230           flg = 1;
231           break;
232         }
233       }
234       if (flg) {
235         pv = b->a + bs2*bdiag[row];
236         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
237         PetscCall(PetscKernel_A_gets_A_times_B_4(pc,pv,mwork));
238 
239         pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
240         pv = b->a + bs2*(bdiag[row+1]+1);
241         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
242         for (j=0; j<nz; j++) {
243           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
244           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
245           v    = rtmp + bs2*pj[j];
246           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv));
247           pv  += bs2;
248         }
249         PetscCall(PetscLogFlops(128.0*nz+112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
250       }
251     }
252 
253     /* finished row so stick it into b->a */
254     /* L part */
255     pv = b->a + bs2*bi[i];
256     pj = b->j + bi[i];
257     nz = bi[i+1] - bi[i];
258     for (j=0; j<nz; j++) {
259       PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2));
260     }
261 
262     /* Mark diagonal and invert diagonal for simpler triangular solves */
263     pv   = b->a + bs2*bdiag[i];
264     pj   = b->j + bdiag[i];
265     PetscCall(PetscArraycpy(pv,rtmp+bs2*pj[0],bs2));
266     PetscCall(PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected));
267     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
268 
269     /* U part */
270     pv = b->a + bs2*(bdiag[i+1]+1);
271     pj = b->j + bdiag[i+1]+1;
272     nz = bdiag[i] - bdiag[i+1] - 1;
273     for (j=0; j<nz; j++) {
274       PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2));
275     }
276   }
277 
278   PetscCall(PetscFree2(rtmp,mwork));
279   PetscCall(ISRestoreIndices(isicol,&ic));
280   PetscCall(ISRestoreIndices(isrow,&r));
281 
282   C->ops->solve          = MatSolve_SeqBAIJ_4;
283   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
284   C->assembled           = PETSC_TRUE;
285 
286   PetscCall(PetscLogFlops(1.333333333333*4*4*4*n)); /* from inverting diagonal blocks */
287   PetscFunctionReturn(0);
288 }
289 
290 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
291 {
292 /*
293     Default Version for when blocks are 4 by 4 Using natural ordering
294 */
295   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
296   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
297   PetscInt       *ajtmpold,*ajtmp,nz,row;
298   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
299   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
300   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
301   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
302   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
303   MatScalar      m13,m14,m15,m16;
304   MatScalar      *ba           = b->a,*aa = a->a;
305   PetscBool      pivotinblocks = b->pivotinblocks;
306   PetscReal      shift         = info->shiftamount;
307   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
308 
309   PetscFunctionBegin;
310   allowzeropivot = PetscNot(A->erroriffailure);
311   PetscCall(PetscMalloc1(16*(n+1),&rtmp));
312 
313   for (i=0; i<n; i++) {
314     nz    = bi[i+1] - bi[i];
315     ajtmp = bj + bi[i];
316     for  (j=0; j<nz; j++) {
317       x     = rtmp+16*ajtmp[j];
318       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
319       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
320     }
321     /* load in initial (unfactored row) */
322     nz       = ai[i+1] - ai[i];
323     ajtmpold = aj + ai[i];
324     v        = aa + 16*ai[i];
325     for (j=0; j<nz; j++) {
326       x     = rtmp+16*ajtmpold[j];
327       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
328       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
329       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
330       x[14] = v[14]; x[15] = v[15];
331       v    += 16;
332     }
333     row = *ajtmp++;
334     while (row < i) {
335       pc  = rtmp + 16*row;
336       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
337       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
338       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
339       p15 = pc[14]; p16 = pc[15];
340       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
341           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
342           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
343           || p16 != 0.0) {
344         pv    = ba + 16*diag_offset[row];
345         pj    = bj + diag_offset[row] + 1;
346         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
347         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
348         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
349         x15   = pv[14]; x16 = pv[15];
350         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
351         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
352         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
353         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
354 
355         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
356         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
357         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
358         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
359 
360         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
361         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
362         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
363         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
364 
365         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
366         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
367         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
368         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
369         nz     = bi[row+1] - diag_offset[row] - 1;
370         pv    += 16;
371         for (j=0; j<nz; j++) {
372           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
373           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
374           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
375           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
376           x     = rtmp + 16*pj[j];
377           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
378           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
379           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
380           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
381 
382           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
383           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
384           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
385           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
386 
387           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
388           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
389           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
390           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
391 
392           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
393           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
394           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
395           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
396 
397           pv += 16;
398         }
399         PetscCall(PetscLogFlops(128.0*nz+112.0));
400       }
401       row = *ajtmp++;
402     }
403     /* finished row so stick it into b->a */
404     pv = ba + 16*bi[i];
405     pj = bj + bi[i];
406     nz = bi[i+1] - bi[i];
407     for (j=0; j<nz; j++) {
408       x      = rtmp+16*pj[j];
409       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
410       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
411       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
412       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
413       pv    += 16;
414     }
415     /* invert diagonal block */
416     w = ba + 16*diag_offset[i];
417     if (pivotinblocks) {
418       PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected));
419       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
420     } else {
421       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
422     }
423   }
424 
425   PetscCall(PetscFree(rtmp));
426 
427   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
428   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
429   C->assembled           = PETSC_TRUE;
430 
431   PetscCall(PetscLogFlops(1.333333333333*4*4*4*b->mbs)); /* from inverting diagonal blocks */
432   PetscFunctionReturn(0);
433 }
434 
435 /*
436   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
437     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
438 */
439 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
440 {
441   Mat            C =B;
442   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
443   PetscInt       i,j,k,nz,nzL,row;
444   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
445   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
446   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
447   PetscInt       flg;
448   PetscReal      shift;
449   PetscBool      allowzeropivot,zeropivotdetected;
450 
451   PetscFunctionBegin;
452   allowzeropivot = PetscNot(A->erroriffailure);
453 
454   /* generate work space needed by the factorization */
455   PetscCall(PetscMalloc2(bs2*n,&rtmp,bs2,&mwork));
456   PetscCall(PetscArrayzero(rtmp,bs2*n));
457 
458   if (info->shifttype == MAT_SHIFT_NONE) {
459     shift = 0;
460   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
461     shift = info->shiftamount;
462   }
463 
464   for (i=0; i<n; i++) {
465     /* zero rtmp */
466     /* L part */
467     nz    = bi[i+1] - bi[i];
468     bjtmp = bj + bi[i];
469     for  (j=0; j<nz; j++) {
470       PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2));
471     }
472 
473     /* U part */
474     nz    = bdiag[i] - bdiag[i+1];
475     bjtmp = bj + bdiag[i+1]+1;
476     for  (j=0; j<nz; j++) {
477       PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2));
478     }
479 
480     /* load in initial (unfactored row) */
481     nz    = ai[i+1] - ai[i];
482     ajtmp = aj + ai[i];
483     v     = aa + bs2*ai[i];
484     for (j=0; j<nz; j++) {
485       PetscCall(PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2));
486     }
487 
488     /* elimination */
489     bjtmp = bj + bi[i];
490     nzL   = bi[i+1] - bi[i];
491     for (k=0; k < nzL; k++) {
492       row = bjtmp[k];
493       pc  = rtmp + bs2*row;
494       for (flg=0,j=0; j<bs2; j++) {
495         if (pc[j]!=0.0) {
496           flg = 1;
497           break;
498         }
499       }
500       if (flg) {
501         pv = b->a + bs2*bdiag[row];
502         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
503         PetscCall(PetscKernel_A_gets_A_times_B_4(pc,pv,mwork));
504 
505         pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
506         pv = b->a + bs2*(bdiag[row+1]+1);
507         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
508         for (j=0; j<nz; j++) {
509           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
510           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
511           v    = rtmp + bs2*pj[j];
512           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv));
513           pv  += bs2;
514         }
515         PetscCall(PetscLogFlops(128.0*nz+112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
516       }
517     }
518 
519     /* finished row so stick it into b->a */
520     /* L part */
521     pv = b->a + bs2*bi[i];
522     pj = b->j + bi[i];
523     nz = bi[i+1] - bi[i];
524     for (j=0; j<nz; j++) {
525       PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2));
526     }
527 
528     /* Mark diagonal and invert diagonal for simpler triangular solves */
529     pv   = b->a + bs2*bdiag[i];
530     pj   = b->j + bdiag[i];
531     PetscCall(PetscArraycpy(pv,rtmp+bs2*pj[0],bs2));
532     PetscCall(PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected));
533     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
534 
535     /* U part */
536     pv = b->a + bs2*(bdiag[i+1]+1);
537     pj = b->j + bdiag[i+1]+1;
538     nz = bdiag[i] - bdiag[i+1] - 1;
539     for (j=0; j<nz; j++) {
540       PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2));
541     }
542   }
543   PetscCall(PetscFree2(rtmp,mwork));
544 
545   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
546   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
547   C->assembled           = PETSC_TRUE;
548 
549   PetscCall(PetscLogFlops(1.333333333333*4*4*4*n)); /* from inverting diagonal blocks */
550   PetscFunctionReturn(0);
551 }
552 
553 #if defined(PETSC_HAVE_SSE)
554 
555 #include PETSC_HAVE_SSE
556 
557 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
558 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
559 {
560   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
561   PetscErrorCode ierr;
562   int            i,j,n = a->mbs;
563   int            *bj = b->j,*bjtmp,*pj;
564   int            row;
565   int            *ajtmpold,nz,*bi=b->i;
566   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
567   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
568   MatScalar      *ba    = b->a,*aa = a->a;
569   int            nonzero=0;
570   PetscBool      pivotinblocks = b->pivotinblocks;
571   PetscReal      shift         = info->shiftamount;
572   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
573 
574   PetscFunctionBegin;
575   allowzeropivot = PetscNot(A->erroriffailure);
576   SSE_SCOPE_BEGIN;
577 
578   PetscCheckFalse((unsigned long)aa%16!=0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
579   PetscCheckFalse((unsigned long)ba%16!=0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
580   PetscCall(PetscMalloc1(16*(n+1),&rtmp));
581   PetscCheckFalse((unsigned long)rtmp%16!=0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
582 /*    if ((unsigned long)bj==(unsigned long)aj) { */
583 /*      colscale = 4; */
584 /*    } */
585   for (i=0; i<n; i++) {
586     nz    = bi[i+1] - bi[i];
587     bjtmp = bj + bi[i];
588     /* zero out the 4x4 block accumulators */
589     /* zero out one register */
590     XOR_PS(XMM7,XMM7);
591     for  (j=0; j<nz; j++) {
592       x = rtmp+16*bjtmp[j];
593 /*        x = rtmp+4*bjtmp[j]; */
594       SSE_INLINE_BEGIN_1(x)
595       /* Copy zero register to memory locations */
596       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
597       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
598       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
599       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
600       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
601       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
602       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
603       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
604       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
605       SSE_INLINE_END_1;
606     }
607     /* load in initial (unfactored row) */
608     nz       = ai[i+1] - ai[i];
609     ajtmpold = aj + ai[i];
610     v        = aa + 16*ai[i];
611     for (j=0; j<nz; j++) {
612       x = rtmp+16*ajtmpold[j];
613 /*        x = rtmp+colscale*ajtmpold[j]; */
614       /* Copy v block into x block */
615       SSE_INLINE_BEGIN_2(v,x)
616       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
617       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
618       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
619 
620       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
621       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
622 
623       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
624       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
625 
626       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
627       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
628 
629       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
630       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
631 
632       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
633       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
634 
635       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
636       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
637 
638       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
639       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
640       SSE_INLINE_END_2;
641 
642       v += 16;
643     }
644 /*      row = (*bjtmp++)/4; */
645     row = *bjtmp++;
646     while (row < i) {
647       pc = rtmp + 16*row;
648       SSE_INLINE_BEGIN_1(pc)
649       /* Load block from lower triangle */
650       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
651       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
652       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
653 
654       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
655       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
656 
657       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
658       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
659 
660       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
661       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
662 
663       /* Compare block to zero block */
664 
665       SSE_COPY_PS(XMM4,XMM7)
666       SSE_CMPNEQ_PS(XMM4,XMM0)
667 
668       SSE_COPY_PS(XMM5,XMM7)
669       SSE_CMPNEQ_PS(XMM5,XMM1)
670 
671       SSE_COPY_PS(XMM6,XMM7)
672       SSE_CMPNEQ_PS(XMM6,XMM2)
673 
674       SSE_CMPNEQ_PS(XMM7,XMM3)
675 
676       /* Reduce the comparisons to one SSE register */
677       SSE_OR_PS(XMM6,XMM7)
678       SSE_OR_PS(XMM5,XMM4)
679       SSE_OR_PS(XMM5,XMM6)
680       SSE_INLINE_END_1;
681 
682       /* Reduce the one SSE register to an integer register for branching */
683       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
684       MOVEMASK(nonzero,XMM5);
685 
686       /* If block is nonzero ... */
687       if (nonzero) {
688         pv = ba + 16*diag_offset[row];
689         PREFETCH_L1(&pv[16]);
690         pj = bj + diag_offset[row] + 1;
691 
692         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
693         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
694         /* but the diagonal was inverted already */
695         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
696 
697         SSE_INLINE_BEGIN_2(pv,pc)
698         /* Column 0, product is accumulated in XMM4 */
699         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
700         SSE_SHUFFLE(XMM4,XMM4,0x00)
701         SSE_MULT_PS(XMM4,XMM0)
702 
703         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
704         SSE_SHUFFLE(XMM5,XMM5,0x00)
705         SSE_MULT_PS(XMM5,XMM1)
706         SSE_ADD_PS(XMM4,XMM5)
707 
708         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
709         SSE_SHUFFLE(XMM6,XMM6,0x00)
710         SSE_MULT_PS(XMM6,XMM2)
711         SSE_ADD_PS(XMM4,XMM6)
712 
713         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
714         SSE_SHUFFLE(XMM7,XMM7,0x00)
715         SSE_MULT_PS(XMM7,XMM3)
716         SSE_ADD_PS(XMM4,XMM7)
717 
718         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
719         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
720 
721         /* Column 1, product is accumulated in XMM5 */
722         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
723         SSE_SHUFFLE(XMM5,XMM5,0x00)
724         SSE_MULT_PS(XMM5,XMM0)
725 
726         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
727         SSE_SHUFFLE(XMM6,XMM6,0x00)
728         SSE_MULT_PS(XMM6,XMM1)
729         SSE_ADD_PS(XMM5,XMM6)
730 
731         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
732         SSE_SHUFFLE(XMM7,XMM7,0x00)
733         SSE_MULT_PS(XMM7,XMM2)
734         SSE_ADD_PS(XMM5,XMM7)
735 
736         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
737         SSE_SHUFFLE(XMM6,XMM6,0x00)
738         SSE_MULT_PS(XMM6,XMM3)
739         SSE_ADD_PS(XMM5,XMM6)
740 
741         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
742         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
743 
744         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
745 
746         /* Column 2, product is accumulated in XMM6 */
747         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
748         SSE_SHUFFLE(XMM6,XMM6,0x00)
749         SSE_MULT_PS(XMM6,XMM0)
750 
751         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
752         SSE_SHUFFLE(XMM7,XMM7,0x00)
753         SSE_MULT_PS(XMM7,XMM1)
754         SSE_ADD_PS(XMM6,XMM7)
755 
756         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
757         SSE_SHUFFLE(XMM7,XMM7,0x00)
758         SSE_MULT_PS(XMM7,XMM2)
759         SSE_ADD_PS(XMM6,XMM7)
760 
761         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
762         SSE_SHUFFLE(XMM7,XMM7,0x00)
763         SSE_MULT_PS(XMM7,XMM3)
764         SSE_ADD_PS(XMM6,XMM7)
765 
766         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
767         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
768 
769         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
770         /* Column 3, product is accumulated in XMM0 */
771         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
772         SSE_SHUFFLE(XMM7,XMM7,0x00)
773         SSE_MULT_PS(XMM0,XMM7)
774 
775         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
776         SSE_SHUFFLE(XMM7,XMM7,0x00)
777         SSE_MULT_PS(XMM1,XMM7)
778         SSE_ADD_PS(XMM0,XMM1)
779 
780         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
781         SSE_SHUFFLE(XMM1,XMM1,0x00)
782         SSE_MULT_PS(XMM1,XMM2)
783         SSE_ADD_PS(XMM0,XMM1)
784 
785         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
786         SSE_SHUFFLE(XMM7,XMM7,0x00)
787         SSE_MULT_PS(XMM3,XMM7)
788         SSE_ADD_PS(XMM0,XMM3)
789 
790         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
791         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
792 
793         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
794         /* This is code to be maintained and read by humans afterall. */
795         /* Copy Multiplier Col 3 into XMM3 */
796         SSE_COPY_PS(XMM3,XMM0)
797         /* Copy Multiplier Col 2 into XMM2 */
798         SSE_COPY_PS(XMM2,XMM6)
799         /* Copy Multiplier Col 1 into XMM1 */
800         SSE_COPY_PS(XMM1,XMM5)
801         /* Copy Multiplier Col 0 into XMM0 */
802         SSE_COPY_PS(XMM0,XMM4)
803         SSE_INLINE_END_2;
804 
805         /* Update the row: */
806         nz  = bi[row+1] - diag_offset[row] - 1;
807         pv += 16;
808         for (j=0; j<nz; j++) {
809           PREFETCH_L1(&pv[16]);
810           x = rtmp + 16*pj[j];
811 /*            x = rtmp + 4*pj[j]; */
812 
813           /* X:=X-M*PV, One column at a time */
814           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
815           SSE_INLINE_BEGIN_2(x,pv)
816           /* Load First Column of X*/
817           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
818           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
819 
820           /* Matrix-Vector Product: */
821           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
822           SSE_SHUFFLE(XMM5,XMM5,0x00)
823           SSE_MULT_PS(XMM5,XMM0)
824           SSE_SUB_PS(XMM4,XMM5)
825 
826           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
827           SSE_SHUFFLE(XMM6,XMM6,0x00)
828           SSE_MULT_PS(XMM6,XMM1)
829           SSE_SUB_PS(XMM4,XMM6)
830 
831           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
832           SSE_SHUFFLE(XMM7,XMM7,0x00)
833           SSE_MULT_PS(XMM7,XMM2)
834           SSE_SUB_PS(XMM4,XMM7)
835 
836           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
837           SSE_SHUFFLE(XMM5,XMM5,0x00)
838           SSE_MULT_PS(XMM5,XMM3)
839           SSE_SUB_PS(XMM4,XMM5)
840 
841           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
842           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
843 
844           /* Second Column */
845           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
846           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
847 
848           /* Matrix-Vector Product: */
849           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
850           SSE_SHUFFLE(XMM6,XMM6,0x00)
851           SSE_MULT_PS(XMM6,XMM0)
852           SSE_SUB_PS(XMM5,XMM6)
853 
854           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
855           SSE_SHUFFLE(XMM7,XMM7,0x00)
856           SSE_MULT_PS(XMM7,XMM1)
857           SSE_SUB_PS(XMM5,XMM7)
858 
859           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
860           SSE_SHUFFLE(XMM4,XMM4,0x00)
861           SSE_MULT_PS(XMM4,XMM2)
862           SSE_SUB_PS(XMM5,XMM4)
863 
864           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
865           SSE_SHUFFLE(XMM6,XMM6,0x00)
866           SSE_MULT_PS(XMM6,XMM3)
867           SSE_SUB_PS(XMM5,XMM6)
868 
869           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
870           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
871 
872           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
873 
874           /* Third Column */
875           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
876           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
877 
878           /* Matrix-Vector Product: */
879           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
880           SSE_SHUFFLE(XMM7,XMM7,0x00)
881           SSE_MULT_PS(XMM7,XMM0)
882           SSE_SUB_PS(XMM6,XMM7)
883 
884           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
885           SSE_SHUFFLE(XMM4,XMM4,0x00)
886           SSE_MULT_PS(XMM4,XMM1)
887           SSE_SUB_PS(XMM6,XMM4)
888 
889           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
890           SSE_SHUFFLE(XMM5,XMM5,0x00)
891           SSE_MULT_PS(XMM5,XMM2)
892           SSE_SUB_PS(XMM6,XMM5)
893 
894           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
895           SSE_SHUFFLE(XMM7,XMM7,0x00)
896           SSE_MULT_PS(XMM7,XMM3)
897           SSE_SUB_PS(XMM6,XMM7)
898 
899           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
900           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
901 
902           /* Fourth Column */
903           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
904           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
905 
906           /* Matrix-Vector Product: */
907           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
908           SSE_SHUFFLE(XMM5,XMM5,0x00)
909           SSE_MULT_PS(XMM5,XMM0)
910           SSE_SUB_PS(XMM4,XMM5)
911 
912           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
913           SSE_SHUFFLE(XMM6,XMM6,0x00)
914           SSE_MULT_PS(XMM6,XMM1)
915           SSE_SUB_PS(XMM4,XMM6)
916 
917           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
918           SSE_SHUFFLE(XMM7,XMM7,0x00)
919           SSE_MULT_PS(XMM7,XMM2)
920           SSE_SUB_PS(XMM4,XMM7)
921 
922           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
923           SSE_SHUFFLE(XMM5,XMM5,0x00)
924           SSE_MULT_PS(XMM5,XMM3)
925           SSE_SUB_PS(XMM4,XMM5)
926 
927           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
928           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
929           SSE_INLINE_END_2;
930           pv += 16;
931         }
932         PetscCall(PetscLogFlops(128.0*nz+112.0));
933       }
934       row = *bjtmp++;
935 /*        row = (*bjtmp++)/4; */
936     }
937     /* finished row so stick it into b->a */
938     pv = ba + 16*bi[i];
939     pj = bj + bi[i];
940     nz = bi[i+1] - bi[i];
941 
942     /* Copy x block back into pv block */
943     for (j=0; j<nz; j++) {
944       x = rtmp+16*pj[j];
945 /*        x  = rtmp+4*pj[j]; */
946 
947       SSE_INLINE_BEGIN_2(x,pv)
948       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
949       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
950       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
951 
952       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
953       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
954 
955       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
956       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
957 
958       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
959       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
960 
961       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
962       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
963 
964       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
965       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
966 
967       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
968       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
969 
970       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
971       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
972       SSE_INLINE_END_2;
973       pv += 16;
974     }
975     /* invert diagonal block */
976     w = ba + 16*diag_offset[i];
977     if (pivotinblocks) {
978       PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected));
979       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
980     } else {
981       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
982     }
983 /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
984     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
985   }
986 
987   PetscCall(PetscFree(rtmp));
988 
989   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
990   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
991   C->assembled           = PETSC_TRUE;
992 
993   PetscCall(PetscLogFlops(1.333333333333*bs*bs2*b->mbs));
994   /* Flop Count from inverting diagonal blocks */
995   SSE_SCOPE_END;
996   PetscFunctionReturn(0);
997 }
998 
999 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1000 {
1001   Mat            A  =C;
1002   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1003   PetscErrorCode ierr;
1004   int            i,j,n = a->mbs;
1005   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1006   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1007   unsigned int   row;
1008   int            nz,*bi=b->i;
1009   int            *diag_offset = b->diag,*ai=a->i;
1010   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1011   MatScalar      *ba    = b->a,*aa = a->a;
1012   int            nonzero=0;
1013   PetscBool      pivotinblocks = b->pivotinblocks;
1014   PetscReal      shift = info->shiftamount;
1015   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
1016 
1017   PetscFunctionBegin;
1018   allowzeropivot = PetscNot(A->erroriffailure);
1019   SSE_SCOPE_BEGIN;
1020 
1021   PetscCheckFalse((unsigned long)aa%16!=0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1022   PetscCheckFalse((unsigned long)ba%16!=0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1023   PetscCall(PetscMalloc1(16*(n+1),&rtmp));
1024   PetscCheckFalse((unsigned long)rtmp%16!=0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1025 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1026 /*      colscale = 4; */
1027 /*    } */
1028 
1029   for (i=0; i<n; i++) {
1030     nz    = bi[i+1] - bi[i];
1031     bjtmp = bj + bi[i];
1032     /* zero out the 4x4 block accumulators */
1033     /* zero out one register */
1034     XOR_PS(XMM7,XMM7);
1035     for  (j=0; j<nz; j++) {
1036       x = rtmp+16*((unsigned int)bjtmp[j]);
1037 /*        x = rtmp+4*bjtmp[j]; */
1038       SSE_INLINE_BEGIN_1(x)
1039       /* Copy zero register to memory locations */
1040       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1041       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1042       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1043       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1044       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1045       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1046       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1047       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1048       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1049       SSE_INLINE_END_1;
1050     }
1051     /* load in initial (unfactored row) */
1052     nz    = ai[i+1] - ai[i];
1053     ajtmp = aj + ai[i];
1054     v     = aa + 16*ai[i];
1055     for (j=0; j<nz; j++) {
1056       x = rtmp+16*((unsigned int)ajtmp[j]);
1057 /*        x = rtmp+colscale*ajtmp[j]; */
1058       /* Copy v block into x block */
1059       SSE_INLINE_BEGIN_2(v,x)
1060       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1061       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1062       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1063 
1064       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1065       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1066 
1067       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1068       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1069 
1070       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1071       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1072 
1073       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1074       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1075 
1076       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1077       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1078 
1079       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1080       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1081 
1082       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1083       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1084       SSE_INLINE_END_2;
1085 
1086       v += 16;
1087     }
1088 /*      row = (*bjtmp++)/4; */
1089     row = (unsigned int)(*bjtmp++);
1090     while (row < i) {
1091       pc = rtmp + 16*row;
1092       SSE_INLINE_BEGIN_1(pc)
1093       /* Load block from lower triangle */
1094       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1095       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1096       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1097 
1098       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1099       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1100 
1101       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1102       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1103 
1104       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1105       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1106 
1107       /* Compare block to zero block */
1108 
1109       SSE_COPY_PS(XMM4,XMM7)
1110       SSE_CMPNEQ_PS(XMM4,XMM0)
1111 
1112       SSE_COPY_PS(XMM5,XMM7)
1113       SSE_CMPNEQ_PS(XMM5,XMM1)
1114 
1115       SSE_COPY_PS(XMM6,XMM7)
1116       SSE_CMPNEQ_PS(XMM6,XMM2)
1117 
1118       SSE_CMPNEQ_PS(XMM7,XMM3)
1119 
1120       /* Reduce the comparisons to one SSE register */
1121       SSE_OR_PS(XMM6,XMM7)
1122       SSE_OR_PS(XMM5,XMM4)
1123       SSE_OR_PS(XMM5,XMM6)
1124       SSE_INLINE_END_1;
1125 
1126       /* Reduce the one SSE register to an integer register for branching */
1127       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1128       MOVEMASK(nonzero,XMM5);
1129 
1130       /* If block is nonzero ... */
1131       if (nonzero) {
1132         pv = ba + 16*diag_offset[row];
1133         PREFETCH_L1(&pv[16]);
1134         pj = bj + diag_offset[row] + 1;
1135 
1136         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1137         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1138         /* but the diagonal was inverted already */
1139         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1140 
1141         SSE_INLINE_BEGIN_2(pv,pc)
1142         /* Column 0, product is accumulated in XMM4 */
1143         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1144         SSE_SHUFFLE(XMM4,XMM4,0x00)
1145         SSE_MULT_PS(XMM4,XMM0)
1146 
1147         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1148         SSE_SHUFFLE(XMM5,XMM5,0x00)
1149         SSE_MULT_PS(XMM5,XMM1)
1150         SSE_ADD_PS(XMM4,XMM5)
1151 
1152         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1153         SSE_SHUFFLE(XMM6,XMM6,0x00)
1154         SSE_MULT_PS(XMM6,XMM2)
1155         SSE_ADD_PS(XMM4,XMM6)
1156 
1157         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1158         SSE_SHUFFLE(XMM7,XMM7,0x00)
1159         SSE_MULT_PS(XMM7,XMM3)
1160         SSE_ADD_PS(XMM4,XMM7)
1161 
1162         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1163         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1164 
1165         /* Column 1, product is accumulated in XMM5 */
1166         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1167         SSE_SHUFFLE(XMM5,XMM5,0x00)
1168         SSE_MULT_PS(XMM5,XMM0)
1169 
1170         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1171         SSE_SHUFFLE(XMM6,XMM6,0x00)
1172         SSE_MULT_PS(XMM6,XMM1)
1173         SSE_ADD_PS(XMM5,XMM6)
1174 
1175         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1176         SSE_SHUFFLE(XMM7,XMM7,0x00)
1177         SSE_MULT_PS(XMM7,XMM2)
1178         SSE_ADD_PS(XMM5,XMM7)
1179 
1180         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1181         SSE_SHUFFLE(XMM6,XMM6,0x00)
1182         SSE_MULT_PS(XMM6,XMM3)
1183         SSE_ADD_PS(XMM5,XMM6)
1184 
1185         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1186         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1187 
1188         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1189 
1190         /* Column 2, product is accumulated in XMM6 */
1191         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1192         SSE_SHUFFLE(XMM6,XMM6,0x00)
1193         SSE_MULT_PS(XMM6,XMM0)
1194 
1195         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1196         SSE_SHUFFLE(XMM7,XMM7,0x00)
1197         SSE_MULT_PS(XMM7,XMM1)
1198         SSE_ADD_PS(XMM6,XMM7)
1199 
1200         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1201         SSE_SHUFFLE(XMM7,XMM7,0x00)
1202         SSE_MULT_PS(XMM7,XMM2)
1203         SSE_ADD_PS(XMM6,XMM7)
1204 
1205         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1206         SSE_SHUFFLE(XMM7,XMM7,0x00)
1207         SSE_MULT_PS(XMM7,XMM3)
1208         SSE_ADD_PS(XMM6,XMM7)
1209 
1210         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1211         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1212 
1213         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1214         /* Column 3, product is accumulated in XMM0 */
1215         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1216         SSE_SHUFFLE(XMM7,XMM7,0x00)
1217         SSE_MULT_PS(XMM0,XMM7)
1218 
1219         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1220         SSE_SHUFFLE(XMM7,XMM7,0x00)
1221         SSE_MULT_PS(XMM1,XMM7)
1222         SSE_ADD_PS(XMM0,XMM1)
1223 
1224         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1225         SSE_SHUFFLE(XMM1,XMM1,0x00)
1226         SSE_MULT_PS(XMM1,XMM2)
1227         SSE_ADD_PS(XMM0,XMM1)
1228 
1229         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1230         SSE_SHUFFLE(XMM7,XMM7,0x00)
1231         SSE_MULT_PS(XMM3,XMM7)
1232         SSE_ADD_PS(XMM0,XMM3)
1233 
1234         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1235         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1236 
1237         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1238         /* This is code to be maintained and read by humans afterall. */
1239         /* Copy Multiplier Col 3 into XMM3 */
1240         SSE_COPY_PS(XMM3,XMM0)
1241         /* Copy Multiplier Col 2 into XMM2 */
1242         SSE_COPY_PS(XMM2,XMM6)
1243         /* Copy Multiplier Col 1 into XMM1 */
1244         SSE_COPY_PS(XMM1,XMM5)
1245         /* Copy Multiplier Col 0 into XMM0 */
1246         SSE_COPY_PS(XMM0,XMM4)
1247         SSE_INLINE_END_2;
1248 
1249         /* Update the row: */
1250         nz  = bi[row+1] - diag_offset[row] - 1;
1251         pv += 16;
1252         for (j=0; j<nz; j++) {
1253           PREFETCH_L1(&pv[16]);
1254           x = rtmp + 16*((unsigned int)pj[j]);
1255 /*            x = rtmp + 4*pj[j]; */
1256 
1257           /* X:=X-M*PV, One column at a time */
1258           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1259           SSE_INLINE_BEGIN_2(x,pv)
1260           /* Load First Column of X*/
1261           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1262           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1263 
1264           /* Matrix-Vector Product: */
1265           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1266           SSE_SHUFFLE(XMM5,XMM5,0x00)
1267           SSE_MULT_PS(XMM5,XMM0)
1268           SSE_SUB_PS(XMM4,XMM5)
1269 
1270           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1271           SSE_SHUFFLE(XMM6,XMM6,0x00)
1272           SSE_MULT_PS(XMM6,XMM1)
1273           SSE_SUB_PS(XMM4,XMM6)
1274 
1275           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1276           SSE_SHUFFLE(XMM7,XMM7,0x00)
1277           SSE_MULT_PS(XMM7,XMM2)
1278           SSE_SUB_PS(XMM4,XMM7)
1279 
1280           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1281           SSE_SHUFFLE(XMM5,XMM5,0x00)
1282           SSE_MULT_PS(XMM5,XMM3)
1283           SSE_SUB_PS(XMM4,XMM5)
1284 
1285           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1286           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1287 
1288           /* Second Column */
1289           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1290           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1291 
1292           /* Matrix-Vector Product: */
1293           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1294           SSE_SHUFFLE(XMM6,XMM6,0x00)
1295           SSE_MULT_PS(XMM6,XMM0)
1296           SSE_SUB_PS(XMM5,XMM6)
1297 
1298           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1299           SSE_SHUFFLE(XMM7,XMM7,0x00)
1300           SSE_MULT_PS(XMM7,XMM1)
1301           SSE_SUB_PS(XMM5,XMM7)
1302 
1303           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1304           SSE_SHUFFLE(XMM4,XMM4,0x00)
1305           SSE_MULT_PS(XMM4,XMM2)
1306           SSE_SUB_PS(XMM5,XMM4)
1307 
1308           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1309           SSE_SHUFFLE(XMM6,XMM6,0x00)
1310           SSE_MULT_PS(XMM6,XMM3)
1311           SSE_SUB_PS(XMM5,XMM6)
1312 
1313           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1314           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1315 
1316           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1317 
1318           /* Third Column */
1319           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1320           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1321 
1322           /* Matrix-Vector Product: */
1323           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1324           SSE_SHUFFLE(XMM7,XMM7,0x00)
1325           SSE_MULT_PS(XMM7,XMM0)
1326           SSE_SUB_PS(XMM6,XMM7)
1327 
1328           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1329           SSE_SHUFFLE(XMM4,XMM4,0x00)
1330           SSE_MULT_PS(XMM4,XMM1)
1331           SSE_SUB_PS(XMM6,XMM4)
1332 
1333           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1334           SSE_SHUFFLE(XMM5,XMM5,0x00)
1335           SSE_MULT_PS(XMM5,XMM2)
1336           SSE_SUB_PS(XMM6,XMM5)
1337 
1338           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1339           SSE_SHUFFLE(XMM7,XMM7,0x00)
1340           SSE_MULT_PS(XMM7,XMM3)
1341           SSE_SUB_PS(XMM6,XMM7)
1342 
1343           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1344           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1345 
1346           /* Fourth Column */
1347           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1348           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1349 
1350           /* Matrix-Vector Product: */
1351           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1352           SSE_SHUFFLE(XMM5,XMM5,0x00)
1353           SSE_MULT_PS(XMM5,XMM0)
1354           SSE_SUB_PS(XMM4,XMM5)
1355 
1356           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1357           SSE_SHUFFLE(XMM6,XMM6,0x00)
1358           SSE_MULT_PS(XMM6,XMM1)
1359           SSE_SUB_PS(XMM4,XMM6)
1360 
1361           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1362           SSE_SHUFFLE(XMM7,XMM7,0x00)
1363           SSE_MULT_PS(XMM7,XMM2)
1364           SSE_SUB_PS(XMM4,XMM7)
1365 
1366           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1367           SSE_SHUFFLE(XMM5,XMM5,0x00)
1368           SSE_MULT_PS(XMM5,XMM3)
1369           SSE_SUB_PS(XMM4,XMM5)
1370 
1371           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1372           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1373           SSE_INLINE_END_2;
1374           pv += 16;
1375         }
1376         PetscCall(PetscLogFlops(128.0*nz+112.0));
1377       }
1378       row = (unsigned int)(*bjtmp++);
1379 /*        row = (*bjtmp++)/4; */
1380 /*        bjtmp++; */
1381     }
1382     /* finished row so stick it into b->a */
1383     pv = ba + 16*bi[i];
1384     pj = bj + bi[i];
1385     nz = bi[i+1] - bi[i];
1386 
1387     /* Copy x block back into pv block */
1388     for (j=0; j<nz; j++) {
1389       x = rtmp+16*((unsigned int)pj[j]);
1390 /*        x  = rtmp+4*pj[j]; */
1391 
1392       SSE_INLINE_BEGIN_2(x,pv)
1393       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1394       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1395       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1396 
1397       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1398       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1399 
1400       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1401       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1402 
1403       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1404       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1405 
1406       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1407       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1408 
1409       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1410       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1411 
1412       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1413       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1414 
1415       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1416       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1417       SSE_INLINE_END_2;
1418       pv += 16;
1419     }
1420     /* invert diagonal block */
1421     w = ba + 16*diag_offset[i];
1422     if (pivotinblocks) {
1423       PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected));
1424       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1425     } else {
1426       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1427     }
1428     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1429   }
1430 
1431   PetscCall(PetscFree(rtmp));
1432 
1433   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1434   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1435   C->assembled           = PETSC_TRUE;
1436 
1437   PetscCall(PetscLogFlops(1.333333333333*bs*bs2*b->mbs));
1438   /* Flop Count from inverting diagonal blocks */
1439   SSE_SCOPE_END;
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1444 {
1445   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1446   PetscErrorCode ierr;
1447   int            i,j,n = a->mbs;
1448   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1449   unsigned int   row;
1450   int            *ajtmpold,nz,*bi=b->i;
1451   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1452   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1453   MatScalar      *ba    = b->a,*aa = a->a;
1454   int            nonzero=0;
1455   PetscBool      pivotinblocks = b->pivotinblocks;
1456   PetscReal      shift = info->shiftamount;
1457   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
1458 
1459   PetscFunctionBegin;
1460   allowzeropivot = PetscNot(A->erroriffailure);
1461   SSE_SCOPE_BEGIN;
1462 
1463   PetscCheckFalse((unsigned long)aa%16!=0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1464   PetscCheckFalse((unsigned long)ba%16!=0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1465   PetscCall(PetscMalloc1(16*(n+1),&rtmp));
1466   PetscCheckFalse((unsigned long)rtmp%16!=0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1467 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1468 /*      colscale = 4; */
1469 /*    } */
1470   if ((unsigned long)bj==(unsigned long)aj) {
1471     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1472   }
1473 
1474   for (i=0; i<n; i++) {
1475     nz    = bi[i+1] - bi[i];
1476     bjtmp = bj + bi[i];
1477     /* zero out the 4x4 block accumulators */
1478     /* zero out one register */
1479     XOR_PS(XMM7,XMM7);
1480     for  (j=0; j<nz; j++) {
1481       x = rtmp+16*((unsigned int)bjtmp[j]);
1482 /*        x = rtmp+4*bjtmp[j]; */
1483       SSE_INLINE_BEGIN_1(x)
1484       /* Copy zero register to memory locations */
1485       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1486       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1487       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1488       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1489       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1490       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1491       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1492       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1493       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1494       SSE_INLINE_END_1;
1495     }
1496     /* load in initial (unfactored row) */
1497     nz       = ai[i+1] - ai[i];
1498     ajtmpold = aj + ai[i];
1499     v        = aa + 16*ai[i];
1500     for (j=0; j<nz; j++) {
1501       x = rtmp+16*ajtmpold[j];
1502 /*        x = rtmp+colscale*ajtmpold[j]; */
1503       /* Copy v block into x block */
1504       SSE_INLINE_BEGIN_2(v,x)
1505       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1506       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1507       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1508 
1509       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1510       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1511 
1512       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1513       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1514 
1515       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1516       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1517 
1518       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1519       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1520 
1521       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1522       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1523 
1524       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1525       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1526 
1527       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1528       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1529       SSE_INLINE_END_2;
1530 
1531       v += 16;
1532     }
1533 /*      row = (*bjtmp++)/4; */
1534     row = (unsigned int)(*bjtmp++);
1535     while (row < i) {
1536       pc = rtmp + 16*row;
1537       SSE_INLINE_BEGIN_1(pc)
1538       /* Load block from lower triangle */
1539       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1540       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1541       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1542 
1543       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1544       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1545 
1546       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1547       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1548 
1549       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1550       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1551 
1552       /* Compare block to zero block */
1553 
1554       SSE_COPY_PS(XMM4,XMM7)
1555       SSE_CMPNEQ_PS(XMM4,XMM0)
1556 
1557       SSE_COPY_PS(XMM5,XMM7)
1558       SSE_CMPNEQ_PS(XMM5,XMM1)
1559 
1560       SSE_COPY_PS(XMM6,XMM7)
1561       SSE_CMPNEQ_PS(XMM6,XMM2)
1562 
1563       SSE_CMPNEQ_PS(XMM7,XMM3)
1564 
1565       /* Reduce the comparisons to one SSE register */
1566       SSE_OR_PS(XMM6,XMM7)
1567       SSE_OR_PS(XMM5,XMM4)
1568       SSE_OR_PS(XMM5,XMM6)
1569       SSE_INLINE_END_1;
1570 
1571       /* Reduce the one SSE register to an integer register for branching */
1572       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1573       MOVEMASK(nonzero,XMM5);
1574 
1575       /* If block is nonzero ... */
1576       if (nonzero) {
1577         pv = ba + 16*diag_offset[row];
1578         PREFETCH_L1(&pv[16]);
1579         pj = bj + diag_offset[row] + 1;
1580 
1581         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1582         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1583         /* but the diagonal was inverted already */
1584         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1585 
1586         SSE_INLINE_BEGIN_2(pv,pc)
1587         /* Column 0, product is accumulated in XMM4 */
1588         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1589         SSE_SHUFFLE(XMM4,XMM4,0x00)
1590         SSE_MULT_PS(XMM4,XMM0)
1591 
1592         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1593         SSE_SHUFFLE(XMM5,XMM5,0x00)
1594         SSE_MULT_PS(XMM5,XMM1)
1595         SSE_ADD_PS(XMM4,XMM5)
1596 
1597         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1598         SSE_SHUFFLE(XMM6,XMM6,0x00)
1599         SSE_MULT_PS(XMM6,XMM2)
1600         SSE_ADD_PS(XMM4,XMM6)
1601 
1602         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1603         SSE_SHUFFLE(XMM7,XMM7,0x00)
1604         SSE_MULT_PS(XMM7,XMM3)
1605         SSE_ADD_PS(XMM4,XMM7)
1606 
1607         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1608         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1609 
1610         /* Column 1, product is accumulated in XMM5 */
1611         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1612         SSE_SHUFFLE(XMM5,XMM5,0x00)
1613         SSE_MULT_PS(XMM5,XMM0)
1614 
1615         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1616         SSE_SHUFFLE(XMM6,XMM6,0x00)
1617         SSE_MULT_PS(XMM6,XMM1)
1618         SSE_ADD_PS(XMM5,XMM6)
1619 
1620         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1621         SSE_SHUFFLE(XMM7,XMM7,0x00)
1622         SSE_MULT_PS(XMM7,XMM2)
1623         SSE_ADD_PS(XMM5,XMM7)
1624 
1625         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1626         SSE_SHUFFLE(XMM6,XMM6,0x00)
1627         SSE_MULT_PS(XMM6,XMM3)
1628         SSE_ADD_PS(XMM5,XMM6)
1629 
1630         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1631         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1632 
1633         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1634 
1635         /* Column 2, product is accumulated in XMM6 */
1636         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1637         SSE_SHUFFLE(XMM6,XMM6,0x00)
1638         SSE_MULT_PS(XMM6,XMM0)
1639 
1640         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1641         SSE_SHUFFLE(XMM7,XMM7,0x00)
1642         SSE_MULT_PS(XMM7,XMM1)
1643         SSE_ADD_PS(XMM6,XMM7)
1644 
1645         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1646         SSE_SHUFFLE(XMM7,XMM7,0x00)
1647         SSE_MULT_PS(XMM7,XMM2)
1648         SSE_ADD_PS(XMM6,XMM7)
1649 
1650         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1651         SSE_SHUFFLE(XMM7,XMM7,0x00)
1652         SSE_MULT_PS(XMM7,XMM3)
1653         SSE_ADD_PS(XMM6,XMM7)
1654 
1655         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1656         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1657 
1658         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1659         /* Column 3, product is accumulated in XMM0 */
1660         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1661         SSE_SHUFFLE(XMM7,XMM7,0x00)
1662         SSE_MULT_PS(XMM0,XMM7)
1663 
1664         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1665         SSE_SHUFFLE(XMM7,XMM7,0x00)
1666         SSE_MULT_PS(XMM1,XMM7)
1667         SSE_ADD_PS(XMM0,XMM1)
1668 
1669         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1670         SSE_SHUFFLE(XMM1,XMM1,0x00)
1671         SSE_MULT_PS(XMM1,XMM2)
1672         SSE_ADD_PS(XMM0,XMM1)
1673 
1674         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1675         SSE_SHUFFLE(XMM7,XMM7,0x00)
1676         SSE_MULT_PS(XMM3,XMM7)
1677         SSE_ADD_PS(XMM0,XMM3)
1678 
1679         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1680         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1681 
1682         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1683         /* This is code to be maintained and read by humans afterall. */
1684         /* Copy Multiplier Col 3 into XMM3 */
1685         SSE_COPY_PS(XMM3,XMM0)
1686         /* Copy Multiplier Col 2 into XMM2 */
1687         SSE_COPY_PS(XMM2,XMM6)
1688         /* Copy Multiplier Col 1 into XMM1 */
1689         SSE_COPY_PS(XMM1,XMM5)
1690         /* Copy Multiplier Col 0 into XMM0 */
1691         SSE_COPY_PS(XMM0,XMM4)
1692         SSE_INLINE_END_2;
1693 
1694         /* Update the row: */
1695         nz  = bi[row+1] - diag_offset[row] - 1;
1696         pv += 16;
1697         for (j=0; j<nz; j++) {
1698           PREFETCH_L1(&pv[16]);
1699           x = rtmp + 16*((unsigned int)pj[j]);
1700 /*            x = rtmp + 4*pj[j]; */
1701 
1702           /* X:=X-M*PV, One column at a time */
1703           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1704           SSE_INLINE_BEGIN_2(x,pv)
1705           /* Load First Column of X*/
1706           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1707           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1708 
1709           /* Matrix-Vector Product: */
1710           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1711           SSE_SHUFFLE(XMM5,XMM5,0x00)
1712           SSE_MULT_PS(XMM5,XMM0)
1713           SSE_SUB_PS(XMM4,XMM5)
1714 
1715           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1716           SSE_SHUFFLE(XMM6,XMM6,0x00)
1717           SSE_MULT_PS(XMM6,XMM1)
1718           SSE_SUB_PS(XMM4,XMM6)
1719 
1720           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1721           SSE_SHUFFLE(XMM7,XMM7,0x00)
1722           SSE_MULT_PS(XMM7,XMM2)
1723           SSE_SUB_PS(XMM4,XMM7)
1724 
1725           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1726           SSE_SHUFFLE(XMM5,XMM5,0x00)
1727           SSE_MULT_PS(XMM5,XMM3)
1728           SSE_SUB_PS(XMM4,XMM5)
1729 
1730           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1731           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1732 
1733           /* Second Column */
1734           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1735           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1736 
1737           /* Matrix-Vector Product: */
1738           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1739           SSE_SHUFFLE(XMM6,XMM6,0x00)
1740           SSE_MULT_PS(XMM6,XMM0)
1741           SSE_SUB_PS(XMM5,XMM6)
1742 
1743           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1744           SSE_SHUFFLE(XMM7,XMM7,0x00)
1745           SSE_MULT_PS(XMM7,XMM1)
1746           SSE_SUB_PS(XMM5,XMM7)
1747 
1748           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1749           SSE_SHUFFLE(XMM4,XMM4,0x00)
1750           SSE_MULT_PS(XMM4,XMM2)
1751           SSE_SUB_PS(XMM5,XMM4)
1752 
1753           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1754           SSE_SHUFFLE(XMM6,XMM6,0x00)
1755           SSE_MULT_PS(XMM6,XMM3)
1756           SSE_SUB_PS(XMM5,XMM6)
1757 
1758           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1759           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1760 
1761           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1762 
1763           /* Third Column */
1764           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1765           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1766 
1767           /* Matrix-Vector Product: */
1768           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1769           SSE_SHUFFLE(XMM7,XMM7,0x00)
1770           SSE_MULT_PS(XMM7,XMM0)
1771           SSE_SUB_PS(XMM6,XMM7)
1772 
1773           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1774           SSE_SHUFFLE(XMM4,XMM4,0x00)
1775           SSE_MULT_PS(XMM4,XMM1)
1776           SSE_SUB_PS(XMM6,XMM4)
1777 
1778           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1779           SSE_SHUFFLE(XMM5,XMM5,0x00)
1780           SSE_MULT_PS(XMM5,XMM2)
1781           SSE_SUB_PS(XMM6,XMM5)
1782 
1783           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1784           SSE_SHUFFLE(XMM7,XMM7,0x00)
1785           SSE_MULT_PS(XMM7,XMM3)
1786           SSE_SUB_PS(XMM6,XMM7)
1787 
1788           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1789           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1790 
1791           /* Fourth Column */
1792           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1793           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1794 
1795           /* Matrix-Vector Product: */
1796           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1797           SSE_SHUFFLE(XMM5,XMM5,0x00)
1798           SSE_MULT_PS(XMM5,XMM0)
1799           SSE_SUB_PS(XMM4,XMM5)
1800 
1801           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1802           SSE_SHUFFLE(XMM6,XMM6,0x00)
1803           SSE_MULT_PS(XMM6,XMM1)
1804           SSE_SUB_PS(XMM4,XMM6)
1805 
1806           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1807           SSE_SHUFFLE(XMM7,XMM7,0x00)
1808           SSE_MULT_PS(XMM7,XMM2)
1809           SSE_SUB_PS(XMM4,XMM7)
1810 
1811           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1812           SSE_SHUFFLE(XMM5,XMM5,0x00)
1813           SSE_MULT_PS(XMM5,XMM3)
1814           SSE_SUB_PS(XMM4,XMM5)
1815 
1816           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1817           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1818           SSE_INLINE_END_2;
1819           pv += 16;
1820         }
1821         PetscCall(PetscLogFlops(128.0*nz+112.0));
1822       }
1823       row = (unsigned int)(*bjtmp++);
1824 /*        row = (*bjtmp++)/4; */
1825 /*        bjtmp++; */
1826     }
1827     /* finished row so stick it into b->a */
1828     pv = ba + 16*bi[i];
1829     pj = bj + bi[i];
1830     nz = bi[i+1] - bi[i];
1831 
1832     /* Copy x block back into pv block */
1833     for (j=0; j<nz; j++) {
1834       x = rtmp+16*((unsigned int)pj[j]);
1835 /*        x  = rtmp+4*pj[j]; */
1836 
1837       SSE_INLINE_BEGIN_2(x,pv)
1838       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1839       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1840       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1841 
1842       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1843       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1844 
1845       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1846       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1847 
1848       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1849       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1850 
1851       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1852       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1853 
1854       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1855       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1856 
1857       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1858       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1859 
1860       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1861       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1862       SSE_INLINE_END_2;
1863       pv += 16;
1864     }
1865     /* invert diagonal block */
1866     w = ba + 16*diag_offset[i];
1867     if (pivotinblocks) {
1868       PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected));
1869       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1870     } else {
1871       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1872     }
1873 /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1874     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1875   }
1876 
1877   PetscCall(PetscFree(rtmp));
1878 
1879   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1880   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1881   C->assembled           = PETSC_TRUE;
1882 
1883   PetscCall(PetscLogFlops(1.333333333333*bs*bs2*b->mbs));
1884   /* Flop Count from inverting diagonal blocks */
1885   SSE_SCOPE_END;
1886   PetscFunctionReturn(0);
1887 }
1888 
1889 #endif
1890