xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision b2b2dd246975d7f9c8a1571def503d28e659d8b1)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 /* ------------------------------------------------------------*/
10 /*
11       Version for when blocks are 4 by 4
12 */
13 #undef __FUNCT__
14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info)
16 {
17   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
18   IS             isrow = b->row,isicol = b->icol;
19   PetscErrorCode ierr;
20   const PetscInt *r,*ic;
21   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
22   PetscInt       *ajtmpold,*ajtmp,nz,row;
23   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
24   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
25   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
26   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
27   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
28   MatScalar      m13,m14,m15,m16;
29   MatScalar      *ba = b->a,*aa = a->a;
30   PetscTruth     pivotinblocks = b->pivotinblocks;
31   PetscReal      shift = info->shiftinblocks;
32 
33   PetscFunctionBegin;
34   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
35   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
36   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
37 
38   for (i=0; i<n; i++) {
39     nz    = bi[i+1] - bi[i];
40     ajtmp = bj + bi[i];
41     for  (j=0; j<nz; j++) {
42       x = rtmp+16*ajtmp[j];
43       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
44       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
45     }
46     /* load in initial (unfactored row) */
47     idx      = r[i];
48     nz       = ai[idx+1] - ai[idx];
49     ajtmpold = aj + ai[idx];
50     v        = aa + 16*ai[idx];
51     for (j=0; j<nz; j++) {
52       x    = rtmp+16*ic[ajtmpold[j]];
53       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
54       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
55       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
56       x[14] = v[14]; x[15] = v[15];
57       v    += 16;
58     }
59     row = *ajtmp++;
60     while (row < i) {
61       pc  = rtmp + 16*row;
62       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
63       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
64       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
65       p15 = pc[14]; p16 = pc[15];
66       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
67           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
68           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
69           || p16 != 0.0) {
70         pv = ba + 16*diag_offset[row];
71         pj = bj + diag_offset[row] + 1;
72         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
73         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
74         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
75         x15 = pv[14]; x16 = pv[15];
76         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
77         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
78         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
79         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
80 
81         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
82         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
83         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
84         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
85 
86         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
87         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
88         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
89         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
90 
91         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
92         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
93         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
94         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
95 
96         nz = bi[row+1] - diag_offset[row] - 1;
97         pv += 16;
98         for (j=0; j<nz; j++) {
99           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
100           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
101           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
102           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
103           x    = rtmp + 16*pj[j];
104           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
105           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
106           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
107           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
108 
109           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
110           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
111           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
112           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
113 
114           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
115           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
116           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
117           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
118 
119           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
120           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
121           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
122           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
123 
124           pv   += 16;
125         }
126         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
127       }
128       row = *ajtmp++;
129     }
130     /* finished row so stick it into b->a */
131     pv = ba + 16*bi[i];
132     pj = bj + bi[i];
133     nz = bi[i+1] - bi[i];
134     for (j=0; j<nz; j++) {
135       x      = rtmp+16*pj[j];
136       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
137       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
138       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
139       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
140       pv   += 16;
141     }
142     /* invert diagonal block */
143     w    = ba + 16*diag_offset[i];
144     if (pivotinblocks) {
145       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
146     } else {
147       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148     }
149   }
150 
151   ierr = PetscFree(rtmp);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
153   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154   C->ops->solve          = MatSolve_SeqBAIJ_4;
155   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
156   C->assembled = PETSC_TRUE;
157   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
158   PetscFunctionReturn(0);
159 }
160 
161 /* MatLUFactorNumeric_SeqBAIJ_4_newdatastruct -
162      copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented
163        Kernel_A_gets_A_times_B()
164        Kernel_A_gets_A_minus_B_times_C()
165        Kernel_A_gets_inverse_A()
166 */
167 #undef __FUNCT__
168 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct"
169 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
170 {
171   Mat            C=B;
172   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
173   IS             isrow = b->row,isicol = b->icol;
174   PetscErrorCode ierr;
175   const PetscInt *r,*ic,*ics;
176   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
177   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
178   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
179   PetscInt       bs2 = a->bs2,flg;
180   PetscReal      shift = info->shiftinblocks;
181 
182   PetscFunctionBegin;
183   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
184   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
185 
186   /* generate work space needed by the factorization */
187   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
188   mwork = rtmp + bs2*n;
189   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
190   ics  = ic;
191 
192   for (i=0; i<n; i++){
193     /* zero rtmp */
194     /* L part */
195     nz    = bi[i+1] - bi[i];
196     bjtmp = bj + bi[i];
197     for  (j=0; j<nz; j++){
198       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
199     }
200 
201     /* U part */
202     nz = bi[2*n-i+1] - bi[2*n-i];
203     bjtmp = bj + bi[2*n-i];
204     for  (j=0; j<nz; j++){
205       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
206     }
207 
208     /* load in initial (unfactored row) */
209     nz    = ai[r[i]+1] - ai[r[i]];
210     ajtmp = aj + ai[r[i]];
211     v     = aa + bs2*ai[r[i]];
212     for (j=0; j<nz; j++) {
213       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
214     }
215 
216     /* elimination */
217     bjtmp = bj + bi[i];
218     nzL   = bi[i+1] - bi[i];
219     for(k=0;k < nzL;k++) {
220       row = bjtmp[k];
221       pc = rtmp + bs2*row;
222       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
223       if (flg) {
224         pv = b->a + bs2*bdiag[row];
225         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
226         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
227 
228         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
229         pv = b->a + bs2*bi[2*n-row];
230         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
231         for (j=0; j<nz; j++) {
232           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
233           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
234           v    = rtmp + bs2*pj[j];
235           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
236           pv  += bs2;
237         }
238         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
239       }
240     }
241 
242     /* finished row so stick it into b->a */
243     /* L part */
244     pv   = b->a + bs2*bi[i] ;
245     pj   = b->j + bi[i] ;
246     nz   = bi[i+1] - bi[i];
247     for (j=0; j<nz; j++) {
248       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
249     }
250 
251     /* Mark diagonal and invert diagonal for simplier triangular solves */
252     pv   = b->a + bs2*bdiag[i];
253     pj   = b->j + bdiag[i];
254     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
255     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
256     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
257 
258     /* U part */
259     pv = b->a + bs2*bi[2*n-i];
260     pj = b->j + bi[2*n-i];
261     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
262     for (j=0; j<nz; j++){
263       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
264     }
265   }
266 
267   ierr = PetscFree(rtmp);CHKERRQ(ierr);
268   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
269   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
270 
271   C->assembled = PETSC_TRUE;
272   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
273   PetscFunctionReturn(0);
274 }
275 
276 #undef __FUNCT__
277 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
278 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
279 {
280 /*
281     Default Version for when blocks are 4 by 4 Using natural ordering
282 */
283   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
284   PetscErrorCode ierr;
285   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
286   PetscInt       *ajtmpold,*ajtmp,nz,row;
287   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
288   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
289   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
290   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
291   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
292   MatScalar      m13,m14,m15,m16;
293   MatScalar      *ba = b->a,*aa = a->a;
294   PetscTruth     pivotinblocks = b->pivotinblocks;
295   PetscReal      shift = info->shiftinblocks;
296 
297   PetscFunctionBegin;
298   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
299 
300   for (i=0; i<n; i++) {
301     nz    = bi[i+1] - bi[i];
302     ajtmp = bj + bi[i];
303     for  (j=0; j<nz; j++) {
304       x = rtmp+16*ajtmp[j];
305       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
306       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
307     }
308     /* load in initial (unfactored row) */
309     nz       = ai[i+1] - ai[i];
310     ajtmpold = aj + ai[i];
311     v        = aa + 16*ai[i];
312     for (j=0; j<nz; j++) {
313       x    = rtmp+16*ajtmpold[j];
314       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
315       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
316       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
317       x[14] = v[14]; x[15] = v[15];
318       v    += 16;
319     }
320     row = *ajtmp++;
321     while (row < i) {
322       pc  = rtmp + 16*row;
323       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
324       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
325       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
326       p15 = pc[14]; p16 = pc[15];
327       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
328           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
329           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
330           || p16 != 0.0) {
331         pv = ba + 16*diag_offset[row];
332         pj = bj + diag_offset[row] + 1;
333         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
334         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
335         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
336         x15 = pv[14]; x16 = pv[15];
337         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
338         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
339         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
340         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
341 
342         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
343         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
344         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
345         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
346 
347         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
348         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
349         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
350         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
351 
352         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
353         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
354         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
355         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
356         nz = bi[row+1] - diag_offset[row] - 1;
357         pv += 16;
358         for (j=0; j<nz; j++) {
359           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
360           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
361           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
362           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
363           x    = rtmp + 16*pj[j];
364           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
365           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
366           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
367           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
368 
369           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
370           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
371           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
372           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
373 
374           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
375           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
376           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
377           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
378 
379           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
380           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
381           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
382           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
383 
384           pv   += 16;
385         }
386         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
387       }
388       row = *ajtmp++;
389     }
390     /* finished row so stick it into b->a */
391     pv = ba + 16*bi[i];
392     pj = bj + bi[i];
393     nz = bi[i+1] - bi[i];
394     for (j=0; j<nz; j++) {
395       x      = rtmp+16*pj[j];
396       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
397       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
398       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
399       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
400       pv   += 16;
401     }
402     /* invert diagonal block */
403     w = ba + 16*diag_offset[i];
404     if (pivotinblocks) {
405       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
406     } else {
407       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
408     }
409   }
410 
411   ierr = PetscFree(rtmp);CHKERRQ(ierr);
412   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
413   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
414   C->assembled = PETSC_TRUE;
415   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
416   PetscFunctionReturn(0);
417 }
418 /*
419   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct -
420     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_newdatastruct()
421 */
422 #undef __FUNCT__
423 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct"
424 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
425 {
426   Mat            C=B;
427   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
428   PetscErrorCode ierr;
429   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
430   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
431   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
432   PetscInt       bs2 = a->bs2,flg;
433   PetscReal      shift = info->shiftinblocks;
434 
435   PetscFunctionBegin;
436   /* generate work space needed by the factorization */
437   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
438   mwork = rtmp + bs2*n;
439   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
440 
441   for (i=0; i<n; i++){
442     /* zero rtmp */
443     /* L part */
444     nz    = bi[i+1] - bi[i];
445     bjtmp = bj + bi[i];
446     for  (j=0; j<nz; j++){
447       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
448     }
449 
450     /* U part */
451     nz = bi[2*n-i+1] - bi[2*n-i];
452     bjtmp = bj + bi[2*n-i];
453     for  (j=0; j<nz; j++){
454       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
455     }
456 
457     /* load in initial (unfactored row) */
458     nz    = ai[i+1] - ai[i];
459     ajtmp = aj + ai[i];
460     v     = aa + bs2*ai[i];
461     for (j=0; j<nz; j++) {
462       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
463     }
464 
465     /* elimination */
466     bjtmp = bj + bi[i];
467     nzL   = bi[i+1] - bi[i];
468     for(k=0;k < nzL;k++) {
469       row = bjtmp[k];
470       pc = rtmp + bs2*row;
471       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
472       if (flg) {
473         pv = b->a + bs2*bdiag[row];
474         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
475         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
476 
477         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
478         pv = b->a + bs2*bi[2*n-row];
479         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
480         for (j=0; j<nz; j++) {
481           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
482           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
483           v    = rtmp + bs2*pj[j];
484           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
485           pv  += bs2;
486         }
487         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
488       }
489     }
490 
491     /* finished row so stick it into b->a */
492     /* L part */
493     pv   = b->a + bs2*bi[i] ;
494     pj   = b->j + bi[i] ;
495     nz   = bi[i+1] - bi[i];
496     for (j=0; j<nz; j++) {
497       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
498     }
499 
500     /* Mark diagonal and invert diagonal for simplier triangular solves */
501     pv   = b->a + bs2*bdiag[i];
502     pj   = b->j + bdiag[i];
503     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
504     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
505     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
506 
507     /* U part */
508     pv = b->a + bs2*bi[2*n-i];
509     pj = b->j + bi[2*n-i];
510     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
511     for (j=0; j<nz; j++){
512       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
513     }
514   }
515 
516   ierr = PetscFree(rtmp);CHKERRQ(ierr);
517   C->assembled = PETSC_TRUE;
518   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
519   PetscFunctionReturn(0);
520 }
521 
522 #undef __FUNCT__
523 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
524 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
525 {
526   Mat            C=B;
527   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
528   PetscErrorCode ierr;
529   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
530   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
531   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
532   PetscInt       bs2 = a->bs2,flg;
533   PetscReal      shift = info->shiftinblocks;
534 
535   PetscFunctionBegin;
536   /* generate work space needed by the factorization */
537   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
538   mwork = rtmp + bs2*n;
539   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
540 
541   for (i=0; i<n; i++){
542     /* zero rtmp */
543     /* L part */
544     nz    = bi[i+1] - bi[i];
545     bjtmp = bj + bi[i];
546     for  (j=0; j<nz; j++){
547       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
548     }
549 
550     /* U part */
551     nz = bdiag[i] - bdiag[i+1];
552     bjtmp = bj + bdiag[i+1]+1;
553     for  (j=0; j<nz; j++){
554       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
555     }
556 
557     /* load in initial (unfactored row) */
558     nz    = ai[i+1] - ai[i];
559     ajtmp = aj + ai[i];
560     v     = aa + bs2*ai[i];
561     for (j=0; j<nz; j++) {
562       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
563     }
564 
565     /* elimination */
566     bjtmp = bj + bi[i];
567     nzL   = bi[i+1] - bi[i];
568     for(k=0;k < nzL;k++) {
569       row = bjtmp[k];
570       pc = rtmp + bs2*row;
571       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
572       if (flg) {
573         pv = b->a + bs2*bdiag[row];
574         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
575         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
576 
577         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
578         pv = b->a + bs2*(bdiag[row+1]+1);
579         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
580         for (j=0; j<nz; j++) {
581           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
582           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
583           v    = rtmp + bs2*pj[j];
584           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
585           pv  += bs2;
586         }
587         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
588       }
589     }
590 
591     /* finished row so stick it into b->a */
592     /* L part */
593     pv   = b->a + bs2*bi[i] ;
594     pj   = b->j + bi[i] ;
595     nz   = bi[i+1] - bi[i];
596     for (j=0; j<nz; j++) {
597       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
598     }
599 
600     /* Mark diagonal and invert diagonal for simplier triangular solves */
601     pv   = b->a + bs2*bdiag[i];
602     pj   = b->j + bdiag[i];
603     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
604     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
605     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
606 
607     /* U part */
608     pv = b->a + bs2*(bdiag[i+1]+1);
609     pj = b->j + bdiag[i+1]+1;
610     nz = bdiag[i] - bdiag[i+1] - 1;
611     for (j=0; j<nz; j++){
612       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
613     }
614   }
615 
616   ierr = PetscFree(rtmp);CHKERRQ(ierr);
617   C->assembled = PETSC_TRUE;
618   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
619   PetscFunctionReturn(0);
620 }
621 
622 #if defined(PETSC_HAVE_SSE)
623 
624 #include PETSC_HAVE_SSE
625 
626 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
627 #undef __FUNCT__
628 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
629 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
630 {
631   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
632   PetscErrorCode ierr;
633   int i,j,n = a->mbs;
634   int         *bj = b->j,*bjtmp,*pj;
635   int         row;
636   int         *ajtmpold,nz,*bi=b->i;
637   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
638   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
639   MatScalar   *ba = b->a,*aa = a->a;
640   int         nonzero=0;
641 /*    int            nonzero=0,colscale = 16; */
642   PetscTruth  pivotinblocks = b->pivotinblocks;
643   PetscReal      shift = info->shiftinblocks;
644 
645   PetscFunctionBegin;
646   SSE_SCOPE_BEGIN;
647 
648   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
649   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
650   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
651   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
652 /*    if ((unsigned long)bj==(unsigned long)aj) { */
653 /*      colscale = 4; */
654 /*    } */
655   for (i=0; i<n; i++) {
656     nz    = bi[i+1] - bi[i];
657     bjtmp = bj + bi[i];
658     /* zero out the 4x4 block accumulators */
659     /* zero out one register */
660     XOR_PS(XMM7,XMM7);
661     for  (j=0; j<nz; j++) {
662       x = rtmp+16*bjtmp[j];
663 /*        x = rtmp+4*bjtmp[j]; */
664       SSE_INLINE_BEGIN_1(x)
665         /* Copy zero register to memory locations */
666         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
667         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
668         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
669         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
670         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
671         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
672         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
673         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
674         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
675       SSE_INLINE_END_1;
676     }
677     /* load in initial (unfactored row) */
678     nz       = ai[i+1] - ai[i];
679     ajtmpold = aj + ai[i];
680     v        = aa + 16*ai[i];
681     for (j=0; j<nz; j++) {
682       x = rtmp+16*ajtmpold[j];
683 /*        x = rtmp+colscale*ajtmpold[j]; */
684       /* Copy v block into x block */
685       SSE_INLINE_BEGIN_2(v,x)
686         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
687         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
688         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
689 
690         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
691         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
692 
693         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
694         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
695 
696         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
697         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
698 
699         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
700         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
701 
702         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
703         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
704 
705         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
706         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
707 
708         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
709         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
710       SSE_INLINE_END_2;
711 
712       v += 16;
713     }
714 /*      row = (*bjtmp++)/4; */
715     row = *bjtmp++;
716     while (row < i) {
717       pc  = rtmp + 16*row;
718       SSE_INLINE_BEGIN_1(pc)
719         /* Load block from lower triangle */
720         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
721         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
722         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
723 
724         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
725         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
726 
727         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
728         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
729 
730         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
731         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
732 
733         /* Compare block to zero block */
734 
735         SSE_COPY_PS(XMM4,XMM7)
736         SSE_CMPNEQ_PS(XMM4,XMM0)
737 
738         SSE_COPY_PS(XMM5,XMM7)
739         SSE_CMPNEQ_PS(XMM5,XMM1)
740 
741         SSE_COPY_PS(XMM6,XMM7)
742         SSE_CMPNEQ_PS(XMM6,XMM2)
743 
744         SSE_CMPNEQ_PS(XMM7,XMM3)
745 
746         /* Reduce the comparisons to one SSE register */
747         SSE_OR_PS(XMM6,XMM7)
748         SSE_OR_PS(XMM5,XMM4)
749         SSE_OR_PS(XMM5,XMM6)
750       SSE_INLINE_END_1;
751 
752       /* Reduce the one SSE register to an integer register for branching */
753       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
754       MOVEMASK(nonzero,XMM5);
755 
756       /* If block is nonzero ... */
757       if (nonzero) {
758         pv = ba + 16*diag_offset[row];
759         PREFETCH_L1(&pv[16]);
760         pj = bj + diag_offset[row] + 1;
761 
762         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
763         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
764         /* but the diagonal was inverted already */
765         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
766 
767         SSE_INLINE_BEGIN_2(pv,pc)
768           /* Column 0, product is accumulated in XMM4 */
769           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
770           SSE_SHUFFLE(XMM4,XMM4,0x00)
771           SSE_MULT_PS(XMM4,XMM0)
772 
773           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
774           SSE_SHUFFLE(XMM5,XMM5,0x00)
775           SSE_MULT_PS(XMM5,XMM1)
776           SSE_ADD_PS(XMM4,XMM5)
777 
778           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
779           SSE_SHUFFLE(XMM6,XMM6,0x00)
780           SSE_MULT_PS(XMM6,XMM2)
781           SSE_ADD_PS(XMM4,XMM6)
782 
783           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
784           SSE_SHUFFLE(XMM7,XMM7,0x00)
785           SSE_MULT_PS(XMM7,XMM3)
786           SSE_ADD_PS(XMM4,XMM7)
787 
788           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
789           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
790 
791           /* Column 1, product is accumulated in XMM5 */
792           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
793           SSE_SHUFFLE(XMM5,XMM5,0x00)
794           SSE_MULT_PS(XMM5,XMM0)
795 
796           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
797           SSE_SHUFFLE(XMM6,XMM6,0x00)
798           SSE_MULT_PS(XMM6,XMM1)
799           SSE_ADD_PS(XMM5,XMM6)
800 
801           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
802           SSE_SHUFFLE(XMM7,XMM7,0x00)
803           SSE_MULT_PS(XMM7,XMM2)
804           SSE_ADD_PS(XMM5,XMM7)
805 
806           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
807           SSE_SHUFFLE(XMM6,XMM6,0x00)
808           SSE_MULT_PS(XMM6,XMM3)
809           SSE_ADD_PS(XMM5,XMM6)
810 
811           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
812           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
813 
814           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
815 
816           /* Column 2, product is accumulated in XMM6 */
817           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
818           SSE_SHUFFLE(XMM6,XMM6,0x00)
819           SSE_MULT_PS(XMM6,XMM0)
820 
821           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
822           SSE_SHUFFLE(XMM7,XMM7,0x00)
823           SSE_MULT_PS(XMM7,XMM1)
824           SSE_ADD_PS(XMM6,XMM7)
825 
826           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
827           SSE_SHUFFLE(XMM7,XMM7,0x00)
828           SSE_MULT_PS(XMM7,XMM2)
829           SSE_ADD_PS(XMM6,XMM7)
830 
831           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
832           SSE_SHUFFLE(XMM7,XMM7,0x00)
833           SSE_MULT_PS(XMM7,XMM3)
834           SSE_ADD_PS(XMM6,XMM7)
835 
836           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
837           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
838 
839           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
840           /* Column 3, product is accumulated in XMM0 */
841           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
842           SSE_SHUFFLE(XMM7,XMM7,0x00)
843           SSE_MULT_PS(XMM0,XMM7)
844 
845           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
846           SSE_SHUFFLE(XMM7,XMM7,0x00)
847           SSE_MULT_PS(XMM1,XMM7)
848           SSE_ADD_PS(XMM0,XMM1)
849 
850           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
851           SSE_SHUFFLE(XMM1,XMM1,0x00)
852           SSE_MULT_PS(XMM1,XMM2)
853           SSE_ADD_PS(XMM0,XMM1)
854 
855           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
856           SSE_SHUFFLE(XMM7,XMM7,0x00)
857           SSE_MULT_PS(XMM3,XMM7)
858           SSE_ADD_PS(XMM0,XMM3)
859 
860           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
861           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
862 
863           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
864           /* This is code to be maintained and read by humans afterall. */
865           /* Copy Multiplier Col 3 into XMM3 */
866           SSE_COPY_PS(XMM3,XMM0)
867           /* Copy Multiplier Col 2 into XMM2 */
868           SSE_COPY_PS(XMM2,XMM6)
869           /* Copy Multiplier Col 1 into XMM1 */
870           SSE_COPY_PS(XMM1,XMM5)
871           /* Copy Multiplier Col 0 into XMM0 */
872           SSE_COPY_PS(XMM0,XMM4)
873         SSE_INLINE_END_2;
874 
875         /* Update the row: */
876         nz = bi[row+1] - diag_offset[row] - 1;
877         pv += 16;
878         for (j=0; j<nz; j++) {
879           PREFETCH_L1(&pv[16]);
880           x = rtmp + 16*pj[j];
881 /*            x = rtmp + 4*pj[j]; */
882 
883           /* X:=X-M*PV, One column at a time */
884           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
885           SSE_INLINE_BEGIN_2(x,pv)
886             /* Load First Column of X*/
887             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
888             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
889 
890             /* Matrix-Vector Product: */
891             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
892             SSE_SHUFFLE(XMM5,XMM5,0x00)
893             SSE_MULT_PS(XMM5,XMM0)
894             SSE_SUB_PS(XMM4,XMM5)
895 
896             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
897             SSE_SHUFFLE(XMM6,XMM6,0x00)
898             SSE_MULT_PS(XMM6,XMM1)
899             SSE_SUB_PS(XMM4,XMM6)
900 
901             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
902             SSE_SHUFFLE(XMM7,XMM7,0x00)
903             SSE_MULT_PS(XMM7,XMM2)
904             SSE_SUB_PS(XMM4,XMM7)
905 
906             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
907             SSE_SHUFFLE(XMM5,XMM5,0x00)
908             SSE_MULT_PS(XMM5,XMM3)
909             SSE_SUB_PS(XMM4,XMM5)
910 
911             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
912             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
913 
914             /* Second Column */
915             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
916             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
917 
918             /* Matrix-Vector Product: */
919             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
920             SSE_SHUFFLE(XMM6,XMM6,0x00)
921             SSE_MULT_PS(XMM6,XMM0)
922             SSE_SUB_PS(XMM5,XMM6)
923 
924             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
925             SSE_SHUFFLE(XMM7,XMM7,0x00)
926             SSE_MULT_PS(XMM7,XMM1)
927             SSE_SUB_PS(XMM5,XMM7)
928 
929             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
930             SSE_SHUFFLE(XMM4,XMM4,0x00)
931             SSE_MULT_PS(XMM4,XMM2)
932             SSE_SUB_PS(XMM5,XMM4)
933 
934             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
935             SSE_SHUFFLE(XMM6,XMM6,0x00)
936             SSE_MULT_PS(XMM6,XMM3)
937             SSE_SUB_PS(XMM5,XMM6)
938 
939             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
940             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
941 
942             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
943 
944             /* Third Column */
945             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
946             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
947 
948             /* Matrix-Vector Product: */
949             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
950             SSE_SHUFFLE(XMM7,XMM7,0x00)
951             SSE_MULT_PS(XMM7,XMM0)
952             SSE_SUB_PS(XMM6,XMM7)
953 
954             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
955             SSE_SHUFFLE(XMM4,XMM4,0x00)
956             SSE_MULT_PS(XMM4,XMM1)
957             SSE_SUB_PS(XMM6,XMM4)
958 
959             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
960             SSE_SHUFFLE(XMM5,XMM5,0x00)
961             SSE_MULT_PS(XMM5,XMM2)
962             SSE_SUB_PS(XMM6,XMM5)
963 
964             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
965             SSE_SHUFFLE(XMM7,XMM7,0x00)
966             SSE_MULT_PS(XMM7,XMM3)
967             SSE_SUB_PS(XMM6,XMM7)
968 
969             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
970             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
971 
972             /* Fourth Column */
973             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
974             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
975 
976             /* Matrix-Vector Product: */
977             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
978             SSE_SHUFFLE(XMM5,XMM5,0x00)
979             SSE_MULT_PS(XMM5,XMM0)
980             SSE_SUB_PS(XMM4,XMM5)
981 
982             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
983             SSE_SHUFFLE(XMM6,XMM6,0x00)
984             SSE_MULT_PS(XMM6,XMM1)
985             SSE_SUB_PS(XMM4,XMM6)
986 
987             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
988             SSE_SHUFFLE(XMM7,XMM7,0x00)
989             SSE_MULT_PS(XMM7,XMM2)
990             SSE_SUB_PS(XMM4,XMM7)
991 
992             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
993             SSE_SHUFFLE(XMM5,XMM5,0x00)
994             SSE_MULT_PS(XMM5,XMM3)
995             SSE_SUB_PS(XMM4,XMM5)
996 
997             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
998             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
999           SSE_INLINE_END_2;
1000           pv   += 16;
1001         }
1002         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1003       }
1004       row = *bjtmp++;
1005 /*        row = (*bjtmp++)/4; */
1006     }
1007     /* finished row so stick it into b->a */
1008     pv = ba + 16*bi[i];
1009     pj = bj + bi[i];
1010     nz = bi[i+1] - bi[i];
1011 
1012     /* Copy x block back into pv block */
1013     for (j=0; j<nz; j++) {
1014       x  = rtmp+16*pj[j];
1015 /*        x  = rtmp+4*pj[j]; */
1016 
1017       SSE_INLINE_BEGIN_2(x,pv)
1018         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1019         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1020         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1021 
1022         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1023         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1024 
1025         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1026         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1027 
1028         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1029         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1030 
1031         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1032         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1033 
1034         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1035         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1036 
1037         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1038         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1039 
1040         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1041         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1042       SSE_INLINE_END_2;
1043       pv += 16;
1044     }
1045     /* invert diagonal block */
1046     w = ba + 16*diag_offset[i];
1047     if (pivotinblocks) {
1048       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1049     } else {
1050       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1051     }
1052 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1053     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1054   }
1055 
1056   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1057   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1058   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1059   C->assembled = PETSC_TRUE;
1060   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1061   /* Flop Count from inverting diagonal blocks */
1062   SSE_SCOPE_END;
1063   PetscFunctionReturn(0);
1064 }
1065 
1066 #undef __FUNCT__
1067 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
1068 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1069 {
1070   Mat            A=C;
1071   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1072   PetscErrorCode ierr;
1073   int i,j,n = a->mbs;
1074   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1075   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
1076   unsigned int   row;
1077   int            nz,*bi=b->i;
1078   int            *diag_offset = b->diag,*ai=a->i;
1079   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1080   MatScalar      *ba = b->a,*aa = a->a;
1081   int            nonzero=0;
1082 /*    int            nonzero=0,colscale = 16; */
1083   PetscTruth     pivotinblocks = b->pivotinblocks;
1084   PetscReal      shift = info->shiftinblocks;
1085 
1086   PetscFunctionBegin;
1087   SSE_SCOPE_BEGIN;
1088 
1089   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1090   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1091   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1092   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1093 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1094 /*      colscale = 4; */
1095 /*    } */
1096 
1097   for (i=0; i<n; i++) {
1098     nz    = bi[i+1] - bi[i];
1099     bjtmp = bj + bi[i];
1100     /* zero out the 4x4 block accumulators */
1101     /* zero out one register */
1102     XOR_PS(XMM7,XMM7);
1103     for  (j=0; j<nz; j++) {
1104       x = rtmp+16*((unsigned int)bjtmp[j]);
1105 /*        x = rtmp+4*bjtmp[j]; */
1106       SSE_INLINE_BEGIN_1(x)
1107         /* Copy zero register to memory locations */
1108         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1109         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1110         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1111         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1112         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1113         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1114         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1115         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1116         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1117       SSE_INLINE_END_1;
1118     }
1119     /* load in initial (unfactored row) */
1120     nz    = ai[i+1] - ai[i];
1121     ajtmp = aj + ai[i];
1122     v     = aa + 16*ai[i];
1123     for (j=0; j<nz; j++) {
1124       x = rtmp+16*((unsigned int)ajtmp[j]);
1125 /*        x = rtmp+colscale*ajtmp[j]; */
1126       /* Copy v block into x block */
1127       SSE_INLINE_BEGIN_2(v,x)
1128         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1129         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1130         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1131 
1132         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1133         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1134 
1135         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1136         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1137 
1138         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1139         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1140 
1141         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1142         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1143 
1144         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1145         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1146 
1147         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1148         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1149 
1150         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1151         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1152       SSE_INLINE_END_2;
1153 
1154       v += 16;
1155     }
1156 /*      row = (*bjtmp++)/4; */
1157     row = (unsigned int)(*bjtmp++);
1158     while (row < i) {
1159       pc  = rtmp + 16*row;
1160       SSE_INLINE_BEGIN_1(pc)
1161         /* Load block from lower triangle */
1162         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1163         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1164         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1165 
1166         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1167         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1168 
1169         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1170         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1171 
1172         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1173         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1174 
1175         /* Compare block to zero block */
1176 
1177         SSE_COPY_PS(XMM4,XMM7)
1178         SSE_CMPNEQ_PS(XMM4,XMM0)
1179 
1180         SSE_COPY_PS(XMM5,XMM7)
1181         SSE_CMPNEQ_PS(XMM5,XMM1)
1182 
1183         SSE_COPY_PS(XMM6,XMM7)
1184         SSE_CMPNEQ_PS(XMM6,XMM2)
1185 
1186         SSE_CMPNEQ_PS(XMM7,XMM3)
1187 
1188         /* Reduce the comparisons to one SSE register */
1189         SSE_OR_PS(XMM6,XMM7)
1190         SSE_OR_PS(XMM5,XMM4)
1191         SSE_OR_PS(XMM5,XMM6)
1192       SSE_INLINE_END_1;
1193 
1194       /* Reduce the one SSE register to an integer register for branching */
1195       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1196       MOVEMASK(nonzero,XMM5);
1197 
1198       /* If block is nonzero ... */
1199       if (nonzero) {
1200         pv = ba + 16*diag_offset[row];
1201         PREFETCH_L1(&pv[16]);
1202         pj = bj + diag_offset[row] + 1;
1203 
1204         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1205         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1206         /* but the diagonal was inverted already */
1207         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1208 
1209         SSE_INLINE_BEGIN_2(pv,pc)
1210           /* Column 0, product is accumulated in XMM4 */
1211           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1212           SSE_SHUFFLE(XMM4,XMM4,0x00)
1213           SSE_MULT_PS(XMM4,XMM0)
1214 
1215           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1216           SSE_SHUFFLE(XMM5,XMM5,0x00)
1217           SSE_MULT_PS(XMM5,XMM1)
1218           SSE_ADD_PS(XMM4,XMM5)
1219 
1220           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1221           SSE_SHUFFLE(XMM6,XMM6,0x00)
1222           SSE_MULT_PS(XMM6,XMM2)
1223           SSE_ADD_PS(XMM4,XMM6)
1224 
1225           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1226           SSE_SHUFFLE(XMM7,XMM7,0x00)
1227           SSE_MULT_PS(XMM7,XMM3)
1228           SSE_ADD_PS(XMM4,XMM7)
1229 
1230           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1231           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1232 
1233           /* Column 1, product is accumulated in XMM5 */
1234           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1235           SSE_SHUFFLE(XMM5,XMM5,0x00)
1236           SSE_MULT_PS(XMM5,XMM0)
1237 
1238           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1239           SSE_SHUFFLE(XMM6,XMM6,0x00)
1240           SSE_MULT_PS(XMM6,XMM1)
1241           SSE_ADD_PS(XMM5,XMM6)
1242 
1243           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1244           SSE_SHUFFLE(XMM7,XMM7,0x00)
1245           SSE_MULT_PS(XMM7,XMM2)
1246           SSE_ADD_PS(XMM5,XMM7)
1247 
1248           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1249           SSE_SHUFFLE(XMM6,XMM6,0x00)
1250           SSE_MULT_PS(XMM6,XMM3)
1251           SSE_ADD_PS(XMM5,XMM6)
1252 
1253           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1254           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1255 
1256           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1257 
1258           /* Column 2, product is accumulated in XMM6 */
1259           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1260           SSE_SHUFFLE(XMM6,XMM6,0x00)
1261           SSE_MULT_PS(XMM6,XMM0)
1262 
1263           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1264           SSE_SHUFFLE(XMM7,XMM7,0x00)
1265           SSE_MULT_PS(XMM7,XMM1)
1266           SSE_ADD_PS(XMM6,XMM7)
1267 
1268           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1269           SSE_SHUFFLE(XMM7,XMM7,0x00)
1270           SSE_MULT_PS(XMM7,XMM2)
1271           SSE_ADD_PS(XMM6,XMM7)
1272 
1273           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1274           SSE_SHUFFLE(XMM7,XMM7,0x00)
1275           SSE_MULT_PS(XMM7,XMM3)
1276           SSE_ADD_PS(XMM6,XMM7)
1277 
1278           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1279           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1280 
1281           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1282           /* Column 3, product is accumulated in XMM0 */
1283           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1284           SSE_SHUFFLE(XMM7,XMM7,0x00)
1285           SSE_MULT_PS(XMM0,XMM7)
1286 
1287           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1288           SSE_SHUFFLE(XMM7,XMM7,0x00)
1289           SSE_MULT_PS(XMM1,XMM7)
1290           SSE_ADD_PS(XMM0,XMM1)
1291 
1292           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1293           SSE_SHUFFLE(XMM1,XMM1,0x00)
1294           SSE_MULT_PS(XMM1,XMM2)
1295           SSE_ADD_PS(XMM0,XMM1)
1296 
1297           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1298           SSE_SHUFFLE(XMM7,XMM7,0x00)
1299           SSE_MULT_PS(XMM3,XMM7)
1300           SSE_ADD_PS(XMM0,XMM3)
1301 
1302           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1303           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1304 
1305           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1306           /* This is code to be maintained and read by humans afterall. */
1307           /* Copy Multiplier Col 3 into XMM3 */
1308           SSE_COPY_PS(XMM3,XMM0)
1309           /* Copy Multiplier Col 2 into XMM2 */
1310           SSE_COPY_PS(XMM2,XMM6)
1311           /* Copy Multiplier Col 1 into XMM1 */
1312           SSE_COPY_PS(XMM1,XMM5)
1313           /* Copy Multiplier Col 0 into XMM0 */
1314           SSE_COPY_PS(XMM0,XMM4)
1315         SSE_INLINE_END_2;
1316 
1317         /* Update the row: */
1318         nz = bi[row+1] - diag_offset[row] - 1;
1319         pv += 16;
1320         for (j=0; j<nz; j++) {
1321           PREFETCH_L1(&pv[16]);
1322           x = rtmp + 16*((unsigned int)pj[j]);
1323 /*            x = rtmp + 4*pj[j]; */
1324 
1325           /* X:=X-M*PV, One column at a time */
1326           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1327           SSE_INLINE_BEGIN_2(x,pv)
1328             /* Load First Column of X*/
1329             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1330             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1331 
1332             /* Matrix-Vector Product: */
1333             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1334             SSE_SHUFFLE(XMM5,XMM5,0x00)
1335             SSE_MULT_PS(XMM5,XMM0)
1336             SSE_SUB_PS(XMM4,XMM5)
1337 
1338             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1339             SSE_SHUFFLE(XMM6,XMM6,0x00)
1340             SSE_MULT_PS(XMM6,XMM1)
1341             SSE_SUB_PS(XMM4,XMM6)
1342 
1343             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1344             SSE_SHUFFLE(XMM7,XMM7,0x00)
1345             SSE_MULT_PS(XMM7,XMM2)
1346             SSE_SUB_PS(XMM4,XMM7)
1347 
1348             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1349             SSE_SHUFFLE(XMM5,XMM5,0x00)
1350             SSE_MULT_PS(XMM5,XMM3)
1351             SSE_SUB_PS(XMM4,XMM5)
1352 
1353             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1354             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1355 
1356             /* Second Column */
1357             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1358             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1359 
1360             /* Matrix-Vector Product: */
1361             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1362             SSE_SHUFFLE(XMM6,XMM6,0x00)
1363             SSE_MULT_PS(XMM6,XMM0)
1364             SSE_SUB_PS(XMM5,XMM6)
1365 
1366             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1367             SSE_SHUFFLE(XMM7,XMM7,0x00)
1368             SSE_MULT_PS(XMM7,XMM1)
1369             SSE_SUB_PS(XMM5,XMM7)
1370 
1371             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1372             SSE_SHUFFLE(XMM4,XMM4,0x00)
1373             SSE_MULT_PS(XMM4,XMM2)
1374             SSE_SUB_PS(XMM5,XMM4)
1375 
1376             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1377             SSE_SHUFFLE(XMM6,XMM6,0x00)
1378             SSE_MULT_PS(XMM6,XMM3)
1379             SSE_SUB_PS(XMM5,XMM6)
1380 
1381             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1382             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1383 
1384             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1385 
1386             /* Third Column */
1387             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1388             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1389 
1390             /* Matrix-Vector Product: */
1391             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1392             SSE_SHUFFLE(XMM7,XMM7,0x00)
1393             SSE_MULT_PS(XMM7,XMM0)
1394             SSE_SUB_PS(XMM6,XMM7)
1395 
1396             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1397             SSE_SHUFFLE(XMM4,XMM4,0x00)
1398             SSE_MULT_PS(XMM4,XMM1)
1399             SSE_SUB_PS(XMM6,XMM4)
1400 
1401             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1402             SSE_SHUFFLE(XMM5,XMM5,0x00)
1403             SSE_MULT_PS(XMM5,XMM2)
1404             SSE_SUB_PS(XMM6,XMM5)
1405 
1406             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1407             SSE_SHUFFLE(XMM7,XMM7,0x00)
1408             SSE_MULT_PS(XMM7,XMM3)
1409             SSE_SUB_PS(XMM6,XMM7)
1410 
1411             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1412             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1413 
1414             /* Fourth Column */
1415             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1416             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1417 
1418             /* Matrix-Vector Product: */
1419             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1420             SSE_SHUFFLE(XMM5,XMM5,0x00)
1421             SSE_MULT_PS(XMM5,XMM0)
1422             SSE_SUB_PS(XMM4,XMM5)
1423 
1424             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1425             SSE_SHUFFLE(XMM6,XMM6,0x00)
1426             SSE_MULT_PS(XMM6,XMM1)
1427             SSE_SUB_PS(XMM4,XMM6)
1428 
1429             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1430             SSE_SHUFFLE(XMM7,XMM7,0x00)
1431             SSE_MULT_PS(XMM7,XMM2)
1432             SSE_SUB_PS(XMM4,XMM7)
1433 
1434             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1435             SSE_SHUFFLE(XMM5,XMM5,0x00)
1436             SSE_MULT_PS(XMM5,XMM3)
1437             SSE_SUB_PS(XMM4,XMM5)
1438 
1439             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1440             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1441           SSE_INLINE_END_2;
1442           pv   += 16;
1443         }
1444         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1445       }
1446       row = (unsigned int)(*bjtmp++);
1447 /*        row = (*bjtmp++)/4; */
1448 /*        bjtmp++; */
1449     }
1450     /* finished row so stick it into b->a */
1451     pv = ba + 16*bi[i];
1452     pj = bj + bi[i];
1453     nz = bi[i+1] - bi[i];
1454 
1455     /* Copy x block back into pv block */
1456     for (j=0; j<nz; j++) {
1457       x  = rtmp+16*((unsigned int)pj[j]);
1458 /*        x  = rtmp+4*pj[j]; */
1459 
1460       SSE_INLINE_BEGIN_2(x,pv)
1461         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1462         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1463         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1464 
1465         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1466         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1467 
1468         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1469         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1470 
1471         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1472         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1473 
1474         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1475         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1476 
1477         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1478         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1479 
1480         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1481         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1482 
1483         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1484         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1485       SSE_INLINE_END_2;
1486       pv += 16;
1487     }
1488     /* invert diagonal block */
1489     w = ba + 16*diag_offset[i];
1490     if (pivotinblocks) {
1491       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1492     } else {
1493       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1494     }
1495 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1496     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1497   }
1498 
1499   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1500   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1501   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1502   C->assembled = PETSC_TRUE;
1503   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1504   /* Flop Count from inverting diagonal blocks */
1505   SSE_SCOPE_END;
1506   PetscFunctionReturn(0);
1507 }
1508 
1509 #undef __FUNCT__
1510 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1511 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1512 {
1513   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1514   PetscErrorCode ierr;
1515   int  i,j,n = a->mbs;
1516   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1517   unsigned int   row;
1518   int            *ajtmpold,nz,*bi=b->i;
1519   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1520   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1521   MatScalar      *ba = b->a,*aa = a->a;
1522   int            nonzero=0;
1523 /*    int            nonzero=0,colscale = 16; */
1524   PetscTruth     pivotinblocks = b->pivotinblocks;
1525   PetscReal      shift = info->shiftinblocks;
1526 
1527   PetscFunctionBegin;
1528   SSE_SCOPE_BEGIN;
1529 
1530   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1531   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1532   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1533   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1534 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1535 /*      colscale = 4; */
1536 /*    } */
1537   if ((unsigned long)bj==(unsigned long)aj) {
1538     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1539   }
1540 
1541   for (i=0; i<n; i++) {
1542     nz    = bi[i+1] - bi[i];
1543     bjtmp = bj + bi[i];
1544     /* zero out the 4x4 block accumulators */
1545     /* zero out one register */
1546     XOR_PS(XMM7,XMM7);
1547     for  (j=0; j<nz; j++) {
1548       x = rtmp+16*((unsigned int)bjtmp[j]);
1549 /*        x = rtmp+4*bjtmp[j]; */
1550       SSE_INLINE_BEGIN_1(x)
1551         /* Copy zero register to memory locations */
1552         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1553         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1554         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1555         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1556         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1557         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1558         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1559         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1560         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1561       SSE_INLINE_END_1;
1562     }
1563     /* load in initial (unfactored row) */
1564     nz       = ai[i+1] - ai[i];
1565     ajtmpold = aj + ai[i];
1566     v        = aa + 16*ai[i];
1567     for (j=0; j<nz; j++) {
1568       x = rtmp+16*ajtmpold[j];
1569 /*        x = rtmp+colscale*ajtmpold[j]; */
1570       /* Copy v block into x block */
1571       SSE_INLINE_BEGIN_2(v,x)
1572         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1573         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1574         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1575 
1576         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1577         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1578 
1579         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1580         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1581 
1582         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1583         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1584 
1585         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1586         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1587 
1588         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1589         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1590 
1591         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1592         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1593 
1594         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1595         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1596       SSE_INLINE_END_2;
1597 
1598       v += 16;
1599     }
1600 /*      row = (*bjtmp++)/4; */
1601     row = (unsigned int)(*bjtmp++);
1602     while (row < i) {
1603       pc  = rtmp + 16*row;
1604       SSE_INLINE_BEGIN_1(pc)
1605         /* Load block from lower triangle */
1606         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1607         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1608         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1609 
1610         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1611         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1612 
1613         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1614         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1615 
1616         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1617         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1618 
1619         /* Compare block to zero block */
1620 
1621         SSE_COPY_PS(XMM4,XMM7)
1622         SSE_CMPNEQ_PS(XMM4,XMM0)
1623 
1624         SSE_COPY_PS(XMM5,XMM7)
1625         SSE_CMPNEQ_PS(XMM5,XMM1)
1626 
1627         SSE_COPY_PS(XMM6,XMM7)
1628         SSE_CMPNEQ_PS(XMM6,XMM2)
1629 
1630         SSE_CMPNEQ_PS(XMM7,XMM3)
1631 
1632         /* Reduce the comparisons to one SSE register */
1633         SSE_OR_PS(XMM6,XMM7)
1634         SSE_OR_PS(XMM5,XMM4)
1635         SSE_OR_PS(XMM5,XMM6)
1636       SSE_INLINE_END_1;
1637 
1638       /* Reduce the one SSE register to an integer register for branching */
1639       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1640       MOVEMASK(nonzero,XMM5);
1641 
1642       /* If block is nonzero ... */
1643       if (nonzero) {
1644         pv = ba + 16*diag_offset[row];
1645         PREFETCH_L1(&pv[16]);
1646         pj = bj + diag_offset[row] + 1;
1647 
1648         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1649         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1650         /* but the diagonal was inverted already */
1651         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1652 
1653         SSE_INLINE_BEGIN_2(pv,pc)
1654           /* Column 0, product is accumulated in XMM4 */
1655           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1656           SSE_SHUFFLE(XMM4,XMM4,0x00)
1657           SSE_MULT_PS(XMM4,XMM0)
1658 
1659           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1660           SSE_SHUFFLE(XMM5,XMM5,0x00)
1661           SSE_MULT_PS(XMM5,XMM1)
1662           SSE_ADD_PS(XMM4,XMM5)
1663 
1664           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1665           SSE_SHUFFLE(XMM6,XMM6,0x00)
1666           SSE_MULT_PS(XMM6,XMM2)
1667           SSE_ADD_PS(XMM4,XMM6)
1668 
1669           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1670           SSE_SHUFFLE(XMM7,XMM7,0x00)
1671           SSE_MULT_PS(XMM7,XMM3)
1672           SSE_ADD_PS(XMM4,XMM7)
1673 
1674           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1675           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1676 
1677           /* Column 1, product is accumulated in XMM5 */
1678           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1679           SSE_SHUFFLE(XMM5,XMM5,0x00)
1680           SSE_MULT_PS(XMM5,XMM0)
1681 
1682           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1683           SSE_SHUFFLE(XMM6,XMM6,0x00)
1684           SSE_MULT_PS(XMM6,XMM1)
1685           SSE_ADD_PS(XMM5,XMM6)
1686 
1687           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1688           SSE_SHUFFLE(XMM7,XMM7,0x00)
1689           SSE_MULT_PS(XMM7,XMM2)
1690           SSE_ADD_PS(XMM5,XMM7)
1691 
1692           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1693           SSE_SHUFFLE(XMM6,XMM6,0x00)
1694           SSE_MULT_PS(XMM6,XMM3)
1695           SSE_ADD_PS(XMM5,XMM6)
1696 
1697           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1698           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1699 
1700           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1701 
1702           /* Column 2, product is accumulated in XMM6 */
1703           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1704           SSE_SHUFFLE(XMM6,XMM6,0x00)
1705           SSE_MULT_PS(XMM6,XMM0)
1706 
1707           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1708           SSE_SHUFFLE(XMM7,XMM7,0x00)
1709           SSE_MULT_PS(XMM7,XMM1)
1710           SSE_ADD_PS(XMM6,XMM7)
1711 
1712           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1713           SSE_SHUFFLE(XMM7,XMM7,0x00)
1714           SSE_MULT_PS(XMM7,XMM2)
1715           SSE_ADD_PS(XMM6,XMM7)
1716 
1717           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1718           SSE_SHUFFLE(XMM7,XMM7,0x00)
1719           SSE_MULT_PS(XMM7,XMM3)
1720           SSE_ADD_PS(XMM6,XMM7)
1721 
1722           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1723           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1724 
1725           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1726           /* Column 3, product is accumulated in XMM0 */
1727           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1728           SSE_SHUFFLE(XMM7,XMM7,0x00)
1729           SSE_MULT_PS(XMM0,XMM7)
1730 
1731           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1732           SSE_SHUFFLE(XMM7,XMM7,0x00)
1733           SSE_MULT_PS(XMM1,XMM7)
1734           SSE_ADD_PS(XMM0,XMM1)
1735 
1736           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1737           SSE_SHUFFLE(XMM1,XMM1,0x00)
1738           SSE_MULT_PS(XMM1,XMM2)
1739           SSE_ADD_PS(XMM0,XMM1)
1740 
1741           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1742           SSE_SHUFFLE(XMM7,XMM7,0x00)
1743           SSE_MULT_PS(XMM3,XMM7)
1744           SSE_ADD_PS(XMM0,XMM3)
1745 
1746           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1747           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1748 
1749           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1750           /* This is code to be maintained and read by humans afterall. */
1751           /* Copy Multiplier Col 3 into XMM3 */
1752           SSE_COPY_PS(XMM3,XMM0)
1753           /* Copy Multiplier Col 2 into XMM2 */
1754           SSE_COPY_PS(XMM2,XMM6)
1755           /* Copy Multiplier Col 1 into XMM1 */
1756           SSE_COPY_PS(XMM1,XMM5)
1757           /* Copy Multiplier Col 0 into XMM0 */
1758           SSE_COPY_PS(XMM0,XMM4)
1759         SSE_INLINE_END_2;
1760 
1761         /* Update the row: */
1762         nz = bi[row+1] - diag_offset[row] - 1;
1763         pv += 16;
1764         for (j=0; j<nz; j++) {
1765           PREFETCH_L1(&pv[16]);
1766           x = rtmp + 16*((unsigned int)pj[j]);
1767 /*            x = rtmp + 4*pj[j]; */
1768 
1769           /* X:=X-M*PV, One column at a time */
1770           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1771           SSE_INLINE_BEGIN_2(x,pv)
1772             /* Load First Column of X*/
1773             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1774             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1775 
1776             /* Matrix-Vector Product: */
1777             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1778             SSE_SHUFFLE(XMM5,XMM5,0x00)
1779             SSE_MULT_PS(XMM5,XMM0)
1780             SSE_SUB_PS(XMM4,XMM5)
1781 
1782             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1783             SSE_SHUFFLE(XMM6,XMM6,0x00)
1784             SSE_MULT_PS(XMM6,XMM1)
1785             SSE_SUB_PS(XMM4,XMM6)
1786 
1787             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1788             SSE_SHUFFLE(XMM7,XMM7,0x00)
1789             SSE_MULT_PS(XMM7,XMM2)
1790             SSE_SUB_PS(XMM4,XMM7)
1791 
1792             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1793             SSE_SHUFFLE(XMM5,XMM5,0x00)
1794             SSE_MULT_PS(XMM5,XMM3)
1795             SSE_SUB_PS(XMM4,XMM5)
1796 
1797             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1798             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1799 
1800             /* Second Column */
1801             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1802             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1803 
1804             /* Matrix-Vector Product: */
1805             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1806             SSE_SHUFFLE(XMM6,XMM6,0x00)
1807             SSE_MULT_PS(XMM6,XMM0)
1808             SSE_SUB_PS(XMM5,XMM6)
1809 
1810             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1811             SSE_SHUFFLE(XMM7,XMM7,0x00)
1812             SSE_MULT_PS(XMM7,XMM1)
1813             SSE_SUB_PS(XMM5,XMM7)
1814 
1815             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1816             SSE_SHUFFLE(XMM4,XMM4,0x00)
1817             SSE_MULT_PS(XMM4,XMM2)
1818             SSE_SUB_PS(XMM5,XMM4)
1819 
1820             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1821             SSE_SHUFFLE(XMM6,XMM6,0x00)
1822             SSE_MULT_PS(XMM6,XMM3)
1823             SSE_SUB_PS(XMM5,XMM6)
1824 
1825             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1826             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1827 
1828             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1829 
1830             /* Third Column */
1831             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1832             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1833 
1834             /* Matrix-Vector Product: */
1835             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1836             SSE_SHUFFLE(XMM7,XMM7,0x00)
1837             SSE_MULT_PS(XMM7,XMM0)
1838             SSE_SUB_PS(XMM6,XMM7)
1839 
1840             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1841             SSE_SHUFFLE(XMM4,XMM4,0x00)
1842             SSE_MULT_PS(XMM4,XMM1)
1843             SSE_SUB_PS(XMM6,XMM4)
1844 
1845             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1846             SSE_SHUFFLE(XMM5,XMM5,0x00)
1847             SSE_MULT_PS(XMM5,XMM2)
1848             SSE_SUB_PS(XMM6,XMM5)
1849 
1850             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1851             SSE_SHUFFLE(XMM7,XMM7,0x00)
1852             SSE_MULT_PS(XMM7,XMM3)
1853             SSE_SUB_PS(XMM6,XMM7)
1854 
1855             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1856             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1857 
1858             /* Fourth Column */
1859             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1860             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1861 
1862             /* Matrix-Vector Product: */
1863             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1864             SSE_SHUFFLE(XMM5,XMM5,0x00)
1865             SSE_MULT_PS(XMM5,XMM0)
1866             SSE_SUB_PS(XMM4,XMM5)
1867 
1868             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1869             SSE_SHUFFLE(XMM6,XMM6,0x00)
1870             SSE_MULT_PS(XMM6,XMM1)
1871             SSE_SUB_PS(XMM4,XMM6)
1872 
1873             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1874             SSE_SHUFFLE(XMM7,XMM7,0x00)
1875             SSE_MULT_PS(XMM7,XMM2)
1876             SSE_SUB_PS(XMM4,XMM7)
1877 
1878             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1879             SSE_SHUFFLE(XMM5,XMM5,0x00)
1880             SSE_MULT_PS(XMM5,XMM3)
1881             SSE_SUB_PS(XMM4,XMM5)
1882 
1883             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1884             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1885           SSE_INLINE_END_2;
1886           pv   += 16;
1887         }
1888         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1889       }
1890       row = (unsigned int)(*bjtmp++);
1891 /*        row = (*bjtmp++)/4; */
1892 /*        bjtmp++; */
1893     }
1894     /* finished row so stick it into b->a */
1895     pv = ba + 16*bi[i];
1896     pj = bj + bi[i];
1897     nz = bi[i+1] - bi[i];
1898 
1899     /* Copy x block back into pv block */
1900     for (j=0; j<nz; j++) {
1901       x  = rtmp+16*((unsigned int)pj[j]);
1902 /*        x  = rtmp+4*pj[j]; */
1903 
1904       SSE_INLINE_BEGIN_2(x,pv)
1905         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1906         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1907         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1908 
1909         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1910         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1911 
1912         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1913         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1914 
1915         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1916         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1917 
1918         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1919         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1920 
1921         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1922         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1923 
1924         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1925         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1926 
1927         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1928         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1929       SSE_INLINE_END_2;
1930       pv += 16;
1931     }
1932     /* invert diagonal block */
1933     w = ba + 16*diag_offset[i];
1934     if (pivotinblocks) {
1935       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1936     } else {
1937       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1938     }
1939 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1940     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1941   }
1942 
1943   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1944   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1945   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1946   C->assembled = PETSC_TRUE;
1947   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1948   /* Flop Count from inverting diagonal blocks */
1949   SSE_SCOPE_END;
1950   PetscFunctionReturn(0);
1951 }
1952 
1953 #endif
1954