xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision f3fe499b4cc4d64bf04aa4f5e4963dcc4eb56541)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 /* ------------------------------------------------------------*/
10 /*
11       Version for when blocks are 4 by 4
12 */
13 #undef __FUNCT__
14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace"
15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
16 {
17   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
18   IS             isrow = b->row,isicol = b->icol;
19   PetscErrorCode ierr;
20   const PetscInt *r,*ic;
21   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
22   PetscInt       *ajtmpold,*ajtmp,nz,row;
23   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
24   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
25   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
26   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
27   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
28   MatScalar      m13,m14,m15,m16;
29   MatScalar      *ba = b->a,*aa = a->a;
30   PetscTruth     pivotinblocks = b->pivotinblocks;
31   PetscReal      shift = info->shiftamount;
32 
33   PetscFunctionBegin;
34   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
35   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
36   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
37 
38   for (i=0; i<n; i++) {
39     nz    = bi[i+1] - bi[i];
40     ajtmp = bj + bi[i];
41     for  (j=0; j<nz; j++) {
42       x = rtmp+16*ajtmp[j];
43       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
44       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
45     }
46     /* load in initial (unfactored row) */
47     idx      = r[i];
48     nz       = ai[idx+1] - ai[idx];
49     ajtmpold = aj + ai[idx];
50     v        = aa + 16*ai[idx];
51     for (j=0; j<nz; j++) {
52       x    = rtmp+16*ic[ajtmpold[j]];
53       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
54       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
55       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
56       x[14] = v[14]; x[15] = v[15];
57       v    += 16;
58     }
59     row = *ajtmp++;
60     while (row < i) {
61       pc  = rtmp + 16*row;
62       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
63       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
64       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
65       p15 = pc[14]; p16 = pc[15];
66       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
67           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
68           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
69           || p16 != 0.0) {
70         pv = ba + 16*diag_offset[row];
71         pj = bj + diag_offset[row] + 1;
72         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
73         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
74         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
75         x15 = pv[14]; x16 = pv[15];
76         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
77         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
78         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
79         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
80 
81         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
82         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
83         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
84         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
85 
86         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
87         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
88         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
89         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
90 
91         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
92         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
93         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
94         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
95 
96         nz = bi[row+1] - diag_offset[row] - 1;
97         pv += 16;
98         for (j=0; j<nz; j++) {
99           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
100           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
101           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
102           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
103           x    = rtmp + 16*pj[j];
104           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
105           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
106           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
107           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
108 
109           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
110           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
111           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
112           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
113 
114           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
115           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
116           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
117           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
118 
119           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
120           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
121           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
122           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
123 
124           pv   += 16;
125         }
126         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
127       }
128       row = *ajtmp++;
129     }
130     /* finished row so stick it into b->a */
131     pv = ba + 16*bi[i];
132     pj = bj + bi[i];
133     nz = bi[i+1] - bi[i];
134     for (j=0; j<nz; j++) {
135       x      = rtmp+16*pj[j];
136       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
137       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
138       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
139       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
140       pv   += 16;
141     }
142     /* invert diagonal block */
143     w    = ba + 16*diag_offset[i];
144     if (pivotinblocks) {
145       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
146     } else {
147       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148     }
149   }
150 
151   ierr = PetscFree(rtmp);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
153   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
155   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
156   C->assembled = PETSC_TRUE;
157   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
158   PetscFunctionReturn(0);
159 }
160 
161 /* MatLUFactorNumeric_SeqBAIJ_4 -
162      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
163        Kernel_A_gets_A_times_B()
164        Kernel_A_gets_A_minus_B_times_C()
165        Kernel_A_gets_inverse_A()
166 */
167 
168 #undef __FUNCT__
169 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
171 {
172   Mat            C=B;
173   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
174   IS             isrow = b->row,isicol = b->icol;
175   PetscErrorCode ierr;
176   const PetscInt *r,*ic;
177   PetscInt       i,j,k,nz,nzL,row;
178   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
179   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
180   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
181   PetscInt       flg;
182   PetscReal      shift;
183 
184   PetscFunctionBegin;
185   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
186   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
187 
188   if (info->shifttype == MAT_SHIFT_NONE){
189     shift = 0;
190   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
191     shift = info->shiftamount;
192   }
193 
194   /* generate work space needed by the factorization */
195   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
196   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
197 
198   for (i=0; i<n; i++){
199     /* zero rtmp */
200     /* L part */
201     nz    = bi[i+1] - bi[i];
202     bjtmp = bj + bi[i];
203     for  (j=0; j<nz; j++){
204       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
205     }
206 
207     /* U part */
208     nz = bdiag[i]-bdiag[i+1];
209     bjtmp = bj + bdiag[i+1]+1;
210     for  (j=0; j<nz; j++){
211       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
212     }
213 
214     /* load in initial (unfactored row) */
215     nz    = ai[r[i]+1] - ai[r[i]];
216     ajtmp = aj + ai[r[i]];
217     v     = aa + bs2*ai[r[i]];
218     for (j=0; j<nz; j++) {
219       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
220     }
221 
222     /* elimination */
223     bjtmp = bj + bi[i];
224     nzL   = bi[i+1] - bi[i];
225     for(k=0;k < nzL;k++) {
226       row = bjtmp[k];
227       pc = rtmp + bs2*row;
228       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
229       if (flg) {
230         pv = b->a + bs2*bdiag[row];
231         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
232         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
233 
234         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
235         pv = b->a + bs2*(bdiag[row+1]+1);
236         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
237         for (j=0; j<nz; j++) {
238           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
239           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
240           v    = rtmp + bs2*pj[j];
241           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
242           pv  += bs2;
243         }
244         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
245       }
246     }
247 
248     /* finished row so stick it into b->a */
249     /* L part */
250     pv   = b->a + bs2*bi[i] ;
251     pj   = b->j + bi[i] ;
252     nz   = bi[i+1] - bi[i];
253     for (j=0; j<nz; j++) {
254       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
255     }
256 
257     /* Mark diagonal and invert diagonal for simplier triangular solves */
258     pv   = b->a + bs2*bdiag[i];
259     pj   = b->j + bdiag[i];
260     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
261     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
262 
263     /* U part */
264     pv = b->a + bs2*(bdiag[i+1]+1);
265     pj = b->j + bdiag[i+1]+1;
266     nz = bdiag[i] - bdiag[i+1] - 1;
267     for (j=0; j<nz; j++){
268       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
269     }
270   }
271 
272   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
273   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
274   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
275   C->ops->solve          = MatSolve_SeqBAIJ_4;
276   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
277   C->assembled = PETSC_TRUE;
278   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
279   PetscFunctionReturn(0);
280 }
281 
282 #undef __FUNCT__
283 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace"
284 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
285 {
286 /*
287     Default Version for when blocks are 4 by 4 Using natural ordering
288 */
289   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
290   PetscErrorCode ierr;
291   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
292   PetscInt       *ajtmpold,*ajtmp,nz,row;
293   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
294   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
295   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
296   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
297   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
298   MatScalar      m13,m14,m15,m16;
299   MatScalar      *ba = b->a,*aa = a->a;
300   PetscTruth     pivotinblocks = b->pivotinblocks;
301   PetscReal      shift = info->shiftamount;
302 
303   PetscFunctionBegin;
304   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
305 
306   for (i=0; i<n; i++) {
307     nz    = bi[i+1] - bi[i];
308     ajtmp = bj + bi[i];
309     for  (j=0; j<nz; j++) {
310       x = rtmp+16*ajtmp[j];
311       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
312       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
313     }
314     /* load in initial (unfactored row) */
315     nz       = ai[i+1] - ai[i];
316     ajtmpold = aj + ai[i];
317     v        = aa + 16*ai[i];
318     for (j=0; j<nz; j++) {
319       x    = rtmp+16*ajtmpold[j];
320       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
321       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
322       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
323       x[14] = v[14]; x[15] = v[15];
324       v    += 16;
325     }
326     row = *ajtmp++;
327     while (row < i) {
328       pc  = rtmp + 16*row;
329       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
330       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
331       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
332       p15 = pc[14]; p16 = pc[15];
333       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
334           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
335           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
336           || p16 != 0.0) {
337         pv = ba + 16*diag_offset[row];
338         pj = bj + diag_offset[row] + 1;
339         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
340         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
341         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
342         x15 = pv[14]; x16 = pv[15];
343         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
344         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
345         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
346         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
347 
348         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
349         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
350         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
351         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
352 
353         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
354         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
355         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
356         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
357 
358         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
359         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
360         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
361         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
362         nz = bi[row+1] - diag_offset[row] - 1;
363         pv += 16;
364         for (j=0; j<nz; j++) {
365           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
366           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
367           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
368           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
369           x    = rtmp + 16*pj[j];
370           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
371           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
372           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
373           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
374 
375           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
376           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
377           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
378           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
379 
380           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
381           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
382           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
383           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
384 
385           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
386           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
387           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
388           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
389 
390           pv   += 16;
391         }
392         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
393       }
394       row = *ajtmp++;
395     }
396     /* finished row so stick it into b->a */
397     pv = ba + 16*bi[i];
398     pj = bj + bi[i];
399     nz = bi[i+1] - bi[i];
400     for (j=0; j<nz; j++) {
401       x      = rtmp+16*pj[j];
402       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
403       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
404       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
405       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
406       pv   += 16;
407     }
408     /* invert diagonal block */
409     w = ba + 16*diag_offset[i];
410     if (pivotinblocks) {
411       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
412     } else {
413       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
414     }
415   }
416 
417   ierr = PetscFree(rtmp);CHKERRQ(ierr);
418   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
419   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
420   C->assembled = PETSC_TRUE;
421   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
422   PetscFunctionReturn(0);
423 }
424 
425 /*
426   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
427     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
428 */
429 #undef __FUNCT__
430 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
431 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
432 {
433   Mat            C=B;
434   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
435   PetscErrorCode ierr;
436   PetscInt       i,j,k,nz,nzL,row;
437   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
438   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
439   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
440   PetscInt       flg;
441   PetscReal      shift;
442 
443   PetscFunctionBegin;
444   /* generate work space needed by the factorization */
445   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
446   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
447 
448   if (info->shifttype == MAT_SHIFT_NONE){
449     shift = 0;
450   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
451     shift = info->shiftamount;
452   }
453 
454   for (i=0; i<n; i++){
455     /* zero rtmp */
456     /* L part */
457     nz    = bi[i+1] - bi[i];
458     bjtmp = bj + bi[i];
459     for  (j=0; j<nz; j++){
460       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
461     }
462 
463     /* U part */
464     nz = bdiag[i] - bdiag[i+1];
465     bjtmp = bj + bdiag[i+1]+1;
466     for  (j=0; j<nz; j++){
467       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
468     }
469 
470     /* load in initial (unfactored row) */
471     nz    = ai[i+1] - ai[i];
472     ajtmp = aj + ai[i];
473     v     = aa + bs2*ai[i];
474     for (j=0; j<nz; j++) {
475       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
476     }
477 
478     /* elimination */
479     bjtmp = bj + bi[i];
480     nzL   = bi[i+1] - bi[i];
481     for(k=0;k < nzL;k++) {
482       row = bjtmp[k];
483       pc = rtmp + bs2*row;
484       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
485       if (flg) {
486         pv = b->a + bs2*bdiag[row];
487         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
488         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
489 
490         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
491         pv = b->a + bs2*(bdiag[row+1]+1);
492         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
493         for (j=0; j<nz; j++) {
494           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
495           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
496           v    = rtmp + bs2*pj[j];
497           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
498           pv  += bs2;
499         }
500         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
501       }
502     }
503 
504     /* finished row so stick it into b->a */
505     /* L part */
506     pv   = b->a + bs2*bi[i] ;
507     pj   = b->j + bi[i] ;
508     nz   = bi[i+1] - bi[i];
509     for (j=0; j<nz; j++) {
510       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
511     }
512 
513     /* Mark diagonal and invert diagonal for simplier triangular solves */
514     pv   = b->a + bs2*bdiag[i];
515     pj   = b->j + bdiag[i];
516     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
517     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
518 
519     /* U part */
520     pv = b->a + bs2*(bdiag[i+1]+1);
521     pj = b->j + bdiag[i+1]+1;
522     nz = bdiag[i] - bdiag[i+1] - 1;
523     for (j=0; j<nz; j++){
524       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
525     }
526   }
527   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
528   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
529   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
530   C->assembled = PETSC_TRUE;
531   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
532   PetscFunctionReturn(0);
533 }
534 
535 #if defined(PETSC_HAVE_SSE)
536 
537 #include PETSC_HAVE_SSE
538 
539 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
540 #undef __FUNCT__
541 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
542 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
543 {
544   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
545   PetscErrorCode ierr;
546   int i,j,n = a->mbs;
547   int         *bj = b->j,*bjtmp,*pj;
548   int         row;
549   int         *ajtmpold,nz,*bi=b->i;
550   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
551   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
552   MatScalar   *ba = b->a,*aa = a->a;
553   int         nonzero=0;
554 /*    int            nonzero=0,colscale = 16; */
555   PetscTruth  pivotinblocks = b->pivotinblocks;
556   PetscReal      shift = info->shiftamount;
557 
558   PetscFunctionBegin;
559   SSE_SCOPE_BEGIN;
560 
561   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
562   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
563   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
564   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
565 /*    if ((unsigned long)bj==(unsigned long)aj) { */
566 /*      colscale = 4; */
567 /*    } */
568   for (i=0; i<n; i++) {
569     nz    = bi[i+1] - bi[i];
570     bjtmp = bj + bi[i];
571     /* zero out the 4x4 block accumulators */
572     /* zero out one register */
573     XOR_PS(XMM7,XMM7);
574     for  (j=0; j<nz; j++) {
575       x = rtmp+16*bjtmp[j];
576 /*        x = rtmp+4*bjtmp[j]; */
577       SSE_INLINE_BEGIN_1(x)
578         /* Copy zero register to memory locations */
579         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
580         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
581         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
582         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
583         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
584         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
585         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
586         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
587         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
588       SSE_INLINE_END_1;
589     }
590     /* load in initial (unfactored row) */
591     nz       = ai[i+1] - ai[i];
592     ajtmpold = aj + ai[i];
593     v        = aa + 16*ai[i];
594     for (j=0; j<nz; j++) {
595       x = rtmp+16*ajtmpold[j];
596 /*        x = rtmp+colscale*ajtmpold[j]; */
597       /* Copy v block into x block */
598       SSE_INLINE_BEGIN_2(v,x)
599         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
600         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
601         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
602 
603         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
604         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
605 
606         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
607         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
608 
609         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
610         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
611 
612         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
613         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
614 
615         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
616         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
617 
618         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
619         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
620 
621         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
622         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
623       SSE_INLINE_END_2;
624 
625       v += 16;
626     }
627 /*      row = (*bjtmp++)/4; */
628     row = *bjtmp++;
629     while (row < i) {
630       pc  = rtmp + 16*row;
631       SSE_INLINE_BEGIN_1(pc)
632         /* Load block from lower triangle */
633         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
634         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
635         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
636 
637         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
638         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
639 
640         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
641         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
642 
643         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
644         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
645 
646         /* Compare block to zero block */
647 
648         SSE_COPY_PS(XMM4,XMM7)
649         SSE_CMPNEQ_PS(XMM4,XMM0)
650 
651         SSE_COPY_PS(XMM5,XMM7)
652         SSE_CMPNEQ_PS(XMM5,XMM1)
653 
654         SSE_COPY_PS(XMM6,XMM7)
655         SSE_CMPNEQ_PS(XMM6,XMM2)
656 
657         SSE_CMPNEQ_PS(XMM7,XMM3)
658 
659         /* Reduce the comparisons to one SSE register */
660         SSE_OR_PS(XMM6,XMM7)
661         SSE_OR_PS(XMM5,XMM4)
662         SSE_OR_PS(XMM5,XMM6)
663       SSE_INLINE_END_1;
664 
665       /* Reduce the one SSE register to an integer register for branching */
666       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
667       MOVEMASK(nonzero,XMM5);
668 
669       /* If block is nonzero ... */
670       if (nonzero) {
671         pv = ba + 16*diag_offset[row];
672         PREFETCH_L1(&pv[16]);
673         pj = bj + diag_offset[row] + 1;
674 
675         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
676         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
677         /* but the diagonal was inverted already */
678         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
679 
680         SSE_INLINE_BEGIN_2(pv,pc)
681           /* Column 0, product is accumulated in XMM4 */
682           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
683           SSE_SHUFFLE(XMM4,XMM4,0x00)
684           SSE_MULT_PS(XMM4,XMM0)
685 
686           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
687           SSE_SHUFFLE(XMM5,XMM5,0x00)
688           SSE_MULT_PS(XMM5,XMM1)
689           SSE_ADD_PS(XMM4,XMM5)
690 
691           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
692           SSE_SHUFFLE(XMM6,XMM6,0x00)
693           SSE_MULT_PS(XMM6,XMM2)
694           SSE_ADD_PS(XMM4,XMM6)
695 
696           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
697           SSE_SHUFFLE(XMM7,XMM7,0x00)
698           SSE_MULT_PS(XMM7,XMM3)
699           SSE_ADD_PS(XMM4,XMM7)
700 
701           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
702           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
703 
704           /* Column 1, product is accumulated in XMM5 */
705           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
706           SSE_SHUFFLE(XMM5,XMM5,0x00)
707           SSE_MULT_PS(XMM5,XMM0)
708 
709           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
710           SSE_SHUFFLE(XMM6,XMM6,0x00)
711           SSE_MULT_PS(XMM6,XMM1)
712           SSE_ADD_PS(XMM5,XMM6)
713 
714           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
715           SSE_SHUFFLE(XMM7,XMM7,0x00)
716           SSE_MULT_PS(XMM7,XMM2)
717           SSE_ADD_PS(XMM5,XMM7)
718 
719           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
720           SSE_SHUFFLE(XMM6,XMM6,0x00)
721           SSE_MULT_PS(XMM6,XMM3)
722           SSE_ADD_PS(XMM5,XMM6)
723 
724           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
725           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
726 
727           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
728 
729           /* Column 2, product is accumulated in XMM6 */
730           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
731           SSE_SHUFFLE(XMM6,XMM6,0x00)
732           SSE_MULT_PS(XMM6,XMM0)
733 
734           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
735           SSE_SHUFFLE(XMM7,XMM7,0x00)
736           SSE_MULT_PS(XMM7,XMM1)
737           SSE_ADD_PS(XMM6,XMM7)
738 
739           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
740           SSE_SHUFFLE(XMM7,XMM7,0x00)
741           SSE_MULT_PS(XMM7,XMM2)
742           SSE_ADD_PS(XMM6,XMM7)
743 
744           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
745           SSE_SHUFFLE(XMM7,XMM7,0x00)
746           SSE_MULT_PS(XMM7,XMM3)
747           SSE_ADD_PS(XMM6,XMM7)
748 
749           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
750           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
751 
752           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
753           /* Column 3, product is accumulated in XMM0 */
754           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
755           SSE_SHUFFLE(XMM7,XMM7,0x00)
756           SSE_MULT_PS(XMM0,XMM7)
757 
758           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
759           SSE_SHUFFLE(XMM7,XMM7,0x00)
760           SSE_MULT_PS(XMM1,XMM7)
761           SSE_ADD_PS(XMM0,XMM1)
762 
763           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
764           SSE_SHUFFLE(XMM1,XMM1,0x00)
765           SSE_MULT_PS(XMM1,XMM2)
766           SSE_ADD_PS(XMM0,XMM1)
767 
768           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
769           SSE_SHUFFLE(XMM7,XMM7,0x00)
770           SSE_MULT_PS(XMM3,XMM7)
771           SSE_ADD_PS(XMM0,XMM3)
772 
773           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
774           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
775 
776           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
777           /* This is code to be maintained and read by humans afterall. */
778           /* Copy Multiplier Col 3 into XMM3 */
779           SSE_COPY_PS(XMM3,XMM0)
780           /* Copy Multiplier Col 2 into XMM2 */
781           SSE_COPY_PS(XMM2,XMM6)
782           /* Copy Multiplier Col 1 into XMM1 */
783           SSE_COPY_PS(XMM1,XMM5)
784           /* Copy Multiplier Col 0 into XMM0 */
785           SSE_COPY_PS(XMM0,XMM4)
786         SSE_INLINE_END_2;
787 
788         /* Update the row: */
789         nz = bi[row+1] - diag_offset[row] - 1;
790         pv += 16;
791         for (j=0; j<nz; j++) {
792           PREFETCH_L1(&pv[16]);
793           x = rtmp + 16*pj[j];
794 /*            x = rtmp + 4*pj[j]; */
795 
796           /* X:=X-M*PV, One column at a time */
797           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
798           SSE_INLINE_BEGIN_2(x,pv)
799             /* Load First Column of X*/
800             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
801             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
802 
803             /* Matrix-Vector Product: */
804             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
805             SSE_SHUFFLE(XMM5,XMM5,0x00)
806             SSE_MULT_PS(XMM5,XMM0)
807             SSE_SUB_PS(XMM4,XMM5)
808 
809             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
810             SSE_SHUFFLE(XMM6,XMM6,0x00)
811             SSE_MULT_PS(XMM6,XMM1)
812             SSE_SUB_PS(XMM4,XMM6)
813 
814             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
815             SSE_SHUFFLE(XMM7,XMM7,0x00)
816             SSE_MULT_PS(XMM7,XMM2)
817             SSE_SUB_PS(XMM4,XMM7)
818 
819             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
820             SSE_SHUFFLE(XMM5,XMM5,0x00)
821             SSE_MULT_PS(XMM5,XMM3)
822             SSE_SUB_PS(XMM4,XMM5)
823 
824             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
825             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
826 
827             /* Second Column */
828             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
829             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
830 
831             /* Matrix-Vector Product: */
832             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
833             SSE_SHUFFLE(XMM6,XMM6,0x00)
834             SSE_MULT_PS(XMM6,XMM0)
835             SSE_SUB_PS(XMM5,XMM6)
836 
837             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
838             SSE_SHUFFLE(XMM7,XMM7,0x00)
839             SSE_MULT_PS(XMM7,XMM1)
840             SSE_SUB_PS(XMM5,XMM7)
841 
842             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
843             SSE_SHUFFLE(XMM4,XMM4,0x00)
844             SSE_MULT_PS(XMM4,XMM2)
845             SSE_SUB_PS(XMM5,XMM4)
846 
847             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
848             SSE_SHUFFLE(XMM6,XMM6,0x00)
849             SSE_MULT_PS(XMM6,XMM3)
850             SSE_SUB_PS(XMM5,XMM6)
851 
852             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
853             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
854 
855             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
856 
857             /* Third Column */
858             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
859             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
860 
861             /* Matrix-Vector Product: */
862             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
863             SSE_SHUFFLE(XMM7,XMM7,0x00)
864             SSE_MULT_PS(XMM7,XMM0)
865             SSE_SUB_PS(XMM6,XMM7)
866 
867             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
868             SSE_SHUFFLE(XMM4,XMM4,0x00)
869             SSE_MULT_PS(XMM4,XMM1)
870             SSE_SUB_PS(XMM6,XMM4)
871 
872             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
873             SSE_SHUFFLE(XMM5,XMM5,0x00)
874             SSE_MULT_PS(XMM5,XMM2)
875             SSE_SUB_PS(XMM6,XMM5)
876 
877             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
878             SSE_SHUFFLE(XMM7,XMM7,0x00)
879             SSE_MULT_PS(XMM7,XMM3)
880             SSE_SUB_PS(XMM6,XMM7)
881 
882             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
883             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
884 
885             /* Fourth Column */
886             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
887             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
888 
889             /* Matrix-Vector Product: */
890             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
891             SSE_SHUFFLE(XMM5,XMM5,0x00)
892             SSE_MULT_PS(XMM5,XMM0)
893             SSE_SUB_PS(XMM4,XMM5)
894 
895             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
896             SSE_SHUFFLE(XMM6,XMM6,0x00)
897             SSE_MULT_PS(XMM6,XMM1)
898             SSE_SUB_PS(XMM4,XMM6)
899 
900             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
901             SSE_SHUFFLE(XMM7,XMM7,0x00)
902             SSE_MULT_PS(XMM7,XMM2)
903             SSE_SUB_PS(XMM4,XMM7)
904 
905             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
906             SSE_SHUFFLE(XMM5,XMM5,0x00)
907             SSE_MULT_PS(XMM5,XMM3)
908             SSE_SUB_PS(XMM4,XMM5)
909 
910             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
911             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
912           SSE_INLINE_END_2;
913           pv   += 16;
914         }
915         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
916       }
917       row = *bjtmp++;
918 /*        row = (*bjtmp++)/4; */
919     }
920     /* finished row so stick it into b->a */
921     pv = ba + 16*bi[i];
922     pj = bj + bi[i];
923     nz = bi[i+1] - bi[i];
924 
925     /* Copy x block back into pv block */
926     for (j=0; j<nz; j++) {
927       x  = rtmp+16*pj[j];
928 /*        x  = rtmp+4*pj[j]; */
929 
930       SSE_INLINE_BEGIN_2(x,pv)
931         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
932         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
933         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
934 
935         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
936         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
937 
938         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
939         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
940 
941         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
942         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
943 
944         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
945         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
946 
947         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
948         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
949 
950         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
951         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
952 
953         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
954         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
955       SSE_INLINE_END_2;
956       pv += 16;
957     }
958     /* invert diagonal block */
959     w = ba + 16*diag_offset[i];
960     if (pivotinblocks) {
961       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
962     } else {
963       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
964     }
965 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
966     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
967   }
968 
969   ierr = PetscFree(rtmp);CHKERRQ(ierr);
970   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
971   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
972   C->assembled = PETSC_TRUE;
973   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
974   /* Flop Count from inverting diagonal blocks */
975   SSE_SCOPE_END;
976   PetscFunctionReturn(0);
977 }
978 
979 #undef __FUNCT__
980 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
981 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
982 {
983   Mat            A=C;
984   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
985   PetscErrorCode ierr;
986   int i,j,n = a->mbs;
987   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
988   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
989   unsigned int   row;
990   int            nz,*bi=b->i;
991   int            *diag_offset = b->diag,*ai=a->i;
992   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
993   MatScalar      *ba = b->a,*aa = a->a;
994   int            nonzero=0;
995 /*    int            nonzero=0,colscale = 16; */
996   PetscTruth     pivotinblocks = b->pivotinblocks;
997   PetscReal      shift = info->shiftamount;
998 
999   PetscFunctionBegin;
1000   SSE_SCOPE_BEGIN;
1001 
1002   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1003   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1004   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1005   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1006 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1007 /*      colscale = 4; */
1008 /*    } */
1009 
1010   for (i=0; i<n; i++) {
1011     nz    = bi[i+1] - bi[i];
1012     bjtmp = bj + bi[i];
1013     /* zero out the 4x4 block accumulators */
1014     /* zero out one register */
1015     XOR_PS(XMM7,XMM7);
1016     for  (j=0; j<nz; j++) {
1017       x = rtmp+16*((unsigned int)bjtmp[j]);
1018 /*        x = rtmp+4*bjtmp[j]; */
1019       SSE_INLINE_BEGIN_1(x)
1020         /* Copy zero register to memory locations */
1021         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1022         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1023         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1024         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1025         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1026         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1027         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1028         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1029         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1030       SSE_INLINE_END_1;
1031     }
1032     /* load in initial (unfactored row) */
1033     nz    = ai[i+1] - ai[i];
1034     ajtmp = aj + ai[i];
1035     v     = aa + 16*ai[i];
1036     for (j=0; j<nz; j++) {
1037       x = rtmp+16*((unsigned int)ajtmp[j]);
1038 /*        x = rtmp+colscale*ajtmp[j]; */
1039       /* Copy v block into x block */
1040       SSE_INLINE_BEGIN_2(v,x)
1041         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1042         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1043         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1044 
1045         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1046         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1047 
1048         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1049         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1050 
1051         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1052         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1053 
1054         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1055         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1056 
1057         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1058         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1059 
1060         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1061         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1062 
1063         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1064         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1065       SSE_INLINE_END_2;
1066 
1067       v += 16;
1068     }
1069 /*      row = (*bjtmp++)/4; */
1070     row = (unsigned int)(*bjtmp++);
1071     while (row < i) {
1072       pc  = rtmp + 16*row;
1073       SSE_INLINE_BEGIN_1(pc)
1074         /* Load block from lower triangle */
1075         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1076         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1077         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1078 
1079         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1080         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1081 
1082         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1083         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1084 
1085         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1086         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1087 
1088         /* Compare block to zero block */
1089 
1090         SSE_COPY_PS(XMM4,XMM7)
1091         SSE_CMPNEQ_PS(XMM4,XMM0)
1092 
1093         SSE_COPY_PS(XMM5,XMM7)
1094         SSE_CMPNEQ_PS(XMM5,XMM1)
1095 
1096         SSE_COPY_PS(XMM6,XMM7)
1097         SSE_CMPNEQ_PS(XMM6,XMM2)
1098 
1099         SSE_CMPNEQ_PS(XMM7,XMM3)
1100 
1101         /* Reduce the comparisons to one SSE register */
1102         SSE_OR_PS(XMM6,XMM7)
1103         SSE_OR_PS(XMM5,XMM4)
1104         SSE_OR_PS(XMM5,XMM6)
1105       SSE_INLINE_END_1;
1106 
1107       /* Reduce the one SSE register to an integer register for branching */
1108       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1109       MOVEMASK(nonzero,XMM5);
1110 
1111       /* If block is nonzero ... */
1112       if (nonzero) {
1113         pv = ba + 16*diag_offset[row];
1114         PREFETCH_L1(&pv[16]);
1115         pj = bj + diag_offset[row] + 1;
1116 
1117         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1118         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1119         /* but the diagonal was inverted already */
1120         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1121 
1122         SSE_INLINE_BEGIN_2(pv,pc)
1123           /* Column 0, product is accumulated in XMM4 */
1124           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1125           SSE_SHUFFLE(XMM4,XMM4,0x00)
1126           SSE_MULT_PS(XMM4,XMM0)
1127 
1128           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1129           SSE_SHUFFLE(XMM5,XMM5,0x00)
1130           SSE_MULT_PS(XMM5,XMM1)
1131           SSE_ADD_PS(XMM4,XMM5)
1132 
1133           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1134           SSE_SHUFFLE(XMM6,XMM6,0x00)
1135           SSE_MULT_PS(XMM6,XMM2)
1136           SSE_ADD_PS(XMM4,XMM6)
1137 
1138           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1139           SSE_SHUFFLE(XMM7,XMM7,0x00)
1140           SSE_MULT_PS(XMM7,XMM3)
1141           SSE_ADD_PS(XMM4,XMM7)
1142 
1143           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1144           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1145 
1146           /* Column 1, product is accumulated in XMM5 */
1147           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1148           SSE_SHUFFLE(XMM5,XMM5,0x00)
1149           SSE_MULT_PS(XMM5,XMM0)
1150 
1151           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1152           SSE_SHUFFLE(XMM6,XMM6,0x00)
1153           SSE_MULT_PS(XMM6,XMM1)
1154           SSE_ADD_PS(XMM5,XMM6)
1155 
1156           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1157           SSE_SHUFFLE(XMM7,XMM7,0x00)
1158           SSE_MULT_PS(XMM7,XMM2)
1159           SSE_ADD_PS(XMM5,XMM7)
1160 
1161           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1162           SSE_SHUFFLE(XMM6,XMM6,0x00)
1163           SSE_MULT_PS(XMM6,XMM3)
1164           SSE_ADD_PS(XMM5,XMM6)
1165 
1166           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1167           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1168 
1169           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1170 
1171           /* Column 2, product is accumulated in XMM6 */
1172           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1173           SSE_SHUFFLE(XMM6,XMM6,0x00)
1174           SSE_MULT_PS(XMM6,XMM0)
1175 
1176           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1177           SSE_SHUFFLE(XMM7,XMM7,0x00)
1178           SSE_MULT_PS(XMM7,XMM1)
1179           SSE_ADD_PS(XMM6,XMM7)
1180 
1181           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1182           SSE_SHUFFLE(XMM7,XMM7,0x00)
1183           SSE_MULT_PS(XMM7,XMM2)
1184           SSE_ADD_PS(XMM6,XMM7)
1185 
1186           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1187           SSE_SHUFFLE(XMM7,XMM7,0x00)
1188           SSE_MULT_PS(XMM7,XMM3)
1189           SSE_ADD_PS(XMM6,XMM7)
1190 
1191           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1192           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1193 
1194           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1195           /* Column 3, product is accumulated in XMM0 */
1196           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1197           SSE_SHUFFLE(XMM7,XMM7,0x00)
1198           SSE_MULT_PS(XMM0,XMM7)
1199 
1200           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1201           SSE_SHUFFLE(XMM7,XMM7,0x00)
1202           SSE_MULT_PS(XMM1,XMM7)
1203           SSE_ADD_PS(XMM0,XMM1)
1204 
1205           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1206           SSE_SHUFFLE(XMM1,XMM1,0x00)
1207           SSE_MULT_PS(XMM1,XMM2)
1208           SSE_ADD_PS(XMM0,XMM1)
1209 
1210           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1211           SSE_SHUFFLE(XMM7,XMM7,0x00)
1212           SSE_MULT_PS(XMM3,XMM7)
1213           SSE_ADD_PS(XMM0,XMM3)
1214 
1215           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1216           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1217 
1218           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1219           /* This is code to be maintained and read by humans afterall. */
1220           /* Copy Multiplier Col 3 into XMM3 */
1221           SSE_COPY_PS(XMM3,XMM0)
1222           /* Copy Multiplier Col 2 into XMM2 */
1223           SSE_COPY_PS(XMM2,XMM6)
1224           /* Copy Multiplier Col 1 into XMM1 */
1225           SSE_COPY_PS(XMM1,XMM5)
1226           /* Copy Multiplier Col 0 into XMM0 */
1227           SSE_COPY_PS(XMM0,XMM4)
1228         SSE_INLINE_END_2;
1229 
1230         /* Update the row: */
1231         nz = bi[row+1] - diag_offset[row] - 1;
1232         pv += 16;
1233         for (j=0; j<nz; j++) {
1234           PREFETCH_L1(&pv[16]);
1235           x = rtmp + 16*((unsigned int)pj[j]);
1236 /*            x = rtmp + 4*pj[j]; */
1237 
1238           /* X:=X-M*PV, One column at a time */
1239           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1240           SSE_INLINE_BEGIN_2(x,pv)
1241             /* Load First Column of X*/
1242             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1243             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1244 
1245             /* Matrix-Vector Product: */
1246             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1247             SSE_SHUFFLE(XMM5,XMM5,0x00)
1248             SSE_MULT_PS(XMM5,XMM0)
1249             SSE_SUB_PS(XMM4,XMM5)
1250 
1251             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1252             SSE_SHUFFLE(XMM6,XMM6,0x00)
1253             SSE_MULT_PS(XMM6,XMM1)
1254             SSE_SUB_PS(XMM4,XMM6)
1255 
1256             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1257             SSE_SHUFFLE(XMM7,XMM7,0x00)
1258             SSE_MULT_PS(XMM7,XMM2)
1259             SSE_SUB_PS(XMM4,XMM7)
1260 
1261             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1262             SSE_SHUFFLE(XMM5,XMM5,0x00)
1263             SSE_MULT_PS(XMM5,XMM3)
1264             SSE_SUB_PS(XMM4,XMM5)
1265 
1266             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1267             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1268 
1269             /* Second Column */
1270             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1271             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1272 
1273             /* Matrix-Vector Product: */
1274             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1275             SSE_SHUFFLE(XMM6,XMM6,0x00)
1276             SSE_MULT_PS(XMM6,XMM0)
1277             SSE_SUB_PS(XMM5,XMM6)
1278 
1279             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1280             SSE_SHUFFLE(XMM7,XMM7,0x00)
1281             SSE_MULT_PS(XMM7,XMM1)
1282             SSE_SUB_PS(XMM5,XMM7)
1283 
1284             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1285             SSE_SHUFFLE(XMM4,XMM4,0x00)
1286             SSE_MULT_PS(XMM4,XMM2)
1287             SSE_SUB_PS(XMM5,XMM4)
1288 
1289             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1290             SSE_SHUFFLE(XMM6,XMM6,0x00)
1291             SSE_MULT_PS(XMM6,XMM3)
1292             SSE_SUB_PS(XMM5,XMM6)
1293 
1294             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1295             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1296 
1297             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1298 
1299             /* Third Column */
1300             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1301             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1302 
1303             /* Matrix-Vector Product: */
1304             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1305             SSE_SHUFFLE(XMM7,XMM7,0x00)
1306             SSE_MULT_PS(XMM7,XMM0)
1307             SSE_SUB_PS(XMM6,XMM7)
1308 
1309             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1310             SSE_SHUFFLE(XMM4,XMM4,0x00)
1311             SSE_MULT_PS(XMM4,XMM1)
1312             SSE_SUB_PS(XMM6,XMM4)
1313 
1314             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1315             SSE_SHUFFLE(XMM5,XMM5,0x00)
1316             SSE_MULT_PS(XMM5,XMM2)
1317             SSE_SUB_PS(XMM6,XMM5)
1318 
1319             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1320             SSE_SHUFFLE(XMM7,XMM7,0x00)
1321             SSE_MULT_PS(XMM7,XMM3)
1322             SSE_SUB_PS(XMM6,XMM7)
1323 
1324             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1325             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1326 
1327             /* Fourth Column */
1328             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1329             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1330 
1331             /* Matrix-Vector Product: */
1332             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1333             SSE_SHUFFLE(XMM5,XMM5,0x00)
1334             SSE_MULT_PS(XMM5,XMM0)
1335             SSE_SUB_PS(XMM4,XMM5)
1336 
1337             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1338             SSE_SHUFFLE(XMM6,XMM6,0x00)
1339             SSE_MULT_PS(XMM6,XMM1)
1340             SSE_SUB_PS(XMM4,XMM6)
1341 
1342             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1343             SSE_SHUFFLE(XMM7,XMM7,0x00)
1344             SSE_MULT_PS(XMM7,XMM2)
1345             SSE_SUB_PS(XMM4,XMM7)
1346 
1347             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1348             SSE_SHUFFLE(XMM5,XMM5,0x00)
1349             SSE_MULT_PS(XMM5,XMM3)
1350             SSE_SUB_PS(XMM4,XMM5)
1351 
1352             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1353             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1354           SSE_INLINE_END_2;
1355           pv   += 16;
1356         }
1357         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1358       }
1359       row = (unsigned int)(*bjtmp++);
1360 /*        row = (*bjtmp++)/4; */
1361 /*        bjtmp++; */
1362     }
1363     /* finished row so stick it into b->a */
1364     pv = ba + 16*bi[i];
1365     pj = bj + bi[i];
1366     nz = bi[i+1] - bi[i];
1367 
1368     /* Copy x block back into pv block */
1369     for (j=0; j<nz; j++) {
1370       x  = rtmp+16*((unsigned int)pj[j]);
1371 /*        x  = rtmp+4*pj[j]; */
1372 
1373       SSE_INLINE_BEGIN_2(x,pv)
1374         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1375         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1376         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1377 
1378         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1379         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1380 
1381         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1382         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1383 
1384         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1385         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1386 
1387         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1388         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1389 
1390         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1391         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1392 
1393         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1394         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1395 
1396         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1397         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1398       SSE_INLINE_END_2;
1399       pv += 16;
1400     }
1401     /* invert diagonal block */
1402     w = ba + 16*diag_offset[i];
1403     if (pivotinblocks) {
1404       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1405     } else {
1406       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1407     }
1408 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1409     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1410   }
1411 
1412   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1413   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1414   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1415   C->assembled = PETSC_TRUE;
1416   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1417   /* Flop Count from inverting diagonal blocks */
1418   SSE_SCOPE_END;
1419   PetscFunctionReturn(0);
1420 }
1421 
1422 #undef __FUNCT__
1423 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1424 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1425 {
1426   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1427   PetscErrorCode ierr;
1428   int  i,j,n = a->mbs;
1429   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1430   unsigned int   row;
1431   int            *ajtmpold,nz,*bi=b->i;
1432   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1433   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1434   MatScalar      *ba = b->a,*aa = a->a;
1435   int            nonzero=0;
1436 /*    int            nonzero=0,colscale = 16; */
1437   PetscTruth     pivotinblocks = b->pivotinblocks;
1438   PetscReal      shift = info->shiftamount;
1439 
1440   PetscFunctionBegin;
1441   SSE_SCOPE_BEGIN;
1442 
1443   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1444   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1445   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1446   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1447 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1448 /*      colscale = 4; */
1449 /*    } */
1450   if ((unsigned long)bj==(unsigned long)aj) {
1451     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1452   }
1453 
1454   for (i=0; i<n; i++) {
1455     nz    = bi[i+1] - bi[i];
1456     bjtmp = bj + bi[i];
1457     /* zero out the 4x4 block accumulators */
1458     /* zero out one register */
1459     XOR_PS(XMM7,XMM7);
1460     for  (j=0; j<nz; j++) {
1461       x = rtmp+16*((unsigned int)bjtmp[j]);
1462 /*        x = rtmp+4*bjtmp[j]; */
1463       SSE_INLINE_BEGIN_1(x)
1464         /* Copy zero register to memory locations */
1465         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1466         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1467         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1468         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1469         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1470         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1471         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1472         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1473         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1474       SSE_INLINE_END_1;
1475     }
1476     /* load in initial (unfactored row) */
1477     nz       = ai[i+1] - ai[i];
1478     ajtmpold = aj + ai[i];
1479     v        = aa + 16*ai[i];
1480     for (j=0; j<nz; j++) {
1481       x = rtmp+16*ajtmpold[j];
1482 /*        x = rtmp+colscale*ajtmpold[j]; */
1483       /* Copy v block into x block */
1484       SSE_INLINE_BEGIN_2(v,x)
1485         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1486         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1487         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1488 
1489         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1490         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1491 
1492         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1493         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1494 
1495         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1496         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1497 
1498         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1499         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1500 
1501         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1502         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1503 
1504         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1505         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1506 
1507         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1508         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1509       SSE_INLINE_END_2;
1510 
1511       v += 16;
1512     }
1513 /*      row = (*bjtmp++)/4; */
1514     row = (unsigned int)(*bjtmp++);
1515     while (row < i) {
1516       pc  = rtmp + 16*row;
1517       SSE_INLINE_BEGIN_1(pc)
1518         /* Load block from lower triangle */
1519         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1520         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1521         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1522 
1523         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1524         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1525 
1526         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1527         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1528 
1529         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1530         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1531 
1532         /* Compare block to zero block */
1533 
1534         SSE_COPY_PS(XMM4,XMM7)
1535         SSE_CMPNEQ_PS(XMM4,XMM0)
1536 
1537         SSE_COPY_PS(XMM5,XMM7)
1538         SSE_CMPNEQ_PS(XMM5,XMM1)
1539 
1540         SSE_COPY_PS(XMM6,XMM7)
1541         SSE_CMPNEQ_PS(XMM6,XMM2)
1542 
1543         SSE_CMPNEQ_PS(XMM7,XMM3)
1544 
1545         /* Reduce the comparisons to one SSE register */
1546         SSE_OR_PS(XMM6,XMM7)
1547         SSE_OR_PS(XMM5,XMM4)
1548         SSE_OR_PS(XMM5,XMM6)
1549       SSE_INLINE_END_1;
1550 
1551       /* Reduce the one SSE register to an integer register for branching */
1552       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1553       MOVEMASK(nonzero,XMM5);
1554 
1555       /* If block is nonzero ... */
1556       if (nonzero) {
1557         pv = ba + 16*diag_offset[row];
1558         PREFETCH_L1(&pv[16]);
1559         pj = bj + diag_offset[row] + 1;
1560 
1561         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1562         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1563         /* but the diagonal was inverted already */
1564         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1565 
1566         SSE_INLINE_BEGIN_2(pv,pc)
1567           /* Column 0, product is accumulated in XMM4 */
1568           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1569           SSE_SHUFFLE(XMM4,XMM4,0x00)
1570           SSE_MULT_PS(XMM4,XMM0)
1571 
1572           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1573           SSE_SHUFFLE(XMM5,XMM5,0x00)
1574           SSE_MULT_PS(XMM5,XMM1)
1575           SSE_ADD_PS(XMM4,XMM5)
1576 
1577           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1578           SSE_SHUFFLE(XMM6,XMM6,0x00)
1579           SSE_MULT_PS(XMM6,XMM2)
1580           SSE_ADD_PS(XMM4,XMM6)
1581 
1582           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1583           SSE_SHUFFLE(XMM7,XMM7,0x00)
1584           SSE_MULT_PS(XMM7,XMM3)
1585           SSE_ADD_PS(XMM4,XMM7)
1586 
1587           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1588           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1589 
1590           /* Column 1, product is accumulated in XMM5 */
1591           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1592           SSE_SHUFFLE(XMM5,XMM5,0x00)
1593           SSE_MULT_PS(XMM5,XMM0)
1594 
1595           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1596           SSE_SHUFFLE(XMM6,XMM6,0x00)
1597           SSE_MULT_PS(XMM6,XMM1)
1598           SSE_ADD_PS(XMM5,XMM6)
1599 
1600           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1601           SSE_SHUFFLE(XMM7,XMM7,0x00)
1602           SSE_MULT_PS(XMM7,XMM2)
1603           SSE_ADD_PS(XMM5,XMM7)
1604 
1605           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1606           SSE_SHUFFLE(XMM6,XMM6,0x00)
1607           SSE_MULT_PS(XMM6,XMM3)
1608           SSE_ADD_PS(XMM5,XMM6)
1609 
1610           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1611           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1612 
1613           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1614 
1615           /* Column 2, product is accumulated in XMM6 */
1616           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1617           SSE_SHUFFLE(XMM6,XMM6,0x00)
1618           SSE_MULT_PS(XMM6,XMM0)
1619 
1620           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1621           SSE_SHUFFLE(XMM7,XMM7,0x00)
1622           SSE_MULT_PS(XMM7,XMM1)
1623           SSE_ADD_PS(XMM6,XMM7)
1624 
1625           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1626           SSE_SHUFFLE(XMM7,XMM7,0x00)
1627           SSE_MULT_PS(XMM7,XMM2)
1628           SSE_ADD_PS(XMM6,XMM7)
1629 
1630           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1631           SSE_SHUFFLE(XMM7,XMM7,0x00)
1632           SSE_MULT_PS(XMM7,XMM3)
1633           SSE_ADD_PS(XMM6,XMM7)
1634 
1635           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1636           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1637 
1638           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1639           /* Column 3, product is accumulated in XMM0 */
1640           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1641           SSE_SHUFFLE(XMM7,XMM7,0x00)
1642           SSE_MULT_PS(XMM0,XMM7)
1643 
1644           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1645           SSE_SHUFFLE(XMM7,XMM7,0x00)
1646           SSE_MULT_PS(XMM1,XMM7)
1647           SSE_ADD_PS(XMM0,XMM1)
1648 
1649           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1650           SSE_SHUFFLE(XMM1,XMM1,0x00)
1651           SSE_MULT_PS(XMM1,XMM2)
1652           SSE_ADD_PS(XMM0,XMM1)
1653 
1654           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1655           SSE_SHUFFLE(XMM7,XMM7,0x00)
1656           SSE_MULT_PS(XMM3,XMM7)
1657           SSE_ADD_PS(XMM0,XMM3)
1658 
1659           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1660           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1661 
1662           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1663           /* This is code to be maintained and read by humans afterall. */
1664           /* Copy Multiplier Col 3 into XMM3 */
1665           SSE_COPY_PS(XMM3,XMM0)
1666           /* Copy Multiplier Col 2 into XMM2 */
1667           SSE_COPY_PS(XMM2,XMM6)
1668           /* Copy Multiplier Col 1 into XMM1 */
1669           SSE_COPY_PS(XMM1,XMM5)
1670           /* Copy Multiplier Col 0 into XMM0 */
1671           SSE_COPY_PS(XMM0,XMM4)
1672         SSE_INLINE_END_2;
1673 
1674         /* Update the row: */
1675         nz = bi[row+1] - diag_offset[row] - 1;
1676         pv += 16;
1677         for (j=0; j<nz; j++) {
1678           PREFETCH_L1(&pv[16]);
1679           x = rtmp + 16*((unsigned int)pj[j]);
1680 /*            x = rtmp + 4*pj[j]; */
1681 
1682           /* X:=X-M*PV, One column at a time */
1683           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1684           SSE_INLINE_BEGIN_2(x,pv)
1685             /* Load First Column of X*/
1686             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1687             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1688 
1689             /* Matrix-Vector Product: */
1690             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1691             SSE_SHUFFLE(XMM5,XMM5,0x00)
1692             SSE_MULT_PS(XMM5,XMM0)
1693             SSE_SUB_PS(XMM4,XMM5)
1694 
1695             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1696             SSE_SHUFFLE(XMM6,XMM6,0x00)
1697             SSE_MULT_PS(XMM6,XMM1)
1698             SSE_SUB_PS(XMM4,XMM6)
1699 
1700             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1701             SSE_SHUFFLE(XMM7,XMM7,0x00)
1702             SSE_MULT_PS(XMM7,XMM2)
1703             SSE_SUB_PS(XMM4,XMM7)
1704 
1705             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1706             SSE_SHUFFLE(XMM5,XMM5,0x00)
1707             SSE_MULT_PS(XMM5,XMM3)
1708             SSE_SUB_PS(XMM4,XMM5)
1709 
1710             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1711             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1712 
1713             /* Second Column */
1714             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1715             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1716 
1717             /* Matrix-Vector Product: */
1718             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1719             SSE_SHUFFLE(XMM6,XMM6,0x00)
1720             SSE_MULT_PS(XMM6,XMM0)
1721             SSE_SUB_PS(XMM5,XMM6)
1722 
1723             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1724             SSE_SHUFFLE(XMM7,XMM7,0x00)
1725             SSE_MULT_PS(XMM7,XMM1)
1726             SSE_SUB_PS(XMM5,XMM7)
1727 
1728             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1729             SSE_SHUFFLE(XMM4,XMM4,0x00)
1730             SSE_MULT_PS(XMM4,XMM2)
1731             SSE_SUB_PS(XMM5,XMM4)
1732 
1733             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1734             SSE_SHUFFLE(XMM6,XMM6,0x00)
1735             SSE_MULT_PS(XMM6,XMM3)
1736             SSE_SUB_PS(XMM5,XMM6)
1737 
1738             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1739             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1740 
1741             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1742 
1743             /* Third Column */
1744             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1745             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1746 
1747             /* Matrix-Vector Product: */
1748             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1749             SSE_SHUFFLE(XMM7,XMM7,0x00)
1750             SSE_MULT_PS(XMM7,XMM0)
1751             SSE_SUB_PS(XMM6,XMM7)
1752 
1753             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1754             SSE_SHUFFLE(XMM4,XMM4,0x00)
1755             SSE_MULT_PS(XMM4,XMM1)
1756             SSE_SUB_PS(XMM6,XMM4)
1757 
1758             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1759             SSE_SHUFFLE(XMM5,XMM5,0x00)
1760             SSE_MULT_PS(XMM5,XMM2)
1761             SSE_SUB_PS(XMM6,XMM5)
1762 
1763             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1764             SSE_SHUFFLE(XMM7,XMM7,0x00)
1765             SSE_MULT_PS(XMM7,XMM3)
1766             SSE_SUB_PS(XMM6,XMM7)
1767 
1768             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1769             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1770 
1771             /* Fourth Column */
1772             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1773             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1774 
1775             /* Matrix-Vector Product: */
1776             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1777             SSE_SHUFFLE(XMM5,XMM5,0x00)
1778             SSE_MULT_PS(XMM5,XMM0)
1779             SSE_SUB_PS(XMM4,XMM5)
1780 
1781             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1782             SSE_SHUFFLE(XMM6,XMM6,0x00)
1783             SSE_MULT_PS(XMM6,XMM1)
1784             SSE_SUB_PS(XMM4,XMM6)
1785 
1786             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1787             SSE_SHUFFLE(XMM7,XMM7,0x00)
1788             SSE_MULT_PS(XMM7,XMM2)
1789             SSE_SUB_PS(XMM4,XMM7)
1790 
1791             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1792             SSE_SHUFFLE(XMM5,XMM5,0x00)
1793             SSE_MULT_PS(XMM5,XMM3)
1794             SSE_SUB_PS(XMM4,XMM5)
1795 
1796             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1797             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1798           SSE_INLINE_END_2;
1799           pv   += 16;
1800         }
1801         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1802       }
1803       row = (unsigned int)(*bjtmp++);
1804 /*        row = (*bjtmp++)/4; */
1805 /*        bjtmp++; */
1806     }
1807     /* finished row so stick it into b->a */
1808     pv = ba + 16*bi[i];
1809     pj = bj + bi[i];
1810     nz = bi[i+1] - bi[i];
1811 
1812     /* Copy x block back into pv block */
1813     for (j=0; j<nz; j++) {
1814       x  = rtmp+16*((unsigned int)pj[j]);
1815 /*        x  = rtmp+4*pj[j]; */
1816 
1817       SSE_INLINE_BEGIN_2(x,pv)
1818         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1819         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1820         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1821 
1822         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1823         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1824 
1825         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1826         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1827 
1828         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1829         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1830 
1831         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1832         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1833 
1834         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1835         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1836 
1837         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1838         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1839 
1840         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1841         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1842       SSE_INLINE_END_2;
1843       pv += 16;
1844     }
1845     /* invert diagonal block */
1846     w = ba + 16*diag_offset[i];
1847     if (pivotinblocks) {
1848       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1849     } else {
1850       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1851     }
1852 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1853     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1854   }
1855 
1856   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1857   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1858   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1859   C->assembled = PETSC_TRUE;
1860   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1861   /* Flop Count from inverting diagonal blocks */
1862   SSE_SCOPE_END;
1863   PetscFunctionReturn(0);
1864 }
1865 
1866 #endif
1867