xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision eeffb40d691afbdd57a8091619e7ddd44ac5fdca)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 /* ------------------------------------------------------------*/
10 /*
11       Version for when blocks are 4 by 4
12 */
13 #undef __FUNCT__
14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info)
16 {
17   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
18   IS             isrow = b->row,isicol = b->icol;
19   PetscErrorCode ierr;
20   const PetscInt *r,*ic;
21   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
22   PetscInt       *ajtmpold,*ajtmp,nz,row;
23   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
24   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
25   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
26   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
27   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
28   MatScalar      m13,m14,m15,m16;
29   MatScalar      *ba = b->a,*aa = a->a;
30   PetscTruth     pivotinblocks = b->pivotinblocks;
31   PetscReal      shift = info->shiftinblocks;
32 
33   PetscFunctionBegin;
34   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
35   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
36   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
37 
38   for (i=0; i<n; i++) {
39     nz    = bi[i+1] - bi[i];
40     ajtmp = bj + bi[i];
41     for  (j=0; j<nz; j++) {
42       x = rtmp+16*ajtmp[j];
43       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
44       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
45     }
46     /* load in initial (unfactored row) */
47     idx      = r[i];
48     nz       = ai[idx+1] - ai[idx];
49     ajtmpold = aj + ai[idx];
50     v        = aa + 16*ai[idx];
51     for (j=0; j<nz; j++) {
52       x    = rtmp+16*ic[ajtmpold[j]];
53       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
54       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
55       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
56       x[14] = v[14]; x[15] = v[15];
57       v    += 16;
58     }
59     row = *ajtmp++;
60     while (row < i) {
61       pc  = rtmp + 16*row;
62       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
63       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
64       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
65       p15 = pc[14]; p16 = pc[15];
66       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
67           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
68           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
69           || p16 != 0.0) {
70         pv = ba + 16*diag_offset[row];
71         pj = bj + diag_offset[row] + 1;
72         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
73         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
74         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
75         x15 = pv[14]; x16 = pv[15];
76         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
77         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
78         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
79         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
80 
81         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
82         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
83         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
84         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
85 
86         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
87         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
88         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
89         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
90 
91         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
92         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
93         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
94         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
95 
96         nz = bi[row+1] - diag_offset[row] - 1;
97         pv += 16;
98         for (j=0; j<nz; j++) {
99           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
100           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
101           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
102           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
103           x    = rtmp + 16*pj[j];
104           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
105           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
106           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
107           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
108 
109           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
110           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
111           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
112           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
113 
114           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
115           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
116           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
117           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
118 
119           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
120           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
121           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
122           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
123 
124           pv   += 16;
125         }
126         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
127       }
128       row = *ajtmp++;
129     }
130     /* finished row so stick it into b->a */
131     pv = ba + 16*bi[i];
132     pj = bj + bi[i];
133     nz = bi[i+1] - bi[i];
134     for (j=0; j<nz; j++) {
135       x      = rtmp+16*pj[j];
136       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
137       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
138       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
139       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
140       pv   += 16;
141     }
142     /* invert diagonal block */
143     w    = ba + 16*diag_offset[i];
144     if (pivotinblocks) {
145       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
146     } else {
147       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148     }
149   }
150 
151   ierr = PetscFree(rtmp);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
153   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154   C->ops->solve          = MatSolve_SeqBAIJ_4;
155   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
156   C->assembled = PETSC_TRUE;
157   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
158   PetscFunctionReturn(0);
159 }
160 
161 /* MatLUFactorNumeric_SeqBAIJ_4_newdatastruct -
162      copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented
163        Kernel_A_gets_A_times_B()
164        Kernel_A_gets_A_minus_B_times_C()
165        Kernel_A_gets_inverse_A()
166 */
167 #undef __FUNCT__
168 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct"
169 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
170 {
171   Mat            C=B;
172   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
173   IS             isrow = b->row,isicol = b->icol;
174   PetscErrorCode ierr;
175   const PetscInt *r,*ic,*ics;
176   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
177   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
178   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
179   PetscInt       bs2 = a->bs2,flg;
180   PetscReal      shift = info->shiftinblocks;
181 
182   PetscFunctionBegin;
183   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
184   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
185 
186   /* generate work space needed by the factorization */
187   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
188   mwork = rtmp + bs2*n;
189   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
190   ics  = ic;
191 
192   for (i=0; i<n; i++){
193     /* zero rtmp */
194     /* L part */
195     nz    = bi[i+1] - bi[i];
196     bjtmp = bj + bi[i];
197     for  (j=0; j<nz; j++){
198       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
199     }
200 
201     /* U part */
202     nz = bi[2*n-i+1] - bi[2*n-i];
203     bjtmp = bj + bi[2*n-i];
204     for  (j=0; j<nz; j++){
205       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
206     }
207 
208     /* load in initial (unfactored row) */
209     nz    = ai[r[i]+1] - ai[r[i]];
210     ajtmp = aj + ai[r[i]];
211     v     = aa + bs2*ai[r[i]];
212     for (j=0; j<nz; j++) {
213       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
214     }
215 
216     /* elimination */
217     bjtmp = bj + bi[i];
218     nzL   = bi[i+1] - bi[i];
219     for(k=0;k < nzL;k++) {
220       row = bjtmp[k];
221       pc = rtmp + bs2*row;
222       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
223       if (flg) {
224         pv = b->a + bs2*bdiag[row];
225         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
226         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
227 
228         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
229         pv = b->a + bs2*bi[2*n-row];
230         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
231         for (j=0; j<nz; j++) {
232           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
233           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
234           v    = rtmp + bs2*pj[j];
235           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
236           pv  += bs2;
237         }
238         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
239       }
240     }
241 
242     /* finished row so stick it into b->a */
243     /* L part */
244     pv   = b->a + bs2*bi[i] ;
245     pj   = b->j + bi[i] ;
246     nz   = bi[i+1] - bi[i];
247     for (j=0; j<nz; j++) {
248       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
249     }
250 
251     /* Mark diagonal and invert diagonal for simplier triangular solves */
252     pv   = b->a + bs2*bdiag[i];
253     pj   = b->j + bdiag[i];
254     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
255     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
256     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
257 
258     /* U part */
259     pv = b->a + bs2*bi[2*n-i];
260     pj = b->j + bi[2*n-i];
261     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
262     for (j=0; j<nz; j++){
263       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
264     }
265   }
266 
267   ierr = PetscFree(rtmp);CHKERRQ(ierr);
268   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
269   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
270 
271   C->assembled = PETSC_TRUE;
272   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
273   PetscFunctionReturn(0);
274 }
275 
276 #undef __FUNCT__
277 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct_v2"
278 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
279 {
280   Mat            C=B;
281   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
282   IS             isrow = b->row,isicol = b->icol;
283   PetscErrorCode ierr;
284   const PetscInt *r,*ic,*ics;
285   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
286   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
287   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
288   PetscInt       bs2 = a->bs2,flg;
289   PetscReal      shift = info->shiftinblocks;
290 
291   PetscFunctionBegin;
292   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
293   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
294 
295   /* generate work space needed by the factorization */
296   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
297   mwork = rtmp + bs2*n;
298   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
299   ics  = ic;
300 
301   for (i=0; i<n; i++){
302     /* zero rtmp */
303     /* L part */
304     nz    = bi[i+1] - bi[i];
305     bjtmp = bj + bi[i];
306     for  (j=0; j<nz; j++){
307       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
308     }
309 
310     /* U part */
311     nz = bdiag[i]-bdiag[i+1];
312     bjtmp = bj + bdiag[i+1]+1;
313     for  (j=0; j<nz; j++){
314       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
315     }
316 
317     /* load in initial (unfactored row) */
318     nz    = ai[r[i]+1] - ai[r[i]];
319     ajtmp = aj + ai[r[i]];
320     v     = aa + bs2*ai[r[i]];
321     for (j=0; j<nz; j++) {
322       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
323     }
324 
325     /* elimination */
326     bjtmp = bj + bi[i];
327     nzL   = bi[i+1] - bi[i];
328     for(k=0;k < nzL;k++) {
329       row = bjtmp[k];
330       pc = rtmp + bs2*row;
331       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
332       if (flg) {
333         pv = b->a + bs2*bdiag[row];
334         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
335         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
336 
337         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
338         pv = b->a + bs2*(bdiag[row+1]+1);
339         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
340         for (j=0; j<nz; j++) {
341           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
342           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
343           v    = rtmp + bs2*pj[j];
344           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
345           pv  += bs2;
346         }
347         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
348       }
349     }
350 
351     /* finished row so stick it into b->a */
352     /* L part */
353     pv   = b->a + bs2*bi[i] ;
354     pj   = b->j + bi[i] ;
355     nz   = bi[i+1] - bi[i];
356     for (j=0; j<nz; j++) {
357       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
358     }
359 
360     /* Mark diagonal and invert diagonal for simplier triangular solves */
361     pv   = b->a + bs2*bdiag[i];
362     pj   = b->j + bdiag[i];
363     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
364     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
365     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
366 
367     /* U part */
368     pv = b->a + bs2*(bdiag[i+1]+1);
369     pj = b->j + bdiag[i+1]+1;
370     nz = bdiag[i] - bdiag[i+1] - 1;
371     for (j=0; j<nz; j++){
372       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
373     }
374   }
375 
376   ierr = PetscFree(rtmp);CHKERRQ(ierr);
377   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
378   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
379 
380   C->assembled = PETSC_TRUE;
381   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
382   PetscFunctionReturn(0);
383 }
384 
385 #undef __FUNCT__
386 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
387 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
388 {
389 /*
390     Default Version for when blocks are 4 by 4 Using natural ordering
391 */
392   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
393   PetscErrorCode ierr;
394   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
395   PetscInt       *ajtmpold,*ajtmp,nz,row;
396   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
397   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
398   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
399   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
400   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
401   MatScalar      m13,m14,m15,m16;
402   MatScalar      *ba = b->a,*aa = a->a;
403   PetscTruth     pivotinblocks = b->pivotinblocks;
404   PetscReal      shift = info->shiftinblocks;
405 
406   PetscFunctionBegin;
407   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
408 
409   for (i=0; i<n; i++) {
410     nz    = bi[i+1] - bi[i];
411     ajtmp = bj + bi[i];
412     for  (j=0; j<nz; j++) {
413       x = rtmp+16*ajtmp[j];
414       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
415       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
416     }
417     /* load in initial (unfactored row) */
418     nz       = ai[i+1] - ai[i];
419     ajtmpold = aj + ai[i];
420     v        = aa + 16*ai[i];
421     for (j=0; j<nz; j++) {
422       x    = rtmp+16*ajtmpold[j];
423       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
424       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
425       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
426       x[14] = v[14]; x[15] = v[15];
427       v    += 16;
428     }
429     row = *ajtmp++;
430     while (row < i) {
431       pc  = rtmp + 16*row;
432       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
433       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
434       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
435       p15 = pc[14]; p16 = pc[15];
436       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
437           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
438           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
439           || p16 != 0.0) {
440         pv = ba + 16*diag_offset[row];
441         pj = bj + diag_offset[row] + 1;
442         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
443         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
444         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
445         x15 = pv[14]; x16 = pv[15];
446         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
447         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
448         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
449         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
450 
451         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
452         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
453         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
454         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
455 
456         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
457         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
458         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
459         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
460 
461         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
462         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
463         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
464         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
465         nz = bi[row+1] - diag_offset[row] - 1;
466         pv += 16;
467         for (j=0; j<nz; j++) {
468           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
469           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
470           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
471           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
472           x    = rtmp + 16*pj[j];
473           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
474           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
475           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
476           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
477 
478           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
479           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
480           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
481           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
482 
483           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
484           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
485           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
486           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
487 
488           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
489           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
490           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
491           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
492 
493           pv   += 16;
494         }
495         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
496       }
497       row = *ajtmp++;
498     }
499     /* finished row so stick it into b->a */
500     pv = ba + 16*bi[i];
501     pj = bj + bi[i];
502     nz = bi[i+1] - bi[i];
503     for (j=0; j<nz; j++) {
504       x      = rtmp+16*pj[j];
505       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
506       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
507       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
508       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
509       pv   += 16;
510     }
511     /* invert diagonal block */
512     w = ba + 16*diag_offset[i];
513     if (pivotinblocks) {
514       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
515     } else {
516       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
517     }
518   }
519 
520   ierr = PetscFree(rtmp);CHKERRQ(ierr);
521   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
522   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
523   C->assembled = PETSC_TRUE;
524   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
525   PetscFunctionReturn(0);
526 }
527 /*
528   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct -
529     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_newdatastruct()
530 */
531 #undef __FUNCT__
532 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct"
533 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
534 {
535   Mat            C=B;
536   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
537   PetscErrorCode ierr;
538   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
539   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
540   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
541   PetscInt       bs2 = a->bs2,flg;
542   PetscReal      shift = info->shiftinblocks;
543 
544   PetscFunctionBegin;
545   /* generate work space needed by the factorization */
546   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
547   mwork = rtmp + bs2*n;
548   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
549 
550   for (i=0; i<n; i++){
551     /* zero rtmp */
552     /* L part */
553     nz    = bi[i+1] - bi[i];
554     bjtmp = bj + bi[i];
555     for  (j=0; j<nz; j++){
556       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
557     }
558 
559     /* U part */
560     nz = bi[2*n-i+1] - bi[2*n-i];
561     bjtmp = bj + bi[2*n-i];
562     for  (j=0; j<nz; j++){
563       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
564     }
565 
566     /* load in initial (unfactored row) */
567     nz    = ai[i+1] - ai[i];
568     ajtmp = aj + ai[i];
569     v     = aa + bs2*ai[i];
570     for (j=0; j<nz; j++) {
571       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
572     }
573 
574     /* elimination */
575     bjtmp = bj + bi[i];
576     nzL   = bi[i+1] - bi[i];
577     for(k=0;k < nzL;k++) {
578       row = bjtmp[k];
579       pc = rtmp + bs2*row;
580       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
581       if (flg) {
582         pv = b->a + bs2*bdiag[row];
583         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
584         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
585 
586         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
587         pv = b->a + bs2*bi[2*n-row];
588         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
589         for (j=0; j<nz; j++) {
590           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
591           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
592           v    = rtmp + bs2*pj[j];
593           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
594           pv  += bs2;
595         }
596         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
597       }
598     }
599 
600     /* finished row so stick it into b->a */
601     /* L part */
602     pv   = b->a + bs2*bi[i] ;
603     pj   = b->j + bi[i] ;
604     nz   = bi[i+1] - bi[i];
605     for (j=0; j<nz; j++) {
606       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
607     }
608 
609     /* Mark diagonal and invert diagonal for simplier triangular solves */
610     pv   = b->a + bs2*bdiag[i];
611     pj   = b->j + bdiag[i];
612     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
613     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
614     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
615 
616     /* U part */
617     pv = b->a + bs2*bi[2*n-i];
618     pj = b->j + bi[2*n-i];
619     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
620     for (j=0; j<nz; j++){
621       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
622     }
623   }
624 
625   ierr = PetscFree(rtmp);CHKERRQ(ierr);
626   C->assembled = PETSC_TRUE;
627   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
628   PetscFunctionReturn(0);
629 }
630 
631 #undef __FUNCT__
632 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
633 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
634 {
635   Mat            C=B;
636   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
637   PetscErrorCode ierr;
638   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
639   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
640   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
641   PetscInt       bs2 = a->bs2,flg;
642   PetscReal      shift = info->shiftinblocks;
643 
644   PetscFunctionBegin;
645   /* generate work space needed by the factorization */
646   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
647   mwork = rtmp + bs2*n;
648   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
649 
650   for (i=0; i<n; i++){
651     /* zero rtmp */
652     /* L part */
653     nz    = bi[i+1] - bi[i];
654     bjtmp = bj + bi[i];
655     for  (j=0; j<nz; j++){
656       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
657     }
658 
659     /* U part */
660     nz = bdiag[i] - bdiag[i+1];
661     bjtmp = bj + bdiag[i+1]+1;
662     for  (j=0; j<nz; j++){
663       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
664     }
665 
666     /* load in initial (unfactored row) */
667     nz    = ai[i+1] - ai[i];
668     ajtmp = aj + ai[i];
669     v     = aa + bs2*ai[i];
670     for (j=0; j<nz; j++) {
671       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
672     }
673 
674     /* elimination */
675     bjtmp = bj + bi[i];
676     nzL   = bi[i+1] - bi[i];
677     for(k=0;k < nzL;k++) {
678       row = bjtmp[k];
679       pc = rtmp + bs2*row;
680       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
681       if (flg) {
682         pv = b->a + bs2*bdiag[row];
683         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
684         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
685 
686         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
687         pv = b->a + bs2*(bdiag[row+1]+1);
688         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
689         for (j=0; j<nz; j++) {
690           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
691           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
692           v    = rtmp + bs2*pj[j];
693           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
694           pv  += bs2;
695         }
696         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
697       }
698     }
699 
700     /* finished row so stick it into b->a */
701     /* L part */
702     pv   = b->a + bs2*bi[i] ;
703     pj   = b->j + bi[i] ;
704     nz   = bi[i+1] - bi[i];
705     for (j=0; j<nz; j++) {
706       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
707     }
708 
709     /* Mark diagonal and invert diagonal for simplier triangular solves */
710     pv   = b->a + bs2*bdiag[i];
711     pj   = b->j + bdiag[i];
712     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
713     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
714     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
715 
716     /* U part */
717     pv = b->a + bs2*(bdiag[i+1]+1);
718     pj = b->j + bdiag[i+1]+1;
719     nz = bdiag[i] - bdiag[i+1] - 1;
720     for (j=0; j<nz; j++){
721       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
722     }
723   }
724 
725   ierr = PetscFree(rtmp);CHKERRQ(ierr);
726   C->assembled = PETSC_TRUE;
727   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
728   PetscFunctionReturn(0);
729 }
730 
731 #if defined(PETSC_HAVE_SSE)
732 
733 #include PETSC_HAVE_SSE
734 
735 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
736 #undef __FUNCT__
737 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
738 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
739 {
740   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
741   PetscErrorCode ierr;
742   int i,j,n = a->mbs;
743   int         *bj = b->j,*bjtmp,*pj;
744   int         row;
745   int         *ajtmpold,nz,*bi=b->i;
746   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
747   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
748   MatScalar   *ba = b->a,*aa = a->a;
749   int         nonzero=0;
750 /*    int            nonzero=0,colscale = 16; */
751   PetscTruth  pivotinblocks = b->pivotinblocks;
752   PetscReal      shift = info->shiftinblocks;
753 
754   PetscFunctionBegin;
755   SSE_SCOPE_BEGIN;
756 
757   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
758   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
759   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
760   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
761 /*    if ((unsigned long)bj==(unsigned long)aj) { */
762 /*      colscale = 4; */
763 /*    } */
764   for (i=0; i<n; i++) {
765     nz    = bi[i+1] - bi[i];
766     bjtmp = bj + bi[i];
767     /* zero out the 4x4 block accumulators */
768     /* zero out one register */
769     XOR_PS(XMM7,XMM7);
770     for  (j=0; j<nz; j++) {
771       x = rtmp+16*bjtmp[j];
772 /*        x = rtmp+4*bjtmp[j]; */
773       SSE_INLINE_BEGIN_1(x)
774         /* Copy zero register to memory locations */
775         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
776         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
777         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
778         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
779         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
780         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
781         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
782         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
783         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
784       SSE_INLINE_END_1;
785     }
786     /* load in initial (unfactored row) */
787     nz       = ai[i+1] - ai[i];
788     ajtmpold = aj + ai[i];
789     v        = aa + 16*ai[i];
790     for (j=0; j<nz; j++) {
791       x = rtmp+16*ajtmpold[j];
792 /*        x = rtmp+colscale*ajtmpold[j]; */
793       /* Copy v block into x block */
794       SSE_INLINE_BEGIN_2(v,x)
795         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
796         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
797         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
798 
799         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
800         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
801 
802         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
803         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
804 
805         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
806         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
807 
808         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
809         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
810 
811         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
812         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
813 
814         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
815         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
816 
817         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
818         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
819       SSE_INLINE_END_2;
820 
821       v += 16;
822     }
823 /*      row = (*bjtmp++)/4; */
824     row = *bjtmp++;
825     while (row < i) {
826       pc  = rtmp + 16*row;
827       SSE_INLINE_BEGIN_1(pc)
828         /* Load block from lower triangle */
829         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
830         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
831         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
832 
833         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
834         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
835 
836         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
837         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
838 
839         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
840         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
841 
842         /* Compare block to zero block */
843 
844         SSE_COPY_PS(XMM4,XMM7)
845         SSE_CMPNEQ_PS(XMM4,XMM0)
846 
847         SSE_COPY_PS(XMM5,XMM7)
848         SSE_CMPNEQ_PS(XMM5,XMM1)
849 
850         SSE_COPY_PS(XMM6,XMM7)
851         SSE_CMPNEQ_PS(XMM6,XMM2)
852 
853         SSE_CMPNEQ_PS(XMM7,XMM3)
854 
855         /* Reduce the comparisons to one SSE register */
856         SSE_OR_PS(XMM6,XMM7)
857         SSE_OR_PS(XMM5,XMM4)
858         SSE_OR_PS(XMM5,XMM6)
859       SSE_INLINE_END_1;
860 
861       /* Reduce the one SSE register to an integer register for branching */
862       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
863       MOVEMASK(nonzero,XMM5);
864 
865       /* If block is nonzero ... */
866       if (nonzero) {
867         pv = ba + 16*diag_offset[row];
868         PREFETCH_L1(&pv[16]);
869         pj = bj + diag_offset[row] + 1;
870 
871         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
872         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
873         /* but the diagonal was inverted already */
874         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
875 
876         SSE_INLINE_BEGIN_2(pv,pc)
877           /* Column 0, product is accumulated in XMM4 */
878           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
879           SSE_SHUFFLE(XMM4,XMM4,0x00)
880           SSE_MULT_PS(XMM4,XMM0)
881 
882           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
883           SSE_SHUFFLE(XMM5,XMM5,0x00)
884           SSE_MULT_PS(XMM5,XMM1)
885           SSE_ADD_PS(XMM4,XMM5)
886 
887           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
888           SSE_SHUFFLE(XMM6,XMM6,0x00)
889           SSE_MULT_PS(XMM6,XMM2)
890           SSE_ADD_PS(XMM4,XMM6)
891 
892           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
893           SSE_SHUFFLE(XMM7,XMM7,0x00)
894           SSE_MULT_PS(XMM7,XMM3)
895           SSE_ADD_PS(XMM4,XMM7)
896 
897           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
898           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
899 
900           /* Column 1, product is accumulated in XMM5 */
901           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
902           SSE_SHUFFLE(XMM5,XMM5,0x00)
903           SSE_MULT_PS(XMM5,XMM0)
904 
905           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
906           SSE_SHUFFLE(XMM6,XMM6,0x00)
907           SSE_MULT_PS(XMM6,XMM1)
908           SSE_ADD_PS(XMM5,XMM6)
909 
910           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
911           SSE_SHUFFLE(XMM7,XMM7,0x00)
912           SSE_MULT_PS(XMM7,XMM2)
913           SSE_ADD_PS(XMM5,XMM7)
914 
915           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
916           SSE_SHUFFLE(XMM6,XMM6,0x00)
917           SSE_MULT_PS(XMM6,XMM3)
918           SSE_ADD_PS(XMM5,XMM6)
919 
920           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
921           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
922 
923           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
924 
925           /* Column 2, product is accumulated in XMM6 */
926           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
927           SSE_SHUFFLE(XMM6,XMM6,0x00)
928           SSE_MULT_PS(XMM6,XMM0)
929 
930           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
931           SSE_SHUFFLE(XMM7,XMM7,0x00)
932           SSE_MULT_PS(XMM7,XMM1)
933           SSE_ADD_PS(XMM6,XMM7)
934 
935           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
936           SSE_SHUFFLE(XMM7,XMM7,0x00)
937           SSE_MULT_PS(XMM7,XMM2)
938           SSE_ADD_PS(XMM6,XMM7)
939 
940           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
941           SSE_SHUFFLE(XMM7,XMM7,0x00)
942           SSE_MULT_PS(XMM7,XMM3)
943           SSE_ADD_PS(XMM6,XMM7)
944 
945           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
946           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
947 
948           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
949           /* Column 3, product is accumulated in XMM0 */
950           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
951           SSE_SHUFFLE(XMM7,XMM7,0x00)
952           SSE_MULT_PS(XMM0,XMM7)
953 
954           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
955           SSE_SHUFFLE(XMM7,XMM7,0x00)
956           SSE_MULT_PS(XMM1,XMM7)
957           SSE_ADD_PS(XMM0,XMM1)
958 
959           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
960           SSE_SHUFFLE(XMM1,XMM1,0x00)
961           SSE_MULT_PS(XMM1,XMM2)
962           SSE_ADD_PS(XMM0,XMM1)
963 
964           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
965           SSE_SHUFFLE(XMM7,XMM7,0x00)
966           SSE_MULT_PS(XMM3,XMM7)
967           SSE_ADD_PS(XMM0,XMM3)
968 
969           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
970           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
971 
972           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
973           /* This is code to be maintained and read by humans afterall. */
974           /* Copy Multiplier Col 3 into XMM3 */
975           SSE_COPY_PS(XMM3,XMM0)
976           /* Copy Multiplier Col 2 into XMM2 */
977           SSE_COPY_PS(XMM2,XMM6)
978           /* Copy Multiplier Col 1 into XMM1 */
979           SSE_COPY_PS(XMM1,XMM5)
980           /* Copy Multiplier Col 0 into XMM0 */
981           SSE_COPY_PS(XMM0,XMM4)
982         SSE_INLINE_END_2;
983 
984         /* Update the row: */
985         nz = bi[row+1] - diag_offset[row] - 1;
986         pv += 16;
987         for (j=0; j<nz; j++) {
988           PREFETCH_L1(&pv[16]);
989           x = rtmp + 16*pj[j];
990 /*            x = rtmp + 4*pj[j]; */
991 
992           /* X:=X-M*PV, One column at a time */
993           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
994           SSE_INLINE_BEGIN_2(x,pv)
995             /* Load First Column of X*/
996             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
997             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
998 
999             /* Matrix-Vector Product: */
1000             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1001             SSE_SHUFFLE(XMM5,XMM5,0x00)
1002             SSE_MULT_PS(XMM5,XMM0)
1003             SSE_SUB_PS(XMM4,XMM5)
1004 
1005             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1006             SSE_SHUFFLE(XMM6,XMM6,0x00)
1007             SSE_MULT_PS(XMM6,XMM1)
1008             SSE_SUB_PS(XMM4,XMM6)
1009 
1010             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1011             SSE_SHUFFLE(XMM7,XMM7,0x00)
1012             SSE_MULT_PS(XMM7,XMM2)
1013             SSE_SUB_PS(XMM4,XMM7)
1014 
1015             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1016             SSE_SHUFFLE(XMM5,XMM5,0x00)
1017             SSE_MULT_PS(XMM5,XMM3)
1018             SSE_SUB_PS(XMM4,XMM5)
1019 
1020             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1021             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1022 
1023             /* Second Column */
1024             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1025             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1026 
1027             /* Matrix-Vector Product: */
1028             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1029             SSE_SHUFFLE(XMM6,XMM6,0x00)
1030             SSE_MULT_PS(XMM6,XMM0)
1031             SSE_SUB_PS(XMM5,XMM6)
1032 
1033             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1034             SSE_SHUFFLE(XMM7,XMM7,0x00)
1035             SSE_MULT_PS(XMM7,XMM1)
1036             SSE_SUB_PS(XMM5,XMM7)
1037 
1038             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1039             SSE_SHUFFLE(XMM4,XMM4,0x00)
1040             SSE_MULT_PS(XMM4,XMM2)
1041             SSE_SUB_PS(XMM5,XMM4)
1042 
1043             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1044             SSE_SHUFFLE(XMM6,XMM6,0x00)
1045             SSE_MULT_PS(XMM6,XMM3)
1046             SSE_SUB_PS(XMM5,XMM6)
1047 
1048             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1049             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1050 
1051             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1052 
1053             /* Third Column */
1054             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1055             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1056 
1057             /* Matrix-Vector Product: */
1058             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1059             SSE_SHUFFLE(XMM7,XMM7,0x00)
1060             SSE_MULT_PS(XMM7,XMM0)
1061             SSE_SUB_PS(XMM6,XMM7)
1062 
1063             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1064             SSE_SHUFFLE(XMM4,XMM4,0x00)
1065             SSE_MULT_PS(XMM4,XMM1)
1066             SSE_SUB_PS(XMM6,XMM4)
1067 
1068             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1069             SSE_SHUFFLE(XMM5,XMM5,0x00)
1070             SSE_MULT_PS(XMM5,XMM2)
1071             SSE_SUB_PS(XMM6,XMM5)
1072 
1073             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1074             SSE_SHUFFLE(XMM7,XMM7,0x00)
1075             SSE_MULT_PS(XMM7,XMM3)
1076             SSE_SUB_PS(XMM6,XMM7)
1077 
1078             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1079             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1080 
1081             /* Fourth Column */
1082             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1083             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1084 
1085             /* Matrix-Vector Product: */
1086             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1087             SSE_SHUFFLE(XMM5,XMM5,0x00)
1088             SSE_MULT_PS(XMM5,XMM0)
1089             SSE_SUB_PS(XMM4,XMM5)
1090 
1091             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1092             SSE_SHUFFLE(XMM6,XMM6,0x00)
1093             SSE_MULT_PS(XMM6,XMM1)
1094             SSE_SUB_PS(XMM4,XMM6)
1095 
1096             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1097             SSE_SHUFFLE(XMM7,XMM7,0x00)
1098             SSE_MULT_PS(XMM7,XMM2)
1099             SSE_SUB_PS(XMM4,XMM7)
1100 
1101             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1102             SSE_SHUFFLE(XMM5,XMM5,0x00)
1103             SSE_MULT_PS(XMM5,XMM3)
1104             SSE_SUB_PS(XMM4,XMM5)
1105 
1106             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1107             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1108           SSE_INLINE_END_2;
1109           pv   += 16;
1110         }
1111         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1112       }
1113       row = *bjtmp++;
1114 /*        row = (*bjtmp++)/4; */
1115     }
1116     /* finished row so stick it into b->a */
1117     pv = ba + 16*bi[i];
1118     pj = bj + bi[i];
1119     nz = bi[i+1] - bi[i];
1120 
1121     /* Copy x block back into pv block */
1122     for (j=0; j<nz; j++) {
1123       x  = rtmp+16*pj[j];
1124 /*        x  = rtmp+4*pj[j]; */
1125 
1126       SSE_INLINE_BEGIN_2(x,pv)
1127         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1128         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1129         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1130 
1131         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1132         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1133 
1134         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1135         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1136 
1137         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1138         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1139 
1140         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1141         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1142 
1143         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1144         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1145 
1146         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1147         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1148 
1149         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1150         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1151       SSE_INLINE_END_2;
1152       pv += 16;
1153     }
1154     /* invert diagonal block */
1155     w = ba + 16*diag_offset[i];
1156     if (pivotinblocks) {
1157       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1158     } else {
1159       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1160     }
1161 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1162     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1163   }
1164 
1165   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1166   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1167   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1168   C->assembled = PETSC_TRUE;
1169   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1170   /* Flop Count from inverting diagonal blocks */
1171   SSE_SCOPE_END;
1172   PetscFunctionReturn(0);
1173 }
1174 
1175 #undef __FUNCT__
1176 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
1177 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1178 {
1179   Mat            A=C;
1180   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1181   PetscErrorCode ierr;
1182   int i,j,n = a->mbs;
1183   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1184   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
1185   unsigned int   row;
1186   int            nz,*bi=b->i;
1187   int            *diag_offset = b->diag,*ai=a->i;
1188   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1189   MatScalar      *ba = b->a,*aa = a->a;
1190   int            nonzero=0;
1191 /*    int            nonzero=0,colscale = 16; */
1192   PetscTruth     pivotinblocks = b->pivotinblocks;
1193   PetscReal      shift = info->shiftinblocks;
1194 
1195   PetscFunctionBegin;
1196   SSE_SCOPE_BEGIN;
1197 
1198   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1199   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1200   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1201   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1202 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1203 /*      colscale = 4; */
1204 /*    } */
1205 
1206   for (i=0; i<n; i++) {
1207     nz    = bi[i+1] - bi[i];
1208     bjtmp = bj + bi[i];
1209     /* zero out the 4x4 block accumulators */
1210     /* zero out one register */
1211     XOR_PS(XMM7,XMM7);
1212     for  (j=0; j<nz; j++) {
1213       x = rtmp+16*((unsigned int)bjtmp[j]);
1214 /*        x = rtmp+4*bjtmp[j]; */
1215       SSE_INLINE_BEGIN_1(x)
1216         /* Copy zero register to memory locations */
1217         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1218         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1219         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1220         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1221         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1222         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1223         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1224         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1225         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1226       SSE_INLINE_END_1;
1227     }
1228     /* load in initial (unfactored row) */
1229     nz    = ai[i+1] - ai[i];
1230     ajtmp = aj + ai[i];
1231     v     = aa + 16*ai[i];
1232     for (j=0; j<nz; j++) {
1233       x = rtmp+16*((unsigned int)ajtmp[j]);
1234 /*        x = rtmp+colscale*ajtmp[j]; */
1235       /* Copy v block into x block */
1236       SSE_INLINE_BEGIN_2(v,x)
1237         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1238         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1239         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1240 
1241         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1242         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1243 
1244         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1245         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1246 
1247         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1248         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1249 
1250         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1251         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1252 
1253         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1254         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1255 
1256         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1257         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1258 
1259         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1260         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1261       SSE_INLINE_END_2;
1262 
1263       v += 16;
1264     }
1265 /*      row = (*bjtmp++)/4; */
1266     row = (unsigned int)(*bjtmp++);
1267     while (row < i) {
1268       pc  = rtmp + 16*row;
1269       SSE_INLINE_BEGIN_1(pc)
1270         /* Load block from lower triangle */
1271         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1272         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1273         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1274 
1275         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1276         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1277 
1278         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1279         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1280 
1281         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1282         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1283 
1284         /* Compare block to zero block */
1285 
1286         SSE_COPY_PS(XMM4,XMM7)
1287         SSE_CMPNEQ_PS(XMM4,XMM0)
1288 
1289         SSE_COPY_PS(XMM5,XMM7)
1290         SSE_CMPNEQ_PS(XMM5,XMM1)
1291 
1292         SSE_COPY_PS(XMM6,XMM7)
1293         SSE_CMPNEQ_PS(XMM6,XMM2)
1294 
1295         SSE_CMPNEQ_PS(XMM7,XMM3)
1296 
1297         /* Reduce the comparisons to one SSE register */
1298         SSE_OR_PS(XMM6,XMM7)
1299         SSE_OR_PS(XMM5,XMM4)
1300         SSE_OR_PS(XMM5,XMM6)
1301       SSE_INLINE_END_1;
1302 
1303       /* Reduce the one SSE register to an integer register for branching */
1304       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1305       MOVEMASK(nonzero,XMM5);
1306 
1307       /* If block is nonzero ... */
1308       if (nonzero) {
1309         pv = ba + 16*diag_offset[row];
1310         PREFETCH_L1(&pv[16]);
1311         pj = bj + diag_offset[row] + 1;
1312 
1313         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1314         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1315         /* but the diagonal was inverted already */
1316         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1317 
1318         SSE_INLINE_BEGIN_2(pv,pc)
1319           /* Column 0, product is accumulated in XMM4 */
1320           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1321           SSE_SHUFFLE(XMM4,XMM4,0x00)
1322           SSE_MULT_PS(XMM4,XMM0)
1323 
1324           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1325           SSE_SHUFFLE(XMM5,XMM5,0x00)
1326           SSE_MULT_PS(XMM5,XMM1)
1327           SSE_ADD_PS(XMM4,XMM5)
1328 
1329           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1330           SSE_SHUFFLE(XMM6,XMM6,0x00)
1331           SSE_MULT_PS(XMM6,XMM2)
1332           SSE_ADD_PS(XMM4,XMM6)
1333 
1334           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1335           SSE_SHUFFLE(XMM7,XMM7,0x00)
1336           SSE_MULT_PS(XMM7,XMM3)
1337           SSE_ADD_PS(XMM4,XMM7)
1338 
1339           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1340           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1341 
1342           /* Column 1, product is accumulated in XMM5 */
1343           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1344           SSE_SHUFFLE(XMM5,XMM5,0x00)
1345           SSE_MULT_PS(XMM5,XMM0)
1346 
1347           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1348           SSE_SHUFFLE(XMM6,XMM6,0x00)
1349           SSE_MULT_PS(XMM6,XMM1)
1350           SSE_ADD_PS(XMM5,XMM6)
1351 
1352           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1353           SSE_SHUFFLE(XMM7,XMM7,0x00)
1354           SSE_MULT_PS(XMM7,XMM2)
1355           SSE_ADD_PS(XMM5,XMM7)
1356 
1357           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1358           SSE_SHUFFLE(XMM6,XMM6,0x00)
1359           SSE_MULT_PS(XMM6,XMM3)
1360           SSE_ADD_PS(XMM5,XMM6)
1361 
1362           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1363           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1364 
1365           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1366 
1367           /* Column 2, product is accumulated in XMM6 */
1368           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1369           SSE_SHUFFLE(XMM6,XMM6,0x00)
1370           SSE_MULT_PS(XMM6,XMM0)
1371 
1372           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1373           SSE_SHUFFLE(XMM7,XMM7,0x00)
1374           SSE_MULT_PS(XMM7,XMM1)
1375           SSE_ADD_PS(XMM6,XMM7)
1376 
1377           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1378           SSE_SHUFFLE(XMM7,XMM7,0x00)
1379           SSE_MULT_PS(XMM7,XMM2)
1380           SSE_ADD_PS(XMM6,XMM7)
1381 
1382           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1383           SSE_SHUFFLE(XMM7,XMM7,0x00)
1384           SSE_MULT_PS(XMM7,XMM3)
1385           SSE_ADD_PS(XMM6,XMM7)
1386 
1387           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1388           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1389 
1390           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1391           /* Column 3, product is accumulated in XMM0 */
1392           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1393           SSE_SHUFFLE(XMM7,XMM7,0x00)
1394           SSE_MULT_PS(XMM0,XMM7)
1395 
1396           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1397           SSE_SHUFFLE(XMM7,XMM7,0x00)
1398           SSE_MULT_PS(XMM1,XMM7)
1399           SSE_ADD_PS(XMM0,XMM1)
1400 
1401           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1402           SSE_SHUFFLE(XMM1,XMM1,0x00)
1403           SSE_MULT_PS(XMM1,XMM2)
1404           SSE_ADD_PS(XMM0,XMM1)
1405 
1406           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1407           SSE_SHUFFLE(XMM7,XMM7,0x00)
1408           SSE_MULT_PS(XMM3,XMM7)
1409           SSE_ADD_PS(XMM0,XMM3)
1410 
1411           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1412           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1413 
1414           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1415           /* This is code to be maintained and read by humans afterall. */
1416           /* Copy Multiplier Col 3 into XMM3 */
1417           SSE_COPY_PS(XMM3,XMM0)
1418           /* Copy Multiplier Col 2 into XMM2 */
1419           SSE_COPY_PS(XMM2,XMM6)
1420           /* Copy Multiplier Col 1 into XMM1 */
1421           SSE_COPY_PS(XMM1,XMM5)
1422           /* Copy Multiplier Col 0 into XMM0 */
1423           SSE_COPY_PS(XMM0,XMM4)
1424         SSE_INLINE_END_2;
1425 
1426         /* Update the row: */
1427         nz = bi[row+1] - diag_offset[row] - 1;
1428         pv += 16;
1429         for (j=0; j<nz; j++) {
1430           PREFETCH_L1(&pv[16]);
1431           x = rtmp + 16*((unsigned int)pj[j]);
1432 /*            x = rtmp + 4*pj[j]; */
1433 
1434           /* X:=X-M*PV, One column at a time */
1435           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1436           SSE_INLINE_BEGIN_2(x,pv)
1437             /* Load First Column of X*/
1438             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1439             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1440 
1441             /* Matrix-Vector Product: */
1442             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1443             SSE_SHUFFLE(XMM5,XMM5,0x00)
1444             SSE_MULT_PS(XMM5,XMM0)
1445             SSE_SUB_PS(XMM4,XMM5)
1446 
1447             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1448             SSE_SHUFFLE(XMM6,XMM6,0x00)
1449             SSE_MULT_PS(XMM6,XMM1)
1450             SSE_SUB_PS(XMM4,XMM6)
1451 
1452             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1453             SSE_SHUFFLE(XMM7,XMM7,0x00)
1454             SSE_MULT_PS(XMM7,XMM2)
1455             SSE_SUB_PS(XMM4,XMM7)
1456 
1457             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1458             SSE_SHUFFLE(XMM5,XMM5,0x00)
1459             SSE_MULT_PS(XMM5,XMM3)
1460             SSE_SUB_PS(XMM4,XMM5)
1461 
1462             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1463             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1464 
1465             /* Second Column */
1466             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1467             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1468 
1469             /* Matrix-Vector Product: */
1470             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1471             SSE_SHUFFLE(XMM6,XMM6,0x00)
1472             SSE_MULT_PS(XMM6,XMM0)
1473             SSE_SUB_PS(XMM5,XMM6)
1474 
1475             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1476             SSE_SHUFFLE(XMM7,XMM7,0x00)
1477             SSE_MULT_PS(XMM7,XMM1)
1478             SSE_SUB_PS(XMM5,XMM7)
1479 
1480             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1481             SSE_SHUFFLE(XMM4,XMM4,0x00)
1482             SSE_MULT_PS(XMM4,XMM2)
1483             SSE_SUB_PS(XMM5,XMM4)
1484 
1485             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1486             SSE_SHUFFLE(XMM6,XMM6,0x00)
1487             SSE_MULT_PS(XMM6,XMM3)
1488             SSE_SUB_PS(XMM5,XMM6)
1489 
1490             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1491             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1492 
1493             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1494 
1495             /* Third Column */
1496             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1497             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1498 
1499             /* Matrix-Vector Product: */
1500             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1501             SSE_SHUFFLE(XMM7,XMM7,0x00)
1502             SSE_MULT_PS(XMM7,XMM0)
1503             SSE_SUB_PS(XMM6,XMM7)
1504 
1505             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1506             SSE_SHUFFLE(XMM4,XMM4,0x00)
1507             SSE_MULT_PS(XMM4,XMM1)
1508             SSE_SUB_PS(XMM6,XMM4)
1509 
1510             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1511             SSE_SHUFFLE(XMM5,XMM5,0x00)
1512             SSE_MULT_PS(XMM5,XMM2)
1513             SSE_SUB_PS(XMM6,XMM5)
1514 
1515             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1516             SSE_SHUFFLE(XMM7,XMM7,0x00)
1517             SSE_MULT_PS(XMM7,XMM3)
1518             SSE_SUB_PS(XMM6,XMM7)
1519 
1520             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1521             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1522 
1523             /* Fourth Column */
1524             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1525             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1526 
1527             /* Matrix-Vector Product: */
1528             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1529             SSE_SHUFFLE(XMM5,XMM5,0x00)
1530             SSE_MULT_PS(XMM5,XMM0)
1531             SSE_SUB_PS(XMM4,XMM5)
1532 
1533             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1534             SSE_SHUFFLE(XMM6,XMM6,0x00)
1535             SSE_MULT_PS(XMM6,XMM1)
1536             SSE_SUB_PS(XMM4,XMM6)
1537 
1538             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1539             SSE_SHUFFLE(XMM7,XMM7,0x00)
1540             SSE_MULT_PS(XMM7,XMM2)
1541             SSE_SUB_PS(XMM4,XMM7)
1542 
1543             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1544             SSE_SHUFFLE(XMM5,XMM5,0x00)
1545             SSE_MULT_PS(XMM5,XMM3)
1546             SSE_SUB_PS(XMM4,XMM5)
1547 
1548             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1549             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1550           SSE_INLINE_END_2;
1551           pv   += 16;
1552         }
1553         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1554       }
1555       row = (unsigned int)(*bjtmp++);
1556 /*        row = (*bjtmp++)/4; */
1557 /*        bjtmp++; */
1558     }
1559     /* finished row so stick it into b->a */
1560     pv = ba + 16*bi[i];
1561     pj = bj + bi[i];
1562     nz = bi[i+1] - bi[i];
1563 
1564     /* Copy x block back into pv block */
1565     for (j=0; j<nz; j++) {
1566       x  = rtmp+16*((unsigned int)pj[j]);
1567 /*        x  = rtmp+4*pj[j]; */
1568 
1569       SSE_INLINE_BEGIN_2(x,pv)
1570         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1571         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1572         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1573 
1574         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1575         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1576 
1577         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1578         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1579 
1580         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1581         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1582 
1583         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1584         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1585 
1586         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1587         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1588 
1589         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1590         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1591 
1592         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1593         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1594       SSE_INLINE_END_2;
1595       pv += 16;
1596     }
1597     /* invert diagonal block */
1598     w = ba + 16*diag_offset[i];
1599     if (pivotinblocks) {
1600       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1601     } else {
1602       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1603     }
1604 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1605     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1606   }
1607 
1608   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1609   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1610   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1611   C->assembled = PETSC_TRUE;
1612   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1613   /* Flop Count from inverting diagonal blocks */
1614   SSE_SCOPE_END;
1615   PetscFunctionReturn(0);
1616 }
1617 
1618 #undef __FUNCT__
1619 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1620 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1621 {
1622   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1623   PetscErrorCode ierr;
1624   int  i,j,n = a->mbs;
1625   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1626   unsigned int   row;
1627   int            *ajtmpold,nz,*bi=b->i;
1628   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1629   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1630   MatScalar      *ba = b->a,*aa = a->a;
1631   int            nonzero=0;
1632 /*    int            nonzero=0,colscale = 16; */
1633   PetscTruth     pivotinblocks = b->pivotinblocks;
1634   PetscReal      shift = info->shiftinblocks;
1635 
1636   PetscFunctionBegin;
1637   SSE_SCOPE_BEGIN;
1638 
1639   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1640   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1641   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1642   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1643 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1644 /*      colscale = 4; */
1645 /*    } */
1646   if ((unsigned long)bj==(unsigned long)aj) {
1647     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1648   }
1649 
1650   for (i=0; i<n; i++) {
1651     nz    = bi[i+1] - bi[i];
1652     bjtmp = bj + bi[i];
1653     /* zero out the 4x4 block accumulators */
1654     /* zero out one register */
1655     XOR_PS(XMM7,XMM7);
1656     for  (j=0; j<nz; j++) {
1657       x = rtmp+16*((unsigned int)bjtmp[j]);
1658 /*        x = rtmp+4*bjtmp[j]; */
1659       SSE_INLINE_BEGIN_1(x)
1660         /* Copy zero register to memory locations */
1661         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1662         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1663         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1664         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1665         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1666         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1667         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1668         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1669         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1670       SSE_INLINE_END_1;
1671     }
1672     /* load in initial (unfactored row) */
1673     nz       = ai[i+1] - ai[i];
1674     ajtmpold = aj + ai[i];
1675     v        = aa + 16*ai[i];
1676     for (j=0; j<nz; j++) {
1677       x = rtmp+16*ajtmpold[j];
1678 /*        x = rtmp+colscale*ajtmpold[j]; */
1679       /* Copy v block into x block */
1680       SSE_INLINE_BEGIN_2(v,x)
1681         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1682         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1683         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1684 
1685         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1686         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1687 
1688         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1689         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1690 
1691         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1692         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1693 
1694         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1695         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1696 
1697         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1698         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1699 
1700         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1701         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1702 
1703         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1704         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1705       SSE_INLINE_END_2;
1706 
1707       v += 16;
1708     }
1709 /*      row = (*bjtmp++)/4; */
1710     row = (unsigned int)(*bjtmp++);
1711     while (row < i) {
1712       pc  = rtmp + 16*row;
1713       SSE_INLINE_BEGIN_1(pc)
1714         /* Load block from lower triangle */
1715         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1716         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1717         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1718 
1719         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1720         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1721 
1722         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1723         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1724 
1725         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1726         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1727 
1728         /* Compare block to zero block */
1729 
1730         SSE_COPY_PS(XMM4,XMM7)
1731         SSE_CMPNEQ_PS(XMM4,XMM0)
1732 
1733         SSE_COPY_PS(XMM5,XMM7)
1734         SSE_CMPNEQ_PS(XMM5,XMM1)
1735 
1736         SSE_COPY_PS(XMM6,XMM7)
1737         SSE_CMPNEQ_PS(XMM6,XMM2)
1738 
1739         SSE_CMPNEQ_PS(XMM7,XMM3)
1740 
1741         /* Reduce the comparisons to one SSE register */
1742         SSE_OR_PS(XMM6,XMM7)
1743         SSE_OR_PS(XMM5,XMM4)
1744         SSE_OR_PS(XMM5,XMM6)
1745       SSE_INLINE_END_1;
1746 
1747       /* Reduce the one SSE register to an integer register for branching */
1748       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1749       MOVEMASK(nonzero,XMM5);
1750 
1751       /* If block is nonzero ... */
1752       if (nonzero) {
1753         pv = ba + 16*diag_offset[row];
1754         PREFETCH_L1(&pv[16]);
1755         pj = bj + diag_offset[row] + 1;
1756 
1757         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1758         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1759         /* but the diagonal was inverted already */
1760         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1761 
1762         SSE_INLINE_BEGIN_2(pv,pc)
1763           /* Column 0, product is accumulated in XMM4 */
1764           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1765           SSE_SHUFFLE(XMM4,XMM4,0x00)
1766           SSE_MULT_PS(XMM4,XMM0)
1767 
1768           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1769           SSE_SHUFFLE(XMM5,XMM5,0x00)
1770           SSE_MULT_PS(XMM5,XMM1)
1771           SSE_ADD_PS(XMM4,XMM5)
1772 
1773           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1774           SSE_SHUFFLE(XMM6,XMM6,0x00)
1775           SSE_MULT_PS(XMM6,XMM2)
1776           SSE_ADD_PS(XMM4,XMM6)
1777 
1778           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1779           SSE_SHUFFLE(XMM7,XMM7,0x00)
1780           SSE_MULT_PS(XMM7,XMM3)
1781           SSE_ADD_PS(XMM4,XMM7)
1782 
1783           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1784           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1785 
1786           /* Column 1, product is accumulated in XMM5 */
1787           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1788           SSE_SHUFFLE(XMM5,XMM5,0x00)
1789           SSE_MULT_PS(XMM5,XMM0)
1790 
1791           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1792           SSE_SHUFFLE(XMM6,XMM6,0x00)
1793           SSE_MULT_PS(XMM6,XMM1)
1794           SSE_ADD_PS(XMM5,XMM6)
1795 
1796           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1797           SSE_SHUFFLE(XMM7,XMM7,0x00)
1798           SSE_MULT_PS(XMM7,XMM2)
1799           SSE_ADD_PS(XMM5,XMM7)
1800 
1801           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1802           SSE_SHUFFLE(XMM6,XMM6,0x00)
1803           SSE_MULT_PS(XMM6,XMM3)
1804           SSE_ADD_PS(XMM5,XMM6)
1805 
1806           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1807           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1808 
1809           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1810 
1811           /* Column 2, product is accumulated in XMM6 */
1812           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1813           SSE_SHUFFLE(XMM6,XMM6,0x00)
1814           SSE_MULT_PS(XMM6,XMM0)
1815 
1816           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1817           SSE_SHUFFLE(XMM7,XMM7,0x00)
1818           SSE_MULT_PS(XMM7,XMM1)
1819           SSE_ADD_PS(XMM6,XMM7)
1820 
1821           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1822           SSE_SHUFFLE(XMM7,XMM7,0x00)
1823           SSE_MULT_PS(XMM7,XMM2)
1824           SSE_ADD_PS(XMM6,XMM7)
1825 
1826           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1827           SSE_SHUFFLE(XMM7,XMM7,0x00)
1828           SSE_MULT_PS(XMM7,XMM3)
1829           SSE_ADD_PS(XMM6,XMM7)
1830 
1831           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1832           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1833 
1834           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1835           /* Column 3, product is accumulated in XMM0 */
1836           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1837           SSE_SHUFFLE(XMM7,XMM7,0x00)
1838           SSE_MULT_PS(XMM0,XMM7)
1839 
1840           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1841           SSE_SHUFFLE(XMM7,XMM7,0x00)
1842           SSE_MULT_PS(XMM1,XMM7)
1843           SSE_ADD_PS(XMM0,XMM1)
1844 
1845           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1846           SSE_SHUFFLE(XMM1,XMM1,0x00)
1847           SSE_MULT_PS(XMM1,XMM2)
1848           SSE_ADD_PS(XMM0,XMM1)
1849 
1850           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1851           SSE_SHUFFLE(XMM7,XMM7,0x00)
1852           SSE_MULT_PS(XMM3,XMM7)
1853           SSE_ADD_PS(XMM0,XMM3)
1854 
1855           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1856           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1857 
1858           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1859           /* This is code to be maintained and read by humans afterall. */
1860           /* Copy Multiplier Col 3 into XMM3 */
1861           SSE_COPY_PS(XMM3,XMM0)
1862           /* Copy Multiplier Col 2 into XMM2 */
1863           SSE_COPY_PS(XMM2,XMM6)
1864           /* Copy Multiplier Col 1 into XMM1 */
1865           SSE_COPY_PS(XMM1,XMM5)
1866           /* Copy Multiplier Col 0 into XMM0 */
1867           SSE_COPY_PS(XMM0,XMM4)
1868         SSE_INLINE_END_2;
1869 
1870         /* Update the row: */
1871         nz = bi[row+1] - diag_offset[row] - 1;
1872         pv += 16;
1873         for (j=0; j<nz; j++) {
1874           PREFETCH_L1(&pv[16]);
1875           x = rtmp + 16*((unsigned int)pj[j]);
1876 /*            x = rtmp + 4*pj[j]; */
1877 
1878           /* X:=X-M*PV, One column at a time */
1879           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1880           SSE_INLINE_BEGIN_2(x,pv)
1881             /* Load First Column of X*/
1882             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1883             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1884 
1885             /* Matrix-Vector Product: */
1886             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1887             SSE_SHUFFLE(XMM5,XMM5,0x00)
1888             SSE_MULT_PS(XMM5,XMM0)
1889             SSE_SUB_PS(XMM4,XMM5)
1890 
1891             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1892             SSE_SHUFFLE(XMM6,XMM6,0x00)
1893             SSE_MULT_PS(XMM6,XMM1)
1894             SSE_SUB_PS(XMM4,XMM6)
1895 
1896             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1897             SSE_SHUFFLE(XMM7,XMM7,0x00)
1898             SSE_MULT_PS(XMM7,XMM2)
1899             SSE_SUB_PS(XMM4,XMM7)
1900 
1901             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1902             SSE_SHUFFLE(XMM5,XMM5,0x00)
1903             SSE_MULT_PS(XMM5,XMM3)
1904             SSE_SUB_PS(XMM4,XMM5)
1905 
1906             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1907             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1908 
1909             /* Second Column */
1910             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1911             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1912 
1913             /* Matrix-Vector Product: */
1914             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1915             SSE_SHUFFLE(XMM6,XMM6,0x00)
1916             SSE_MULT_PS(XMM6,XMM0)
1917             SSE_SUB_PS(XMM5,XMM6)
1918 
1919             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1920             SSE_SHUFFLE(XMM7,XMM7,0x00)
1921             SSE_MULT_PS(XMM7,XMM1)
1922             SSE_SUB_PS(XMM5,XMM7)
1923 
1924             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1925             SSE_SHUFFLE(XMM4,XMM4,0x00)
1926             SSE_MULT_PS(XMM4,XMM2)
1927             SSE_SUB_PS(XMM5,XMM4)
1928 
1929             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1930             SSE_SHUFFLE(XMM6,XMM6,0x00)
1931             SSE_MULT_PS(XMM6,XMM3)
1932             SSE_SUB_PS(XMM5,XMM6)
1933 
1934             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1935             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1936 
1937             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1938 
1939             /* Third Column */
1940             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1941             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1942 
1943             /* Matrix-Vector Product: */
1944             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1945             SSE_SHUFFLE(XMM7,XMM7,0x00)
1946             SSE_MULT_PS(XMM7,XMM0)
1947             SSE_SUB_PS(XMM6,XMM7)
1948 
1949             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1950             SSE_SHUFFLE(XMM4,XMM4,0x00)
1951             SSE_MULT_PS(XMM4,XMM1)
1952             SSE_SUB_PS(XMM6,XMM4)
1953 
1954             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1955             SSE_SHUFFLE(XMM5,XMM5,0x00)
1956             SSE_MULT_PS(XMM5,XMM2)
1957             SSE_SUB_PS(XMM6,XMM5)
1958 
1959             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1960             SSE_SHUFFLE(XMM7,XMM7,0x00)
1961             SSE_MULT_PS(XMM7,XMM3)
1962             SSE_SUB_PS(XMM6,XMM7)
1963 
1964             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1965             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1966 
1967             /* Fourth Column */
1968             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1969             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1970 
1971             /* Matrix-Vector Product: */
1972             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1973             SSE_SHUFFLE(XMM5,XMM5,0x00)
1974             SSE_MULT_PS(XMM5,XMM0)
1975             SSE_SUB_PS(XMM4,XMM5)
1976 
1977             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1978             SSE_SHUFFLE(XMM6,XMM6,0x00)
1979             SSE_MULT_PS(XMM6,XMM1)
1980             SSE_SUB_PS(XMM4,XMM6)
1981 
1982             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1983             SSE_SHUFFLE(XMM7,XMM7,0x00)
1984             SSE_MULT_PS(XMM7,XMM2)
1985             SSE_SUB_PS(XMM4,XMM7)
1986 
1987             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1988             SSE_SHUFFLE(XMM5,XMM5,0x00)
1989             SSE_MULT_PS(XMM5,XMM3)
1990             SSE_SUB_PS(XMM4,XMM5)
1991 
1992             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1993             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1994           SSE_INLINE_END_2;
1995           pv   += 16;
1996         }
1997         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1998       }
1999       row = (unsigned int)(*bjtmp++);
2000 /*        row = (*bjtmp++)/4; */
2001 /*        bjtmp++; */
2002     }
2003     /* finished row so stick it into b->a */
2004     pv = ba + 16*bi[i];
2005     pj = bj + bi[i];
2006     nz = bi[i+1] - bi[i];
2007 
2008     /* Copy x block back into pv block */
2009     for (j=0; j<nz; j++) {
2010       x  = rtmp+16*((unsigned int)pj[j]);
2011 /*        x  = rtmp+4*pj[j]; */
2012 
2013       SSE_INLINE_BEGIN_2(x,pv)
2014         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
2015         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
2016         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
2017 
2018         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
2019         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
2020 
2021         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
2022         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
2023 
2024         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
2025         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
2026 
2027         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
2028         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
2029 
2030         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
2031         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
2032 
2033         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
2034         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
2035 
2036         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
2037         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
2038       SSE_INLINE_END_2;
2039       pv += 16;
2040     }
2041     /* invert diagonal block */
2042     w = ba + 16*diag_offset[i];
2043     if (pivotinblocks) {
2044       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
2045     } else {
2046       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
2047     }
2048 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
2049     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
2050   }
2051 
2052   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2053   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
2054   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
2055   C->assembled = PETSC_TRUE;
2056   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
2057   /* Flop Count from inverting diagonal blocks */
2058   SSE_SCOPE_END;
2059   PetscFunctionReturn(0);
2060 }
2061 
2062 #endif
2063