1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 #include "../src/mat/impls/baij/seq/baij.h" 7 #include "../src/mat/blockinvert.h" 8 9 /* ------------------------------------------------------------*/ 10 /* 11 Version for when blocks are 4 by 4 12 */ 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace" 15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 16 { 17 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 18 IS isrow = b->row,isicol = b->icol; 19 PetscErrorCode ierr; 20 const PetscInt *r,*ic; 21 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 22 PetscInt *ajtmpold,*ajtmp,nz,row; 23 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 24 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 25 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 26 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 27 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 28 MatScalar m13,m14,m15,m16; 29 MatScalar *ba = b->a,*aa = a->a; 30 PetscTruth pivotinblocks = b->pivotinblocks; 31 PetscReal shift = info->shiftamount; 32 33 PetscFunctionBegin; 34 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 35 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 36 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 37 38 for (i=0; i<n; i++) { 39 nz = bi[i+1] - bi[i]; 40 ajtmp = bj + bi[i]; 41 for (j=0; j<nz; j++) { 42 x = rtmp+16*ajtmp[j]; 43 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 44 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 45 } 46 /* load in initial (unfactored row) */ 47 idx = r[i]; 48 nz = ai[idx+1] - ai[idx]; 49 ajtmpold = aj + ai[idx]; 50 v = aa + 16*ai[idx]; 51 for (j=0; j<nz; j++) { 52 x = rtmp+16*ic[ajtmpold[j]]; 53 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 54 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 55 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 56 x[14] = v[14]; x[15] = v[15]; 57 v += 16; 58 } 59 row = *ajtmp++; 60 while (row < i) { 61 pc = rtmp + 16*row; 62 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 63 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 64 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 65 p15 = pc[14]; p16 = pc[15]; 66 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 67 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 68 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 69 || p16 != 0.0) { 70 pv = ba + 16*diag_offset[row]; 71 pj = bj + diag_offset[row] + 1; 72 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 73 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 74 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 75 x15 = pv[14]; x16 = pv[15]; 76 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 77 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 78 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 79 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 80 81 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 82 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 83 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 84 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 85 86 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 87 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 88 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 89 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 90 91 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 92 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 93 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 94 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 95 96 nz = bi[row+1] - diag_offset[row] - 1; 97 pv += 16; 98 for (j=0; j<nz; j++) { 99 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 100 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 101 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 102 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 103 x = rtmp + 16*pj[j]; 104 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 105 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 106 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 107 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 108 109 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 110 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 111 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 112 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 113 114 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 115 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 116 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 117 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 118 119 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 120 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 121 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 122 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 123 124 pv += 16; 125 } 126 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 127 } 128 row = *ajtmp++; 129 } 130 /* finished row so stick it into b->a */ 131 pv = ba + 16*bi[i]; 132 pj = bj + bi[i]; 133 nz = bi[i+1] - bi[i]; 134 for (j=0; j<nz; j++) { 135 x = rtmp+16*pj[j]; 136 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 137 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 138 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 139 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 140 pv += 16; 141 } 142 /* invert diagonal block */ 143 w = ba + 16*diag_offset[i]; 144 if (pivotinblocks) { 145 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 146 } else { 147 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148 } 149 } 150 151 ierr = PetscFree(rtmp);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 153 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 155 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 156 C->assembled = PETSC_TRUE; 157 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 158 PetscFunctionReturn(0); 159 } 160 161 /* MatLUFactorNumeric_SeqBAIJ_4 - 162 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 163 Kernel_A_gets_A_times_B() 164 Kernel_A_gets_A_minus_B_times_C() 165 Kernel_A_gets_inverse_A() 166 */ 167 168 #undef __FUNCT__ 169 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 171 { 172 Mat C=B; 173 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 174 IS isrow = b->row,isicol = b->icol; 175 PetscErrorCode ierr; 176 const PetscInt *r,*ic,*ics; 177 PetscInt i,j,k,nz,nzL,row; 178 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 179 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 180 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 181 PetscInt flg; 182 PetscReal shift = info->shiftamount; 183 184 PetscFunctionBegin; 185 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 186 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 187 188 /* generate work space needed by the factorization */ 189 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 190 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 191 ics = ic; 192 193 for (i=0; i<n; i++){ 194 /* zero rtmp */ 195 /* L part */ 196 nz = bi[i+1] - bi[i]; 197 bjtmp = bj + bi[i]; 198 for (j=0; j<nz; j++){ 199 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 200 } 201 202 /* U part */ 203 nz = bdiag[i]-bdiag[i+1]; 204 bjtmp = bj + bdiag[i+1]+1; 205 for (j=0; j<nz; j++){ 206 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 207 } 208 209 /* load in initial (unfactored row) */ 210 nz = ai[r[i]+1] - ai[r[i]]; 211 ajtmp = aj + ai[r[i]]; 212 v = aa + bs2*ai[r[i]]; 213 for (j=0; j<nz; j++) { 214 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 215 } 216 217 /* elimination */ 218 bjtmp = bj + bi[i]; 219 nzL = bi[i+1] - bi[i]; 220 for(k=0;k < nzL;k++) { 221 row = bjtmp[k]; 222 pc = rtmp + bs2*row; 223 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 224 if (flg) { 225 pv = b->a + bs2*bdiag[row]; 226 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 227 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 228 229 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 230 pv = b->a + bs2*(bdiag[row+1]+1); 231 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 232 for (j=0; j<nz; j++) { 233 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 234 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 235 v = rtmp + bs2*pj[j]; 236 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 237 pv += bs2; 238 } 239 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 240 } 241 } 242 243 /* finished row so stick it into b->a */ 244 /* L part */ 245 pv = b->a + bs2*bi[i] ; 246 pj = b->j + bi[i] ; 247 nz = bi[i+1] - bi[i]; 248 for (j=0; j<nz; j++) { 249 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 250 } 251 252 /* Mark diagonal and invert diagonal for simplier triangular solves */ 253 pv = b->a + bs2*bdiag[i]; 254 pj = b->j + bdiag[i]; 255 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 256 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 257 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 258 259 /* U part */ 260 pv = b->a + bs2*(bdiag[i+1]+1); 261 pj = b->j + bdiag[i+1]+1; 262 nz = bdiag[i] - bdiag[i+1] - 1; 263 for (j=0; j<nz; j++){ 264 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 265 } 266 } 267 268 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 269 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 270 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 271 C->ops->solve = MatSolve_SeqBAIJ_4; 272 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 273 C->assembled = PETSC_TRUE; 274 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 275 PetscFunctionReturn(0); 276 } 277 278 #undef __FUNCT__ 279 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace" 280 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 281 { 282 /* 283 Default Version for when blocks are 4 by 4 Using natural ordering 284 */ 285 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 286 PetscErrorCode ierr; 287 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 288 PetscInt *ajtmpold,*ajtmp,nz,row; 289 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 290 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 291 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 292 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 293 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 294 MatScalar m13,m14,m15,m16; 295 MatScalar *ba = b->a,*aa = a->a; 296 PetscTruth pivotinblocks = b->pivotinblocks; 297 PetscReal shift = info->shiftamount; 298 299 PetscFunctionBegin; 300 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 301 302 for (i=0; i<n; i++) { 303 nz = bi[i+1] - bi[i]; 304 ajtmp = bj + bi[i]; 305 for (j=0; j<nz; j++) { 306 x = rtmp+16*ajtmp[j]; 307 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 308 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 309 } 310 /* load in initial (unfactored row) */ 311 nz = ai[i+1] - ai[i]; 312 ajtmpold = aj + ai[i]; 313 v = aa + 16*ai[i]; 314 for (j=0; j<nz; j++) { 315 x = rtmp+16*ajtmpold[j]; 316 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 317 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 318 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 319 x[14] = v[14]; x[15] = v[15]; 320 v += 16; 321 } 322 row = *ajtmp++; 323 while (row < i) { 324 pc = rtmp + 16*row; 325 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 326 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 327 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 328 p15 = pc[14]; p16 = pc[15]; 329 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 330 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 331 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 332 || p16 != 0.0) { 333 pv = ba + 16*diag_offset[row]; 334 pj = bj + diag_offset[row] + 1; 335 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 336 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 337 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 338 x15 = pv[14]; x16 = pv[15]; 339 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 340 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 341 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 342 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 343 344 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 345 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 346 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 347 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 348 349 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 350 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 351 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 352 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 353 354 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 355 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 356 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 357 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 358 nz = bi[row+1] - diag_offset[row] - 1; 359 pv += 16; 360 for (j=0; j<nz; j++) { 361 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 362 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 363 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 364 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 365 x = rtmp + 16*pj[j]; 366 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 367 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 368 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 369 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 370 371 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 372 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 373 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 374 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 375 376 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 377 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 378 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 379 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 380 381 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 382 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 383 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 384 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 385 386 pv += 16; 387 } 388 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 389 } 390 row = *ajtmp++; 391 } 392 /* finished row so stick it into b->a */ 393 pv = ba + 16*bi[i]; 394 pj = bj + bi[i]; 395 nz = bi[i+1] - bi[i]; 396 for (j=0; j<nz; j++) { 397 x = rtmp+16*pj[j]; 398 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 399 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 400 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 401 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 402 pv += 16; 403 } 404 /* invert diagonal block */ 405 w = ba + 16*diag_offset[i]; 406 if (pivotinblocks) { 407 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 408 } else { 409 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 410 } 411 } 412 413 ierr = PetscFree(rtmp);CHKERRQ(ierr); 414 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 415 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 416 C->assembled = PETSC_TRUE; 417 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 418 PetscFunctionReturn(0); 419 } 420 421 /* 422 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 423 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 424 */ 425 #undef __FUNCT__ 426 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 427 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 428 { 429 Mat C=B; 430 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 431 PetscErrorCode ierr; 432 PetscInt i,j,k,nz,nzL,row; 433 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 434 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 435 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 436 PetscInt flg; 437 PetscReal shift = info->shiftamount; 438 439 PetscFunctionBegin; 440 /* generate work space needed by the factorization */ 441 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 442 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 443 444 for (i=0; i<n; i++){ 445 /* zero rtmp */ 446 /* L part */ 447 nz = bi[i+1] - bi[i]; 448 bjtmp = bj + bi[i]; 449 for (j=0; j<nz; j++){ 450 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 451 } 452 453 /* U part */ 454 nz = bdiag[i] - bdiag[i+1]; 455 bjtmp = bj + bdiag[i+1]+1; 456 for (j=0; j<nz; j++){ 457 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 458 } 459 460 /* load in initial (unfactored row) */ 461 nz = ai[i+1] - ai[i]; 462 ajtmp = aj + ai[i]; 463 v = aa + bs2*ai[i]; 464 for (j=0; j<nz; j++) { 465 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 466 } 467 468 /* elimination */ 469 bjtmp = bj + bi[i]; 470 nzL = bi[i+1] - bi[i]; 471 for(k=0;k < nzL;k++) { 472 row = bjtmp[k]; 473 pc = rtmp + bs2*row; 474 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 475 if (flg) { 476 pv = b->a + bs2*bdiag[row]; 477 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 478 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 479 480 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 481 pv = b->a + bs2*(bdiag[row+1]+1); 482 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 483 for (j=0; j<nz; j++) { 484 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 485 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 486 v = rtmp + bs2*pj[j]; 487 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 488 pv += bs2; 489 } 490 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 491 } 492 } 493 494 /* finished row so stick it into b->a */ 495 /* L part */ 496 pv = b->a + bs2*bi[i] ; 497 pj = b->j + bi[i] ; 498 nz = bi[i+1] - bi[i]; 499 for (j=0; j<nz; j++) { 500 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 501 } 502 503 /* Mark diagonal and invert diagonal for simplier triangular solves */ 504 pv = b->a + bs2*bdiag[i]; 505 pj = b->j + bdiag[i]; 506 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 507 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 508 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 509 510 /* U part */ 511 pv = b->a + bs2*(bdiag[i+1]+1); 512 pj = b->j + bdiag[i+1]+1; 513 nz = bdiag[i] - bdiag[i+1] - 1; 514 for (j=0; j<nz; j++){ 515 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 516 } 517 } 518 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 519 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 520 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 521 C->assembled = PETSC_TRUE; 522 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 523 PetscFunctionReturn(0); 524 } 525 526 #if defined(PETSC_HAVE_SSE) 527 528 #include PETSC_HAVE_SSE 529 530 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 531 #undef __FUNCT__ 532 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 533 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 534 { 535 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 536 PetscErrorCode ierr; 537 int i,j,n = a->mbs; 538 int *bj = b->j,*bjtmp,*pj; 539 int row; 540 int *ajtmpold,nz,*bi=b->i; 541 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 542 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 543 MatScalar *ba = b->a,*aa = a->a; 544 int nonzero=0; 545 /* int nonzero=0,colscale = 16; */ 546 PetscTruth pivotinblocks = b->pivotinblocks; 547 PetscReal shift = info->shiftamount; 548 549 PetscFunctionBegin; 550 SSE_SCOPE_BEGIN; 551 552 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 553 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 554 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 555 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 556 /* if ((unsigned long)bj==(unsigned long)aj) { */ 557 /* colscale = 4; */ 558 /* } */ 559 for (i=0; i<n; i++) { 560 nz = bi[i+1] - bi[i]; 561 bjtmp = bj + bi[i]; 562 /* zero out the 4x4 block accumulators */ 563 /* zero out one register */ 564 XOR_PS(XMM7,XMM7); 565 for (j=0; j<nz; j++) { 566 x = rtmp+16*bjtmp[j]; 567 /* x = rtmp+4*bjtmp[j]; */ 568 SSE_INLINE_BEGIN_1(x) 569 /* Copy zero register to memory locations */ 570 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 571 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 572 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 573 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 574 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 575 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 576 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 577 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 578 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 579 SSE_INLINE_END_1; 580 } 581 /* load in initial (unfactored row) */ 582 nz = ai[i+1] - ai[i]; 583 ajtmpold = aj + ai[i]; 584 v = aa + 16*ai[i]; 585 for (j=0; j<nz; j++) { 586 x = rtmp+16*ajtmpold[j]; 587 /* x = rtmp+colscale*ajtmpold[j]; */ 588 /* Copy v block into x block */ 589 SSE_INLINE_BEGIN_2(v,x) 590 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 591 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 592 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 593 594 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 595 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 596 597 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 598 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 599 600 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 601 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 602 603 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 604 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 605 606 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 607 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 608 609 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 610 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 611 612 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 613 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 614 SSE_INLINE_END_2; 615 616 v += 16; 617 } 618 /* row = (*bjtmp++)/4; */ 619 row = *bjtmp++; 620 while (row < i) { 621 pc = rtmp + 16*row; 622 SSE_INLINE_BEGIN_1(pc) 623 /* Load block from lower triangle */ 624 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 625 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 626 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 627 628 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 629 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 630 631 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 632 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 633 634 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 635 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 636 637 /* Compare block to zero block */ 638 639 SSE_COPY_PS(XMM4,XMM7) 640 SSE_CMPNEQ_PS(XMM4,XMM0) 641 642 SSE_COPY_PS(XMM5,XMM7) 643 SSE_CMPNEQ_PS(XMM5,XMM1) 644 645 SSE_COPY_PS(XMM6,XMM7) 646 SSE_CMPNEQ_PS(XMM6,XMM2) 647 648 SSE_CMPNEQ_PS(XMM7,XMM3) 649 650 /* Reduce the comparisons to one SSE register */ 651 SSE_OR_PS(XMM6,XMM7) 652 SSE_OR_PS(XMM5,XMM4) 653 SSE_OR_PS(XMM5,XMM6) 654 SSE_INLINE_END_1; 655 656 /* Reduce the one SSE register to an integer register for branching */ 657 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 658 MOVEMASK(nonzero,XMM5); 659 660 /* If block is nonzero ... */ 661 if (nonzero) { 662 pv = ba + 16*diag_offset[row]; 663 PREFETCH_L1(&pv[16]); 664 pj = bj + diag_offset[row] + 1; 665 666 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 667 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 668 /* but the diagonal was inverted already */ 669 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 670 671 SSE_INLINE_BEGIN_2(pv,pc) 672 /* Column 0, product is accumulated in XMM4 */ 673 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 674 SSE_SHUFFLE(XMM4,XMM4,0x00) 675 SSE_MULT_PS(XMM4,XMM0) 676 677 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 678 SSE_SHUFFLE(XMM5,XMM5,0x00) 679 SSE_MULT_PS(XMM5,XMM1) 680 SSE_ADD_PS(XMM4,XMM5) 681 682 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 683 SSE_SHUFFLE(XMM6,XMM6,0x00) 684 SSE_MULT_PS(XMM6,XMM2) 685 SSE_ADD_PS(XMM4,XMM6) 686 687 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 688 SSE_SHUFFLE(XMM7,XMM7,0x00) 689 SSE_MULT_PS(XMM7,XMM3) 690 SSE_ADD_PS(XMM4,XMM7) 691 692 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 693 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 694 695 /* Column 1, product is accumulated in XMM5 */ 696 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 697 SSE_SHUFFLE(XMM5,XMM5,0x00) 698 SSE_MULT_PS(XMM5,XMM0) 699 700 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 701 SSE_SHUFFLE(XMM6,XMM6,0x00) 702 SSE_MULT_PS(XMM6,XMM1) 703 SSE_ADD_PS(XMM5,XMM6) 704 705 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 706 SSE_SHUFFLE(XMM7,XMM7,0x00) 707 SSE_MULT_PS(XMM7,XMM2) 708 SSE_ADD_PS(XMM5,XMM7) 709 710 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 711 SSE_SHUFFLE(XMM6,XMM6,0x00) 712 SSE_MULT_PS(XMM6,XMM3) 713 SSE_ADD_PS(XMM5,XMM6) 714 715 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 716 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 717 718 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 719 720 /* Column 2, product is accumulated in XMM6 */ 721 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 722 SSE_SHUFFLE(XMM6,XMM6,0x00) 723 SSE_MULT_PS(XMM6,XMM0) 724 725 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 726 SSE_SHUFFLE(XMM7,XMM7,0x00) 727 SSE_MULT_PS(XMM7,XMM1) 728 SSE_ADD_PS(XMM6,XMM7) 729 730 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 731 SSE_SHUFFLE(XMM7,XMM7,0x00) 732 SSE_MULT_PS(XMM7,XMM2) 733 SSE_ADD_PS(XMM6,XMM7) 734 735 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 736 SSE_SHUFFLE(XMM7,XMM7,0x00) 737 SSE_MULT_PS(XMM7,XMM3) 738 SSE_ADD_PS(XMM6,XMM7) 739 740 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 741 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 742 743 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 744 /* Column 3, product is accumulated in XMM0 */ 745 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 746 SSE_SHUFFLE(XMM7,XMM7,0x00) 747 SSE_MULT_PS(XMM0,XMM7) 748 749 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 750 SSE_SHUFFLE(XMM7,XMM7,0x00) 751 SSE_MULT_PS(XMM1,XMM7) 752 SSE_ADD_PS(XMM0,XMM1) 753 754 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 755 SSE_SHUFFLE(XMM1,XMM1,0x00) 756 SSE_MULT_PS(XMM1,XMM2) 757 SSE_ADD_PS(XMM0,XMM1) 758 759 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 760 SSE_SHUFFLE(XMM7,XMM7,0x00) 761 SSE_MULT_PS(XMM3,XMM7) 762 SSE_ADD_PS(XMM0,XMM3) 763 764 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 765 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 766 767 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 768 /* This is code to be maintained and read by humans afterall. */ 769 /* Copy Multiplier Col 3 into XMM3 */ 770 SSE_COPY_PS(XMM3,XMM0) 771 /* Copy Multiplier Col 2 into XMM2 */ 772 SSE_COPY_PS(XMM2,XMM6) 773 /* Copy Multiplier Col 1 into XMM1 */ 774 SSE_COPY_PS(XMM1,XMM5) 775 /* Copy Multiplier Col 0 into XMM0 */ 776 SSE_COPY_PS(XMM0,XMM4) 777 SSE_INLINE_END_2; 778 779 /* Update the row: */ 780 nz = bi[row+1] - diag_offset[row] - 1; 781 pv += 16; 782 for (j=0; j<nz; j++) { 783 PREFETCH_L1(&pv[16]); 784 x = rtmp + 16*pj[j]; 785 /* x = rtmp + 4*pj[j]; */ 786 787 /* X:=X-M*PV, One column at a time */ 788 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 789 SSE_INLINE_BEGIN_2(x,pv) 790 /* Load First Column of X*/ 791 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 792 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 793 794 /* Matrix-Vector Product: */ 795 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 796 SSE_SHUFFLE(XMM5,XMM5,0x00) 797 SSE_MULT_PS(XMM5,XMM0) 798 SSE_SUB_PS(XMM4,XMM5) 799 800 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 801 SSE_SHUFFLE(XMM6,XMM6,0x00) 802 SSE_MULT_PS(XMM6,XMM1) 803 SSE_SUB_PS(XMM4,XMM6) 804 805 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 806 SSE_SHUFFLE(XMM7,XMM7,0x00) 807 SSE_MULT_PS(XMM7,XMM2) 808 SSE_SUB_PS(XMM4,XMM7) 809 810 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 811 SSE_SHUFFLE(XMM5,XMM5,0x00) 812 SSE_MULT_PS(XMM5,XMM3) 813 SSE_SUB_PS(XMM4,XMM5) 814 815 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 816 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 817 818 /* Second Column */ 819 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 820 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 821 822 /* Matrix-Vector Product: */ 823 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 824 SSE_SHUFFLE(XMM6,XMM6,0x00) 825 SSE_MULT_PS(XMM6,XMM0) 826 SSE_SUB_PS(XMM5,XMM6) 827 828 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 829 SSE_SHUFFLE(XMM7,XMM7,0x00) 830 SSE_MULT_PS(XMM7,XMM1) 831 SSE_SUB_PS(XMM5,XMM7) 832 833 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 834 SSE_SHUFFLE(XMM4,XMM4,0x00) 835 SSE_MULT_PS(XMM4,XMM2) 836 SSE_SUB_PS(XMM5,XMM4) 837 838 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 839 SSE_SHUFFLE(XMM6,XMM6,0x00) 840 SSE_MULT_PS(XMM6,XMM3) 841 SSE_SUB_PS(XMM5,XMM6) 842 843 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 844 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 845 846 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 847 848 /* Third Column */ 849 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 850 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 851 852 /* Matrix-Vector Product: */ 853 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 854 SSE_SHUFFLE(XMM7,XMM7,0x00) 855 SSE_MULT_PS(XMM7,XMM0) 856 SSE_SUB_PS(XMM6,XMM7) 857 858 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 859 SSE_SHUFFLE(XMM4,XMM4,0x00) 860 SSE_MULT_PS(XMM4,XMM1) 861 SSE_SUB_PS(XMM6,XMM4) 862 863 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 864 SSE_SHUFFLE(XMM5,XMM5,0x00) 865 SSE_MULT_PS(XMM5,XMM2) 866 SSE_SUB_PS(XMM6,XMM5) 867 868 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 869 SSE_SHUFFLE(XMM7,XMM7,0x00) 870 SSE_MULT_PS(XMM7,XMM3) 871 SSE_SUB_PS(XMM6,XMM7) 872 873 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 874 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 875 876 /* Fourth Column */ 877 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 878 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 879 880 /* Matrix-Vector Product: */ 881 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 882 SSE_SHUFFLE(XMM5,XMM5,0x00) 883 SSE_MULT_PS(XMM5,XMM0) 884 SSE_SUB_PS(XMM4,XMM5) 885 886 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 887 SSE_SHUFFLE(XMM6,XMM6,0x00) 888 SSE_MULT_PS(XMM6,XMM1) 889 SSE_SUB_PS(XMM4,XMM6) 890 891 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 892 SSE_SHUFFLE(XMM7,XMM7,0x00) 893 SSE_MULT_PS(XMM7,XMM2) 894 SSE_SUB_PS(XMM4,XMM7) 895 896 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 897 SSE_SHUFFLE(XMM5,XMM5,0x00) 898 SSE_MULT_PS(XMM5,XMM3) 899 SSE_SUB_PS(XMM4,XMM5) 900 901 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 902 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 903 SSE_INLINE_END_2; 904 pv += 16; 905 } 906 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 907 } 908 row = *bjtmp++; 909 /* row = (*bjtmp++)/4; */ 910 } 911 /* finished row so stick it into b->a */ 912 pv = ba + 16*bi[i]; 913 pj = bj + bi[i]; 914 nz = bi[i+1] - bi[i]; 915 916 /* Copy x block back into pv block */ 917 for (j=0; j<nz; j++) { 918 x = rtmp+16*pj[j]; 919 /* x = rtmp+4*pj[j]; */ 920 921 SSE_INLINE_BEGIN_2(x,pv) 922 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 923 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 924 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 925 926 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 927 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 928 929 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 930 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 931 932 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 933 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 934 935 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 936 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 937 938 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 939 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 940 941 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 942 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 943 944 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 945 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 946 SSE_INLINE_END_2; 947 pv += 16; 948 } 949 /* invert diagonal block */ 950 w = ba + 16*diag_offset[i]; 951 if (pivotinblocks) { 952 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 953 } else { 954 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 955 } 956 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 957 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 958 } 959 960 ierr = PetscFree(rtmp);CHKERRQ(ierr); 961 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 962 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 963 C->assembled = PETSC_TRUE; 964 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 965 /* Flop Count from inverting diagonal blocks */ 966 SSE_SCOPE_END; 967 PetscFunctionReturn(0); 968 } 969 970 #undef __FUNCT__ 971 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 972 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 973 { 974 Mat A=C; 975 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 976 PetscErrorCode ierr; 977 int i,j,n = a->mbs; 978 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 979 unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 980 unsigned int row; 981 int nz,*bi=b->i; 982 int *diag_offset = b->diag,*ai=a->i; 983 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 984 MatScalar *ba = b->a,*aa = a->a; 985 int nonzero=0; 986 /* int nonzero=0,colscale = 16; */ 987 PetscTruth pivotinblocks = b->pivotinblocks; 988 PetscReal shift = info->shiftamount; 989 990 PetscFunctionBegin; 991 SSE_SCOPE_BEGIN; 992 993 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 994 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 995 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 996 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 997 /* if ((unsigned long)bj==(unsigned long)aj) { */ 998 /* colscale = 4; */ 999 /* } */ 1000 1001 for (i=0; i<n; i++) { 1002 nz = bi[i+1] - bi[i]; 1003 bjtmp = bj + bi[i]; 1004 /* zero out the 4x4 block accumulators */ 1005 /* zero out one register */ 1006 XOR_PS(XMM7,XMM7); 1007 for (j=0; j<nz; j++) { 1008 x = rtmp+16*((unsigned int)bjtmp[j]); 1009 /* x = rtmp+4*bjtmp[j]; */ 1010 SSE_INLINE_BEGIN_1(x) 1011 /* Copy zero register to memory locations */ 1012 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1013 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1014 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1015 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1016 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1017 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1018 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1019 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1020 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1021 SSE_INLINE_END_1; 1022 } 1023 /* load in initial (unfactored row) */ 1024 nz = ai[i+1] - ai[i]; 1025 ajtmp = aj + ai[i]; 1026 v = aa + 16*ai[i]; 1027 for (j=0; j<nz; j++) { 1028 x = rtmp+16*((unsigned int)ajtmp[j]); 1029 /* x = rtmp+colscale*ajtmp[j]; */ 1030 /* Copy v block into x block */ 1031 SSE_INLINE_BEGIN_2(v,x) 1032 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1033 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1034 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1035 1036 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1037 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1038 1039 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1040 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1041 1042 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1043 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1044 1045 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1046 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1047 1048 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1049 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1050 1051 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1052 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1053 1054 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1055 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1056 SSE_INLINE_END_2; 1057 1058 v += 16; 1059 } 1060 /* row = (*bjtmp++)/4; */ 1061 row = (unsigned int)(*bjtmp++); 1062 while (row < i) { 1063 pc = rtmp + 16*row; 1064 SSE_INLINE_BEGIN_1(pc) 1065 /* Load block from lower triangle */ 1066 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1067 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1068 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1069 1070 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1071 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1072 1073 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1074 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1075 1076 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1077 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1078 1079 /* Compare block to zero block */ 1080 1081 SSE_COPY_PS(XMM4,XMM7) 1082 SSE_CMPNEQ_PS(XMM4,XMM0) 1083 1084 SSE_COPY_PS(XMM5,XMM7) 1085 SSE_CMPNEQ_PS(XMM5,XMM1) 1086 1087 SSE_COPY_PS(XMM6,XMM7) 1088 SSE_CMPNEQ_PS(XMM6,XMM2) 1089 1090 SSE_CMPNEQ_PS(XMM7,XMM3) 1091 1092 /* Reduce the comparisons to one SSE register */ 1093 SSE_OR_PS(XMM6,XMM7) 1094 SSE_OR_PS(XMM5,XMM4) 1095 SSE_OR_PS(XMM5,XMM6) 1096 SSE_INLINE_END_1; 1097 1098 /* Reduce the one SSE register to an integer register for branching */ 1099 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1100 MOVEMASK(nonzero,XMM5); 1101 1102 /* If block is nonzero ... */ 1103 if (nonzero) { 1104 pv = ba + 16*diag_offset[row]; 1105 PREFETCH_L1(&pv[16]); 1106 pj = bj + diag_offset[row] + 1; 1107 1108 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1109 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1110 /* but the diagonal was inverted already */ 1111 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1112 1113 SSE_INLINE_BEGIN_2(pv,pc) 1114 /* Column 0, product is accumulated in XMM4 */ 1115 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1116 SSE_SHUFFLE(XMM4,XMM4,0x00) 1117 SSE_MULT_PS(XMM4,XMM0) 1118 1119 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1120 SSE_SHUFFLE(XMM5,XMM5,0x00) 1121 SSE_MULT_PS(XMM5,XMM1) 1122 SSE_ADD_PS(XMM4,XMM5) 1123 1124 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1125 SSE_SHUFFLE(XMM6,XMM6,0x00) 1126 SSE_MULT_PS(XMM6,XMM2) 1127 SSE_ADD_PS(XMM4,XMM6) 1128 1129 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1130 SSE_SHUFFLE(XMM7,XMM7,0x00) 1131 SSE_MULT_PS(XMM7,XMM3) 1132 SSE_ADD_PS(XMM4,XMM7) 1133 1134 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1135 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1136 1137 /* Column 1, product is accumulated in XMM5 */ 1138 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1139 SSE_SHUFFLE(XMM5,XMM5,0x00) 1140 SSE_MULT_PS(XMM5,XMM0) 1141 1142 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1143 SSE_SHUFFLE(XMM6,XMM6,0x00) 1144 SSE_MULT_PS(XMM6,XMM1) 1145 SSE_ADD_PS(XMM5,XMM6) 1146 1147 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1148 SSE_SHUFFLE(XMM7,XMM7,0x00) 1149 SSE_MULT_PS(XMM7,XMM2) 1150 SSE_ADD_PS(XMM5,XMM7) 1151 1152 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1153 SSE_SHUFFLE(XMM6,XMM6,0x00) 1154 SSE_MULT_PS(XMM6,XMM3) 1155 SSE_ADD_PS(XMM5,XMM6) 1156 1157 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1158 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1159 1160 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1161 1162 /* Column 2, product is accumulated in XMM6 */ 1163 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1164 SSE_SHUFFLE(XMM6,XMM6,0x00) 1165 SSE_MULT_PS(XMM6,XMM0) 1166 1167 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1168 SSE_SHUFFLE(XMM7,XMM7,0x00) 1169 SSE_MULT_PS(XMM7,XMM1) 1170 SSE_ADD_PS(XMM6,XMM7) 1171 1172 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1173 SSE_SHUFFLE(XMM7,XMM7,0x00) 1174 SSE_MULT_PS(XMM7,XMM2) 1175 SSE_ADD_PS(XMM6,XMM7) 1176 1177 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1178 SSE_SHUFFLE(XMM7,XMM7,0x00) 1179 SSE_MULT_PS(XMM7,XMM3) 1180 SSE_ADD_PS(XMM6,XMM7) 1181 1182 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1183 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1184 1185 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1186 /* Column 3, product is accumulated in XMM0 */ 1187 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1188 SSE_SHUFFLE(XMM7,XMM7,0x00) 1189 SSE_MULT_PS(XMM0,XMM7) 1190 1191 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1192 SSE_SHUFFLE(XMM7,XMM7,0x00) 1193 SSE_MULT_PS(XMM1,XMM7) 1194 SSE_ADD_PS(XMM0,XMM1) 1195 1196 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1197 SSE_SHUFFLE(XMM1,XMM1,0x00) 1198 SSE_MULT_PS(XMM1,XMM2) 1199 SSE_ADD_PS(XMM0,XMM1) 1200 1201 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1202 SSE_SHUFFLE(XMM7,XMM7,0x00) 1203 SSE_MULT_PS(XMM3,XMM7) 1204 SSE_ADD_PS(XMM0,XMM3) 1205 1206 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1207 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1208 1209 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1210 /* This is code to be maintained and read by humans afterall. */ 1211 /* Copy Multiplier Col 3 into XMM3 */ 1212 SSE_COPY_PS(XMM3,XMM0) 1213 /* Copy Multiplier Col 2 into XMM2 */ 1214 SSE_COPY_PS(XMM2,XMM6) 1215 /* Copy Multiplier Col 1 into XMM1 */ 1216 SSE_COPY_PS(XMM1,XMM5) 1217 /* Copy Multiplier Col 0 into XMM0 */ 1218 SSE_COPY_PS(XMM0,XMM4) 1219 SSE_INLINE_END_2; 1220 1221 /* Update the row: */ 1222 nz = bi[row+1] - diag_offset[row] - 1; 1223 pv += 16; 1224 for (j=0; j<nz; j++) { 1225 PREFETCH_L1(&pv[16]); 1226 x = rtmp + 16*((unsigned int)pj[j]); 1227 /* x = rtmp + 4*pj[j]; */ 1228 1229 /* X:=X-M*PV, One column at a time */ 1230 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1231 SSE_INLINE_BEGIN_2(x,pv) 1232 /* Load First Column of X*/ 1233 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1234 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1235 1236 /* Matrix-Vector Product: */ 1237 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1238 SSE_SHUFFLE(XMM5,XMM5,0x00) 1239 SSE_MULT_PS(XMM5,XMM0) 1240 SSE_SUB_PS(XMM4,XMM5) 1241 1242 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1243 SSE_SHUFFLE(XMM6,XMM6,0x00) 1244 SSE_MULT_PS(XMM6,XMM1) 1245 SSE_SUB_PS(XMM4,XMM6) 1246 1247 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1248 SSE_SHUFFLE(XMM7,XMM7,0x00) 1249 SSE_MULT_PS(XMM7,XMM2) 1250 SSE_SUB_PS(XMM4,XMM7) 1251 1252 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1253 SSE_SHUFFLE(XMM5,XMM5,0x00) 1254 SSE_MULT_PS(XMM5,XMM3) 1255 SSE_SUB_PS(XMM4,XMM5) 1256 1257 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1258 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1259 1260 /* Second Column */ 1261 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1262 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1263 1264 /* Matrix-Vector Product: */ 1265 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1266 SSE_SHUFFLE(XMM6,XMM6,0x00) 1267 SSE_MULT_PS(XMM6,XMM0) 1268 SSE_SUB_PS(XMM5,XMM6) 1269 1270 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1271 SSE_SHUFFLE(XMM7,XMM7,0x00) 1272 SSE_MULT_PS(XMM7,XMM1) 1273 SSE_SUB_PS(XMM5,XMM7) 1274 1275 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1276 SSE_SHUFFLE(XMM4,XMM4,0x00) 1277 SSE_MULT_PS(XMM4,XMM2) 1278 SSE_SUB_PS(XMM5,XMM4) 1279 1280 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1281 SSE_SHUFFLE(XMM6,XMM6,0x00) 1282 SSE_MULT_PS(XMM6,XMM3) 1283 SSE_SUB_PS(XMM5,XMM6) 1284 1285 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1286 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1287 1288 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1289 1290 /* Third Column */ 1291 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1292 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1293 1294 /* Matrix-Vector Product: */ 1295 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1296 SSE_SHUFFLE(XMM7,XMM7,0x00) 1297 SSE_MULT_PS(XMM7,XMM0) 1298 SSE_SUB_PS(XMM6,XMM7) 1299 1300 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1301 SSE_SHUFFLE(XMM4,XMM4,0x00) 1302 SSE_MULT_PS(XMM4,XMM1) 1303 SSE_SUB_PS(XMM6,XMM4) 1304 1305 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1306 SSE_SHUFFLE(XMM5,XMM5,0x00) 1307 SSE_MULT_PS(XMM5,XMM2) 1308 SSE_SUB_PS(XMM6,XMM5) 1309 1310 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1311 SSE_SHUFFLE(XMM7,XMM7,0x00) 1312 SSE_MULT_PS(XMM7,XMM3) 1313 SSE_SUB_PS(XMM6,XMM7) 1314 1315 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1316 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1317 1318 /* Fourth Column */ 1319 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1320 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1321 1322 /* Matrix-Vector Product: */ 1323 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1324 SSE_SHUFFLE(XMM5,XMM5,0x00) 1325 SSE_MULT_PS(XMM5,XMM0) 1326 SSE_SUB_PS(XMM4,XMM5) 1327 1328 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1329 SSE_SHUFFLE(XMM6,XMM6,0x00) 1330 SSE_MULT_PS(XMM6,XMM1) 1331 SSE_SUB_PS(XMM4,XMM6) 1332 1333 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1334 SSE_SHUFFLE(XMM7,XMM7,0x00) 1335 SSE_MULT_PS(XMM7,XMM2) 1336 SSE_SUB_PS(XMM4,XMM7) 1337 1338 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1339 SSE_SHUFFLE(XMM5,XMM5,0x00) 1340 SSE_MULT_PS(XMM5,XMM3) 1341 SSE_SUB_PS(XMM4,XMM5) 1342 1343 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1344 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1345 SSE_INLINE_END_2; 1346 pv += 16; 1347 } 1348 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1349 } 1350 row = (unsigned int)(*bjtmp++); 1351 /* row = (*bjtmp++)/4; */ 1352 /* bjtmp++; */ 1353 } 1354 /* finished row so stick it into b->a */ 1355 pv = ba + 16*bi[i]; 1356 pj = bj + bi[i]; 1357 nz = bi[i+1] - bi[i]; 1358 1359 /* Copy x block back into pv block */ 1360 for (j=0; j<nz; j++) { 1361 x = rtmp+16*((unsigned int)pj[j]); 1362 /* x = rtmp+4*pj[j]; */ 1363 1364 SSE_INLINE_BEGIN_2(x,pv) 1365 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1366 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1367 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1368 1369 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1370 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1371 1372 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1373 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1374 1375 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1376 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1377 1378 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1379 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1380 1381 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1382 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1383 1384 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1385 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1386 1387 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1388 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1389 SSE_INLINE_END_2; 1390 pv += 16; 1391 } 1392 /* invert diagonal block */ 1393 w = ba + 16*diag_offset[i]; 1394 if (pivotinblocks) { 1395 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1396 } else { 1397 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1398 } 1399 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1400 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1401 } 1402 1403 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1404 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1405 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1406 C->assembled = PETSC_TRUE; 1407 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1408 /* Flop Count from inverting diagonal blocks */ 1409 SSE_SCOPE_END; 1410 PetscFunctionReturn(0); 1411 } 1412 1413 #undef __FUNCT__ 1414 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1415 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1416 { 1417 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1418 PetscErrorCode ierr; 1419 int i,j,n = a->mbs; 1420 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1421 unsigned int row; 1422 int *ajtmpold,nz,*bi=b->i; 1423 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1424 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1425 MatScalar *ba = b->a,*aa = a->a; 1426 int nonzero=0; 1427 /* int nonzero=0,colscale = 16; */ 1428 PetscTruth pivotinblocks = b->pivotinblocks; 1429 PetscReal shift = info->shiftamount; 1430 1431 PetscFunctionBegin; 1432 SSE_SCOPE_BEGIN; 1433 1434 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1435 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1436 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1437 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1438 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1439 /* colscale = 4; */ 1440 /* } */ 1441 if ((unsigned long)bj==(unsigned long)aj) { 1442 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1443 } 1444 1445 for (i=0; i<n; i++) { 1446 nz = bi[i+1] - bi[i]; 1447 bjtmp = bj + bi[i]; 1448 /* zero out the 4x4 block accumulators */ 1449 /* zero out one register */ 1450 XOR_PS(XMM7,XMM7); 1451 for (j=0; j<nz; j++) { 1452 x = rtmp+16*((unsigned int)bjtmp[j]); 1453 /* x = rtmp+4*bjtmp[j]; */ 1454 SSE_INLINE_BEGIN_1(x) 1455 /* Copy zero register to memory locations */ 1456 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1457 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1458 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1459 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1460 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1461 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1462 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1463 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1464 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1465 SSE_INLINE_END_1; 1466 } 1467 /* load in initial (unfactored row) */ 1468 nz = ai[i+1] - ai[i]; 1469 ajtmpold = aj + ai[i]; 1470 v = aa + 16*ai[i]; 1471 for (j=0; j<nz; j++) { 1472 x = rtmp+16*ajtmpold[j]; 1473 /* x = rtmp+colscale*ajtmpold[j]; */ 1474 /* Copy v block into x block */ 1475 SSE_INLINE_BEGIN_2(v,x) 1476 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1477 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1478 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1479 1480 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1481 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1482 1483 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1484 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1485 1486 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1487 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1488 1489 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1490 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1491 1492 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1493 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1494 1495 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1496 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1497 1498 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1499 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1500 SSE_INLINE_END_2; 1501 1502 v += 16; 1503 } 1504 /* row = (*bjtmp++)/4; */ 1505 row = (unsigned int)(*bjtmp++); 1506 while (row < i) { 1507 pc = rtmp + 16*row; 1508 SSE_INLINE_BEGIN_1(pc) 1509 /* Load block from lower triangle */ 1510 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1511 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1512 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1513 1514 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1515 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1516 1517 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1518 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1519 1520 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1521 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1522 1523 /* Compare block to zero block */ 1524 1525 SSE_COPY_PS(XMM4,XMM7) 1526 SSE_CMPNEQ_PS(XMM4,XMM0) 1527 1528 SSE_COPY_PS(XMM5,XMM7) 1529 SSE_CMPNEQ_PS(XMM5,XMM1) 1530 1531 SSE_COPY_PS(XMM6,XMM7) 1532 SSE_CMPNEQ_PS(XMM6,XMM2) 1533 1534 SSE_CMPNEQ_PS(XMM7,XMM3) 1535 1536 /* Reduce the comparisons to one SSE register */ 1537 SSE_OR_PS(XMM6,XMM7) 1538 SSE_OR_PS(XMM5,XMM4) 1539 SSE_OR_PS(XMM5,XMM6) 1540 SSE_INLINE_END_1; 1541 1542 /* Reduce the one SSE register to an integer register for branching */ 1543 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1544 MOVEMASK(nonzero,XMM5); 1545 1546 /* If block is nonzero ... */ 1547 if (nonzero) { 1548 pv = ba + 16*diag_offset[row]; 1549 PREFETCH_L1(&pv[16]); 1550 pj = bj + diag_offset[row] + 1; 1551 1552 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1553 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1554 /* but the diagonal was inverted already */ 1555 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1556 1557 SSE_INLINE_BEGIN_2(pv,pc) 1558 /* Column 0, product is accumulated in XMM4 */ 1559 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1560 SSE_SHUFFLE(XMM4,XMM4,0x00) 1561 SSE_MULT_PS(XMM4,XMM0) 1562 1563 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1564 SSE_SHUFFLE(XMM5,XMM5,0x00) 1565 SSE_MULT_PS(XMM5,XMM1) 1566 SSE_ADD_PS(XMM4,XMM5) 1567 1568 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1569 SSE_SHUFFLE(XMM6,XMM6,0x00) 1570 SSE_MULT_PS(XMM6,XMM2) 1571 SSE_ADD_PS(XMM4,XMM6) 1572 1573 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1574 SSE_SHUFFLE(XMM7,XMM7,0x00) 1575 SSE_MULT_PS(XMM7,XMM3) 1576 SSE_ADD_PS(XMM4,XMM7) 1577 1578 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1579 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1580 1581 /* Column 1, product is accumulated in XMM5 */ 1582 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1583 SSE_SHUFFLE(XMM5,XMM5,0x00) 1584 SSE_MULT_PS(XMM5,XMM0) 1585 1586 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1587 SSE_SHUFFLE(XMM6,XMM6,0x00) 1588 SSE_MULT_PS(XMM6,XMM1) 1589 SSE_ADD_PS(XMM5,XMM6) 1590 1591 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1592 SSE_SHUFFLE(XMM7,XMM7,0x00) 1593 SSE_MULT_PS(XMM7,XMM2) 1594 SSE_ADD_PS(XMM5,XMM7) 1595 1596 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1597 SSE_SHUFFLE(XMM6,XMM6,0x00) 1598 SSE_MULT_PS(XMM6,XMM3) 1599 SSE_ADD_PS(XMM5,XMM6) 1600 1601 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1602 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1603 1604 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1605 1606 /* Column 2, product is accumulated in XMM6 */ 1607 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1608 SSE_SHUFFLE(XMM6,XMM6,0x00) 1609 SSE_MULT_PS(XMM6,XMM0) 1610 1611 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1612 SSE_SHUFFLE(XMM7,XMM7,0x00) 1613 SSE_MULT_PS(XMM7,XMM1) 1614 SSE_ADD_PS(XMM6,XMM7) 1615 1616 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1617 SSE_SHUFFLE(XMM7,XMM7,0x00) 1618 SSE_MULT_PS(XMM7,XMM2) 1619 SSE_ADD_PS(XMM6,XMM7) 1620 1621 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1622 SSE_SHUFFLE(XMM7,XMM7,0x00) 1623 SSE_MULT_PS(XMM7,XMM3) 1624 SSE_ADD_PS(XMM6,XMM7) 1625 1626 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1627 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1628 1629 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1630 /* Column 3, product is accumulated in XMM0 */ 1631 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1632 SSE_SHUFFLE(XMM7,XMM7,0x00) 1633 SSE_MULT_PS(XMM0,XMM7) 1634 1635 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1636 SSE_SHUFFLE(XMM7,XMM7,0x00) 1637 SSE_MULT_PS(XMM1,XMM7) 1638 SSE_ADD_PS(XMM0,XMM1) 1639 1640 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1641 SSE_SHUFFLE(XMM1,XMM1,0x00) 1642 SSE_MULT_PS(XMM1,XMM2) 1643 SSE_ADD_PS(XMM0,XMM1) 1644 1645 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1646 SSE_SHUFFLE(XMM7,XMM7,0x00) 1647 SSE_MULT_PS(XMM3,XMM7) 1648 SSE_ADD_PS(XMM0,XMM3) 1649 1650 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1651 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1652 1653 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1654 /* This is code to be maintained and read by humans afterall. */ 1655 /* Copy Multiplier Col 3 into XMM3 */ 1656 SSE_COPY_PS(XMM3,XMM0) 1657 /* Copy Multiplier Col 2 into XMM2 */ 1658 SSE_COPY_PS(XMM2,XMM6) 1659 /* Copy Multiplier Col 1 into XMM1 */ 1660 SSE_COPY_PS(XMM1,XMM5) 1661 /* Copy Multiplier Col 0 into XMM0 */ 1662 SSE_COPY_PS(XMM0,XMM4) 1663 SSE_INLINE_END_2; 1664 1665 /* Update the row: */ 1666 nz = bi[row+1] - diag_offset[row] - 1; 1667 pv += 16; 1668 for (j=0; j<nz; j++) { 1669 PREFETCH_L1(&pv[16]); 1670 x = rtmp + 16*((unsigned int)pj[j]); 1671 /* x = rtmp + 4*pj[j]; */ 1672 1673 /* X:=X-M*PV, One column at a time */ 1674 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1675 SSE_INLINE_BEGIN_2(x,pv) 1676 /* Load First Column of X*/ 1677 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1678 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1679 1680 /* Matrix-Vector Product: */ 1681 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1682 SSE_SHUFFLE(XMM5,XMM5,0x00) 1683 SSE_MULT_PS(XMM5,XMM0) 1684 SSE_SUB_PS(XMM4,XMM5) 1685 1686 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1687 SSE_SHUFFLE(XMM6,XMM6,0x00) 1688 SSE_MULT_PS(XMM6,XMM1) 1689 SSE_SUB_PS(XMM4,XMM6) 1690 1691 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1692 SSE_SHUFFLE(XMM7,XMM7,0x00) 1693 SSE_MULT_PS(XMM7,XMM2) 1694 SSE_SUB_PS(XMM4,XMM7) 1695 1696 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1697 SSE_SHUFFLE(XMM5,XMM5,0x00) 1698 SSE_MULT_PS(XMM5,XMM3) 1699 SSE_SUB_PS(XMM4,XMM5) 1700 1701 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1702 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1703 1704 /* Second Column */ 1705 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1706 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1707 1708 /* Matrix-Vector Product: */ 1709 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1710 SSE_SHUFFLE(XMM6,XMM6,0x00) 1711 SSE_MULT_PS(XMM6,XMM0) 1712 SSE_SUB_PS(XMM5,XMM6) 1713 1714 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1715 SSE_SHUFFLE(XMM7,XMM7,0x00) 1716 SSE_MULT_PS(XMM7,XMM1) 1717 SSE_SUB_PS(XMM5,XMM7) 1718 1719 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1720 SSE_SHUFFLE(XMM4,XMM4,0x00) 1721 SSE_MULT_PS(XMM4,XMM2) 1722 SSE_SUB_PS(XMM5,XMM4) 1723 1724 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1725 SSE_SHUFFLE(XMM6,XMM6,0x00) 1726 SSE_MULT_PS(XMM6,XMM3) 1727 SSE_SUB_PS(XMM5,XMM6) 1728 1729 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1730 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1731 1732 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1733 1734 /* Third Column */ 1735 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1736 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1737 1738 /* Matrix-Vector Product: */ 1739 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1740 SSE_SHUFFLE(XMM7,XMM7,0x00) 1741 SSE_MULT_PS(XMM7,XMM0) 1742 SSE_SUB_PS(XMM6,XMM7) 1743 1744 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1745 SSE_SHUFFLE(XMM4,XMM4,0x00) 1746 SSE_MULT_PS(XMM4,XMM1) 1747 SSE_SUB_PS(XMM6,XMM4) 1748 1749 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1750 SSE_SHUFFLE(XMM5,XMM5,0x00) 1751 SSE_MULT_PS(XMM5,XMM2) 1752 SSE_SUB_PS(XMM6,XMM5) 1753 1754 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1755 SSE_SHUFFLE(XMM7,XMM7,0x00) 1756 SSE_MULT_PS(XMM7,XMM3) 1757 SSE_SUB_PS(XMM6,XMM7) 1758 1759 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1760 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1761 1762 /* Fourth Column */ 1763 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1764 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1765 1766 /* Matrix-Vector Product: */ 1767 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1768 SSE_SHUFFLE(XMM5,XMM5,0x00) 1769 SSE_MULT_PS(XMM5,XMM0) 1770 SSE_SUB_PS(XMM4,XMM5) 1771 1772 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1773 SSE_SHUFFLE(XMM6,XMM6,0x00) 1774 SSE_MULT_PS(XMM6,XMM1) 1775 SSE_SUB_PS(XMM4,XMM6) 1776 1777 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1778 SSE_SHUFFLE(XMM7,XMM7,0x00) 1779 SSE_MULT_PS(XMM7,XMM2) 1780 SSE_SUB_PS(XMM4,XMM7) 1781 1782 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1783 SSE_SHUFFLE(XMM5,XMM5,0x00) 1784 SSE_MULT_PS(XMM5,XMM3) 1785 SSE_SUB_PS(XMM4,XMM5) 1786 1787 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1788 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1789 SSE_INLINE_END_2; 1790 pv += 16; 1791 } 1792 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1793 } 1794 row = (unsigned int)(*bjtmp++); 1795 /* row = (*bjtmp++)/4; */ 1796 /* bjtmp++; */ 1797 } 1798 /* finished row so stick it into b->a */ 1799 pv = ba + 16*bi[i]; 1800 pj = bj + bi[i]; 1801 nz = bi[i+1] - bi[i]; 1802 1803 /* Copy x block back into pv block */ 1804 for (j=0; j<nz; j++) { 1805 x = rtmp+16*((unsigned int)pj[j]); 1806 /* x = rtmp+4*pj[j]; */ 1807 1808 SSE_INLINE_BEGIN_2(x,pv) 1809 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1810 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1811 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1812 1813 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1814 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1815 1816 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1817 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1818 1819 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1820 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1821 1822 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1823 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1824 1825 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1826 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1827 1828 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1829 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1830 1831 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1832 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1833 SSE_INLINE_END_2; 1834 pv += 16; 1835 } 1836 /* invert diagonal block */ 1837 w = ba + 16*diag_offset[i]; 1838 if (pivotinblocks) { 1839 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1840 } else { 1841 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1842 } 1843 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1844 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1845 } 1846 1847 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1848 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1849 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1850 C->assembled = PETSC_TRUE; 1851 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1852 /* Flop Count from inverting diagonal blocks */ 1853 SSE_SCOPE_END; 1854 PetscFunctionReturn(0); 1855 } 1856 1857 #endif 1858