1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 /* ------------------------------------------------------------*/ 9 /* 10 Version for when blocks are 4 by 4 11 */ 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace" 14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 15 { 16 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 17 IS isrow = b->row,isicol = b->icol; 18 PetscErrorCode ierr; 19 const PetscInt *r,*ic; 20 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 21 PetscInt *ajtmpold,*ajtmp,nz,row; 22 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 23 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 24 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 25 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 26 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 27 MatScalar m13,m14,m15,m16; 28 MatScalar *ba = b->a,*aa = a->a; 29 PetscBool pivotinblocks = b->pivotinblocks,zeropivotdetected; 30 PetscReal shift = info->shiftamount; 31 32 PetscFunctionBegin; 33 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 34 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 35 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 36 37 for (i=0; i<n; i++) { 38 nz = bi[i+1] - bi[i]; 39 ajtmp = bj + bi[i]; 40 for (j=0; j<nz; j++) { 41 x = rtmp+16*ajtmp[j]; 42 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 43 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 44 } 45 /* load in initial (unfactored row) */ 46 idx = r[i]; 47 nz = ai[idx+1] - ai[idx]; 48 ajtmpold = aj + ai[idx]; 49 v = aa + 16*ai[idx]; 50 for (j=0; j<nz; j++) { 51 x = rtmp+16*ic[ajtmpold[j]]; 52 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 53 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 54 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 55 x[14] = v[14]; x[15] = v[15]; 56 v += 16; 57 } 58 row = *ajtmp++; 59 while (row < i) { 60 pc = rtmp + 16*row; 61 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 62 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 63 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 64 p15 = pc[14]; p16 = pc[15]; 65 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 66 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 67 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 68 || p16 != 0.0) { 69 pv = ba + 16*diag_offset[row]; 70 pj = bj + diag_offset[row] + 1; 71 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 72 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 73 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 74 x15 = pv[14]; x16 = pv[15]; 75 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 76 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 77 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 78 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 79 80 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 81 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 82 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 83 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 84 85 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 86 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 87 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 88 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 89 90 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 91 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 92 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 93 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 94 95 nz = bi[row+1] - diag_offset[row] - 1; 96 pv += 16; 97 for (j=0; j<nz; j++) { 98 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 99 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 100 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 101 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 102 x = rtmp + 16*pj[j]; 103 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 104 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 105 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 106 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 107 108 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 109 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 110 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 111 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 112 113 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 114 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 115 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 116 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 117 118 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 119 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 120 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 121 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 122 123 pv += 16; 124 } 125 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 126 } 127 row = *ajtmp++; 128 } 129 /* finished row so stick it into b->a */ 130 pv = ba + 16*bi[i]; 131 pj = bj + bi[i]; 132 nz = bi[i+1] - bi[i]; 133 for (j=0; j<nz; j++) { 134 x = rtmp+16*pj[j]; 135 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 136 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 137 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 138 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 139 pv += 16; 140 } 141 /* invert diagonal block */ 142 w = ba + 16*diag_offset[i]; 143 if (pivotinblocks) { 144 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr); 145 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 146 } else { 147 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148 } 149 } 150 151 ierr = PetscFree(rtmp);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 153 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154 155 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 156 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 157 C->assembled = PETSC_TRUE; 158 159 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 160 PetscFunctionReturn(0); 161 } 162 163 /* MatLUFactorNumeric_SeqBAIJ_4 - 164 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 165 PetscKernel_A_gets_A_times_B() 166 PetscKernel_A_gets_A_minus_B_times_C() 167 PetscKernel_A_gets_inverse_A() 168 */ 169 170 #undef __FUNCT__ 171 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 172 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 173 { 174 Mat C = B; 175 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 176 IS isrow = b->row,isicol = b->icol; 177 PetscErrorCode ierr; 178 const PetscInt *r,*ic; 179 PetscInt i,j,k,nz,nzL,row; 180 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 181 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 182 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 183 PetscInt flg; 184 PetscReal shift; 185 PetscBool zeropivotdetected; 186 187 PetscFunctionBegin; 188 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 189 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 190 191 if (info->shifttype == MAT_SHIFT_NONE) { 192 shift = 0; 193 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 194 shift = info->shiftamount; 195 } 196 197 /* generate work space needed by the factorization */ 198 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 199 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 200 201 for (i=0; i<n; i++) { 202 /* zero rtmp */ 203 /* L part */ 204 nz = bi[i+1] - bi[i]; 205 bjtmp = bj + bi[i]; 206 for (j=0; j<nz; j++) { 207 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 208 } 209 210 /* U part */ 211 nz = bdiag[i]-bdiag[i+1]; 212 bjtmp = bj + bdiag[i+1]+1; 213 for (j=0; j<nz; j++) { 214 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 215 } 216 217 /* load in initial (unfactored row) */ 218 nz = ai[r[i]+1] - ai[r[i]]; 219 ajtmp = aj + ai[r[i]]; 220 v = aa + bs2*ai[r[i]]; 221 for (j=0; j<nz; j++) { 222 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 223 } 224 225 /* elimination */ 226 bjtmp = bj + bi[i]; 227 nzL = bi[i+1] - bi[i]; 228 for (k=0; k < nzL; k++) { 229 row = bjtmp[k]; 230 pc = rtmp + bs2*row; 231 for (flg=0,j=0; j<bs2; j++) { 232 if (pc[j]!=0.0) { 233 flg = 1; 234 break; 235 } 236 } 237 if (flg) { 238 pv = b->a + bs2*bdiag[row]; 239 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 240 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 241 242 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 243 pv = b->a + bs2*(bdiag[row+1]+1); 244 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 245 for (j=0; j<nz; j++) { 246 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 247 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 248 v = rtmp + bs2*pj[j]; 249 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 250 pv += bs2; 251 } 252 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 253 } 254 } 255 256 /* finished row so stick it into b->a */ 257 /* L part */ 258 pv = b->a + bs2*bi[i]; 259 pj = b->j + bi[i]; 260 nz = bi[i+1] - bi[i]; 261 for (j=0; j<nz; j++) { 262 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 263 } 264 265 /* Mark diagonal and invert diagonal for simplier triangular solves */ 266 pv = b->a + bs2*bdiag[i]; 267 pj = b->j + bdiag[i]; 268 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 269 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr); 270 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 271 272 /* U part */ 273 pv = b->a + bs2*(bdiag[i+1]+1); 274 pj = b->j + bdiag[i+1]+1; 275 nz = bdiag[i] - bdiag[i+1] - 1; 276 for (j=0; j<nz; j++) { 277 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 278 } 279 } 280 281 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 282 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 283 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 284 285 C->ops->solve = MatSolve_SeqBAIJ_4; 286 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 287 C->assembled = PETSC_TRUE; 288 289 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 290 PetscFunctionReturn(0); 291 } 292 293 #undef __FUNCT__ 294 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace" 295 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 296 { 297 /* 298 Default Version for when blocks are 4 by 4 Using natural ordering 299 */ 300 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 301 PetscErrorCode ierr; 302 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 303 PetscInt *ajtmpold,*ajtmp,nz,row; 304 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 305 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 306 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 307 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 308 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 309 MatScalar m13,m14,m15,m16; 310 MatScalar *ba = b->a,*aa = a->a; 311 PetscBool pivotinblocks = b->pivotinblocks,zeropivotdetected; 312 PetscReal shift = info->shiftamount; 313 314 PetscFunctionBegin; 315 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 316 317 for (i=0; i<n; i++) { 318 nz = bi[i+1] - bi[i]; 319 ajtmp = bj + bi[i]; 320 for (j=0; j<nz; j++) { 321 x = rtmp+16*ajtmp[j]; 322 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 323 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 324 } 325 /* load in initial (unfactored row) */ 326 nz = ai[i+1] - ai[i]; 327 ajtmpold = aj + ai[i]; 328 v = aa + 16*ai[i]; 329 for (j=0; j<nz; j++) { 330 x = rtmp+16*ajtmpold[j]; 331 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 332 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 333 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 334 x[14] = v[14]; x[15] = v[15]; 335 v += 16; 336 } 337 row = *ajtmp++; 338 while (row < i) { 339 pc = rtmp + 16*row; 340 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 341 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 342 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 343 p15 = pc[14]; p16 = pc[15]; 344 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 345 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 346 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 347 || p16 != 0.0) { 348 pv = ba + 16*diag_offset[row]; 349 pj = bj + diag_offset[row] + 1; 350 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 351 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 352 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 353 x15 = pv[14]; x16 = pv[15]; 354 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 355 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 356 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 357 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 358 359 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 360 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 361 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 362 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 363 364 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 365 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 366 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 367 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 368 369 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 370 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 371 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 372 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 373 nz = bi[row+1] - diag_offset[row] - 1; 374 pv += 16; 375 for (j=0; j<nz; j++) { 376 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 377 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 378 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 379 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 380 x = rtmp + 16*pj[j]; 381 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 382 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 383 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 384 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 385 386 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 387 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 388 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 389 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 390 391 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 392 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 393 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 394 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 395 396 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 397 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 398 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 399 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 400 401 pv += 16; 402 } 403 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 404 } 405 row = *ajtmp++; 406 } 407 /* finished row so stick it into b->a */ 408 pv = ba + 16*bi[i]; 409 pj = bj + bi[i]; 410 nz = bi[i+1] - bi[i]; 411 for (j=0; j<nz; j++) { 412 x = rtmp+16*pj[j]; 413 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 414 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 415 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 416 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 417 pv += 16; 418 } 419 /* invert diagonal block */ 420 w = ba + 16*diag_offset[i]; 421 if (pivotinblocks) { 422 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr); 423 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 424 } else { 425 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 426 } 427 } 428 429 ierr = PetscFree(rtmp);CHKERRQ(ierr); 430 431 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 432 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 433 C->assembled = PETSC_TRUE; 434 435 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 436 PetscFunctionReturn(0); 437 } 438 439 /* 440 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 441 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 442 */ 443 #undef __FUNCT__ 444 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 445 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 446 { 447 Mat C =B; 448 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 449 PetscErrorCode ierr; 450 PetscInt i,j,k,nz,nzL,row; 451 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 452 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 453 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 454 PetscInt flg; 455 PetscReal shift; 456 PetscBool zeropivotdetected; 457 458 PetscFunctionBegin; 459 /* generate work space needed by the factorization */ 460 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 461 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 462 463 if (info->shifttype == MAT_SHIFT_NONE) { 464 shift = 0; 465 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 466 shift = info->shiftamount; 467 } 468 469 for (i=0; i<n; i++) { 470 /* zero rtmp */ 471 /* L part */ 472 nz = bi[i+1] - bi[i]; 473 bjtmp = bj + bi[i]; 474 for (j=0; j<nz; j++) { 475 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 476 } 477 478 /* U part */ 479 nz = bdiag[i] - bdiag[i+1]; 480 bjtmp = bj + bdiag[i+1]+1; 481 for (j=0; j<nz; j++) { 482 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 483 } 484 485 /* load in initial (unfactored row) */ 486 nz = ai[i+1] - ai[i]; 487 ajtmp = aj + ai[i]; 488 v = aa + bs2*ai[i]; 489 for (j=0; j<nz; j++) { 490 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 491 } 492 493 /* elimination */ 494 bjtmp = bj + bi[i]; 495 nzL = bi[i+1] - bi[i]; 496 for (k=0; k < nzL; k++) { 497 row = bjtmp[k]; 498 pc = rtmp + bs2*row; 499 for (flg=0,j=0; j<bs2; j++) { 500 if (pc[j]!=0.0) { 501 flg = 1; 502 break; 503 } 504 } 505 if (flg) { 506 pv = b->a + bs2*bdiag[row]; 507 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 508 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 509 510 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 511 pv = b->a + bs2*(bdiag[row+1]+1); 512 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 513 for (j=0; j<nz; j++) { 514 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 515 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 516 v = rtmp + bs2*pj[j]; 517 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 518 pv += bs2; 519 } 520 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 521 } 522 } 523 524 /* finished row so stick it into b->a */ 525 /* L part */ 526 pv = b->a + bs2*bi[i]; 527 pj = b->j + bi[i]; 528 nz = bi[i+1] - bi[i]; 529 for (j=0; j<nz; j++) { 530 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 531 } 532 533 /* Mark diagonal and invert diagonal for simplier triangular solves */ 534 pv = b->a + bs2*bdiag[i]; 535 pj = b->j + bdiag[i]; 536 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 537 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr); 538 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 539 540 /* U part */ 541 pv = b->a + bs2*(bdiag[i+1]+1); 542 pj = b->j + bdiag[i+1]+1; 543 nz = bdiag[i] - bdiag[i+1] - 1; 544 for (j=0; j<nz; j++) { 545 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 546 } 547 } 548 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 549 550 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 551 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 552 C->assembled = PETSC_TRUE; 553 554 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 555 PetscFunctionReturn(0); 556 } 557 558 #if defined(PETSC_HAVE_SSE) 559 560 #include PETSC_HAVE_SSE 561 562 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 563 #undef __FUNCT__ 564 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 565 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 566 { 567 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 568 PetscErrorCode ierr; 569 int i,j,n = a->mbs; 570 int *bj = b->j,*bjtmp,*pj; 571 int row; 572 int *ajtmpold,nz,*bi=b->i; 573 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 574 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 575 MatScalar *ba = b->a,*aa = a->a; 576 int nonzero=0; 577 /* int nonzero=0,colscale = 16; */ 578 PetscBool pivotinblocks = b->pivotinblocks,zeropivotdetected; 579 PetscReal shift = info->shiftamount; 580 581 PetscFunctionBegin; 582 SSE_SCOPE_BEGIN; 583 584 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 585 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 586 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 587 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 588 /* if ((unsigned long)bj==(unsigned long)aj) { */ 589 /* colscale = 4; */ 590 /* } */ 591 for (i=0; i<n; i++) { 592 nz = bi[i+1] - bi[i]; 593 bjtmp = bj + bi[i]; 594 /* zero out the 4x4 block accumulators */ 595 /* zero out one register */ 596 XOR_PS(XMM7,XMM7); 597 for (j=0; j<nz; j++) { 598 x = rtmp+16*bjtmp[j]; 599 /* x = rtmp+4*bjtmp[j]; */ 600 SSE_INLINE_BEGIN_1(x) 601 /* Copy zero register to memory locations */ 602 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 603 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 604 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 605 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 606 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 607 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 608 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 609 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 610 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 611 SSE_INLINE_END_1; 612 } 613 /* load in initial (unfactored row) */ 614 nz = ai[i+1] - ai[i]; 615 ajtmpold = aj + ai[i]; 616 v = aa + 16*ai[i]; 617 for (j=0; j<nz; j++) { 618 x = rtmp+16*ajtmpold[j]; 619 /* x = rtmp+colscale*ajtmpold[j]; */ 620 /* Copy v block into x block */ 621 SSE_INLINE_BEGIN_2(v,x) 622 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 623 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 624 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 625 626 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 627 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 628 629 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 630 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 631 632 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 633 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 634 635 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 636 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 637 638 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 639 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 640 641 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 642 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 643 644 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 645 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 646 SSE_INLINE_END_2; 647 648 v += 16; 649 } 650 /* row = (*bjtmp++)/4; */ 651 row = *bjtmp++; 652 while (row < i) { 653 pc = rtmp + 16*row; 654 SSE_INLINE_BEGIN_1(pc) 655 /* Load block from lower triangle */ 656 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 657 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 658 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 659 660 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 661 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 662 663 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 664 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 665 666 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 667 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 668 669 /* Compare block to zero block */ 670 671 SSE_COPY_PS(XMM4,XMM7) 672 SSE_CMPNEQ_PS(XMM4,XMM0) 673 674 SSE_COPY_PS(XMM5,XMM7) 675 SSE_CMPNEQ_PS(XMM5,XMM1) 676 677 SSE_COPY_PS(XMM6,XMM7) 678 SSE_CMPNEQ_PS(XMM6,XMM2) 679 680 SSE_CMPNEQ_PS(XMM7,XMM3) 681 682 /* Reduce the comparisons to one SSE register */ 683 SSE_OR_PS(XMM6,XMM7) 684 SSE_OR_PS(XMM5,XMM4) 685 SSE_OR_PS(XMM5,XMM6) 686 SSE_INLINE_END_1; 687 688 /* Reduce the one SSE register to an integer register for branching */ 689 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 690 MOVEMASK(nonzero,XMM5); 691 692 /* If block is nonzero ... */ 693 if (nonzero) { 694 pv = ba + 16*diag_offset[row]; 695 PREFETCH_L1(&pv[16]); 696 pj = bj + diag_offset[row] + 1; 697 698 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 699 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 700 /* but the diagonal was inverted already */ 701 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 702 703 SSE_INLINE_BEGIN_2(pv,pc) 704 /* Column 0, product is accumulated in XMM4 */ 705 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 706 SSE_SHUFFLE(XMM4,XMM4,0x00) 707 SSE_MULT_PS(XMM4,XMM0) 708 709 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 710 SSE_SHUFFLE(XMM5,XMM5,0x00) 711 SSE_MULT_PS(XMM5,XMM1) 712 SSE_ADD_PS(XMM4,XMM5) 713 714 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 715 SSE_SHUFFLE(XMM6,XMM6,0x00) 716 SSE_MULT_PS(XMM6,XMM2) 717 SSE_ADD_PS(XMM4,XMM6) 718 719 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 720 SSE_SHUFFLE(XMM7,XMM7,0x00) 721 SSE_MULT_PS(XMM7,XMM3) 722 SSE_ADD_PS(XMM4,XMM7) 723 724 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 725 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 726 727 /* Column 1, product is accumulated in XMM5 */ 728 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 729 SSE_SHUFFLE(XMM5,XMM5,0x00) 730 SSE_MULT_PS(XMM5,XMM0) 731 732 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 733 SSE_SHUFFLE(XMM6,XMM6,0x00) 734 SSE_MULT_PS(XMM6,XMM1) 735 SSE_ADD_PS(XMM5,XMM6) 736 737 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 738 SSE_SHUFFLE(XMM7,XMM7,0x00) 739 SSE_MULT_PS(XMM7,XMM2) 740 SSE_ADD_PS(XMM5,XMM7) 741 742 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 743 SSE_SHUFFLE(XMM6,XMM6,0x00) 744 SSE_MULT_PS(XMM6,XMM3) 745 SSE_ADD_PS(XMM5,XMM6) 746 747 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 748 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 749 750 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 751 752 /* Column 2, product is accumulated in XMM6 */ 753 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 754 SSE_SHUFFLE(XMM6,XMM6,0x00) 755 SSE_MULT_PS(XMM6,XMM0) 756 757 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 758 SSE_SHUFFLE(XMM7,XMM7,0x00) 759 SSE_MULT_PS(XMM7,XMM1) 760 SSE_ADD_PS(XMM6,XMM7) 761 762 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 763 SSE_SHUFFLE(XMM7,XMM7,0x00) 764 SSE_MULT_PS(XMM7,XMM2) 765 SSE_ADD_PS(XMM6,XMM7) 766 767 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 768 SSE_SHUFFLE(XMM7,XMM7,0x00) 769 SSE_MULT_PS(XMM7,XMM3) 770 SSE_ADD_PS(XMM6,XMM7) 771 772 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 773 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 774 775 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 776 /* Column 3, product is accumulated in XMM0 */ 777 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 778 SSE_SHUFFLE(XMM7,XMM7,0x00) 779 SSE_MULT_PS(XMM0,XMM7) 780 781 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 782 SSE_SHUFFLE(XMM7,XMM7,0x00) 783 SSE_MULT_PS(XMM1,XMM7) 784 SSE_ADD_PS(XMM0,XMM1) 785 786 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 787 SSE_SHUFFLE(XMM1,XMM1,0x00) 788 SSE_MULT_PS(XMM1,XMM2) 789 SSE_ADD_PS(XMM0,XMM1) 790 791 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 792 SSE_SHUFFLE(XMM7,XMM7,0x00) 793 SSE_MULT_PS(XMM3,XMM7) 794 SSE_ADD_PS(XMM0,XMM3) 795 796 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 797 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 798 799 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 800 /* This is code to be maintained and read by humans afterall. */ 801 /* Copy Multiplier Col 3 into XMM3 */ 802 SSE_COPY_PS(XMM3,XMM0) 803 /* Copy Multiplier Col 2 into XMM2 */ 804 SSE_COPY_PS(XMM2,XMM6) 805 /* Copy Multiplier Col 1 into XMM1 */ 806 SSE_COPY_PS(XMM1,XMM5) 807 /* Copy Multiplier Col 0 into XMM0 */ 808 SSE_COPY_PS(XMM0,XMM4) 809 SSE_INLINE_END_2; 810 811 /* Update the row: */ 812 nz = bi[row+1] - diag_offset[row] - 1; 813 pv += 16; 814 for (j=0; j<nz; j++) { 815 PREFETCH_L1(&pv[16]); 816 x = rtmp + 16*pj[j]; 817 /* x = rtmp + 4*pj[j]; */ 818 819 /* X:=X-M*PV, One column at a time */ 820 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 821 SSE_INLINE_BEGIN_2(x,pv) 822 /* Load First Column of X*/ 823 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 824 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 825 826 /* Matrix-Vector Product: */ 827 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 828 SSE_SHUFFLE(XMM5,XMM5,0x00) 829 SSE_MULT_PS(XMM5,XMM0) 830 SSE_SUB_PS(XMM4,XMM5) 831 832 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 833 SSE_SHUFFLE(XMM6,XMM6,0x00) 834 SSE_MULT_PS(XMM6,XMM1) 835 SSE_SUB_PS(XMM4,XMM6) 836 837 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 838 SSE_SHUFFLE(XMM7,XMM7,0x00) 839 SSE_MULT_PS(XMM7,XMM2) 840 SSE_SUB_PS(XMM4,XMM7) 841 842 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 843 SSE_SHUFFLE(XMM5,XMM5,0x00) 844 SSE_MULT_PS(XMM5,XMM3) 845 SSE_SUB_PS(XMM4,XMM5) 846 847 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 848 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 849 850 /* Second Column */ 851 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 852 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 853 854 /* Matrix-Vector Product: */ 855 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 856 SSE_SHUFFLE(XMM6,XMM6,0x00) 857 SSE_MULT_PS(XMM6,XMM0) 858 SSE_SUB_PS(XMM5,XMM6) 859 860 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 861 SSE_SHUFFLE(XMM7,XMM7,0x00) 862 SSE_MULT_PS(XMM7,XMM1) 863 SSE_SUB_PS(XMM5,XMM7) 864 865 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 866 SSE_SHUFFLE(XMM4,XMM4,0x00) 867 SSE_MULT_PS(XMM4,XMM2) 868 SSE_SUB_PS(XMM5,XMM4) 869 870 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 871 SSE_SHUFFLE(XMM6,XMM6,0x00) 872 SSE_MULT_PS(XMM6,XMM3) 873 SSE_SUB_PS(XMM5,XMM6) 874 875 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 876 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 877 878 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 879 880 /* Third Column */ 881 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 882 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 883 884 /* Matrix-Vector Product: */ 885 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 886 SSE_SHUFFLE(XMM7,XMM7,0x00) 887 SSE_MULT_PS(XMM7,XMM0) 888 SSE_SUB_PS(XMM6,XMM7) 889 890 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 891 SSE_SHUFFLE(XMM4,XMM4,0x00) 892 SSE_MULT_PS(XMM4,XMM1) 893 SSE_SUB_PS(XMM6,XMM4) 894 895 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 896 SSE_SHUFFLE(XMM5,XMM5,0x00) 897 SSE_MULT_PS(XMM5,XMM2) 898 SSE_SUB_PS(XMM6,XMM5) 899 900 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 901 SSE_SHUFFLE(XMM7,XMM7,0x00) 902 SSE_MULT_PS(XMM7,XMM3) 903 SSE_SUB_PS(XMM6,XMM7) 904 905 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 906 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 907 908 /* Fourth Column */ 909 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 910 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 911 912 /* Matrix-Vector Product: */ 913 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 914 SSE_SHUFFLE(XMM5,XMM5,0x00) 915 SSE_MULT_PS(XMM5,XMM0) 916 SSE_SUB_PS(XMM4,XMM5) 917 918 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 919 SSE_SHUFFLE(XMM6,XMM6,0x00) 920 SSE_MULT_PS(XMM6,XMM1) 921 SSE_SUB_PS(XMM4,XMM6) 922 923 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 924 SSE_SHUFFLE(XMM7,XMM7,0x00) 925 SSE_MULT_PS(XMM7,XMM2) 926 SSE_SUB_PS(XMM4,XMM7) 927 928 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 929 SSE_SHUFFLE(XMM5,XMM5,0x00) 930 SSE_MULT_PS(XMM5,XMM3) 931 SSE_SUB_PS(XMM4,XMM5) 932 933 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 934 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 935 SSE_INLINE_END_2; 936 pv += 16; 937 } 938 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 939 } 940 row = *bjtmp++; 941 /* row = (*bjtmp++)/4; */ 942 } 943 /* finished row so stick it into b->a */ 944 pv = ba + 16*bi[i]; 945 pj = bj + bi[i]; 946 nz = bi[i+1] - bi[i]; 947 948 /* Copy x block back into pv block */ 949 for (j=0; j<nz; j++) { 950 x = rtmp+16*pj[j]; 951 /* x = rtmp+4*pj[j]; */ 952 953 SSE_INLINE_BEGIN_2(x,pv) 954 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 955 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 956 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 957 958 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 959 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 960 961 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 962 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 963 964 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 965 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 966 967 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 968 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 969 970 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 971 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 972 973 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 974 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 975 976 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 977 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 978 SSE_INLINE_END_2; 979 pv += 16; 980 } 981 /* invert diagonal block */ 982 w = ba + 16*diag_offset[i]; 983 if (pivotinblocks) { 984 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr); 985 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 986 } else { 987 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 988 } 989 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 990 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 991 } 992 993 ierr = PetscFree(rtmp);CHKERRQ(ierr); 994 995 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 996 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 997 C->assembled = PETSC_TRUE; 998 999 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1000 /* Flop Count from inverting diagonal blocks */ 1001 SSE_SCOPE_END; 1002 PetscFunctionReturn(0); 1003 } 1004 1005 #undef __FUNCT__ 1006 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 1007 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1008 { 1009 Mat A =C; 1010 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1011 PetscErrorCode ierr; 1012 int i,j,n = a->mbs; 1013 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1014 unsigned short *aj = (unsigned short*)(a->j),*ajtmp; 1015 unsigned int row; 1016 int nz,*bi=b->i; 1017 int *diag_offset = b->diag,*ai=a->i; 1018 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1019 MatScalar *ba = b->a,*aa = a->a; 1020 int nonzero=0; 1021 /* int nonzero=0,colscale = 16; */ 1022 PetscBool pivotinblocks = b->pivotinblocks,zeropivotdetected; 1023 PetscReal shift = info->shiftamount; 1024 1025 PetscFunctionBegin; 1026 SSE_SCOPE_BEGIN; 1027 1028 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1029 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1030 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1031 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1032 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1033 /* colscale = 4; */ 1034 /* } */ 1035 1036 for (i=0; i<n; i++) { 1037 nz = bi[i+1] - bi[i]; 1038 bjtmp = bj + bi[i]; 1039 /* zero out the 4x4 block accumulators */ 1040 /* zero out one register */ 1041 XOR_PS(XMM7,XMM7); 1042 for (j=0; j<nz; j++) { 1043 x = rtmp+16*((unsigned int)bjtmp[j]); 1044 /* x = rtmp+4*bjtmp[j]; */ 1045 SSE_INLINE_BEGIN_1(x) 1046 /* Copy zero register to memory locations */ 1047 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1048 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1049 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1050 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1051 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1052 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1053 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1054 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1055 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1056 SSE_INLINE_END_1; 1057 } 1058 /* load in initial (unfactored row) */ 1059 nz = ai[i+1] - ai[i]; 1060 ajtmp = aj + ai[i]; 1061 v = aa + 16*ai[i]; 1062 for (j=0; j<nz; j++) { 1063 x = rtmp+16*((unsigned int)ajtmp[j]); 1064 /* x = rtmp+colscale*ajtmp[j]; */ 1065 /* Copy v block into x block */ 1066 SSE_INLINE_BEGIN_2(v,x) 1067 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1068 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1069 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1070 1071 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1072 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1073 1074 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1075 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1076 1077 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1078 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1079 1080 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1081 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1082 1083 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1084 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1085 1086 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1087 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1088 1089 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1090 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1091 SSE_INLINE_END_2; 1092 1093 v += 16; 1094 } 1095 /* row = (*bjtmp++)/4; */ 1096 row = (unsigned int)(*bjtmp++); 1097 while (row < i) { 1098 pc = rtmp + 16*row; 1099 SSE_INLINE_BEGIN_1(pc) 1100 /* Load block from lower triangle */ 1101 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1102 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1103 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1104 1105 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1106 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1107 1108 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1109 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1110 1111 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1112 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1113 1114 /* Compare block to zero block */ 1115 1116 SSE_COPY_PS(XMM4,XMM7) 1117 SSE_CMPNEQ_PS(XMM4,XMM0) 1118 1119 SSE_COPY_PS(XMM5,XMM7) 1120 SSE_CMPNEQ_PS(XMM5,XMM1) 1121 1122 SSE_COPY_PS(XMM6,XMM7) 1123 SSE_CMPNEQ_PS(XMM6,XMM2) 1124 1125 SSE_CMPNEQ_PS(XMM7,XMM3) 1126 1127 /* Reduce the comparisons to one SSE register */ 1128 SSE_OR_PS(XMM6,XMM7) 1129 SSE_OR_PS(XMM5,XMM4) 1130 SSE_OR_PS(XMM5,XMM6) 1131 SSE_INLINE_END_1; 1132 1133 /* Reduce the one SSE register to an integer register for branching */ 1134 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1135 MOVEMASK(nonzero,XMM5); 1136 1137 /* If block is nonzero ... */ 1138 if (nonzero) { 1139 pv = ba + 16*diag_offset[row]; 1140 PREFETCH_L1(&pv[16]); 1141 pj = bj + diag_offset[row] + 1; 1142 1143 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1144 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1145 /* but the diagonal was inverted already */ 1146 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1147 1148 SSE_INLINE_BEGIN_2(pv,pc) 1149 /* Column 0, product is accumulated in XMM4 */ 1150 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1151 SSE_SHUFFLE(XMM4,XMM4,0x00) 1152 SSE_MULT_PS(XMM4,XMM0) 1153 1154 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1155 SSE_SHUFFLE(XMM5,XMM5,0x00) 1156 SSE_MULT_PS(XMM5,XMM1) 1157 SSE_ADD_PS(XMM4,XMM5) 1158 1159 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1160 SSE_SHUFFLE(XMM6,XMM6,0x00) 1161 SSE_MULT_PS(XMM6,XMM2) 1162 SSE_ADD_PS(XMM4,XMM6) 1163 1164 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1165 SSE_SHUFFLE(XMM7,XMM7,0x00) 1166 SSE_MULT_PS(XMM7,XMM3) 1167 SSE_ADD_PS(XMM4,XMM7) 1168 1169 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1170 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1171 1172 /* Column 1, product is accumulated in XMM5 */ 1173 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1174 SSE_SHUFFLE(XMM5,XMM5,0x00) 1175 SSE_MULT_PS(XMM5,XMM0) 1176 1177 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1178 SSE_SHUFFLE(XMM6,XMM6,0x00) 1179 SSE_MULT_PS(XMM6,XMM1) 1180 SSE_ADD_PS(XMM5,XMM6) 1181 1182 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1183 SSE_SHUFFLE(XMM7,XMM7,0x00) 1184 SSE_MULT_PS(XMM7,XMM2) 1185 SSE_ADD_PS(XMM5,XMM7) 1186 1187 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1188 SSE_SHUFFLE(XMM6,XMM6,0x00) 1189 SSE_MULT_PS(XMM6,XMM3) 1190 SSE_ADD_PS(XMM5,XMM6) 1191 1192 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1193 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1194 1195 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1196 1197 /* Column 2, product is accumulated in XMM6 */ 1198 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1199 SSE_SHUFFLE(XMM6,XMM6,0x00) 1200 SSE_MULT_PS(XMM6,XMM0) 1201 1202 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1203 SSE_SHUFFLE(XMM7,XMM7,0x00) 1204 SSE_MULT_PS(XMM7,XMM1) 1205 SSE_ADD_PS(XMM6,XMM7) 1206 1207 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1208 SSE_SHUFFLE(XMM7,XMM7,0x00) 1209 SSE_MULT_PS(XMM7,XMM2) 1210 SSE_ADD_PS(XMM6,XMM7) 1211 1212 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1213 SSE_SHUFFLE(XMM7,XMM7,0x00) 1214 SSE_MULT_PS(XMM7,XMM3) 1215 SSE_ADD_PS(XMM6,XMM7) 1216 1217 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1218 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1219 1220 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1221 /* Column 3, product is accumulated in XMM0 */ 1222 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1223 SSE_SHUFFLE(XMM7,XMM7,0x00) 1224 SSE_MULT_PS(XMM0,XMM7) 1225 1226 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1227 SSE_SHUFFLE(XMM7,XMM7,0x00) 1228 SSE_MULT_PS(XMM1,XMM7) 1229 SSE_ADD_PS(XMM0,XMM1) 1230 1231 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1232 SSE_SHUFFLE(XMM1,XMM1,0x00) 1233 SSE_MULT_PS(XMM1,XMM2) 1234 SSE_ADD_PS(XMM0,XMM1) 1235 1236 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1237 SSE_SHUFFLE(XMM7,XMM7,0x00) 1238 SSE_MULT_PS(XMM3,XMM7) 1239 SSE_ADD_PS(XMM0,XMM3) 1240 1241 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1242 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1243 1244 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1245 /* This is code to be maintained and read by humans afterall. */ 1246 /* Copy Multiplier Col 3 into XMM3 */ 1247 SSE_COPY_PS(XMM3,XMM0) 1248 /* Copy Multiplier Col 2 into XMM2 */ 1249 SSE_COPY_PS(XMM2,XMM6) 1250 /* Copy Multiplier Col 1 into XMM1 */ 1251 SSE_COPY_PS(XMM1,XMM5) 1252 /* Copy Multiplier Col 0 into XMM0 */ 1253 SSE_COPY_PS(XMM0,XMM4) 1254 SSE_INLINE_END_2; 1255 1256 /* Update the row: */ 1257 nz = bi[row+1] - diag_offset[row] - 1; 1258 pv += 16; 1259 for (j=0; j<nz; j++) { 1260 PREFETCH_L1(&pv[16]); 1261 x = rtmp + 16*((unsigned int)pj[j]); 1262 /* x = rtmp + 4*pj[j]; */ 1263 1264 /* X:=X-M*PV, One column at a time */ 1265 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1266 SSE_INLINE_BEGIN_2(x,pv) 1267 /* Load First Column of X*/ 1268 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1269 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1270 1271 /* Matrix-Vector Product: */ 1272 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1273 SSE_SHUFFLE(XMM5,XMM5,0x00) 1274 SSE_MULT_PS(XMM5,XMM0) 1275 SSE_SUB_PS(XMM4,XMM5) 1276 1277 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1278 SSE_SHUFFLE(XMM6,XMM6,0x00) 1279 SSE_MULT_PS(XMM6,XMM1) 1280 SSE_SUB_PS(XMM4,XMM6) 1281 1282 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1283 SSE_SHUFFLE(XMM7,XMM7,0x00) 1284 SSE_MULT_PS(XMM7,XMM2) 1285 SSE_SUB_PS(XMM4,XMM7) 1286 1287 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1288 SSE_SHUFFLE(XMM5,XMM5,0x00) 1289 SSE_MULT_PS(XMM5,XMM3) 1290 SSE_SUB_PS(XMM4,XMM5) 1291 1292 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1293 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1294 1295 /* Second Column */ 1296 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1297 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1298 1299 /* Matrix-Vector Product: */ 1300 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1301 SSE_SHUFFLE(XMM6,XMM6,0x00) 1302 SSE_MULT_PS(XMM6,XMM0) 1303 SSE_SUB_PS(XMM5,XMM6) 1304 1305 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1306 SSE_SHUFFLE(XMM7,XMM7,0x00) 1307 SSE_MULT_PS(XMM7,XMM1) 1308 SSE_SUB_PS(XMM5,XMM7) 1309 1310 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1311 SSE_SHUFFLE(XMM4,XMM4,0x00) 1312 SSE_MULT_PS(XMM4,XMM2) 1313 SSE_SUB_PS(XMM5,XMM4) 1314 1315 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1316 SSE_SHUFFLE(XMM6,XMM6,0x00) 1317 SSE_MULT_PS(XMM6,XMM3) 1318 SSE_SUB_PS(XMM5,XMM6) 1319 1320 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1321 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1322 1323 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1324 1325 /* Third Column */ 1326 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1327 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1328 1329 /* Matrix-Vector Product: */ 1330 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1331 SSE_SHUFFLE(XMM7,XMM7,0x00) 1332 SSE_MULT_PS(XMM7,XMM0) 1333 SSE_SUB_PS(XMM6,XMM7) 1334 1335 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1336 SSE_SHUFFLE(XMM4,XMM4,0x00) 1337 SSE_MULT_PS(XMM4,XMM1) 1338 SSE_SUB_PS(XMM6,XMM4) 1339 1340 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1341 SSE_SHUFFLE(XMM5,XMM5,0x00) 1342 SSE_MULT_PS(XMM5,XMM2) 1343 SSE_SUB_PS(XMM6,XMM5) 1344 1345 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1346 SSE_SHUFFLE(XMM7,XMM7,0x00) 1347 SSE_MULT_PS(XMM7,XMM3) 1348 SSE_SUB_PS(XMM6,XMM7) 1349 1350 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1351 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1352 1353 /* Fourth Column */ 1354 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1355 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1356 1357 /* Matrix-Vector Product: */ 1358 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1359 SSE_SHUFFLE(XMM5,XMM5,0x00) 1360 SSE_MULT_PS(XMM5,XMM0) 1361 SSE_SUB_PS(XMM4,XMM5) 1362 1363 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1364 SSE_SHUFFLE(XMM6,XMM6,0x00) 1365 SSE_MULT_PS(XMM6,XMM1) 1366 SSE_SUB_PS(XMM4,XMM6) 1367 1368 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1369 SSE_SHUFFLE(XMM7,XMM7,0x00) 1370 SSE_MULT_PS(XMM7,XMM2) 1371 SSE_SUB_PS(XMM4,XMM7) 1372 1373 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1374 SSE_SHUFFLE(XMM5,XMM5,0x00) 1375 SSE_MULT_PS(XMM5,XMM3) 1376 SSE_SUB_PS(XMM4,XMM5) 1377 1378 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1379 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1380 SSE_INLINE_END_2; 1381 pv += 16; 1382 } 1383 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1384 } 1385 row = (unsigned int)(*bjtmp++); 1386 /* row = (*bjtmp++)/4; */ 1387 /* bjtmp++; */ 1388 } 1389 /* finished row so stick it into b->a */ 1390 pv = ba + 16*bi[i]; 1391 pj = bj + bi[i]; 1392 nz = bi[i+1] - bi[i]; 1393 1394 /* Copy x block back into pv block */ 1395 for (j=0; j<nz; j++) { 1396 x = rtmp+16*((unsigned int)pj[j]); 1397 /* x = rtmp+4*pj[j]; */ 1398 1399 SSE_INLINE_BEGIN_2(x,pv) 1400 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1401 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1402 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1403 1404 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1405 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1406 1407 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1408 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1409 1410 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1411 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1412 1413 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1414 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1415 1416 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1417 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1418 1419 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1420 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1421 1422 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1423 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1424 SSE_INLINE_END_2; 1425 pv += 16; 1426 } 1427 /* invert diagonal block */ 1428 w = ba + 16*diag_offset[i]; 1429 if (pivotinblocks) { 1430 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr); 1431 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1432 } else { 1433 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1434 } 1435 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1436 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1437 } 1438 1439 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1440 1441 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1442 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1443 C->assembled = PETSC_TRUE; 1444 1445 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1446 /* Flop Count from inverting diagonal blocks */ 1447 SSE_SCOPE_END; 1448 PetscFunctionReturn(0); 1449 } 1450 1451 #undef __FUNCT__ 1452 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1453 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1454 { 1455 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1456 PetscErrorCode ierr; 1457 int i,j,n = a->mbs; 1458 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1459 unsigned int row; 1460 int *ajtmpold,nz,*bi=b->i; 1461 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1462 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1463 MatScalar *ba = b->a,*aa = a->a; 1464 int nonzero=0; 1465 /* int nonzero=0,colscale = 16; */ 1466 PetscBool pivotinblocks = b->pivotinblocks,zeropivotdetected; 1467 PetscReal shift = info->shiftamount; 1468 1469 PetscFunctionBegin; 1470 SSE_SCOPE_BEGIN; 1471 1472 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1473 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1474 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1475 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1476 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1477 /* colscale = 4; */ 1478 /* } */ 1479 if ((unsigned long)bj==(unsigned long)aj) { 1480 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1481 } 1482 1483 for (i=0; i<n; i++) { 1484 nz = bi[i+1] - bi[i]; 1485 bjtmp = bj + bi[i]; 1486 /* zero out the 4x4 block accumulators */ 1487 /* zero out one register */ 1488 XOR_PS(XMM7,XMM7); 1489 for (j=0; j<nz; j++) { 1490 x = rtmp+16*((unsigned int)bjtmp[j]); 1491 /* x = rtmp+4*bjtmp[j]; */ 1492 SSE_INLINE_BEGIN_1(x) 1493 /* Copy zero register to memory locations */ 1494 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1495 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1496 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1497 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1498 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1499 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1500 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1501 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1502 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1503 SSE_INLINE_END_1; 1504 } 1505 /* load in initial (unfactored row) */ 1506 nz = ai[i+1] - ai[i]; 1507 ajtmpold = aj + ai[i]; 1508 v = aa + 16*ai[i]; 1509 for (j=0; j<nz; j++) { 1510 x = rtmp+16*ajtmpold[j]; 1511 /* x = rtmp+colscale*ajtmpold[j]; */ 1512 /* Copy v block into x block */ 1513 SSE_INLINE_BEGIN_2(v,x) 1514 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1515 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1516 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1517 1518 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1519 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1520 1521 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1522 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1523 1524 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1525 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1526 1527 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1528 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1529 1530 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1531 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1532 1533 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1534 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1535 1536 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1537 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1538 SSE_INLINE_END_2; 1539 1540 v += 16; 1541 } 1542 /* row = (*bjtmp++)/4; */ 1543 row = (unsigned int)(*bjtmp++); 1544 while (row < i) { 1545 pc = rtmp + 16*row; 1546 SSE_INLINE_BEGIN_1(pc) 1547 /* Load block from lower triangle */ 1548 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1549 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1550 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1551 1552 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1553 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1554 1555 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1556 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1557 1558 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1559 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1560 1561 /* Compare block to zero block */ 1562 1563 SSE_COPY_PS(XMM4,XMM7) 1564 SSE_CMPNEQ_PS(XMM4,XMM0) 1565 1566 SSE_COPY_PS(XMM5,XMM7) 1567 SSE_CMPNEQ_PS(XMM5,XMM1) 1568 1569 SSE_COPY_PS(XMM6,XMM7) 1570 SSE_CMPNEQ_PS(XMM6,XMM2) 1571 1572 SSE_CMPNEQ_PS(XMM7,XMM3) 1573 1574 /* Reduce the comparisons to one SSE register */ 1575 SSE_OR_PS(XMM6,XMM7) 1576 SSE_OR_PS(XMM5,XMM4) 1577 SSE_OR_PS(XMM5,XMM6) 1578 SSE_INLINE_END_1; 1579 1580 /* Reduce the one SSE register to an integer register for branching */ 1581 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1582 MOVEMASK(nonzero,XMM5); 1583 1584 /* If block is nonzero ... */ 1585 if (nonzero) { 1586 pv = ba + 16*diag_offset[row]; 1587 PREFETCH_L1(&pv[16]); 1588 pj = bj + diag_offset[row] + 1; 1589 1590 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1591 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1592 /* but the diagonal was inverted already */ 1593 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1594 1595 SSE_INLINE_BEGIN_2(pv,pc) 1596 /* Column 0, product is accumulated in XMM4 */ 1597 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1598 SSE_SHUFFLE(XMM4,XMM4,0x00) 1599 SSE_MULT_PS(XMM4,XMM0) 1600 1601 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1602 SSE_SHUFFLE(XMM5,XMM5,0x00) 1603 SSE_MULT_PS(XMM5,XMM1) 1604 SSE_ADD_PS(XMM4,XMM5) 1605 1606 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1607 SSE_SHUFFLE(XMM6,XMM6,0x00) 1608 SSE_MULT_PS(XMM6,XMM2) 1609 SSE_ADD_PS(XMM4,XMM6) 1610 1611 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1612 SSE_SHUFFLE(XMM7,XMM7,0x00) 1613 SSE_MULT_PS(XMM7,XMM3) 1614 SSE_ADD_PS(XMM4,XMM7) 1615 1616 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1617 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1618 1619 /* Column 1, product is accumulated in XMM5 */ 1620 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1621 SSE_SHUFFLE(XMM5,XMM5,0x00) 1622 SSE_MULT_PS(XMM5,XMM0) 1623 1624 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1625 SSE_SHUFFLE(XMM6,XMM6,0x00) 1626 SSE_MULT_PS(XMM6,XMM1) 1627 SSE_ADD_PS(XMM5,XMM6) 1628 1629 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1630 SSE_SHUFFLE(XMM7,XMM7,0x00) 1631 SSE_MULT_PS(XMM7,XMM2) 1632 SSE_ADD_PS(XMM5,XMM7) 1633 1634 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1635 SSE_SHUFFLE(XMM6,XMM6,0x00) 1636 SSE_MULT_PS(XMM6,XMM3) 1637 SSE_ADD_PS(XMM5,XMM6) 1638 1639 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1640 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1641 1642 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1643 1644 /* Column 2, product is accumulated in XMM6 */ 1645 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1646 SSE_SHUFFLE(XMM6,XMM6,0x00) 1647 SSE_MULT_PS(XMM6,XMM0) 1648 1649 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1650 SSE_SHUFFLE(XMM7,XMM7,0x00) 1651 SSE_MULT_PS(XMM7,XMM1) 1652 SSE_ADD_PS(XMM6,XMM7) 1653 1654 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1655 SSE_SHUFFLE(XMM7,XMM7,0x00) 1656 SSE_MULT_PS(XMM7,XMM2) 1657 SSE_ADD_PS(XMM6,XMM7) 1658 1659 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1660 SSE_SHUFFLE(XMM7,XMM7,0x00) 1661 SSE_MULT_PS(XMM7,XMM3) 1662 SSE_ADD_PS(XMM6,XMM7) 1663 1664 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1665 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1666 1667 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1668 /* Column 3, product is accumulated in XMM0 */ 1669 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1670 SSE_SHUFFLE(XMM7,XMM7,0x00) 1671 SSE_MULT_PS(XMM0,XMM7) 1672 1673 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1674 SSE_SHUFFLE(XMM7,XMM7,0x00) 1675 SSE_MULT_PS(XMM1,XMM7) 1676 SSE_ADD_PS(XMM0,XMM1) 1677 1678 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1679 SSE_SHUFFLE(XMM1,XMM1,0x00) 1680 SSE_MULT_PS(XMM1,XMM2) 1681 SSE_ADD_PS(XMM0,XMM1) 1682 1683 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1684 SSE_SHUFFLE(XMM7,XMM7,0x00) 1685 SSE_MULT_PS(XMM3,XMM7) 1686 SSE_ADD_PS(XMM0,XMM3) 1687 1688 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1689 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1690 1691 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1692 /* This is code to be maintained and read by humans afterall. */ 1693 /* Copy Multiplier Col 3 into XMM3 */ 1694 SSE_COPY_PS(XMM3,XMM0) 1695 /* Copy Multiplier Col 2 into XMM2 */ 1696 SSE_COPY_PS(XMM2,XMM6) 1697 /* Copy Multiplier Col 1 into XMM1 */ 1698 SSE_COPY_PS(XMM1,XMM5) 1699 /* Copy Multiplier Col 0 into XMM0 */ 1700 SSE_COPY_PS(XMM0,XMM4) 1701 SSE_INLINE_END_2; 1702 1703 /* Update the row: */ 1704 nz = bi[row+1] - diag_offset[row] - 1; 1705 pv += 16; 1706 for (j=0; j<nz; j++) { 1707 PREFETCH_L1(&pv[16]); 1708 x = rtmp + 16*((unsigned int)pj[j]); 1709 /* x = rtmp + 4*pj[j]; */ 1710 1711 /* X:=X-M*PV, One column at a time */ 1712 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1713 SSE_INLINE_BEGIN_2(x,pv) 1714 /* Load First Column of X*/ 1715 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1716 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1717 1718 /* Matrix-Vector Product: */ 1719 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1720 SSE_SHUFFLE(XMM5,XMM5,0x00) 1721 SSE_MULT_PS(XMM5,XMM0) 1722 SSE_SUB_PS(XMM4,XMM5) 1723 1724 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1725 SSE_SHUFFLE(XMM6,XMM6,0x00) 1726 SSE_MULT_PS(XMM6,XMM1) 1727 SSE_SUB_PS(XMM4,XMM6) 1728 1729 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1730 SSE_SHUFFLE(XMM7,XMM7,0x00) 1731 SSE_MULT_PS(XMM7,XMM2) 1732 SSE_SUB_PS(XMM4,XMM7) 1733 1734 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1735 SSE_SHUFFLE(XMM5,XMM5,0x00) 1736 SSE_MULT_PS(XMM5,XMM3) 1737 SSE_SUB_PS(XMM4,XMM5) 1738 1739 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1740 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1741 1742 /* Second Column */ 1743 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1744 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1745 1746 /* Matrix-Vector Product: */ 1747 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1748 SSE_SHUFFLE(XMM6,XMM6,0x00) 1749 SSE_MULT_PS(XMM6,XMM0) 1750 SSE_SUB_PS(XMM5,XMM6) 1751 1752 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1753 SSE_SHUFFLE(XMM7,XMM7,0x00) 1754 SSE_MULT_PS(XMM7,XMM1) 1755 SSE_SUB_PS(XMM5,XMM7) 1756 1757 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1758 SSE_SHUFFLE(XMM4,XMM4,0x00) 1759 SSE_MULT_PS(XMM4,XMM2) 1760 SSE_SUB_PS(XMM5,XMM4) 1761 1762 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1763 SSE_SHUFFLE(XMM6,XMM6,0x00) 1764 SSE_MULT_PS(XMM6,XMM3) 1765 SSE_SUB_PS(XMM5,XMM6) 1766 1767 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1768 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1769 1770 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1771 1772 /* Third Column */ 1773 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1774 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1775 1776 /* Matrix-Vector Product: */ 1777 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1778 SSE_SHUFFLE(XMM7,XMM7,0x00) 1779 SSE_MULT_PS(XMM7,XMM0) 1780 SSE_SUB_PS(XMM6,XMM7) 1781 1782 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1783 SSE_SHUFFLE(XMM4,XMM4,0x00) 1784 SSE_MULT_PS(XMM4,XMM1) 1785 SSE_SUB_PS(XMM6,XMM4) 1786 1787 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1788 SSE_SHUFFLE(XMM5,XMM5,0x00) 1789 SSE_MULT_PS(XMM5,XMM2) 1790 SSE_SUB_PS(XMM6,XMM5) 1791 1792 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1793 SSE_SHUFFLE(XMM7,XMM7,0x00) 1794 SSE_MULT_PS(XMM7,XMM3) 1795 SSE_SUB_PS(XMM6,XMM7) 1796 1797 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1798 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1799 1800 /* Fourth Column */ 1801 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1802 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1803 1804 /* Matrix-Vector Product: */ 1805 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1806 SSE_SHUFFLE(XMM5,XMM5,0x00) 1807 SSE_MULT_PS(XMM5,XMM0) 1808 SSE_SUB_PS(XMM4,XMM5) 1809 1810 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1811 SSE_SHUFFLE(XMM6,XMM6,0x00) 1812 SSE_MULT_PS(XMM6,XMM1) 1813 SSE_SUB_PS(XMM4,XMM6) 1814 1815 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1816 SSE_SHUFFLE(XMM7,XMM7,0x00) 1817 SSE_MULT_PS(XMM7,XMM2) 1818 SSE_SUB_PS(XMM4,XMM7) 1819 1820 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1821 SSE_SHUFFLE(XMM5,XMM5,0x00) 1822 SSE_MULT_PS(XMM5,XMM3) 1823 SSE_SUB_PS(XMM4,XMM5) 1824 1825 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1826 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1827 SSE_INLINE_END_2; 1828 pv += 16; 1829 } 1830 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1831 } 1832 row = (unsigned int)(*bjtmp++); 1833 /* row = (*bjtmp++)/4; */ 1834 /* bjtmp++; */ 1835 } 1836 /* finished row so stick it into b->a */ 1837 pv = ba + 16*bi[i]; 1838 pj = bj + bi[i]; 1839 nz = bi[i+1] - bi[i]; 1840 1841 /* Copy x block back into pv block */ 1842 for (j=0; j<nz; j++) { 1843 x = rtmp+16*((unsigned int)pj[j]); 1844 /* x = rtmp+4*pj[j]; */ 1845 1846 SSE_INLINE_BEGIN_2(x,pv) 1847 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1848 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1849 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1850 1851 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1852 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1853 1854 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1855 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1856 1857 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1858 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1859 1860 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1861 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1862 1863 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1864 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1865 1866 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1867 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1868 1869 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1870 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1871 SSE_INLINE_END_2; 1872 pv += 16; 1873 } 1874 /* invert diagonal block */ 1875 w = ba + 16*diag_offset[i]; 1876 if (pivotinblocks) { 1877 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr); 1878 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1879 } else { 1880 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1881 } 1882 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1883 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1884 } 1885 1886 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1887 1888 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1889 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1890 C->assembled = PETSC_TRUE; 1891 1892 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1893 /* Flop Count from inverting diagonal blocks */ 1894 SSE_SCOPE_END; 1895 PetscFunctionReturn(0); 1896 } 1897 1898 #endif 1899