1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 /* ------------------------------------------------------------*/ 9 /* 10 Version for when blocks are 4 by 4 11 */ 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace" 14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 15 { 16 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 17 IS isrow = b->row,isicol = b->icol; 18 PetscErrorCode ierr; 19 const PetscInt *r,*ic; 20 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 21 PetscInt *ajtmpold,*ajtmp,nz,row; 22 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 23 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 24 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 25 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 26 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 27 MatScalar m13,m14,m15,m16; 28 MatScalar *ba = b->a,*aa = a->a; 29 PetscBool pivotinblocks = b->pivotinblocks; 30 PetscReal shift = info->shiftamount; 31 32 PetscFunctionBegin; 33 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 34 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 35 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 36 37 for (i=0; i<n; i++) { 38 nz = bi[i+1] - bi[i]; 39 ajtmp = bj + bi[i]; 40 for (j=0; j<nz; j++) { 41 x = rtmp+16*ajtmp[j]; 42 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 43 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 44 } 45 /* load in initial (unfactored row) */ 46 idx = r[i]; 47 nz = ai[idx+1] - ai[idx]; 48 ajtmpold = aj + ai[idx]; 49 v = aa + 16*ai[idx]; 50 for (j=0; j<nz; j++) { 51 x = rtmp+16*ic[ajtmpold[j]]; 52 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 53 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 54 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 55 x[14] = v[14]; x[15] = v[15]; 56 v += 16; 57 } 58 row = *ajtmp++; 59 while (row < i) { 60 pc = rtmp + 16*row; 61 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 62 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 63 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 64 p15 = pc[14]; p16 = pc[15]; 65 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 66 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 67 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 68 || p16 != 0.0) { 69 pv = ba + 16*diag_offset[row]; 70 pj = bj + diag_offset[row] + 1; 71 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 72 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 73 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 74 x15 = pv[14]; x16 = pv[15]; 75 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 76 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 77 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 78 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 79 80 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 81 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 82 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 83 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 84 85 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 86 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 87 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 88 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 89 90 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 91 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 92 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 93 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 94 95 nz = bi[row+1] - diag_offset[row] - 1; 96 pv += 16; 97 for (j=0; j<nz; j++) { 98 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 99 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 100 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 101 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 102 x = rtmp + 16*pj[j]; 103 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 104 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 105 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 106 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 107 108 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 109 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 110 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 111 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 112 113 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 114 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 115 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 116 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 117 118 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 119 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 120 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 121 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 122 123 pv += 16; 124 } 125 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 126 } 127 row = *ajtmp++; 128 } 129 /* finished row so stick it into b->a */ 130 pv = ba + 16*bi[i]; 131 pj = bj + bi[i]; 132 nz = bi[i+1] - bi[i]; 133 for (j=0; j<nz; j++) { 134 x = rtmp+16*pj[j]; 135 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 136 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 137 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 138 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 139 pv += 16; 140 } 141 /* invert diagonal block */ 142 w = ba + 16*diag_offset[i]; 143 if (pivotinblocks) { 144 PetscBool allowzeropivot,zeropivotdetected; 145 allowzeropivot = PetscNot(A->erroriffailure); 146 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 147 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 148 } else { 149 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 150 } 151 } 152 153 ierr = PetscFree(rtmp);CHKERRQ(ierr); 154 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 155 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 156 157 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 158 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 159 C->assembled = PETSC_TRUE; 160 161 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 162 PetscFunctionReturn(0); 163 } 164 165 /* MatLUFactorNumeric_SeqBAIJ_4 - 166 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 167 PetscKernel_A_gets_A_times_B() 168 PetscKernel_A_gets_A_minus_B_times_C() 169 PetscKernel_A_gets_inverse_A() 170 */ 171 172 #undef __FUNCT__ 173 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 174 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 175 { 176 Mat C = B; 177 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 178 IS isrow = b->row,isicol = b->icol; 179 PetscErrorCode ierr; 180 const PetscInt *r,*ic; 181 PetscInt i,j,k,nz,nzL,row; 182 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 183 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 184 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 185 PetscInt flg; 186 PetscReal shift; 187 PetscBool allowzeropivot,zeropivotdetected; 188 189 PetscFunctionBegin; 190 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 191 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 192 193 if (info->shifttype == MAT_SHIFT_NONE) { 194 shift = 0; 195 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 196 shift = info->shiftamount; 197 } 198 199 /* generate work space needed by the factorization */ 200 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 201 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 202 203 for (i=0; i<n; i++) { 204 /* zero rtmp */ 205 /* L part */ 206 nz = bi[i+1] - bi[i]; 207 bjtmp = bj + bi[i]; 208 for (j=0; j<nz; j++) { 209 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 210 } 211 212 /* U part */ 213 nz = bdiag[i]-bdiag[i+1]; 214 bjtmp = bj + bdiag[i+1]+1; 215 for (j=0; j<nz; j++) { 216 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 217 } 218 219 /* load in initial (unfactored row) */ 220 nz = ai[r[i]+1] - ai[r[i]]; 221 ajtmp = aj + ai[r[i]]; 222 v = aa + bs2*ai[r[i]]; 223 for (j=0; j<nz; j++) { 224 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 225 } 226 227 /* elimination */ 228 bjtmp = bj + bi[i]; 229 nzL = bi[i+1] - bi[i]; 230 for (k=0; k < nzL; k++) { 231 row = bjtmp[k]; 232 pc = rtmp + bs2*row; 233 for (flg=0,j=0; j<bs2; j++) { 234 if (pc[j]!=0.0) { 235 flg = 1; 236 break; 237 } 238 } 239 if (flg) { 240 pv = b->a + bs2*bdiag[row]; 241 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 242 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 243 244 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 245 pv = b->a + bs2*(bdiag[row+1]+1); 246 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 247 for (j=0; j<nz; j++) { 248 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 249 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 250 v = rtmp + bs2*pj[j]; 251 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 252 pv += bs2; 253 } 254 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 255 } 256 } 257 258 /* finished row so stick it into b->a */ 259 /* L part */ 260 pv = b->a + bs2*bi[i]; 261 pj = b->j + bi[i]; 262 nz = bi[i+1] - bi[i]; 263 for (j=0; j<nz; j++) { 264 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 265 } 266 267 /* Mark diagonal and invert diagonal for simplier triangular solves */ 268 pv = b->a + bs2*bdiag[i]; 269 pj = b->j + bdiag[i]; 270 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 271 allowzeropivot = PetscNot(A->erroriffailure); 272 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 273 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 274 275 /* U part */ 276 pv = b->a + bs2*(bdiag[i+1]+1); 277 pj = b->j + bdiag[i+1]+1; 278 nz = bdiag[i] - bdiag[i+1] - 1; 279 for (j=0; j<nz; j++) { 280 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 281 } 282 } 283 284 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 285 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 286 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 287 288 C->ops->solve = MatSolve_SeqBAIJ_4; 289 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 290 C->assembled = PETSC_TRUE; 291 292 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 293 PetscFunctionReturn(0); 294 } 295 296 #undef __FUNCT__ 297 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace" 298 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 299 { 300 /* 301 Default Version for when blocks are 4 by 4 Using natural ordering 302 */ 303 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 304 PetscErrorCode ierr; 305 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 306 PetscInt *ajtmpold,*ajtmp,nz,row; 307 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 308 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 309 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 310 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 311 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 312 MatScalar m13,m14,m15,m16; 313 MatScalar *ba = b->a,*aa = a->a; 314 PetscBool pivotinblocks = b->pivotinblocks; 315 PetscReal shift = info->shiftamount; 316 317 PetscFunctionBegin; 318 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 319 320 for (i=0; i<n; i++) { 321 nz = bi[i+1] - bi[i]; 322 ajtmp = bj + bi[i]; 323 for (j=0; j<nz; j++) { 324 x = rtmp+16*ajtmp[j]; 325 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 326 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 327 } 328 /* load in initial (unfactored row) */ 329 nz = ai[i+1] - ai[i]; 330 ajtmpold = aj + ai[i]; 331 v = aa + 16*ai[i]; 332 for (j=0; j<nz; j++) { 333 x = rtmp+16*ajtmpold[j]; 334 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 335 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 336 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 337 x[14] = v[14]; x[15] = v[15]; 338 v += 16; 339 } 340 row = *ajtmp++; 341 while (row < i) { 342 pc = rtmp + 16*row; 343 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 344 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 345 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 346 p15 = pc[14]; p16 = pc[15]; 347 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 348 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 349 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 350 || p16 != 0.0) { 351 pv = ba + 16*diag_offset[row]; 352 pj = bj + diag_offset[row] + 1; 353 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 354 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 355 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 356 x15 = pv[14]; x16 = pv[15]; 357 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 358 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 359 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 360 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 361 362 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 363 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 364 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 365 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 366 367 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 368 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 369 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 370 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 371 372 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 373 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 374 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 375 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 376 nz = bi[row+1] - diag_offset[row] - 1; 377 pv += 16; 378 for (j=0; j<nz; j++) { 379 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 380 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 381 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 382 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 383 x = rtmp + 16*pj[j]; 384 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 385 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 386 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 387 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 388 389 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 390 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 391 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 392 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 393 394 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 395 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 396 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 397 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 398 399 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 400 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 401 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 402 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 403 404 pv += 16; 405 } 406 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 407 } 408 row = *ajtmp++; 409 } 410 /* finished row so stick it into b->a */ 411 pv = ba + 16*bi[i]; 412 pj = bj + bi[i]; 413 nz = bi[i+1] - bi[i]; 414 for (j=0; j<nz; j++) { 415 x = rtmp+16*pj[j]; 416 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 417 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 418 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 419 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 420 pv += 16; 421 } 422 /* invert diagonal block */ 423 w = ba + 16*diag_offset[i]; 424 if (pivotinblocks) { 425 PetscBool allowzeropivot,zeropivotdetected; 426 allowzeropivot = PetscNot(A->erroriffailure); 427 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 428 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 429 } else { 430 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 431 } 432 } 433 434 ierr = PetscFree(rtmp);CHKERRQ(ierr); 435 436 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 437 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 438 C->assembled = PETSC_TRUE; 439 440 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 441 PetscFunctionReturn(0); 442 } 443 444 /* 445 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 446 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 447 */ 448 #undef __FUNCT__ 449 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 450 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 451 { 452 Mat C =B; 453 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 454 PetscErrorCode ierr; 455 PetscInt i,j,k,nz,nzL,row; 456 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 457 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 458 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 459 PetscInt flg; 460 PetscReal shift; 461 PetscBool allowzeropivot,zeropivotdetected; 462 463 PetscFunctionBegin; 464 /* generate work space needed by the factorization */ 465 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 466 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 467 468 if (info->shifttype == MAT_SHIFT_NONE) { 469 shift = 0; 470 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 471 shift = info->shiftamount; 472 } 473 474 for (i=0; i<n; i++) { 475 /* zero rtmp */ 476 /* L part */ 477 nz = bi[i+1] - bi[i]; 478 bjtmp = bj + bi[i]; 479 for (j=0; j<nz; j++) { 480 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 481 } 482 483 /* U part */ 484 nz = bdiag[i] - bdiag[i+1]; 485 bjtmp = bj + bdiag[i+1]+1; 486 for (j=0; j<nz; j++) { 487 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 488 } 489 490 /* load in initial (unfactored row) */ 491 nz = ai[i+1] - ai[i]; 492 ajtmp = aj + ai[i]; 493 v = aa + bs2*ai[i]; 494 for (j=0; j<nz; j++) { 495 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 496 } 497 498 /* elimination */ 499 bjtmp = bj + bi[i]; 500 nzL = bi[i+1] - bi[i]; 501 for (k=0; k < nzL; k++) { 502 row = bjtmp[k]; 503 pc = rtmp + bs2*row; 504 for (flg=0,j=0; j<bs2; j++) { 505 if (pc[j]!=0.0) { 506 flg = 1; 507 break; 508 } 509 } 510 if (flg) { 511 pv = b->a + bs2*bdiag[row]; 512 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 513 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 514 515 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 516 pv = b->a + bs2*(bdiag[row+1]+1); 517 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 518 for (j=0; j<nz; j++) { 519 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 520 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 521 v = rtmp + bs2*pj[j]; 522 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 523 pv += bs2; 524 } 525 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 526 } 527 } 528 529 /* finished row so stick it into b->a */ 530 /* L part */ 531 pv = b->a + bs2*bi[i]; 532 pj = b->j + bi[i]; 533 nz = bi[i+1] - bi[i]; 534 for (j=0; j<nz; j++) { 535 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 536 } 537 538 /* Mark diagonal and invert diagonal for simplier triangular solves */ 539 pv = b->a + bs2*bdiag[i]; 540 pj = b->j + bdiag[i]; 541 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 542 allowzeropivot = PetscNot(A->erroriffailure); 543 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 544 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 545 546 /* U part */ 547 pv = b->a + bs2*(bdiag[i+1]+1); 548 pj = b->j + bdiag[i+1]+1; 549 nz = bdiag[i] - bdiag[i+1] - 1; 550 for (j=0; j<nz; j++) { 551 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 552 } 553 } 554 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 555 556 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 557 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 558 C->assembled = PETSC_TRUE; 559 560 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 561 PetscFunctionReturn(0); 562 } 563 564 #if defined(PETSC_HAVE_SSE) 565 566 #include PETSC_HAVE_SSE 567 568 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 569 #undef __FUNCT__ 570 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 571 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 572 { 573 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 574 PetscErrorCode ierr; 575 int i,j,n = a->mbs; 576 int *bj = b->j,*bjtmp,*pj; 577 int row; 578 int *ajtmpold,nz,*bi=b->i; 579 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 580 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 581 MatScalar *ba = b->a,*aa = a->a; 582 int nonzero=0; 583 PetscBool pivotinblocks = b->pivotinblocks; 584 PetscReal shift = info->shiftamount; 585 586 PetscFunctionBegin; 587 SSE_SCOPE_BEGIN; 588 589 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 590 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 591 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 592 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 593 /* if ((unsigned long)bj==(unsigned long)aj) { */ 594 /* colscale = 4; */ 595 /* } */ 596 for (i=0; i<n; i++) { 597 nz = bi[i+1] - bi[i]; 598 bjtmp = bj + bi[i]; 599 /* zero out the 4x4 block accumulators */ 600 /* zero out one register */ 601 XOR_PS(XMM7,XMM7); 602 for (j=0; j<nz; j++) { 603 x = rtmp+16*bjtmp[j]; 604 /* x = rtmp+4*bjtmp[j]; */ 605 SSE_INLINE_BEGIN_1(x) 606 /* Copy zero register to memory locations */ 607 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 608 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 609 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 610 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 611 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 612 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 613 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 614 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 615 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 616 SSE_INLINE_END_1; 617 } 618 /* load in initial (unfactored row) */ 619 nz = ai[i+1] - ai[i]; 620 ajtmpold = aj + ai[i]; 621 v = aa + 16*ai[i]; 622 for (j=0; j<nz; j++) { 623 x = rtmp+16*ajtmpold[j]; 624 /* x = rtmp+colscale*ajtmpold[j]; */ 625 /* Copy v block into x block */ 626 SSE_INLINE_BEGIN_2(v,x) 627 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 628 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 629 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 630 631 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 632 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 633 634 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 635 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 636 637 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 638 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 639 640 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 641 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 642 643 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 644 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 645 646 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 647 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 648 649 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 650 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 651 SSE_INLINE_END_2; 652 653 v += 16; 654 } 655 /* row = (*bjtmp++)/4; */ 656 row = *bjtmp++; 657 while (row < i) { 658 pc = rtmp + 16*row; 659 SSE_INLINE_BEGIN_1(pc) 660 /* Load block from lower triangle */ 661 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 662 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 663 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 664 665 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 666 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 667 668 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 669 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 670 671 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 672 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 673 674 /* Compare block to zero block */ 675 676 SSE_COPY_PS(XMM4,XMM7) 677 SSE_CMPNEQ_PS(XMM4,XMM0) 678 679 SSE_COPY_PS(XMM5,XMM7) 680 SSE_CMPNEQ_PS(XMM5,XMM1) 681 682 SSE_COPY_PS(XMM6,XMM7) 683 SSE_CMPNEQ_PS(XMM6,XMM2) 684 685 SSE_CMPNEQ_PS(XMM7,XMM3) 686 687 /* Reduce the comparisons to one SSE register */ 688 SSE_OR_PS(XMM6,XMM7) 689 SSE_OR_PS(XMM5,XMM4) 690 SSE_OR_PS(XMM5,XMM6) 691 SSE_INLINE_END_1; 692 693 /* Reduce the one SSE register to an integer register for branching */ 694 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 695 MOVEMASK(nonzero,XMM5); 696 697 /* If block is nonzero ... */ 698 if (nonzero) { 699 pv = ba + 16*diag_offset[row]; 700 PREFETCH_L1(&pv[16]); 701 pj = bj + diag_offset[row] + 1; 702 703 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 704 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 705 /* but the diagonal was inverted already */ 706 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 707 708 SSE_INLINE_BEGIN_2(pv,pc) 709 /* Column 0, product is accumulated in XMM4 */ 710 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 711 SSE_SHUFFLE(XMM4,XMM4,0x00) 712 SSE_MULT_PS(XMM4,XMM0) 713 714 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 715 SSE_SHUFFLE(XMM5,XMM5,0x00) 716 SSE_MULT_PS(XMM5,XMM1) 717 SSE_ADD_PS(XMM4,XMM5) 718 719 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 720 SSE_SHUFFLE(XMM6,XMM6,0x00) 721 SSE_MULT_PS(XMM6,XMM2) 722 SSE_ADD_PS(XMM4,XMM6) 723 724 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 725 SSE_SHUFFLE(XMM7,XMM7,0x00) 726 SSE_MULT_PS(XMM7,XMM3) 727 SSE_ADD_PS(XMM4,XMM7) 728 729 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 730 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 731 732 /* Column 1, product is accumulated in XMM5 */ 733 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 734 SSE_SHUFFLE(XMM5,XMM5,0x00) 735 SSE_MULT_PS(XMM5,XMM0) 736 737 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 738 SSE_SHUFFLE(XMM6,XMM6,0x00) 739 SSE_MULT_PS(XMM6,XMM1) 740 SSE_ADD_PS(XMM5,XMM6) 741 742 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 743 SSE_SHUFFLE(XMM7,XMM7,0x00) 744 SSE_MULT_PS(XMM7,XMM2) 745 SSE_ADD_PS(XMM5,XMM7) 746 747 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 748 SSE_SHUFFLE(XMM6,XMM6,0x00) 749 SSE_MULT_PS(XMM6,XMM3) 750 SSE_ADD_PS(XMM5,XMM6) 751 752 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 753 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 754 755 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 756 757 /* Column 2, product is accumulated in XMM6 */ 758 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 759 SSE_SHUFFLE(XMM6,XMM6,0x00) 760 SSE_MULT_PS(XMM6,XMM0) 761 762 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 763 SSE_SHUFFLE(XMM7,XMM7,0x00) 764 SSE_MULT_PS(XMM7,XMM1) 765 SSE_ADD_PS(XMM6,XMM7) 766 767 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 768 SSE_SHUFFLE(XMM7,XMM7,0x00) 769 SSE_MULT_PS(XMM7,XMM2) 770 SSE_ADD_PS(XMM6,XMM7) 771 772 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 773 SSE_SHUFFLE(XMM7,XMM7,0x00) 774 SSE_MULT_PS(XMM7,XMM3) 775 SSE_ADD_PS(XMM6,XMM7) 776 777 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 778 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 779 780 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 781 /* Column 3, product is accumulated in XMM0 */ 782 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 783 SSE_SHUFFLE(XMM7,XMM7,0x00) 784 SSE_MULT_PS(XMM0,XMM7) 785 786 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 787 SSE_SHUFFLE(XMM7,XMM7,0x00) 788 SSE_MULT_PS(XMM1,XMM7) 789 SSE_ADD_PS(XMM0,XMM1) 790 791 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 792 SSE_SHUFFLE(XMM1,XMM1,0x00) 793 SSE_MULT_PS(XMM1,XMM2) 794 SSE_ADD_PS(XMM0,XMM1) 795 796 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 797 SSE_SHUFFLE(XMM7,XMM7,0x00) 798 SSE_MULT_PS(XMM3,XMM7) 799 SSE_ADD_PS(XMM0,XMM3) 800 801 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 802 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 803 804 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 805 /* This is code to be maintained and read by humans afterall. */ 806 /* Copy Multiplier Col 3 into XMM3 */ 807 SSE_COPY_PS(XMM3,XMM0) 808 /* Copy Multiplier Col 2 into XMM2 */ 809 SSE_COPY_PS(XMM2,XMM6) 810 /* Copy Multiplier Col 1 into XMM1 */ 811 SSE_COPY_PS(XMM1,XMM5) 812 /* Copy Multiplier Col 0 into XMM0 */ 813 SSE_COPY_PS(XMM0,XMM4) 814 SSE_INLINE_END_2; 815 816 /* Update the row: */ 817 nz = bi[row+1] - diag_offset[row] - 1; 818 pv += 16; 819 for (j=0; j<nz; j++) { 820 PREFETCH_L1(&pv[16]); 821 x = rtmp + 16*pj[j]; 822 /* x = rtmp + 4*pj[j]; */ 823 824 /* X:=X-M*PV, One column at a time */ 825 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 826 SSE_INLINE_BEGIN_2(x,pv) 827 /* Load First Column of X*/ 828 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 829 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 830 831 /* Matrix-Vector Product: */ 832 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 833 SSE_SHUFFLE(XMM5,XMM5,0x00) 834 SSE_MULT_PS(XMM5,XMM0) 835 SSE_SUB_PS(XMM4,XMM5) 836 837 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 838 SSE_SHUFFLE(XMM6,XMM6,0x00) 839 SSE_MULT_PS(XMM6,XMM1) 840 SSE_SUB_PS(XMM4,XMM6) 841 842 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 843 SSE_SHUFFLE(XMM7,XMM7,0x00) 844 SSE_MULT_PS(XMM7,XMM2) 845 SSE_SUB_PS(XMM4,XMM7) 846 847 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 848 SSE_SHUFFLE(XMM5,XMM5,0x00) 849 SSE_MULT_PS(XMM5,XMM3) 850 SSE_SUB_PS(XMM4,XMM5) 851 852 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 853 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 854 855 /* Second Column */ 856 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 857 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 858 859 /* Matrix-Vector Product: */ 860 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 861 SSE_SHUFFLE(XMM6,XMM6,0x00) 862 SSE_MULT_PS(XMM6,XMM0) 863 SSE_SUB_PS(XMM5,XMM6) 864 865 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 866 SSE_SHUFFLE(XMM7,XMM7,0x00) 867 SSE_MULT_PS(XMM7,XMM1) 868 SSE_SUB_PS(XMM5,XMM7) 869 870 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 871 SSE_SHUFFLE(XMM4,XMM4,0x00) 872 SSE_MULT_PS(XMM4,XMM2) 873 SSE_SUB_PS(XMM5,XMM4) 874 875 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 876 SSE_SHUFFLE(XMM6,XMM6,0x00) 877 SSE_MULT_PS(XMM6,XMM3) 878 SSE_SUB_PS(XMM5,XMM6) 879 880 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 881 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 882 883 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 884 885 /* Third Column */ 886 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 887 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 888 889 /* Matrix-Vector Product: */ 890 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 891 SSE_SHUFFLE(XMM7,XMM7,0x00) 892 SSE_MULT_PS(XMM7,XMM0) 893 SSE_SUB_PS(XMM6,XMM7) 894 895 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 896 SSE_SHUFFLE(XMM4,XMM4,0x00) 897 SSE_MULT_PS(XMM4,XMM1) 898 SSE_SUB_PS(XMM6,XMM4) 899 900 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 901 SSE_SHUFFLE(XMM5,XMM5,0x00) 902 SSE_MULT_PS(XMM5,XMM2) 903 SSE_SUB_PS(XMM6,XMM5) 904 905 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 906 SSE_SHUFFLE(XMM7,XMM7,0x00) 907 SSE_MULT_PS(XMM7,XMM3) 908 SSE_SUB_PS(XMM6,XMM7) 909 910 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 911 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 912 913 /* Fourth Column */ 914 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 915 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 916 917 /* Matrix-Vector Product: */ 918 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 919 SSE_SHUFFLE(XMM5,XMM5,0x00) 920 SSE_MULT_PS(XMM5,XMM0) 921 SSE_SUB_PS(XMM4,XMM5) 922 923 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 924 SSE_SHUFFLE(XMM6,XMM6,0x00) 925 SSE_MULT_PS(XMM6,XMM1) 926 SSE_SUB_PS(XMM4,XMM6) 927 928 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 929 SSE_SHUFFLE(XMM7,XMM7,0x00) 930 SSE_MULT_PS(XMM7,XMM2) 931 SSE_SUB_PS(XMM4,XMM7) 932 933 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 934 SSE_SHUFFLE(XMM5,XMM5,0x00) 935 SSE_MULT_PS(XMM5,XMM3) 936 SSE_SUB_PS(XMM4,XMM5) 937 938 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 939 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 940 SSE_INLINE_END_2; 941 pv += 16; 942 } 943 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 944 } 945 row = *bjtmp++; 946 /* row = (*bjtmp++)/4; */ 947 } 948 /* finished row so stick it into b->a */ 949 pv = ba + 16*bi[i]; 950 pj = bj + bi[i]; 951 nz = bi[i+1] - bi[i]; 952 953 /* Copy x block back into pv block */ 954 for (j=0; j<nz; j++) { 955 x = rtmp+16*pj[j]; 956 /* x = rtmp+4*pj[j]; */ 957 958 SSE_INLINE_BEGIN_2(x,pv) 959 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 960 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 961 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 962 963 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 964 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 965 966 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 967 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 968 969 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 970 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 971 972 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 973 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 974 975 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 976 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 977 978 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 979 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 980 981 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 982 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 983 SSE_INLINE_END_2; 984 pv += 16; 985 } 986 /* invert diagonal block */ 987 w = ba + 16*diag_offset[i]; 988 if (pivotinblocks) { 989 PetscBool allowzeropivot,zeropivotdetected; 990 allowzeropivot = PetscNot(A->erroriffailure); 991 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 992 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 993 } else { 994 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 995 } 996 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 997 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 998 } 999 1000 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1001 1002 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1003 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1004 C->assembled = PETSC_TRUE; 1005 1006 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1007 /* Flop Count from inverting diagonal blocks */ 1008 SSE_SCOPE_END; 1009 PetscFunctionReturn(0); 1010 } 1011 1012 #undef __FUNCT__ 1013 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 1014 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1015 { 1016 Mat A =C; 1017 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1018 PetscErrorCode ierr; 1019 int i,j,n = a->mbs; 1020 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1021 unsigned short *aj = (unsigned short*)(a->j),*ajtmp; 1022 unsigned int row; 1023 int nz,*bi=b->i; 1024 int *diag_offset = b->diag,*ai=a->i; 1025 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1026 MatScalar *ba = b->a,*aa = a->a; 1027 int nonzero=0; 1028 PetscBool pivotinblocks = b->pivotinblocks; 1029 PetscReal shift = info->shiftamount; 1030 1031 PetscFunctionBegin; 1032 SSE_SCOPE_BEGIN; 1033 1034 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1035 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1036 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1037 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1038 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1039 /* colscale = 4; */ 1040 /* } */ 1041 1042 for (i=0; i<n; i++) { 1043 nz = bi[i+1] - bi[i]; 1044 bjtmp = bj + bi[i]; 1045 /* zero out the 4x4 block accumulators */ 1046 /* zero out one register */ 1047 XOR_PS(XMM7,XMM7); 1048 for (j=0; j<nz; j++) { 1049 x = rtmp+16*((unsigned int)bjtmp[j]); 1050 /* x = rtmp+4*bjtmp[j]; */ 1051 SSE_INLINE_BEGIN_1(x) 1052 /* Copy zero register to memory locations */ 1053 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1054 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1055 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1056 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1057 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1058 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1059 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1060 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1061 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1062 SSE_INLINE_END_1; 1063 } 1064 /* load in initial (unfactored row) */ 1065 nz = ai[i+1] - ai[i]; 1066 ajtmp = aj + ai[i]; 1067 v = aa + 16*ai[i]; 1068 for (j=0; j<nz; j++) { 1069 x = rtmp+16*((unsigned int)ajtmp[j]); 1070 /* x = rtmp+colscale*ajtmp[j]; */ 1071 /* Copy v block into x block */ 1072 SSE_INLINE_BEGIN_2(v,x) 1073 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1074 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1075 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1076 1077 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1078 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1079 1080 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1081 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1082 1083 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1084 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1085 1086 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1087 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1088 1089 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1090 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1091 1092 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1093 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1094 1095 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1096 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1097 SSE_INLINE_END_2; 1098 1099 v += 16; 1100 } 1101 /* row = (*bjtmp++)/4; */ 1102 row = (unsigned int)(*bjtmp++); 1103 while (row < i) { 1104 pc = rtmp + 16*row; 1105 SSE_INLINE_BEGIN_1(pc) 1106 /* Load block from lower triangle */ 1107 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1108 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1109 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1110 1111 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1112 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1113 1114 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1115 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1116 1117 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1118 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1119 1120 /* Compare block to zero block */ 1121 1122 SSE_COPY_PS(XMM4,XMM7) 1123 SSE_CMPNEQ_PS(XMM4,XMM0) 1124 1125 SSE_COPY_PS(XMM5,XMM7) 1126 SSE_CMPNEQ_PS(XMM5,XMM1) 1127 1128 SSE_COPY_PS(XMM6,XMM7) 1129 SSE_CMPNEQ_PS(XMM6,XMM2) 1130 1131 SSE_CMPNEQ_PS(XMM7,XMM3) 1132 1133 /* Reduce the comparisons to one SSE register */ 1134 SSE_OR_PS(XMM6,XMM7) 1135 SSE_OR_PS(XMM5,XMM4) 1136 SSE_OR_PS(XMM5,XMM6) 1137 SSE_INLINE_END_1; 1138 1139 /* Reduce the one SSE register to an integer register for branching */ 1140 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1141 MOVEMASK(nonzero,XMM5); 1142 1143 /* If block is nonzero ... */ 1144 if (nonzero) { 1145 pv = ba + 16*diag_offset[row]; 1146 PREFETCH_L1(&pv[16]); 1147 pj = bj + diag_offset[row] + 1; 1148 1149 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1150 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1151 /* but the diagonal was inverted already */ 1152 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1153 1154 SSE_INLINE_BEGIN_2(pv,pc) 1155 /* Column 0, product is accumulated in XMM4 */ 1156 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1157 SSE_SHUFFLE(XMM4,XMM4,0x00) 1158 SSE_MULT_PS(XMM4,XMM0) 1159 1160 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1161 SSE_SHUFFLE(XMM5,XMM5,0x00) 1162 SSE_MULT_PS(XMM5,XMM1) 1163 SSE_ADD_PS(XMM4,XMM5) 1164 1165 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1166 SSE_SHUFFLE(XMM6,XMM6,0x00) 1167 SSE_MULT_PS(XMM6,XMM2) 1168 SSE_ADD_PS(XMM4,XMM6) 1169 1170 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1171 SSE_SHUFFLE(XMM7,XMM7,0x00) 1172 SSE_MULT_PS(XMM7,XMM3) 1173 SSE_ADD_PS(XMM4,XMM7) 1174 1175 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1176 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1177 1178 /* Column 1, product is accumulated in XMM5 */ 1179 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1180 SSE_SHUFFLE(XMM5,XMM5,0x00) 1181 SSE_MULT_PS(XMM5,XMM0) 1182 1183 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1184 SSE_SHUFFLE(XMM6,XMM6,0x00) 1185 SSE_MULT_PS(XMM6,XMM1) 1186 SSE_ADD_PS(XMM5,XMM6) 1187 1188 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1189 SSE_SHUFFLE(XMM7,XMM7,0x00) 1190 SSE_MULT_PS(XMM7,XMM2) 1191 SSE_ADD_PS(XMM5,XMM7) 1192 1193 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1194 SSE_SHUFFLE(XMM6,XMM6,0x00) 1195 SSE_MULT_PS(XMM6,XMM3) 1196 SSE_ADD_PS(XMM5,XMM6) 1197 1198 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1199 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1200 1201 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1202 1203 /* Column 2, product is accumulated in XMM6 */ 1204 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1205 SSE_SHUFFLE(XMM6,XMM6,0x00) 1206 SSE_MULT_PS(XMM6,XMM0) 1207 1208 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1209 SSE_SHUFFLE(XMM7,XMM7,0x00) 1210 SSE_MULT_PS(XMM7,XMM1) 1211 SSE_ADD_PS(XMM6,XMM7) 1212 1213 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1214 SSE_SHUFFLE(XMM7,XMM7,0x00) 1215 SSE_MULT_PS(XMM7,XMM2) 1216 SSE_ADD_PS(XMM6,XMM7) 1217 1218 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1219 SSE_SHUFFLE(XMM7,XMM7,0x00) 1220 SSE_MULT_PS(XMM7,XMM3) 1221 SSE_ADD_PS(XMM6,XMM7) 1222 1223 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1224 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1225 1226 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1227 /* Column 3, product is accumulated in XMM0 */ 1228 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1229 SSE_SHUFFLE(XMM7,XMM7,0x00) 1230 SSE_MULT_PS(XMM0,XMM7) 1231 1232 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1233 SSE_SHUFFLE(XMM7,XMM7,0x00) 1234 SSE_MULT_PS(XMM1,XMM7) 1235 SSE_ADD_PS(XMM0,XMM1) 1236 1237 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1238 SSE_SHUFFLE(XMM1,XMM1,0x00) 1239 SSE_MULT_PS(XMM1,XMM2) 1240 SSE_ADD_PS(XMM0,XMM1) 1241 1242 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1243 SSE_SHUFFLE(XMM7,XMM7,0x00) 1244 SSE_MULT_PS(XMM3,XMM7) 1245 SSE_ADD_PS(XMM0,XMM3) 1246 1247 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1248 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1249 1250 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1251 /* This is code to be maintained and read by humans afterall. */ 1252 /* Copy Multiplier Col 3 into XMM3 */ 1253 SSE_COPY_PS(XMM3,XMM0) 1254 /* Copy Multiplier Col 2 into XMM2 */ 1255 SSE_COPY_PS(XMM2,XMM6) 1256 /* Copy Multiplier Col 1 into XMM1 */ 1257 SSE_COPY_PS(XMM1,XMM5) 1258 /* Copy Multiplier Col 0 into XMM0 */ 1259 SSE_COPY_PS(XMM0,XMM4) 1260 SSE_INLINE_END_2; 1261 1262 /* Update the row: */ 1263 nz = bi[row+1] - diag_offset[row] - 1; 1264 pv += 16; 1265 for (j=0; j<nz; j++) { 1266 PREFETCH_L1(&pv[16]); 1267 x = rtmp + 16*((unsigned int)pj[j]); 1268 /* x = rtmp + 4*pj[j]; */ 1269 1270 /* X:=X-M*PV, One column at a time */ 1271 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1272 SSE_INLINE_BEGIN_2(x,pv) 1273 /* Load First Column of X*/ 1274 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1275 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1276 1277 /* Matrix-Vector Product: */ 1278 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1279 SSE_SHUFFLE(XMM5,XMM5,0x00) 1280 SSE_MULT_PS(XMM5,XMM0) 1281 SSE_SUB_PS(XMM4,XMM5) 1282 1283 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1284 SSE_SHUFFLE(XMM6,XMM6,0x00) 1285 SSE_MULT_PS(XMM6,XMM1) 1286 SSE_SUB_PS(XMM4,XMM6) 1287 1288 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1289 SSE_SHUFFLE(XMM7,XMM7,0x00) 1290 SSE_MULT_PS(XMM7,XMM2) 1291 SSE_SUB_PS(XMM4,XMM7) 1292 1293 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1294 SSE_SHUFFLE(XMM5,XMM5,0x00) 1295 SSE_MULT_PS(XMM5,XMM3) 1296 SSE_SUB_PS(XMM4,XMM5) 1297 1298 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1299 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1300 1301 /* Second Column */ 1302 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1303 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1304 1305 /* Matrix-Vector Product: */ 1306 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1307 SSE_SHUFFLE(XMM6,XMM6,0x00) 1308 SSE_MULT_PS(XMM6,XMM0) 1309 SSE_SUB_PS(XMM5,XMM6) 1310 1311 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1312 SSE_SHUFFLE(XMM7,XMM7,0x00) 1313 SSE_MULT_PS(XMM7,XMM1) 1314 SSE_SUB_PS(XMM5,XMM7) 1315 1316 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1317 SSE_SHUFFLE(XMM4,XMM4,0x00) 1318 SSE_MULT_PS(XMM4,XMM2) 1319 SSE_SUB_PS(XMM5,XMM4) 1320 1321 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1322 SSE_SHUFFLE(XMM6,XMM6,0x00) 1323 SSE_MULT_PS(XMM6,XMM3) 1324 SSE_SUB_PS(XMM5,XMM6) 1325 1326 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1327 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1328 1329 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1330 1331 /* Third Column */ 1332 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1333 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1334 1335 /* Matrix-Vector Product: */ 1336 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1337 SSE_SHUFFLE(XMM7,XMM7,0x00) 1338 SSE_MULT_PS(XMM7,XMM0) 1339 SSE_SUB_PS(XMM6,XMM7) 1340 1341 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1342 SSE_SHUFFLE(XMM4,XMM4,0x00) 1343 SSE_MULT_PS(XMM4,XMM1) 1344 SSE_SUB_PS(XMM6,XMM4) 1345 1346 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1347 SSE_SHUFFLE(XMM5,XMM5,0x00) 1348 SSE_MULT_PS(XMM5,XMM2) 1349 SSE_SUB_PS(XMM6,XMM5) 1350 1351 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1352 SSE_SHUFFLE(XMM7,XMM7,0x00) 1353 SSE_MULT_PS(XMM7,XMM3) 1354 SSE_SUB_PS(XMM6,XMM7) 1355 1356 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1357 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1358 1359 /* Fourth Column */ 1360 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1361 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1362 1363 /* Matrix-Vector Product: */ 1364 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1365 SSE_SHUFFLE(XMM5,XMM5,0x00) 1366 SSE_MULT_PS(XMM5,XMM0) 1367 SSE_SUB_PS(XMM4,XMM5) 1368 1369 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1370 SSE_SHUFFLE(XMM6,XMM6,0x00) 1371 SSE_MULT_PS(XMM6,XMM1) 1372 SSE_SUB_PS(XMM4,XMM6) 1373 1374 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1375 SSE_SHUFFLE(XMM7,XMM7,0x00) 1376 SSE_MULT_PS(XMM7,XMM2) 1377 SSE_SUB_PS(XMM4,XMM7) 1378 1379 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1380 SSE_SHUFFLE(XMM5,XMM5,0x00) 1381 SSE_MULT_PS(XMM5,XMM3) 1382 SSE_SUB_PS(XMM4,XMM5) 1383 1384 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1385 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1386 SSE_INLINE_END_2; 1387 pv += 16; 1388 } 1389 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1390 } 1391 row = (unsigned int)(*bjtmp++); 1392 /* row = (*bjtmp++)/4; */ 1393 /* bjtmp++; */ 1394 } 1395 /* finished row so stick it into b->a */ 1396 pv = ba + 16*bi[i]; 1397 pj = bj + bi[i]; 1398 nz = bi[i+1] - bi[i]; 1399 1400 /* Copy x block back into pv block */ 1401 for (j=0; j<nz; j++) { 1402 x = rtmp+16*((unsigned int)pj[j]); 1403 /* x = rtmp+4*pj[j]; */ 1404 1405 SSE_INLINE_BEGIN_2(x,pv) 1406 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1407 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1408 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1409 1410 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1411 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1412 1413 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1414 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1415 1416 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1417 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1418 1419 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1420 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1421 1422 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1423 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1424 1425 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1426 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1427 1428 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1429 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1430 SSE_INLINE_END_2; 1431 pv += 16; 1432 } 1433 /* invert diagonal block */ 1434 w = ba + 16*diag_offset[i]; 1435 if (pivotinblocks) { 1436 PetscBool allowzeropivot,zeropivotdetected; 1437 allowzeropivot = PetscNot(A->erroriffailure); 1438 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 1439 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1440 } else { 1441 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1442 } 1443 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1444 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1445 } 1446 1447 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1448 1449 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1450 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1451 C->assembled = PETSC_TRUE; 1452 1453 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1454 /* Flop Count from inverting diagonal blocks */ 1455 SSE_SCOPE_END; 1456 PetscFunctionReturn(0); 1457 } 1458 1459 #undef __FUNCT__ 1460 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1461 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1462 { 1463 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1464 PetscErrorCode ierr; 1465 int i,j,n = a->mbs; 1466 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1467 unsigned int row; 1468 int *ajtmpold,nz,*bi=b->i; 1469 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1470 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1471 MatScalar *ba = b->a,*aa = a->a; 1472 int nonzero=0; 1473 PetscBool pivotinblocks = b->pivotinblocks; 1474 PetscReal shift = info->shiftamount; 1475 1476 PetscFunctionBegin; 1477 SSE_SCOPE_BEGIN; 1478 1479 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1480 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1481 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1482 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1483 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1484 /* colscale = 4; */ 1485 /* } */ 1486 if ((unsigned long)bj==(unsigned long)aj) { 1487 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1488 } 1489 1490 for (i=0; i<n; i++) { 1491 nz = bi[i+1] - bi[i]; 1492 bjtmp = bj + bi[i]; 1493 /* zero out the 4x4 block accumulators */ 1494 /* zero out one register */ 1495 XOR_PS(XMM7,XMM7); 1496 for (j=0; j<nz; j++) { 1497 x = rtmp+16*((unsigned int)bjtmp[j]); 1498 /* x = rtmp+4*bjtmp[j]; */ 1499 SSE_INLINE_BEGIN_1(x) 1500 /* Copy zero register to memory locations */ 1501 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1502 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1503 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1504 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1505 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1506 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1507 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1508 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1509 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1510 SSE_INLINE_END_1; 1511 } 1512 /* load in initial (unfactored row) */ 1513 nz = ai[i+1] - ai[i]; 1514 ajtmpold = aj + ai[i]; 1515 v = aa + 16*ai[i]; 1516 for (j=0; j<nz; j++) { 1517 x = rtmp+16*ajtmpold[j]; 1518 /* x = rtmp+colscale*ajtmpold[j]; */ 1519 /* Copy v block into x block */ 1520 SSE_INLINE_BEGIN_2(v,x) 1521 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1522 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1523 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1524 1525 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1526 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1527 1528 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1529 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1530 1531 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1532 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1533 1534 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1535 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1536 1537 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1538 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1539 1540 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1541 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1542 1543 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1544 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1545 SSE_INLINE_END_2; 1546 1547 v += 16; 1548 } 1549 /* row = (*bjtmp++)/4; */ 1550 row = (unsigned int)(*bjtmp++); 1551 while (row < i) { 1552 pc = rtmp + 16*row; 1553 SSE_INLINE_BEGIN_1(pc) 1554 /* Load block from lower triangle */ 1555 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1556 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1557 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1558 1559 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1560 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1561 1562 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1563 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1564 1565 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1566 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1567 1568 /* Compare block to zero block */ 1569 1570 SSE_COPY_PS(XMM4,XMM7) 1571 SSE_CMPNEQ_PS(XMM4,XMM0) 1572 1573 SSE_COPY_PS(XMM5,XMM7) 1574 SSE_CMPNEQ_PS(XMM5,XMM1) 1575 1576 SSE_COPY_PS(XMM6,XMM7) 1577 SSE_CMPNEQ_PS(XMM6,XMM2) 1578 1579 SSE_CMPNEQ_PS(XMM7,XMM3) 1580 1581 /* Reduce the comparisons to one SSE register */ 1582 SSE_OR_PS(XMM6,XMM7) 1583 SSE_OR_PS(XMM5,XMM4) 1584 SSE_OR_PS(XMM5,XMM6) 1585 SSE_INLINE_END_1; 1586 1587 /* Reduce the one SSE register to an integer register for branching */ 1588 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1589 MOVEMASK(nonzero,XMM5); 1590 1591 /* If block is nonzero ... */ 1592 if (nonzero) { 1593 pv = ba + 16*diag_offset[row]; 1594 PREFETCH_L1(&pv[16]); 1595 pj = bj + diag_offset[row] + 1; 1596 1597 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1598 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1599 /* but the diagonal was inverted already */ 1600 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1601 1602 SSE_INLINE_BEGIN_2(pv,pc) 1603 /* Column 0, product is accumulated in XMM4 */ 1604 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1605 SSE_SHUFFLE(XMM4,XMM4,0x00) 1606 SSE_MULT_PS(XMM4,XMM0) 1607 1608 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1609 SSE_SHUFFLE(XMM5,XMM5,0x00) 1610 SSE_MULT_PS(XMM5,XMM1) 1611 SSE_ADD_PS(XMM4,XMM5) 1612 1613 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1614 SSE_SHUFFLE(XMM6,XMM6,0x00) 1615 SSE_MULT_PS(XMM6,XMM2) 1616 SSE_ADD_PS(XMM4,XMM6) 1617 1618 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1619 SSE_SHUFFLE(XMM7,XMM7,0x00) 1620 SSE_MULT_PS(XMM7,XMM3) 1621 SSE_ADD_PS(XMM4,XMM7) 1622 1623 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1624 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1625 1626 /* Column 1, product is accumulated in XMM5 */ 1627 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1628 SSE_SHUFFLE(XMM5,XMM5,0x00) 1629 SSE_MULT_PS(XMM5,XMM0) 1630 1631 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1632 SSE_SHUFFLE(XMM6,XMM6,0x00) 1633 SSE_MULT_PS(XMM6,XMM1) 1634 SSE_ADD_PS(XMM5,XMM6) 1635 1636 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1637 SSE_SHUFFLE(XMM7,XMM7,0x00) 1638 SSE_MULT_PS(XMM7,XMM2) 1639 SSE_ADD_PS(XMM5,XMM7) 1640 1641 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1642 SSE_SHUFFLE(XMM6,XMM6,0x00) 1643 SSE_MULT_PS(XMM6,XMM3) 1644 SSE_ADD_PS(XMM5,XMM6) 1645 1646 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1647 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1648 1649 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1650 1651 /* Column 2, product is accumulated in XMM6 */ 1652 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1653 SSE_SHUFFLE(XMM6,XMM6,0x00) 1654 SSE_MULT_PS(XMM6,XMM0) 1655 1656 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1657 SSE_SHUFFLE(XMM7,XMM7,0x00) 1658 SSE_MULT_PS(XMM7,XMM1) 1659 SSE_ADD_PS(XMM6,XMM7) 1660 1661 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1662 SSE_SHUFFLE(XMM7,XMM7,0x00) 1663 SSE_MULT_PS(XMM7,XMM2) 1664 SSE_ADD_PS(XMM6,XMM7) 1665 1666 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1667 SSE_SHUFFLE(XMM7,XMM7,0x00) 1668 SSE_MULT_PS(XMM7,XMM3) 1669 SSE_ADD_PS(XMM6,XMM7) 1670 1671 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1672 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1673 1674 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1675 /* Column 3, product is accumulated in XMM0 */ 1676 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1677 SSE_SHUFFLE(XMM7,XMM7,0x00) 1678 SSE_MULT_PS(XMM0,XMM7) 1679 1680 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1681 SSE_SHUFFLE(XMM7,XMM7,0x00) 1682 SSE_MULT_PS(XMM1,XMM7) 1683 SSE_ADD_PS(XMM0,XMM1) 1684 1685 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1686 SSE_SHUFFLE(XMM1,XMM1,0x00) 1687 SSE_MULT_PS(XMM1,XMM2) 1688 SSE_ADD_PS(XMM0,XMM1) 1689 1690 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1691 SSE_SHUFFLE(XMM7,XMM7,0x00) 1692 SSE_MULT_PS(XMM3,XMM7) 1693 SSE_ADD_PS(XMM0,XMM3) 1694 1695 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1696 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1697 1698 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1699 /* This is code to be maintained and read by humans afterall. */ 1700 /* Copy Multiplier Col 3 into XMM3 */ 1701 SSE_COPY_PS(XMM3,XMM0) 1702 /* Copy Multiplier Col 2 into XMM2 */ 1703 SSE_COPY_PS(XMM2,XMM6) 1704 /* Copy Multiplier Col 1 into XMM1 */ 1705 SSE_COPY_PS(XMM1,XMM5) 1706 /* Copy Multiplier Col 0 into XMM0 */ 1707 SSE_COPY_PS(XMM0,XMM4) 1708 SSE_INLINE_END_2; 1709 1710 /* Update the row: */ 1711 nz = bi[row+1] - diag_offset[row] - 1; 1712 pv += 16; 1713 for (j=0; j<nz; j++) { 1714 PREFETCH_L1(&pv[16]); 1715 x = rtmp + 16*((unsigned int)pj[j]); 1716 /* x = rtmp + 4*pj[j]; */ 1717 1718 /* X:=X-M*PV, One column at a time */ 1719 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1720 SSE_INLINE_BEGIN_2(x,pv) 1721 /* Load First Column of X*/ 1722 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1723 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1724 1725 /* Matrix-Vector Product: */ 1726 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1727 SSE_SHUFFLE(XMM5,XMM5,0x00) 1728 SSE_MULT_PS(XMM5,XMM0) 1729 SSE_SUB_PS(XMM4,XMM5) 1730 1731 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1732 SSE_SHUFFLE(XMM6,XMM6,0x00) 1733 SSE_MULT_PS(XMM6,XMM1) 1734 SSE_SUB_PS(XMM4,XMM6) 1735 1736 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1737 SSE_SHUFFLE(XMM7,XMM7,0x00) 1738 SSE_MULT_PS(XMM7,XMM2) 1739 SSE_SUB_PS(XMM4,XMM7) 1740 1741 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1742 SSE_SHUFFLE(XMM5,XMM5,0x00) 1743 SSE_MULT_PS(XMM5,XMM3) 1744 SSE_SUB_PS(XMM4,XMM5) 1745 1746 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1747 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1748 1749 /* Second Column */ 1750 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1751 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1752 1753 /* Matrix-Vector Product: */ 1754 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1755 SSE_SHUFFLE(XMM6,XMM6,0x00) 1756 SSE_MULT_PS(XMM6,XMM0) 1757 SSE_SUB_PS(XMM5,XMM6) 1758 1759 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1760 SSE_SHUFFLE(XMM7,XMM7,0x00) 1761 SSE_MULT_PS(XMM7,XMM1) 1762 SSE_SUB_PS(XMM5,XMM7) 1763 1764 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1765 SSE_SHUFFLE(XMM4,XMM4,0x00) 1766 SSE_MULT_PS(XMM4,XMM2) 1767 SSE_SUB_PS(XMM5,XMM4) 1768 1769 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1770 SSE_SHUFFLE(XMM6,XMM6,0x00) 1771 SSE_MULT_PS(XMM6,XMM3) 1772 SSE_SUB_PS(XMM5,XMM6) 1773 1774 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1775 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1776 1777 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1778 1779 /* Third Column */ 1780 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1781 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1782 1783 /* Matrix-Vector Product: */ 1784 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1785 SSE_SHUFFLE(XMM7,XMM7,0x00) 1786 SSE_MULT_PS(XMM7,XMM0) 1787 SSE_SUB_PS(XMM6,XMM7) 1788 1789 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1790 SSE_SHUFFLE(XMM4,XMM4,0x00) 1791 SSE_MULT_PS(XMM4,XMM1) 1792 SSE_SUB_PS(XMM6,XMM4) 1793 1794 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1795 SSE_SHUFFLE(XMM5,XMM5,0x00) 1796 SSE_MULT_PS(XMM5,XMM2) 1797 SSE_SUB_PS(XMM6,XMM5) 1798 1799 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1800 SSE_SHUFFLE(XMM7,XMM7,0x00) 1801 SSE_MULT_PS(XMM7,XMM3) 1802 SSE_SUB_PS(XMM6,XMM7) 1803 1804 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1805 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1806 1807 /* Fourth Column */ 1808 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1809 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1810 1811 /* Matrix-Vector Product: */ 1812 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1813 SSE_SHUFFLE(XMM5,XMM5,0x00) 1814 SSE_MULT_PS(XMM5,XMM0) 1815 SSE_SUB_PS(XMM4,XMM5) 1816 1817 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1818 SSE_SHUFFLE(XMM6,XMM6,0x00) 1819 SSE_MULT_PS(XMM6,XMM1) 1820 SSE_SUB_PS(XMM4,XMM6) 1821 1822 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1823 SSE_SHUFFLE(XMM7,XMM7,0x00) 1824 SSE_MULT_PS(XMM7,XMM2) 1825 SSE_SUB_PS(XMM4,XMM7) 1826 1827 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1828 SSE_SHUFFLE(XMM5,XMM5,0x00) 1829 SSE_MULT_PS(XMM5,XMM3) 1830 SSE_SUB_PS(XMM4,XMM5) 1831 1832 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1833 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1834 SSE_INLINE_END_2; 1835 pv += 16; 1836 } 1837 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1838 } 1839 row = (unsigned int)(*bjtmp++); 1840 /* row = (*bjtmp++)/4; */ 1841 /* bjtmp++; */ 1842 } 1843 /* finished row so stick it into b->a */ 1844 pv = ba + 16*bi[i]; 1845 pj = bj + bi[i]; 1846 nz = bi[i+1] - bi[i]; 1847 1848 /* Copy x block back into pv block */ 1849 for (j=0; j<nz; j++) { 1850 x = rtmp+16*((unsigned int)pj[j]); 1851 /* x = rtmp+4*pj[j]; */ 1852 1853 SSE_INLINE_BEGIN_2(x,pv) 1854 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1855 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1856 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1857 1858 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1859 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1860 1861 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1862 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1863 1864 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1865 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1866 1867 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1868 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1869 1870 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1871 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1872 1873 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1874 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1875 1876 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1877 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1878 SSE_INLINE_END_2; 1879 pv += 16; 1880 } 1881 /* invert diagonal block */ 1882 w = ba + 16*diag_offset[i]; 1883 if (pivotinblocks) { 1884 PetscBool allowzeropivot,zeropivotdetected; 1885 allowzeropivot = PetscNot(A->erroriffailure); 1886 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);CHKERRQ(ierr); 1887 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1888 } else { 1889 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1890 } 1891 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1892 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1893 } 1894 1895 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1896 1897 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1898 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1899 C->assembled = PETSC_TRUE; 1900 1901 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1902 /* Flop Count from inverting diagonal blocks */ 1903 SSE_SCOPE_END; 1904 PetscFunctionReturn(0); 1905 } 1906 1907 #endif 1908