1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <../src/mat/blockinvert.h> 7 8 /* ------------------------------------------------------------*/ 9 /* 10 Version for when blocks are 4 by 4 11 */ 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace" 14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 15 { 16 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 17 IS isrow = b->row,isicol = b->icol; 18 PetscErrorCode ierr; 19 const PetscInt *r,*ic; 20 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 21 PetscInt *ajtmpold,*ajtmp,nz,row; 22 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 23 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 24 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 25 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 26 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 27 MatScalar m13,m14,m15,m16; 28 MatScalar *ba = b->a,*aa = a->a; 29 PetscBool pivotinblocks = b->pivotinblocks; 30 PetscReal shift = info->shiftamount; 31 32 PetscFunctionBegin; 33 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 34 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 35 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 36 37 for (i=0; i<n; i++) { 38 nz = bi[i+1] - bi[i]; 39 ajtmp = bj + bi[i]; 40 for (j=0; j<nz; j++) { 41 x = rtmp+16*ajtmp[j]; 42 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 43 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 44 } 45 /* load in initial (unfactored row) */ 46 idx = r[i]; 47 nz = ai[idx+1] - ai[idx]; 48 ajtmpold = aj + ai[idx]; 49 v = aa + 16*ai[idx]; 50 for (j=0; j<nz; j++) { 51 x = rtmp+16*ic[ajtmpold[j]]; 52 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 53 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 54 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 55 x[14] = v[14]; x[15] = v[15]; 56 v += 16; 57 } 58 row = *ajtmp++; 59 while (row < i) { 60 pc = rtmp + 16*row; 61 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 62 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 63 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 64 p15 = pc[14]; p16 = pc[15]; 65 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 66 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 67 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 68 || p16 != 0.0) { 69 pv = ba + 16*diag_offset[row]; 70 pj = bj + diag_offset[row] + 1; 71 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 72 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 73 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 74 x15 = pv[14]; x16 = pv[15]; 75 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 76 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 77 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 78 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 79 80 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 81 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 82 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 83 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 84 85 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 86 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 87 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 88 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 89 90 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 91 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 92 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 93 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 94 95 nz = bi[row+1] - diag_offset[row] - 1; 96 pv += 16; 97 for (j=0; j<nz; j++) { 98 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 99 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 100 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 101 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 102 x = rtmp + 16*pj[j]; 103 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 104 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 105 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 106 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 107 108 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 109 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 110 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 111 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 112 113 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 114 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 115 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 116 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 117 118 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 119 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 120 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 121 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 122 123 pv += 16; 124 } 125 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 126 } 127 row = *ajtmp++; 128 } 129 /* finished row so stick it into b->a */ 130 pv = ba + 16*bi[i]; 131 pj = bj + bi[i]; 132 nz = bi[i+1] - bi[i]; 133 for (j=0; j<nz; j++) { 134 x = rtmp+16*pj[j]; 135 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 136 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 137 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 138 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 139 pv += 16; 140 } 141 /* invert diagonal block */ 142 w = ba + 16*diag_offset[i]; 143 if (pivotinblocks) { 144 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 145 } else { 146 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 147 } 148 } 149 150 ierr = PetscFree(rtmp);CHKERRQ(ierr); 151 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 153 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 154 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 155 C->assembled = PETSC_TRUE; 156 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 157 PetscFunctionReturn(0); 158 } 159 160 /* MatLUFactorNumeric_SeqBAIJ_4 - 161 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 162 Kernel_A_gets_A_times_B() 163 Kernel_A_gets_A_minus_B_times_C() 164 Kernel_A_gets_inverse_A() 165 */ 166 167 #undef __FUNCT__ 168 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 169 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 170 { 171 Mat C=B; 172 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 173 IS isrow = b->row,isicol = b->icol; 174 PetscErrorCode ierr; 175 const PetscInt *r,*ic; 176 PetscInt i,j,k,nz,nzL,row; 177 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 178 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 179 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 180 PetscInt flg; 181 PetscReal shift; 182 183 PetscFunctionBegin; 184 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 185 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 186 187 if (info->shifttype == MAT_SHIFT_NONE){ 188 shift = 0; 189 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 190 shift = info->shiftamount; 191 } 192 193 /* generate work space needed by the factorization */ 194 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 195 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 196 197 for (i=0; i<n; i++){ 198 /* zero rtmp */ 199 /* L part */ 200 nz = bi[i+1] - bi[i]; 201 bjtmp = bj + bi[i]; 202 for (j=0; j<nz; j++){ 203 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 204 } 205 206 /* U part */ 207 nz = bdiag[i]-bdiag[i+1]; 208 bjtmp = bj + bdiag[i+1]+1; 209 for (j=0; j<nz; j++){ 210 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 211 } 212 213 /* load in initial (unfactored row) */ 214 nz = ai[r[i]+1] - ai[r[i]]; 215 ajtmp = aj + ai[r[i]]; 216 v = aa + bs2*ai[r[i]]; 217 for (j=0; j<nz; j++) { 218 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 219 } 220 221 /* elimination */ 222 bjtmp = bj + bi[i]; 223 nzL = bi[i+1] - bi[i]; 224 for(k=0;k < nzL;k++) { 225 row = bjtmp[k]; 226 pc = rtmp + bs2*row; 227 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 228 if (flg) { 229 pv = b->a + bs2*bdiag[row]; 230 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 231 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 232 233 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 234 pv = b->a + bs2*(bdiag[row+1]+1); 235 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 236 for (j=0; j<nz; j++) { 237 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 238 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 239 v = rtmp + bs2*pj[j]; 240 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 241 pv += bs2; 242 } 243 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 244 } 245 } 246 247 /* finished row so stick it into b->a */ 248 /* L part */ 249 pv = b->a + bs2*bi[i] ; 250 pj = b->j + bi[i] ; 251 nz = bi[i+1] - bi[i]; 252 for (j=0; j<nz; j++) { 253 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 254 } 255 256 /* Mark diagonal and invert diagonal for simplier triangular solves */ 257 pv = b->a + bs2*bdiag[i]; 258 pj = b->j + bdiag[i]; 259 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 260 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 261 262 /* U part */ 263 pv = b->a + bs2*(bdiag[i+1]+1); 264 pj = b->j + bdiag[i+1]+1; 265 nz = bdiag[i] - bdiag[i+1] - 1; 266 for (j=0; j<nz; j++){ 267 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 268 } 269 } 270 271 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 272 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 273 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 274 C->ops->solve = MatSolve_SeqBAIJ_4; 275 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 276 C->assembled = PETSC_TRUE; 277 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 278 PetscFunctionReturn(0); 279 } 280 281 #undef __FUNCT__ 282 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace" 283 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 284 { 285 /* 286 Default Version for when blocks are 4 by 4 Using natural ordering 287 */ 288 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 289 PetscErrorCode ierr; 290 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 291 PetscInt *ajtmpold,*ajtmp,nz,row; 292 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 293 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 294 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 295 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 296 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 297 MatScalar m13,m14,m15,m16; 298 MatScalar *ba = b->a,*aa = a->a; 299 PetscBool pivotinblocks = b->pivotinblocks; 300 PetscReal shift = info->shiftamount; 301 302 PetscFunctionBegin; 303 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 304 305 for (i=0; i<n; i++) { 306 nz = bi[i+1] - bi[i]; 307 ajtmp = bj + bi[i]; 308 for (j=0; j<nz; j++) { 309 x = rtmp+16*ajtmp[j]; 310 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 311 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 312 } 313 /* load in initial (unfactored row) */ 314 nz = ai[i+1] - ai[i]; 315 ajtmpold = aj + ai[i]; 316 v = aa + 16*ai[i]; 317 for (j=0; j<nz; j++) { 318 x = rtmp+16*ajtmpold[j]; 319 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 320 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 321 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 322 x[14] = v[14]; x[15] = v[15]; 323 v += 16; 324 } 325 row = *ajtmp++; 326 while (row < i) { 327 pc = rtmp + 16*row; 328 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 329 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 330 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 331 p15 = pc[14]; p16 = pc[15]; 332 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 333 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 334 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 335 || p16 != 0.0) { 336 pv = ba + 16*diag_offset[row]; 337 pj = bj + diag_offset[row] + 1; 338 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 339 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 340 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 341 x15 = pv[14]; x16 = pv[15]; 342 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 343 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 344 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 345 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 346 347 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 348 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 349 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 350 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 351 352 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 353 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 354 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 355 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 356 357 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 358 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 359 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 360 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 361 nz = bi[row+1] - diag_offset[row] - 1; 362 pv += 16; 363 for (j=0; j<nz; j++) { 364 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 365 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 366 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 367 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 368 x = rtmp + 16*pj[j]; 369 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 370 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 371 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 372 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 373 374 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 375 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 376 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 377 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 378 379 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 380 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 381 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 382 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 383 384 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 385 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 386 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 387 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 388 389 pv += 16; 390 } 391 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 392 } 393 row = *ajtmp++; 394 } 395 /* finished row so stick it into b->a */ 396 pv = ba + 16*bi[i]; 397 pj = bj + bi[i]; 398 nz = bi[i+1] - bi[i]; 399 for (j=0; j<nz; j++) { 400 x = rtmp+16*pj[j]; 401 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 402 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 403 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 404 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 405 pv += 16; 406 } 407 /* invert diagonal block */ 408 w = ba + 16*diag_offset[i]; 409 if (pivotinblocks) { 410 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 411 } else { 412 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 413 } 414 } 415 416 ierr = PetscFree(rtmp);CHKERRQ(ierr); 417 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 418 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 419 C->assembled = PETSC_TRUE; 420 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 421 PetscFunctionReturn(0); 422 } 423 424 /* 425 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 426 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 427 */ 428 #undef __FUNCT__ 429 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 430 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 431 { 432 Mat C=B; 433 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 434 PetscErrorCode ierr; 435 PetscInt i,j,k,nz,nzL,row; 436 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 437 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 438 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 439 PetscInt flg; 440 PetscReal shift; 441 442 PetscFunctionBegin; 443 /* generate work space needed by the factorization */ 444 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 445 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 446 447 if (info->shifttype == MAT_SHIFT_NONE){ 448 shift = 0; 449 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 450 shift = info->shiftamount; 451 } 452 453 for (i=0; i<n; i++){ 454 /* zero rtmp */ 455 /* L part */ 456 nz = bi[i+1] - bi[i]; 457 bjtmp = bj + bi[i]; 458 for (j=0; j<nz; j++){ 459 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 460 } 461 462 /* U part */ 463 nz = bdiag[i] - bdiag[i+1]; 464 bjtmp = bj + bdiag[i+1]+1; 465 for (j=0; j<nz; j++){ 466 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 467 } 468 469 /* load in initial (unfactored row) */ 470 nz = ai[i+1] - ai[i]; 471 ajtmp = aj + ai[i]; 472 v = aa + bs2*ai[i]; 473 for (j=0; j<nz; j++) { 474 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 475 } 476 477 /* elimination */ 478 bjtmp = bj + bi[i]; 479 nzL = bi[i+1] - bi[i]; 480 for(k=0;k < nzL;k++) { 481 row = bjtmp[k]; 482 pc = rtmp + bs2*row; 483 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 484 if (flg) { 485 pv = b->a + bs2*bdiag[row]; 486 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 487 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 488 489 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 490 pv = b->a + bs2*(bdiag[row+1]+1); 491 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 492 for (j=0; j<nz; j++) { 493 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 494 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 495 v = rtmp + bs2*pj[j]; 496 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 497 pv += bs2; 498 } 499 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 500 } 501 } 502 503 /* finished row so stick it into b->a */ 504 /* L part */ 505 pv = b->a + bs2*bi[i] ; 506 pj = b->j + bi[i] ; 507 nz = bi[i+1] - bi[i]; 508 for (j=0; j<nz; j++) { 509 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 510 } 511 512 /* Mark diagonal and invert diagonal for simplier triangular solves */ 513 pv = b->a + bs2*bdiag[i]; 514 pj = b->j + bdiag[i]; 515 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 516 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 517 518 /* U part */ 519 pv = b->a + bs2*(bdiag[i+1]+1); 520 pj = b->j + bdiag[i+1]+1; 521 nz = bdiag[i] - bdiag[i+1] - 1; 522 for (j=0; j<nz; j++){ 523 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 524 } 525 } 526 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 527 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 528 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 529 C->assembled = PETSC_TRUE; 530 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 531 PetscFunctionReturn(0); 532 } 533 534 #if defined(PETSC_HAVE_SSE) 535 536 #include PETSC_HAVE_SSE 537 538 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 539 #undef __FUNCT__ 540 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 541 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 542 { 543 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 544 PetscErrorCode ierr; 545 int i,j,n = a->mbs; 546 int *bj = b->j,*bjtmp,*pj; 547 int row; 548 int *ajtmpold,nz,*bi=b->i; 549 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 550 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 551 MatScalar *ba = b->a,*aa = a->a; 552 int nonzero=0; 553 /* int nonzero=0,colscale = 16; */ 554 PetscBool pivotinblocks = b->pivotinblocks; 555 PetscReal shift = info->shiftamount; 556 557 PetscFunctionBegin; 558 SSE_SCOPE_BEGIN; 559 560 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 561 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 562 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 563 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 564 /* if ((unsigned long)bj==(unsigned long)aj) { */ 565 /* colscale = 4; */ 566 /* } */ 567 for (i=0; i<n; i++) { 568 nz = bi[i+1] - bi[i]; 569 bjtmp = bj + bi[i]; 570 /* zero out the 4x4 block accumulators */ 571 /* zero out one register */ 572 XOR_PS(XMM7,XMM7); 573 for (j=0; j<nz; j++) { 574 x = rtmp+16*bjtmp[j]; 575 /* x = rtmp+4*bjtmp[j]; */ 576 SSE_INLINE_BEGIN_1(x) 577 /* Copy zero register to memory locations */ 578 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 579 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 580 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 581 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 582 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 583 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 584 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 585 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 586 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 587 SSE_INLINE_END_1; 588 } 589 /* load in initial (unfactored row) */ 590 nz = ai[i+1] - ai[i]; 591 ajtmpold = aj + ai[i]; 592 v = aa + 16*ai[i]; 593 for (j=0; j<nz; j++) { 594 x = rtmp+16*ajtmpold[j]; 595 /* x = rtmp+colscale*ajtmpold[j]; */ 596 /* Copy v block into x block */ 597 SSE_INLINE_BEGIN_2(v,x) 598 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 599 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 600 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 601 602 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 603 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 604 605 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 606 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 607 608 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 609 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 610 611 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 612 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 613 614 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 615 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 616 617 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 618 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 619 620 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 621 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 622 SSE_INLINE_END_2; 623 624 v += 16; 625 } 626 /* row = (*bjtmp++)/4; */ 627 row = *bjtmp++; 628 while (row < i) { 629 pc = rtmp + 16*row; 630 SSE_INLINE_BEGIN_1(pc) 631 /* Load block from lower triangle */ 632 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 633 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 634 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 635 636 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 637 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 638 639 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 640 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 641 642 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 643 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 644 645 /* Compare block to zero block */ 646 647 SSE_COPY_PS(XMM4,XMM7) 648 SSE_CMPNEQ_PS(XMM4,XMM0) 649 650 SSE_COPY_PS(XMM5,XMM7) 651 SSE_CMPNEQ_PS(XMM5,XMM1) 652 653 SSE_COPY_PS(XMM6,XMM7) 654 SSE_CMPNEQ_PS(XMM6,XMM2) 655 656 SSE_CMPNEQ_PS(XMM7,XMM3) 657 658 /* Reduce the comparisons to one SSE register */ 659 SSE_OR_PS(XMM6,XMM7) 660 SSE_OR_PS(XMM5,XMM4) 661 SSE_OR_PS(XMM5,XMM6) 662 SSE_INLINE_END_1; 663 664 /* Reduce the one SSE register to an integer register for branching */ 665 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 666 MOVEMASK(nonzero,XMM5); 667 668 /* If block is nonzero ... */ 669 if (nonzero) { 670 pv = ba + 16*diag_offset[row]; 671 PREFETCH_L1(&pv[16]); 672 pj = bj + diag_offset[row] + 1; 673 674 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 675 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 676 /* but the diagonal was inverted already */ 677 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 678 679 SSE_INLINE_BEGIN_2(pv,pc) 680 /* Column 0, product is accumulated in XMM4 */ 681 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 682 SSE_SHUFFLE(XMM4,XMM4,0x00) 683 SSE_MULT_PS(XMM4,XMM0) 684 685 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 686 SSE_SHUFFLE(XMM5,XMM5,0x00) 687 SSE_MULT_PS(XMM5,XMM1) 688 SSE_ADD_PS(XMM4,XMM5) 689 690 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 691 SSE_SHUFFLE(XMM6,XMM6,0x00) 692 SSE_MULT_PS(XMM6,XMM2) 693 SSE_ADD_PS(XMM4,XMM6) 694 695 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 696 SSE_SHUFFLE(XMM7,XMM7,0x00) 697 SSE_MULT_PS(XMM7,XMM3) 698 SSE_ADD_PS(XMM4,XMM7) 699 700 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 701 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 702 703 /* Column 1, product is accumulated in XMM5 */ 704 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 705 SSE_SHUFFLE(XMM5,XMM5,0x00) 706 SSE_MULT_PS(XMM5,XMM0) 707 708 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 709 SSE_SHUFFLE(XMM6,XMM6,0x00) 710 SSE_MULT_PS(XMM6,XMM1) 711 SSE_ADD_PS(XMM5,XMM6) 712 713 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 714 SSE_SHUFFLE(XMM7,XMM7,0x00) 715 SSE_MULT_PS(XMM7,XMM2) 716 SSE_ADD_PS(XMM5,XMM7) 717 718 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 719 SSE_SHUFFLE(XMM6,XMM6,0x00) 720 SSE_MULT_PS(XMM6,XMM3) 721 SSE_ADD_PS(XMM5,XMM6) 722 723 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 724 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 725 726 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 727 728 /* Column 2, product is accumulated in XMM6 */ 729 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 730 SSE_SHUFFLE(XMM6,XMM6,0x00) 731 SSE_MULT_PS(XMM6,XMM0) 732 733 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 734 SSE_SHUFFLE(XMM7,XMM7,0x00) 735 SSE_MULT_PS(XMM7,XMM1) 736 SSE_ADD_PS(XMM6,XMM7) 737 738 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 739 SSE_SHUFFLE(XMM7,XMM7,0x00) 740 SSE_MULT_PS(XMM7,XMM2) 741 SSE_ADD_PS(XMM6,XMM7) 742 743 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 744 SSE_SHUFFLE(XMM7,XMM7,0x00) 745 SSE_MULT_PS(XMM7,XMM3) 746 SSE_ADD_PS(XMM6,XMM7) 747 748 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 749 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 750 751 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 752 /* Column 3, product is accumulated in XMM0 */ 753 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 754 SSE_SHUFFLE(XMM7,XMM7,0x00) 755 SSE_MULT_PS(XMM0,XMM7) 756 757 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 758 SSE_SHUFFLE(XMM7,XMM7,0x00) 759 SSE_MULT_PS(XMM1,XMM7) 760 SSE_ADD_PS(XMM0,XMM1) 761 762 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 763 SSE_SHUFFLE(XMM1,XMM1,0x00) 764 SSE_MULT_PS(XMM1,XMM2) 765 SSE_ADD_PS(XMM0,XMM1) 766 767 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 768 SSE_SHUFFLE(XMM7,XMM7,0x00) 769 SSE_MULT_PS(XMM3,XMM7) 770 SSE_ADD_PS(XMM0,XMM3) 771 772 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 773 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 774 775 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 776 /* This is code to be maintained and read by humans afterall. */ 777 /* Copy Multiplier Col 3 into XMM3 */ 778 SSE_COPY_PS(XMM3,XMM0) 779 /* Copy Multiplier Col 2 into XMM2 */ 780 SSE_COPY_PS(XMM2,XMM6) 781 /* Copy Multiplier Col 1 into XMM1 */ 782 SSE_COPY_PS(XMM1,XMM5) 783 /* Copy Multiplier Col 0 into XMM0 */ 784 SSE_COPY_PS(XMM0,XMM4) 785 SSE_INLINE_END_2; 786 787 /* Update the row: */ 788 nz = bi[row+1] - diag_offset[row] - 1; 789 pv += 16; 790 for (j=0; j<nz; j++) { 791 PREFETCH_L1(&pv[16]); 792 x = rtmp + 16*pj[j]; 793 /* x = rtmp + 4*pj[j]; */ 794 795 /* X:=X-M*PV, One column at a time */ 796 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 797 SSE_INLINE_BEGIN_2(x,pv) 798 /* Load First Column of X*/ 799 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 800 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 801 802 /* Matrix-Vector Product: */ 803 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 804 SSE_SHUFFLE(XMM5,XMM5,0x00) 805 SSE_MULT_PS(XMM5,XMM0) 806 SSE_SUB_PS(XMM4,XMM5) 807 808 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 809 SSE_SHUFFLE(XMM6,XMM6,0x00) 810 SSE_MULT_PS(XMM6,XMM1) 811 SSE_SUB_PS(XMM4,XMM6) 812 813 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 814 SSE_SHUFFLE(XMM7,XMM7,0x00) 815 SSE_MULT_PS(XMM7,XMM2) 816 SSE_SUB_PS(XMM4,XMM7) 817 818 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 819 SSE_SHUFFLE(XMM5,XMM5,0x00) 820 SSE_MULT_PS(XMM5,XMM3) 821 SSE_SUB_PS(XMM4,XMM5) 822 823 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 824 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 825 826 /* Second Column */ 827 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 828 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 829 830 /* Matrix-Vector Product: */ 831 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 832 SSE_SHUFFLE(XMM6,XMM6,0x00) 833 SSE_MULT_PS(XMM6,XMM0) 834 SSE_SUB_PS(XMM5,XMM6) 835 836 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 837 SSE_SHUFFLE(XMM7,XMM7,0x00) 838 SSE_MULT_PS(XMM7,XMM1) 839 SSE_SUB_PS(XMM5,XMM7) 840 841 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 842 SSE_SHUFFLE(XMM4,XMM4,0x00) 843 SSE_MULT_PS(XMM4,XMM2) 844 SSE_SUB_PS(XMM5,XMM4) 845 846 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 847 SSE_SHUFFLE(XMM6,XMM6,0x00) 848 SSE_MULT_PS(XMM6,XMM3) 849 SSE_SUB_PS(XMM5,XMM6) 850 851 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 852 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 853 854 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 855 856 /* Third Column */ 857 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 858 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 859 860 /* Matrix-Vector Product: */ 861 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 862 SSE_SHUFFLE(XMM7,XMM7,0x00) 863 SSE_MULT_PS(XMM7,XMM0) 864 SSE_SUB_PS(XMM6,XMM7) 865 866 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 867 SSE_SHUFFLE(XMM4,XMM4,0x00) 868 SSE_MULT_PS(XMM4,XMM1) 869 SSE_SUB_PS(XMM6,XMM4) 870 871 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 872 SSE_SHUFFLE(XMM5,XMM5,0x00) 873 SSE_MULT_PS(XMM5,XMM2) 874 SSE_SUB_PS(XMM6,XMM5) 875 876 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 877 SSE_SHUFFLE(XMM7,XMM7,0x00) 878 SSE_MULT_PS(XMM7,XMM3) 879 SSE_SUB_PS(XMM6,XMM7) 880 881 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 882 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 883 884 /* Fourth Column */ 885 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 886 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 887 888 /* Matrix-Vector Product: */ 889 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 890 SSE_SHUFFLE(XMM5,XMM5,0x00) 891 SSE_MULT_PS(XMM5,XMM0) 892 SSE_SUB_PS(XMM4,XMM5) 893 894 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 895 SSE_SHUFFLE(XMM6,XMM6,0x00) 896 SSE_MULT_PS(XMM6,XMM1) 897 SSE_SUB_PS(XMM4,XMM6) 898 899 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 900 SSE_SHUFFLE(XMM7,XMM7,0x00) 901 SSE_MULT_PS(XMM7,XMM2) 902 SSE_SUB_PS(XMM4,XMM7) 903 904 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 905 SSE_SHUFFLE(XMM5,XMM5,0x00) 906 SSE_MULT_PS(XMM5,XMM3) 907 SSE_SUB_PS(XMM4,XMM5) 908 909 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 910 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 911 SSE_INLINE_END_2; 912 pv += 16; 913 } 914 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 915 } 916 row = *bjtmp++; 917 /* row = (*bjtmp++)/4; */ 918 } 919 /* finished row so stick it into b->a */ 920 pv = ba + 16*bi[i]; 921 pj = bj + bi[i]; 922 nz = bi[i+1] - bi[i]; 923 924 /* Copy x block back into pv block */ 925 for (j=0; j<nz; j++) { 926 x = rtmp+16*pj[j]; 927 /* x = rtmp+4*pj[j]; */ 928 929 SSE_INLINE_BEGIN_2(x,pv) 930 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 931 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 932 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 933 934 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 935 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 936 937 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 938 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 939 940 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 941 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 942 943 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 944 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 945 946 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 947 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 948 949 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 950 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 951 952 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 953 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 954 SSE_INLINE_END_2; 955 pv += 16; 956 } 957 /* invert diagonal block */ 958 w = ba + 16*diag_offset[i]; 959 if (pivotinblocks) { 960 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 961 } else { 962 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 963 } 964 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 965 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 966 } 967 968 ierr = PetscFree(rtmp);CHKERRQ(ierr); 969 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 970 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 971 C->assembled = PETSC_TRUE; 972 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 973 /* Flop Count from inverting diagonal blocks */ 974 SSE_SCOPE_END; 975 PetscFunctionReturn(0); 976 } 977 978 #undef __FUNCT__ 979 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 980 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 981 { 982 Mat A=C; 983 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 984 PetscErrorCode ierr; 985 int i,j,n = a->mbs; 986 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 987 unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 988 unsigned int row; 989 int nz,*bi=b->i; 990 int *diag_offset = b->diag,*ai=a->i; 991 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 992 MatScalar *ba = b->a,*aa = a->a; 993 int nonzero=0; 994 /* int nonzero=0,colscale = 16; */ 995 PetscBool pivotinblocks = b->pivotinblocks; 996 PetscReal shift = info->shiftamount; 997 998 PetscFunctionBegin; 999 SSE_SCOPE_BEGIN; 1000 1001 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1002 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1003 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1004 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1005 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1006 /* colscale = 4; */ 1007 /* } */ 1008 1009 for (i=0; i<n; i++) { 1010 nz = bi[i+1] - bi[i]; 1011 bjtmp = bj + bi[i]; 1012 /* zero out the 4x4 block accumulators */ 1013 /* zero out one register */ 1014 XOR_PS(XMM7,XMM7); 1015 for (j=0; j<nz; j++) { 1016 x = rtmp+16*((unsigned int)bjtmp[j]); 1017 /* x = rtmp+4*bjtmp[j]; */ 1018 SSE_INLINE_BEGIN_1(x) 1019 /* Copy zero register to memory locations */ 1020 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1021 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1022 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1023 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1024 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1025 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1026 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1027 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1028 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1029 SSE_INLINE_END_1; 1030 } 1031 /* load in initial (unfactored row) */ 1032 nz = ai[i+1] - ai[i]; 1033 ajtmp = aj + ai[i]; 1034 v = aa + 16*ai[i]; 1035 for (j=0; j<nz; j++) { 1036 x = rtmp+16*((unsigned int)ajtmp[j]); 1037 /* x = rtmp+colscale*ajtmp[j]; */ 1038 /* Copy v block into x block */ 1039 SSE_INLINE_BEGIN_2(v,x) 1040 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1041 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1042 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1043 1044 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1045 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1046 1047 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1048 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1049 1050 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1051 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1052 1053 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1054 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1055 1056 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1057 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1058 1059 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1060 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1061 1062 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1063 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1064 SSE_INLINE_END_2; 1065 1066 v += 16; 1067 } 1068 /* row = (*bjtmp++)/4; */ 1069 row = (unsigned int)(*bjtmp++); 1070 while (row < i) { 1071 pc = rtmp + 16*row; 1072 SSE_INLINE_BEGIN_1(pc) 1073 /* Load block from lower triangle */ 1074 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1075 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1076 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1077 1078 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1079 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1080 1081 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1082 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1083 1084 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1085 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1086 1087 /* Compare block to zero block */ 1088 1089 SSE_COPY_PS(XMM4,XMM7) 1090 SSE_CMPNEQ_PS(XMM4,XMM0) 1091 1092 SSE_COPY_PS(XMM5,XMM7) 1093 SSE_CMPNEQ_PS(XMM5,XMM1) 1094 1095 SSE_COPY_PS(XMM6,XMM7) 1096 SSE_CMPNEQ_PS(XMM6,XMM2) 1097 1098 SSE_CMPNEQ_PS(XMM7,XMM3) 1099 1100 /* Reduce the comparisons to one SSE register */ 1101 SSE_OR_PS(XMM6,XMM7) 1102 SSE_OR_PS(XMM5,XMM4) 1103 SSE_OR_PS(XMM5,XMM6) 1104 SSE_INLINE_END_1; 1105 1106 /* Reduce the one SSE register to an integer register for branching */ 1107 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1108 MOVEMASK(nonzero,XMM5); 1109 1110 /* If block is nonzero ... */ 1111 if (nonzero) { 1112 pv = ba + 16*diag_offset[row]; 1113 PREFETCH_L1(&pv[16]); 1114 pj = bj + diag_offset[row] + 1; 1115 1116 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1117 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1118 /* but the diagonal was inverted already */ 1119 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1120 1121 SSE_INLINE_BEGIN_2(pv,pc) 1122 /* Column 0, product is accumulated in XMM4 */ 1123 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1124 SSE_SHUFFLE(XMM4,XMM4,0x00) 1125 SSE_MULT_PS(XMM4,XMM0) 1126 1127 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1128 SSE_SHUFFLE(XMM5,XMM5,0x00) 1129 SSE_MULT_PS(XMM5,XMM1) 1130 SSE_ADD_PS(XMM4,XMM5) 1131 1132 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1133 SSE_SHUFFLE(XMM6,XMM6,0x00) 1134 SSE_MULT_PS(XMM6,XMM2) 1135 SSE_ADD_PS(XMM4,XMM6) 1136 1137 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1138 SSE_SHUFFLE(XMM7,XMM7,0x00) 1139 SSE_MULT_PS(XMM7,XMM3) 1140 SSE_ADD_PS(XMM4,XMM7) 1141 1142 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1143 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1144 1145 /* Column 1, product is accumulated in XMM5 */ 1146 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1147 SSE_SHUFFLE(XMM5,XMM5,0x00) 1148 SSE_MULT_PS(XMM5,XMM0) 1149 1150 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1151 SSE_SHUFFLE(XMM6,XMM6,0x00) 1152 SSE_MULT_PS(XMM6,XMM1) 1153 SSE_ADD_PS(XMM5,XMM6) 1154 1155 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1156 SSE_SHUFFLE(XMM7,XMM7,0x00) 1157 SSE_MULT_PS(XMM7,XMM2) 1158 SSE_ADD_PS(XMM5,XMM7) 1159 1160 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1161 SSE_SHUFFLE(XMM6,XMM6,0x00) 1162 SSE_MULT_PS(XMM6,XMM3) 1163 SSE_ADD_PS(XMM5,XMM6) 1164 1165 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1166 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1167 1168 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1169 1170 /* Column 2, product is accumulated in XMM6 */ 1171 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1172 SSE_SHUFFLE(XMM6,XMM6,0x00) 1173 SSE_MULT_PS(XMM6,XMM0) 1174 1175 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1176 SSE_SHUFFLE(XMM7,XMM7,0x00) 1177 SSE_MULT_PS(XMM7,XMM1) 1178 SSE_ADD_PS(XMM6,XMM7) 1179 1180 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1181 SSE_SHUFFLE(XMM7,XMM7,0x00) 1182 SSE_MULT_PS(XMM7,XMM2) 1183 SSE_ADD_PS(XMM6,XMM7) 1184 1185 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1186 SSE_SHUFFLE(XMM7,XMM7,0x00) 1187 SSE_MULT_PS(XMM7,XMM3) 1188 SSE_ADD_PS(XMM6,XMM7) 1189 1190 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1191 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1192 1193 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1194 /* Column 3, product is accumulated in XMM0 */ 1195 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1196 SSE_SHUFFLE(XMM7,XMM7,0x00) 1197 SSE_MULT_PS(XMM0,XMM7) 1198 1199 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1200 SSE_SHUFFLE(XMM7,XMM7,0x00) 1201 SSE_MULT_PS(XMM1,XMM7) 1202 SSE_ADD_PS(XMM0,XMM1) 1203 1204 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1205 SSE_SHUFFLE(XMM1,XMM1,0x00) 1206 SSE_MULT_PS(XMM1,XMM2) 1207 SSE_ADD_PS(XMM0,XMM1) 1208 1209 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1210 SSE_SHUFFLE(XMM7,XMM7,0x00) 1211 SSE_MULT_PS(XMM3,XMM7) 1212 SSE_ADD_PS(XMM0,XMM3) 1213 1214 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1215 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1216 1217 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1218 /* This is code to be maintained and read by humans afterall. */ 1219 /* Copy Multiplier Col 3 into XMM3 */ 1220 SSE_COPY_PS(XMM3,XMM0) 1221 /* Copy Multiplier Col 2 into XMM2 */ 1222 SSE_COPY_PS(XMM2,XMM6) 1223 /* Copy Multiplier Col 1 into XMM1 */ 1224 SSE_COPY_PS(XMM1,XMM5) 1225 /* Copy Multiplier Col 0 into XMM0 */ 1226 SSE_COPY_PS(XMM0,XMM4) 1227 SSE_INLINE_END_2; 1228 1229 /* Update the row: */ 1230 nz = bi[row+1] - diag_offset[row] - 1; 1231 pv += 16; 1232 for (j=0; j<nz; j++) { 1233 PREFETCH_L1(&pv[16]); 1234 x = rtmp + 16*((unsigned int)pj[j]); 1235 /* x = rtmp + 4*pj[j]; */ 1236 1237 /* X:=X-M*PV, One column at a time */ 1238 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1239 SSE_INLINE_BEGIN_2(x,pv) 1240 /* Load First Column of X*/ 1241 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1242 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1243 1244 /* Matrix-Vector Product: */ 1245 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1246 SSE_SHUFFLE(XMM5,XMM5,0x00) 1247 SSE_MULT_PS(XMM5,XMM0) 1248 SSE_SUB_PS(XMM4,XMM5) 1249 1250 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1251 SSE_SHUFFLE(XMM6,XMM6,0x00) 1252 SSE_MULT_PS(XMM6,XMM1) 1253 SSE_SUB_PS(XMM4,XMM6) 1254 1255 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1256 SSE_SHUFFLE(XMM7,XMM7,0x00) 1257 SSE_MULT_PS(XMM7,XMM2) 1258 SSE_SUB_PS(XMM4,XMM7) 1259 1260 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1261 SSE_SHUFFLE(XMM5,XMM5,0x00) 1262 SSE_MULT_PS(XMM5,XMM3) 1263 SSE_SUB_PS(XMM4,XMM5) 1264 1265 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1266 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1267 1268 /* Second Column */ 1269 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1270 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1271 1272 /* Matrix-Vector Product: */ 1273 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1274 SSE_SHUFFLE(XMM6,XMM6,0x00) 1275 SSE_MULT_PS(XMM6,XMM0) 1276 SSE_SUB_PS(XMM5,XMM6) 1277 1278 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1279 SSE_SHUFFLE(XMM7,XMM7,0x00) 1280 SSE_MULT_PS(XMM7,XMM1) 1281 SSE_SUB_PS(XMM5,XMM7) 1282 1283 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1284 SSE_SHUFFLE(XMM4,XMM4,0x00) 1285 SSE_MULT_PS(XMM4,XMM2) 1286 SSE_SUB_PS(XMM5,XMM4) 1287 1288 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1289 SSE_SHUFFLE(XMM6,XMM6,0x00) 1290 SSE_MULT_PS(XMM6,XMM3) 1291 SSE_SUB_PS(XMM5,XMM6) 1292 1293 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1294 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1295 1296 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1297 1298 /* Third Column */ 1299 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1300 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1301 1302 /* Matrix-Vector Product: */ 1303 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1304 SSE_SHUFFLE(XMM7,XMM7,0x00) 1305 SSE_MULT_PS(XMM7,XMM0) 1306 SSE_SUB_PS(XMM6,XMM7) 1307 1308 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1309 SSE_SHUFFLE(XMM4,XMM4,0x00) 1310 SSE_MULT_PS(XMM4,XMM1) 1311 SSE_SUB_PS(XMM6,XMM4) 1312 1313 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1314 SSE_SHUFFLE(XMM5,XMM5,0x00) 1315 SSE_MULT_PS(XMM5,XMM2) 1316 SSE_SUB_PS(XMM6,XMM5) 1317 1318 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1319 SSE_SHUFFLE(XMM7,XMM7,0x00) 1320 SSE_MULT_PS(XMM7,XMM3) 1321 SSE_SUB_PS(XMM6,XMM7) 1322 1323 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1324 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1325 1326 /* Fourth Column */ 1327 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1328 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1329 1330 /* Matrix-Vector Product: */ 1331 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1332 SSE_SHUFFLE(XMM5,XMM5,0x00) 1333 SSE_MULT_PS(XMM5,XMM0) 1334 SSE_SUB_PS(XMM4,XMM5) 1335 1336 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1337 SSE_SHUFFLE(XMM6,XMM6,0x00) 1338 SSE_MULT_PS(XMM6,XMM1) 1339 SSE_SUB_PS(XMM4,XMM6) 1340 1341 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1342 SSE_SHUFFLE(XMM7,XMM7,0x00) 1343 SSE_MULT_PS(XMM7,XMM2) 1344 SSE_SUB_PS(XMM4,XMM7) 1345 1346 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1347 SSE_SHUFFLE(XMM5,XMM5,0x00) 1348 SSE_MULT_PS(XMM5,XMM3) 1349 SSE_SUB_PS(XMM4,XMM5) 1350 1351 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1352 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1353 SSE_INLINE_END_2; 1354 pv += 16; 1355 } 1356 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1357 } 1358 row = (unsigned int)(*bjtmp++); 1359 /* row = (*bjtmp++)/4; */ 1360 /* bjtmp++; */ 1361 } 1362 /* finished row so stick it into b->a */ 1363 pv = ba + 16*bi[i]; 1364 pj = bj + bi[i]; 1365 nz = bi[i+1] - bi[i]; 1366 1367 /* Copy x block back into pv block */ 1368 for (j=0; j<nz; j++) { 1369 x = rtmp+16*((unsigned int)pj[j]); 1370 /* x = rtmp+4*pj[j]; */ 1371 1372 SSE_INLINE_BEGIN_2(x,pv) 1373 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1374 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1375 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1376 1377 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1378 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1379 1380 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1381 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1382 1383 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1384 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1385 1386 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1387 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1388 1389 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1390 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1391 1392 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1393 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1394 1395 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1396 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1397 SSE_INLINE_END_2; 1398 pv += 16; 1399 } 1400 /* invert diagonal block */ 1401 w = ba + 16*diag_offset[i]; 1402 if (pivotinblocks) { 1403 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1404 } else { 1405 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1406 } 1407 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1408 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1409 } 1410 1411 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1412 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1413 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1414 C->assembled = PETSC_TRUE; 1415 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1416 /* Flop Count from inverting diagonal blocks */ 1417 SSE_SCOPE_END; 1418 PetscFunctionReturn(0); 1419 } 1420 1421 #undef __FUNCT__ 1422 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1423 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1424 { 1425 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1426 PetscErrorCode ierr; 1427 int i,j,n = a->mbs; 1428 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1429 unsigned int row; 1430 int *ajtmpold,nz,*bi=b->i; 1431 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1432 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1433 MatScalar *ba = b->a,*aa = a->a; 1434 int nonzero=0; 1435 /* int nonzero=0,colscale = 16; */ 1436 PetscBool pivotinblocks = b->pivotinblocks; 1437 PetscReal shift = info->shiftamount; 1438 1439 PetscFunctionBegin; 1440 SSE_SCOPE_BEGIN; 1441 1442 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1443 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1444 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1445 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1446 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1447 /* colscale = 4; */ 1448 /* } */ 1449 if ((unsigned long)bj==(unsigned long)aj) { 1450 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1451 } 1452 1453 for (i=0; i<n; i++) { 1454 nz = bi[i+1] - bi[i]; 1455 bjtmp = bj + bi[i]; 1456 /* zero out the 4x4 block accumulators */ 1457 /* zero out one register */ 1458 XOR_PS(XMM7,XMM7); 1459 for (j=0; j<nz; j++) { 1460 x = rtmp+16*((unsigned int)bjtmp[j]); 1461 /* x = rtmp+4*bjtmp[j]; */ 1462 SSE_INLINE_BEGIN_1(x) 1463 /* Copy zero register to memory locations */ 1464 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1465 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1466 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1467 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1468 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1469 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1470 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1471 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1472 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1473 SSE_INLINE_END_1; 1474 } 1475 /* load in initial (unfactored row) */ 1476 nz = ai[i+1] - ai[i]; 1477 ajtmpold = aj + ai[i]; 1478 v = aa + 16*ai[i]; 1479 for (j=0; j<nz; j++) { 1480 x = rtmp+16*ajtmpold[j]; 1481 /* x = rtmp+colscale*ajtmpold[j]; */ 1482 /* Copy v block into x block */ 1483 SSE_INLINE_BEGIN_2(v,x) 1484 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1485 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1486 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1487 1488 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1489 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1490 1491 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1492 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1493 1494 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1495 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1496 1497 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1498 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1499 1500 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1501 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1502 1503 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1504 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1505 1506 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1507 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1508 SSE_INLINE_END_2; 1509 1510 v += 16; 1511 } 1512 /* row = (*bjtmp++)/4; */ 1513 row = (unsigned int)(*bjtmp++); 1514 while (row < i) { 1515 pc = rtmp + 16*row; 1516 SSE_INLINE_BEGIN_1(pc) 1517 /* Load block from lower triangle */ 1518 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1519 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1520 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1521 1522 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1523 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1524 1525 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1526 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1527 1528 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1529 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1530 1531 /* Compare block to zero block */ 1532 1533 SSE_COPY_PS(XMM4,XMM7) 1534 SSE_CMPNEQ_PS(XMM4,XMM0) 1535 1536 SSE_COPY_PS(XMM5,XMM7) 1537 SSE_CMPNEQ_PS(XMM5,XMM1) 1538 1539 SSE_COPY_PS(XMM6,XMM7) 1540 SSE_CMPNEQ_PS(XMM6,XMM2) 1541 1542 SSE_CMPNEQ_PS(XMM7,XMM3) 1543 1544 /* Reduce the comparisons to one SSE register */ 1545 SSE_OR_PS(XMM6,XMM7) 1546 SSE_OR_PS(XMM5,XMM4) 1547 SSE_OR_PS(XMM5,XMM6) 1548 SSE_INLINE_END_1; 1549 1550 /* Reduce the one SSE register to an integer register for branching */ 1551 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1552 MOVEMASK(nonzero,XMM5); 1553 1554 /* If block is nonzero ... */ 1555 if (nonzero) { 1556 pv = ba + 16*diag_offset[row]; 1557 PREFETCH_L1(&pv[16]); 1558 pj = bj + diag_offset[row] + 1; 1559 1560 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1561 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1562 /* but the diagonal was inverted already */ 1563 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1564 1565 SSE_INLINE_BEGIN_2(pv,pc) 1566 /* Column 0, product is accumulated in XMM4 */ 1567 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1568 SSE_SHUFFLE(XMM4,XMM4,0x00) 1569 SSE_MULT_PS(XMM4,XMM0) 1570 1571 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1572 SSE_SHUFFLE(XMM5,XMM5,0x00) 1573 SSE_MULT_PS(XMM5,XMM1) 1574 SSE_ADD_PS(XMM4,XMM5) 1575 1576 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1577 SSE_SHUFFLE(XMM6,XMM6,0x00) 1578 SSE_MULT_PS(XMM6,XMM2) 1579 SSE_ADD_PS(XMM4,XMM6) 1580 1581 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1582 SSE_SHUFFLE(XMM7,XMM7,0x00) 1583 SSE_MULT_PS(XMM7,XMM3) 1584 SSE_ADD_PS(XMM4,XMM7) 1585 1586 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1587 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1588 1589 /* Column 1, product is accumulated in XMM5 */ 1590 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1591 SSE_SHUFFLE(XMM5,XMM5,0x00) 1592 SSE_MULT_PS(XMM5,XMM0) 1593 1594 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1595 SSE_SHUFFLE(XMM6,XMM6,0x00) 1596 SSE_MULT_PS(XMM6,XMM1) 1597 SSE_ADD_PS(XMM5,XMM6) 1598 1599 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1600 SSE_SHUFFLE(XMM7,XMM7,0x00) 1601 SSE_MULT_PS(XMM7,XMM2) 1602 SSE_ADD_PS(XMM5,XMM7) 1603 1604 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1605 SSE_SHUFFLE(XMM6,XMM6,0x00) 1606 SSE_MULT_PS(XMM6,XMM3) 1607 SSE_ADD_PS(XMM5,XMM6) 1608 1609 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1610 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1611 1612 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1613 1614 /* Column 2, product is accumulated in XMM6 */ 1615 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1616 SSE_SHUFFLE(XMM6,XMM6,0x00) 1617 SSE_MULT_PS(XMM6,XMM0) 1618 1619 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1620 SSE_SHUFFLE(XMM7,XMM7,0x00) 1621 SSE_MULT_PS(XMM7,XMM1) 1622 SSE_ADD_PS(XMM6,XMM7) 1623 1624 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1625 SSE_SHUFFLE(XMM7,XMM7,0x00) 1626 SSE_MULT_PS(XMM7,XMM2) 1627 SSE_ADD_PS(XMM6,XMM7) 1628 1629 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1630 SSE_SHUFFLE(XMM7,XMM7,0x00) 1631 SSE_MULT_PS(XMM7,XMM3) 1632 SSE_ADD_PS(XMM6,XMM7) 1633 1634 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1635 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1636 1637 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1638 /* Column 3, product is accumulated in XMM0 */ 1639 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1640 SSE_SHUFFLE(XMM7,XMM7,0x00) 1641 SSE_MULT_PS(XMM0,XMM7) 1642 1643 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1644 SSE_SHUFFLE(XMM7,XMM7,0x00) 1645 SSE_MULT_PS(XMM1,XMM7) 1646 SSE_ADD_PS(XMM0,XMM1) 1647 1648 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1649 SSE_SHUFFLE(XMM1,XMM1,0x00) 1650 SSE_MULT_PS(XMM1,XMM2) 1651 SSE_ADD_PS(XMM0,XMM1) 1652 1653 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1654 SSE_SHUFFLE(XMM7,XMM7,0x00) 1655 SSE_MULT_PS(XMM3,XMM7) 1656 SSE_ADD_PS(XMM0,XMM3) 1657 1658 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1659 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1660 1661 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1662 /* This is code to be maintained and read by humans afterall. */ 1663 /* Copy Multiplier Col 3 into XMM3 */ 1664 SSE_COPY_PS(XMM3,XMM0) 1665 /* Copy Multiplier Col 2 into XMM2 */ 1666 SSE_COPY_PS(XMM2,XMM6) 1667 /* Copy Multiplier Col 1 into XMM1 */ 1668 SSE_COPY_PS(XMM1,XMM5) 1669 /* Copy Multiplier Col 0 into XMM0 */ 1670 SSE_COPY_PS(XMM0,XMM4) 1671 SSE_INLINE_END_2; 1672 1673 /* Update the row: */ 1674 nz = bi[row+1] - diag_offset[row] - 1; 1675 pv += 16; 1676 for (j=0; j<nz; j++) { 1677 PREFETCH_L1(&pv[16]); 1678 x = rtmp + 16*((unsigned int)pj[j]); 1679 /* x = rtmp + 4*pj[j]; */ 1680 1681 /* X:=X-M*PV, One column at a time */ 1682 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1683 SSE_INLINE_BEGIN_2(x,pv) 1684 /* Load First Column of X*/ 1685 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1686 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1687 1688 /* Matrix-Vector Product: */ 1689 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1690 SSE_SHUFFLE(XMM5,XMM5,0x00) 1691 SSE_MULT_PS(XMM5,XMM0) 1692 SSE_SUB_PS(XMM4,XMM5) 1693 1694 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1695 SSE_SHUFFLE(XMM6,XMM6,0x00) 1696 SSE_MULT_PS(XMM6,XMM1) 1697 SSE_SUB_PS(XMM4,XMM6) 1698 1699 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1700 SSE_SHUFFLE(XMM7,XMM7,0x00) 1701 SSE_MULT_PS(XMM7,XMM2) 1702 SSE_SUB_PS(XMM4,XMM7) 1703 1704 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1705 SSE_SHUFFLE(XMM5,XMM5,0x00) 1706 SSE_MULT_PS(XMM5,XMM3) 1707 SSE_SUB_PS(XMM4,XMM5) 1708 1709 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1710 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1711 1712 /* Second Column */ 1713 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1714 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1715 1716 /* Matrix-Vector Product: */ 1717 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1718 SSE_SHUFFLE(XMM6,XMM6,0x00) 1719 SSE_MULT_PS(XMM6,XMM0) 1720 SSE_SUB_PS(XMM5,XMM6) 1721 1722 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1723 SSE_SHUFFLE(XMM7,XMM7,0x00) 1724 SSE_MULT_PS(XMM7,XMM1) 1725 SSE_SUB_PS(XMM5,XMM7) 1726 1727 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1728 SSE_SHUFFLE(XMM4,XMM4,0x00) 1729 SSE_MULT_PS(XMM4,XMM2) 1730 SSE_SUB_PS(XMM5,XMM4) 1731 1732 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1733 SSE_SHUFFLE(XMM6,XMM6,0x00) 1734 SSE_MULT_PS(XMM6,XMM3) 1735 SSE_SUB_PS(XMM5,XMM6) 1736 1737 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1738 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1739 1740 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1741 1742 /* Third Column */ 1743 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1744 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1745 1746 /* Matrix-Vector Product: */ 1747 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1748 SSE_SHUFFLE(XMM7,XMM7,0x00) 1749 SSE_MULT_PS(XMM7,XMM0) 1750 SSE_SUB_PS(XMM6,XMM7) 1751 1752 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1753 SSE_SHUFFLE(XMM4,XMM4,0x00) 1754 SSE_MULT_PS(XMM4,XMM1) 1755 SSE_SUB_PS(XMM6,XMM4) 1756 1757 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1758 SSE_SHUFFLE(XMM5,XMM5,0x00) 1759 SSE_MULT_PS(XMM5,XMM2) 1760 SSE_SUB_PS(XMM6,XMM5) 1761 1762 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1763 SSE_SHUFFLE(XMM7,XMM7,0x00) 1764 SSE_MULT_PS(XMM7,XMM3) 1765 SSE_SUB_PS(XMM6,XMM7) 1766 1767 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1768 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1769 1770 /* Fourth Column */ 1771 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1772 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1773 1774 /* Matrix-Vector Product: */ 1775 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1776 SSE_SHUFFLE(XMM5,XMM5,0x00) 1777 SSE_MULT_PS(XMM5,XMM0) 1778 SSE_SUB_PS(XMM4,XMM5) 1779 1780 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1781 SSE_SHUFFLE(XMM6,XMM6,0x00) 1782 SSE_MULT_PS(XMM6,XMM1) 1783 SSE_SUB_PS(XMM4,XMM6) 1784 1785 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1786 SSE_SHUFFLE(XMM7,XMM7,0x00) 1787 SSE_MULT_PS(XMM7,XMM2) 1788 SSE_SUB_PS(XMM4,XMM7) 1789 1790 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1791 SSE_SHUFFLE(XMM5,XMM5,0x00) 1792 SSE_MULT_PS(XMM5,XMM3) 1793 SSE_SUB_PS(XMM4,XMM5) 1794 1795 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1796 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1797 SSE_INLINE_END_2; 1798 pv += 16; 1799 } 1800 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1801 } 1802 row = (unsigned int)(*bjtmp++); 1803 /* row = (*bjtmp++)/4; */ 1804 /* bjtmp++; */ 1805 } 1806 /* finished row so stick it into b->a */ 1807 pv = ba + 16*bi[i]; 1808 pj = bj + bi[i]; 1809 nz = bi[i+1] - bi[i]; 1810 1811 /* Copy x block back into pv block */ 1812 for (j=0; j<nz; j++) { 1813 x = rtmp+16*((unsigned int)pj[j]); 1814 /* x = rtmp+4*pj[j]; */ 1815 1816 SSE_INLINE_BEGIN_2(x,pv) 1817 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1818 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1819 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1820 1821 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1822 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1823 1824 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1825 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1826 1827 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1828 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1829 1830 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1831 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1832 1833 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1834 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1835 1836 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1837 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1838 1839 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1840 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1841 SSE_INLINE_END_2; 1842 pv += 16; 1843 } 1844 /* invert diagonal block */ 1845 w = ba + 16*diag_offset[i]; 1846 if (pivotinblocks) { 1847 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1848 } else { 1849 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1850 } 1851 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1852 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1853 } 1854 1855 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1856 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1857 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1858 C->assembled = PETSC_TRUE; 1859 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1860 /* Flop Count from inverting diagonal blocks */ 1861 SSE_SCOPE_END; 1862 PetscFunctionReturn(0); 1863 } 1864 1865 #endif 1866