1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 #include "../src/mat/impls/baij/seq/baij.h" 7 #include "../src/mat/blockinvert.h" 8 9 /* ------------------------------------------------------------*/ 10 /* 11 Version for when blocks are 4 by 4 12 */ 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace" 15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 16 { 17 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 18 IS isrow = b->row,isicol = b->icol; 19 PetscErrorCode ierr; 20 const PetscInt *r,*ic; 21 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 22 PetscInt *ajtmpold,*ajtmp,nz,row; 23 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 24 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 25 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 26 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 27 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 28 MatScalar m13,m14,m15,m16; 29 MatScalar *ba = b->a,*aa = a->a; 30 PetscTruth pivotinblocks = b->pivotinblocks; 31 PetscReal shift = info->shiftamount; 32 33 PetscFunctionBegin; 34 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 35 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 36 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 37 38 for (i=0; i<n; i++) { 39 nz = bi[i+1] - bi[i]; 40 ajtmp = bj + bi[i]; 41 for (j=0; j<nz; j++) { 42 x = rtmp+16*ajtmp[j]; 43 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 44 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 45 } 46 /* load in initial (unfactored row) */ 47 idx = r[i]; 48 nz = ai[idx+1] - ai[idx]; 49 ajtmpold = aj + ai[idx]; 50 v = aa + 16*ai[idx]; 51 for (j=0; j<nz; j++) { 52 x = rtmp+16*ic[ajtmpold[j]]; 53 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 54 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 55 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 56 x[14] = v[14]; x[15] = v[15]; 57 v += 16; 58 } 59 row = *ajtmp++; 60 while (row < i) { 61 pc = rtmp + 16*row; 62 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 63 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 64 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 65 p15 = pc[14]; p16 = pc[15]; 66 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 67 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 68 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 69 || p16 != 0.0) { 70 pv = ba + 16*diag_offset[row]; 71 pj = bj + diag_offset[row] + 1; 72 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 73 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 74 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 75 x15 = pv[14]; x16 = pv[15]; 76 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 77 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 78 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 79 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 80 81 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 82 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 83 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 84 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 85 86 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 87 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 88 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 89 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 90 91 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 92 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 93 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 94 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 95 96 nz = bi[row+1] - diag_offset[row] - 1; 97 pv += 16; 98 for (j=0; j<nz; j++) { 99 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 100 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 101 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 102 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 103 x = rtmp + 16*pj[j]; 104 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 105 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 106 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 107 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 108 109 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 110 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 111 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 112 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 113 114 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 115 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 116 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 117 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 118 119 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 120 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 121 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 122 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 123 124 pv += 16; 125 } 126 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 127 } 128 row = *ajtmp++; 129 } 130 /* finished row so stick it into b->a */ 131 pv = ba + 16*bi[i]; 132 pj = bj + bi[i]; 133 nz = bi[i+1] - bi[i]; 134 for (j=0; j<nz; j++) { 135 x = rtmp+16*pj[j]; 136 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 137 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 138 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 139 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 140 pv += 16; 141 } 142 /* invert diagonal block */ 143 w = ba + 16*diag_offset[i]; 144 if (pivotinblocks) { 145 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 146 } else { 147 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148 } 149 } 150 151 ierr = PetscFree(rtmp);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 153 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 155 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 156 C->assembled = PETSC_TRUE; 157 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 158 PetscFunctionReturn(0); 159 } 160 161 /* MatLUFactorNumeric_SeqBAIJ_4 - 162 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 163 Kernel_A_gets_A_times_B() 164 Kernel_A_gets_A_minus_B_times_C() 165 Kernel_A_gets_inverse_A() 166 */ 167 168 #undef __FUNCT__ 169 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 171 { 172 Mat C=B; 173 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 174 IS isrow = b->row,isicol = b->icol; 175 PetscErrorCode ierr; 176 const PetscInt *r,*ic; 177 PetscInt i,j,k,nz,nzL,row; 178 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 179 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 180 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 181 PetscInt flg; 182 PetscReal shift; 183 184 PetscFunctionBegin; 185 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 186 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 187 188 if (info->shifttype == MAT_SHIFT_NONE){ 189 shift = 0; 190 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 191 shift = info->shiftamount; 192 } 193 194 /* generate work space needed by the factorization */ 195 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 196 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 197 198 for (i=0; i<n; i++){ 199 /* zero rtmp */ 200 /* L part */ 201 nz = bi[i+1] - bi[i]; 202 bjtmp = bj + bi[i]; 203 for (j=0; j<nz; j++){ 204 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 205 } 206 207 /* U part */ 208 nz = bdiag[i]-bdiag[i+1]; 209 bjtmp = bj + bdiag[i+1]+1; 210 for (j=0; j<nz; j++){ 211 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 212 } 213 214 /* load in initial (unfactored row) */ 215 nz = ai[r[i]+1] - ai[r[i]]; 216 ajtmp = aj + ai[r[i]]; 217 v = aa + bs2*ai[r[i]]; 218 for (j=0; j<nz; j++) { 219 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 220 } 221 222 /* elimination */ 223 bjtmp = bj + bi[i]; 224 nzL = bi[i+1] - bi[i]; 225 for(k=0;k < nzL;k++) { 226 row = bjtmp[k]; 227 pc = rtmp + bs2*row; 228 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 229 if (flg) { 230 pv = b->a + bs2*bdiag[row]; 231 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 232 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 233 234 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 235 pv = b->a + bs2*(bdiag[row+1]+1); 236 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 237 for (j=0; j<nz; j++) { 238 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 239 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 240 v = rtmp + bs2*pj[j]; 241 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 242 pv += bs2; 243 } 244 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 245 } 246 } 247 248 /* finished row so stick it into b->a */ 249 /* L part */ 250 pv = b->a + bs2*bi[i] ; 251 pj = b->j + bi[i] ; 252 nz = bi[i+1] - bi[i]; 253 for (j=0; j<nz; j++) { 254 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 255 } 256 257 /* Mark diagonal and invert diagonal for simplier triangular solves */ 258 pv = b->a + bs2*bdiag[i]; 259 pj = b->j + bdiag[i]; 260 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 261 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 262 263 /* U part */ 264 pv = b->a + bs2*(bdiag[i+1]+1); 265 pj = b->j + bdiag[i+1]+1; 266 nz = bdiag[i] - bdiag[i+1] - 1; 267 for (j=0; j<nz; j++){ 268 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 269 } 270 } 271 272 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 273 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 274 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 275 C->ops->solve = MatSolve_SeqBAIJ_4; 276 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 277 C->assembled = PETSC_TRUE; 278 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 279 PetscFunctionReturn(0); 280 } 281 282 #undef __FUNCT__ 283 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace" 284 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 285 { 286 /* 287 Default Version for when blocks are 4 by 4 Using natural ordering 288 */ 289 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 290 PetscErrorCode ierr; 291 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 292 PetscInt *ajtmpold,*ajtmp,nz,row; 293 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 294 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 295 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 296 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 297 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 298 MatScalar m13,m14,m15,m16; 299 MatScalar *ba = b->a,*aa = a->a; 300 PetscTruth pivotinblocks = b->pivotinblocks; 301 PetscReal shift = info->shiftamount; 302 303 PetscFunctionBegin; 304 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 305 306 for (i=0; i<n; i++) { 307 nz = bi[i+1] - bi[i]; 308 ajtmp = bj + bi[i]; 309 for (j=0; j<nz; j++) { 310 x = rtmp+16*ajtmp[j]; 311 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 312 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 313 } 314 /* load in initial (unfactored row) */ 315 nz = ai[i+1] - ai[i]; 316 ajtmpold = aj + ai[i]; 317 v = aa + 16*ai[i]; 318 for (j=0; j<nz; j++) { 319 x = rtmp+16*ajtmpold[j]; 320 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 321 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 322 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 323 x[14] = v[14]; x[15] = v[15]; 324 v += 16; 325 } 326 row = *ajtmp++; 327 while (row < i) { 328 pc = rtmp + 16*row; 329 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 330 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 331 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 332 p15 = pc[14]; p16 = pc[15]; 333 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 334 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 335 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 336 || p16 != 0.0) { 337 pv = ba + 16*diag_offset[row]; 338 pj = bj + diag_offset[row] + 1; 339 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 340 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 341 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 342 x15 = pv[14]; x16 = pv[15]; 343 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 344 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 345 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 346 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 347 348 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 349 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 350 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 351 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 352 353 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 354 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 355 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 356 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 357 358 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 359 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 360 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 361 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 362 nz = bi[row+1] - diag_offset[row] - 1; 363 pv += 16; 364 for (j=0; j<nz; j++) { 365 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 366 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 367 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 368 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 369 x = rtmp + 16*pj[j]; 370 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 371 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 372 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 373 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 374 375 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 376 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 377 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 378 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 379 380 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 381 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 382 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 383 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 384 385 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 386 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 387 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 388 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 389 390 pv += 16; 391 } 392 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 393 } 394 row = *ajtmp++; 395 } 396 /* finished row so stick it into b->a */ 397 pv = ba + 16*bi[i]; 398 pj = bj + bi[i]; 399 nz = bi[i+1] - bi[i]; 400 for (j=0; j<nz; j++) { 401 x = rtmp+16*pj[j]; 402 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 403 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 404 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 405 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 406 pv += 16; 407 } 408 /* invert diagonal block */ 409 w = ba + 16*diag_offset[i]; 410 if (pivotinblocks) { 411 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 412 } else { 413 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 414 } 415 } 416 417 ierr = PetscFree(rtmp);CHKERRQ(ierr); 418 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 419 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 420 C->assembled = PETSC_TRUE; 421 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 422 PetscFunctionReturn(0); 423 } 424 425 /* 426 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 427 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 428 */ 429 #undef __FUNCT__ 430 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 431 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 432 { 433 Mat C=B; 434 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 435 PetscErrorCode ierr; 436 PetscInt i,j,k,nz,nzL,row; 437 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 438 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 439 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 440 PetscInt flg; 441 PetscReal shift; 442 443 PetscFunctionBegin; 444 /* generate work space needed by the factorization */ 445 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 446 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 447 448 if (info->shifttype == MAT_SHIFT_NONE){ 449 shift = 0; 450 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 451 shift = info->shiftamount; 452 } 453 454 for (i=0; i<n; i++){ 455 /* zero rtmp */ 456 /* L part */ 457 nz = bi[i+1] - bi[i]; 458 bjtmp = bj + bi[i]; 459 for (j=0; j<nz; j++){ 460 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 461 } 462 463 /* U part */ 464 nz = bdiag[i] - bdiag[i+1]; 465 bjtmp = bj + bdiag[i+1]+1; 466 for (j=0; j<nz; j++){ 467 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 468 } 469 470 /* load in initial (unfactored row) */ 471 nz = ai[i+1] - ai[i]; 472 ajtmp = aj + ai[i]; 473 v = aa + bs2*ai[i]; 474 for (j=0; j<nz; j++) { 475 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 476 } 477 478 /* elimination */ 479 bjtmp = bj + bi[i]; 480 nzL = bi[i+1] - bi[i]; 481 for(k=0;k < nzL;k++) { 482 row = bjtmp[k]; 483 pc = rtmp + bs2*row; 484 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 485 if (flg) { 486 pv = b->a + bs2*bdiag[row]; 487 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 488 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 489 490 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 491 pv = b->a + bs2*(bdiag[row+1]+1); 492 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 493 for (j=0; j<nz; j++) { 494 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 495 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 496 v = rtmp + bs2*pj[j]; 497 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 498 pv += bs2; 499 } 500 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 501 } 502 } 503 504 /* finished row so stick it into b->a */ 505 /* L part */ 506 pv = b->a + bs2*bi[i] ; 507 pj = b->j + bi[i] ; 508 nz = bi[i+1] - bi[i]; 509 for (j=0; j<nz; j++) { 510 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 511 } 512 513 /* Mark diagonal and invert diagonal for simplier triangular solves */ 514 pv = b->a + bs2*bdiag[i]; 515 pj = b->j + bdiag[i]; 516 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 517 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 518 519 /* U part */ 520 pv = b->a + bs2*(bdiag[i+1]+1); 521 pj = b->j + bdiag[i+1]+1; 522 nz = bdiag[i] - bdiag[i+1] - 1; 523 for (j=0; j<nz; j++){ 524 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 525 } 526 } 527 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 528 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 529 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 530 C->assembled = PETSC_TRUE; 531 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 532 PetscFunctionReturn(0); 533 } 534 535 #if defined(PETSC_HAVE_SSE) 536 537 #include PETSC_HAVE_SSE 538 539 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 540 #undef __FUNCT__ 541 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 542 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 543 { 544 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 545 PetscErrorCode ierr; 546 int i,j,n = a->mbs; 547 int *bj = b->j,*bjtmp,*pj; 548 int row; 549 int *ajtmpold,nz,*bi=b->i; 550 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 551 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 552 MatScalar *ba = b->a,*aa = a->a; 553 int nonzero=0; 554 /* int nonzero=0,colscale = 16; */ 555 PetscTruth pivotinblocks = b->pivotinblocks; 556 PetscReal shift = info->shiftamount; 557 558 PetscFunctionBegin; 559 SSE_SCOPE_BEGIN; 560 561 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 562 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 563 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 564 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 565 /* if ((unsigned long)bj==(unsigned long)aj) { */ 566 /* colscale = 4; */ 567 /* } */ 568 for (i=0; i<n; i++) { 569 nz = bi[i+1] - bi[i]; 570 bjtmp = bj + bi[i]; 571 /* zero out the 4x4 block accumulators */ 572 /* zero out one register */ 573 XOR_PS(XMM7,XMM7); 574 for (j=0; j<nz; j++) { 575 x = rtmp+16*bjtmp[j]; 576 /* x = rtmp+4*bjtmp[j]; */ 577 SSE_INLINE_BEGIN_1(x) 578 /* Copy zero register to memory locations */ 579 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 580 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 581 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 582 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 583 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 584 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 585 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 586 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 587 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 588 SSE_INLINE_END_1; 589 } 590 /* load in initial (unfactored row) */ 591 nz = ai[i+1] - ai[i]; 592 ajtmpold = aj + ai[i]; 593 v = aa + 16*ai[i]; 594 for (j=0; j<nz; j++) { 595 x = rtmp+16*ajtmpold[j]; 596 /* x = rtmp+colscale*ajtmpold[j]; */ 597 /* Copy v block into x block */ 598 SSE_INLINE_BEGIN_2(v,x) 599 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 600 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 601 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 602 603 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 604 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 605 606 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 607 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 608 609 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 610 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 611 612 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 613 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 614 615 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 616 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 617 618 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 619 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 620 621 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 622 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 623 SSE_INLINE_END_2; 624 625 v += 16; 626 } 627 /* row = (*bjtmp++)/4; */ 628 row = *bjtmp++; 629 while (row < i) { 630 pc = rtmp + 16*row; 631 SSE_INLINE_BEGIN_1(pc) 632 /* Load block from lower triangle */ 633 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 634 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 635 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 636 637 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 638 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 639 640 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 641 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 642 643 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 644 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 645 646 /* Compare block to zero block */ 647 648 SSE_COPY_PS(XMM4,XMM7) 649 SSE_CMPNEQ_PS(XMM4,XMM0) 650 651 SSE_COPY_PS(XMM5,XMM7) 652 SSE_CMPNEQ_PS(XMM5,XMM1) 653 654 SSE_COPY_PS(XMM6,XMM7) 655 SSE_CMPNEQ_PS(XMM6,XMM2) 656 657 SSE_CMPNEQ_PS(XMM7,XMM3) 658 659 /* Reduce the comparisons to one SSE register */ 660 SSE_OR_PS(XMM6,XMM7) 661 SSE_OR_PS(XMM5,XMM4) 662 SSE_OR_PS(XMM5,XMM6) 663 SSE_INLINE_END_1; 664 665 /* Reduce the one SSE register to an integer register for branching */ 666 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 667 MOVEMASK(nonzero,XMM5); 668 669 /* If block is nonzero ... */ 670 if (nonzero) { 671 pv = ba + 16*diag_offset[row]; 672 PREFETCH_L1(&pv[16]); 673 pj = bj + diag_offset[row] + 1; 674 675 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 676 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 677 /* but the diagonal was inverted already */ 678 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 679 680 SSE_INLINE_BEGIN_2(pv,pc) 681 /* Column 0, product is accumulated in XMM4 */ 682 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 683 SSE_SHUFFLE(XMM4,XMM4,0x00) 684 SSE_MULT_PS(XMM4,XMM0) 685 686 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 687 SSE_SHUFFLE(XMM5,XMM5,0x00) 688 SSE_MULT_PS(XMM5,XMM1) 689 SSE_ADD_PS(XMM4,XMM5) 690 691 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 692 SSE_SHUFFLE(XMM6,XMM6,0x00) 693 SSE_MULT_PS(XMM6,XMM2) 694 SSE_ADD_PS(XMM4,XMM6) 695 696 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 697 SSE_SHUFFLE(XMM7,XMM7,0x00) 698 SSE_MULT_PS(XMM7,XMM3) 699 SSE_ADD_PS(XMM4,XMM7) 700 701 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 702 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 703 704 /* Column 1, product is accumulated in XMM5 */ 705 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 706 SSE_SHUFFLE(XMM5,XMM5,0x00) 707 SSE_MULT_PS(XMM5,XMM0) 708 709 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 710 SSE_SHUFFLE(XMM6,XMM6,0x00) 711 SSE_MULT_PS(XMM6,XMM1) 712 SSE_ADD_PS(XMM5,XMM6) 713 714 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 715 SSE_SHUFFLE(XMM7,XMM7,0x00) 716 SSE_MULT_PS(XMM7,XMM2) 717 SSE_ADD_PS(XMM5,XMM7) 718 719 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 720 SSE_SHUFFLE(XMM6,XMM6,0x00) 721 SSE_MULT_PS(XMM6,XMM3) 722 SSE_ADD_PS(XMM5,XMM6) 723 724 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 725 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 726 727 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 728 729 /* Column 2, product is accumulated in XMM6 */ 730 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 731 SSE_SHUFFLE(XMM6,XMM6,0x00) 732 SSE_MULT_PS(XMM6,XMM0) 733 734 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 735 SSE_SHUFFLE(XMM7,XMM7,0x00) 736 SSE_MULT_PS(XMM7,XMM1) 737 SSE_ADD_PS(XMM6,XMM7) 738 739 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 740 SSE_SHUFFLE(XMM7,XMM7,0x00) 741 SSE_MULT_PS(XMM7,XMM2) 742 SSE_ADD_PS(XMM6,XMM7) 743 744 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 745 SSE_SHUFFLE(XMM7,XMM7,0x00) 746 SSE_MULT_PS(XMM7,XMM3) 747 SSE_ADD_PS(XMM6,XMM7) 748 749 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 750 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 751 752 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 753 /* Column 3, product is accumulated in XMM0 */ 754 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 755 SSE_SHUFFLE(XMM7,XMM7,0x00) 756 SSE_MULT_PS(XMM0,XMM7) 757 758 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 759 SSE_SHUFFLE(XMM7,XMM7,0x00) 760 SSE_MULT_PS(XMM1,XMM7) 761 SSE_ADD_PS(XMM0,XMM1) 762 763 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 764 SSE_SHUFFLE(XMM1,XMM1,0x00) 765 SSE_MULT_PS(XMM1,XMM2) 766 SSE_ADD_PS(XMM0,XMM1) 767 768 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 769 SSE_SHUFFLE(XMM7,XMM7,0x00) 770 SSE_MULT_PS(XMM3,XMM7) 771 SSE_ADD_PS(XMM0,XMM3) 772 773 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 774 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 775 776 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 777 /* This is code to be maintained and read by humans afterall. */ 778 /* Copy Multiplier Col 3 into XMM3 */ 779 SSE_COPY_PS(XMM3,XMM0) 780 /* Copy Multiplier Col 2 into XMM2 */ 781 SSE_COPY_PS(XMM2,XMM6) 782 /* Copy Multiplier Col 1 into XMM1 */ 783 SSE_COPY_PS(XMM1,XMM5) 784 /* Copy Multiplier Col 0 into XMM0 */ 785 SSE_COPY_PS(XMM0,XMM4) 786 SSE_INLINE_END_2; 787 788 /* Update the row: */ 789 nz = bi[row+1] - diag_offset[row] - 1; 790 pv += 16; 791 for (j=0; j<nz; j++) { 792 PREFETCH_L1(&pv[16]); 793 x = rtmp + 16*pj[j]; 794 /* x = rtmp + 4*pj[j]; */ 795 796 /* X:=X-M*PV, One column at a time */ 797 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 798 SSE_INLINE_BEGIN_2(x,pv) 799 /* Load First Column of X*/ 800 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 801 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 802 803 /* Matrix-Vector Product: */ 804 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 805 SSE_SHUFFLE(XMM5,XMM5,0x00) 806 SSE_MULT_PS(XMM5,XMM0) 807 SSE_SUB_PS(XMM4,XMM5) 808 809 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 810 SSE_SHUFFLE(XMM6,XMM6,0x00) 811 SSE_MULT_PS(XMM6,XMM1) 812 SSE_SUB_PS(XMM4,XMM6) 813 814 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 815 SSE_SHUFFLE(XMM7,XMM7,0x00) 816 SSE_MULT_PS(XMM7,XMM2) 817 SSE_SUB_PS(XMM4,XMM7) 818 819 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 820 SSE_SHUFFLE(XMM5,XMM5,0x00) 821 SSE_MULT_PS(XMM5,XMM3) 822 SSE_SUB_PS(XMM4,XMM5) 823 824 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 825 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 826 827 /* Second Column */ 828 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 829 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 830 831 /* Matrix-Vector Product: */ 832 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 833 SSE_SHUFFLE(XMM6,XMM6,0x00) 834 SSE_MULT_PS(XMM6,XMM0) 835 SSE_SUB_PS(XMM5,XMM6) 836 837 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 838 SSE_SHUFFLE(XMM7,XMM7,0x00) 839 SSE_MULT_PS(XMM7,XMM1) 840 SSE_SUB_PS(XMM5,XMM7) 841 842 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 843 SSE_SHUFFLE(XMM4,XMM4,0x00) 844 SSE_MULT_PS(XMM4,XMM2) 845 SSE_SUB_PS(XMM5,XMM4) 846 847 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 848 SSE_SHUFFLE(XMM6,XMM6,0x00) 849 SSE_MULT_PS(XMM6,XMM3) 850 SSE_SUB_PS(XMM5,XMM6) 851 852 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 853 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 854 855 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 856 857 /* Third Column */ 858 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 859 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 860 861 /* Matrix-Vector Product: */ 862 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 863 SSE_SHUFFLE(XMM7,XMM7,0x00) 864 SSE_MULT_PS(XMM7,XMM0) 865 SSE_SUB_PS(XMM6,XMM7) 866 867 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 868 SSE_SHUFFLE(XMM4,XMM4,0x00) 869 SSE_MULT_PS(XMM4,XMM1) 870 SSE_SUB_PS(XMM6,XMM4) 871 872 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 873 SSE_SHUFFLE(XMM5,XMM5,0x00) 874 SSE_MULT_PS(XMM5,XMM2) 875 SSE_SUB_PS(XMM6,XMM5) 876 877 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 878 SSE_SHUFFLE(XMM7,XMM7,0x00) 879 SSE_MULT_PS(XMM7,XMM3) 880 SSE_SUB_PS(XMM6,XMM7) 881 882 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 883 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 884 885 /* Fourth Column */ 886 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 887 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 888 889 /* Matrix-Vector Product: */ 890 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 891 SSE_SHUFFLE(XMM5,XMM5,0x00) 892 SSE_MULT_PS(XMM5,XMM0) 893 SSE_SUB_PS(XMM4,XMM5) 894 895 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 896 SSE_SHUFFLE(XMM6,XMM6,0x00) 897 SSE_MULT_PS(XMM6,XMM1) 898 SSE_SUB_PS(XMM4,XMM6) 899 900 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 901 SSE_SHUFFLE(XMM7,XMM7,0x00) 902 SSE_MULT_PS(XMM7,XMM2) 903 SSE_SUB_PS(XMM4,XMM7) 904 905 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 906 SSE_SHUFFLE(XMM5,XMM5,0x00) 907 SSE_MULT_PS(XMM5,XMM3) 908 SSE_SUB_PS(XMM4,XMM5) 909 910 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 911 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 912 SSE_INLINE_END_2; 913 pv += 16; 914 } 915 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 916 } 917 row = *bjtmp++; 918 /* row = (*bjtmp++)/4; */ 919 } 920 /* finished row so stick it into b->a */ 921 pv = ba + 16*bi[i]; 922 pj = bj + bi[i]; 923 nz = bi[i+1] - bi[i]; 924 925 /* Copy x block back into pv block */ 926 for (j=0; j<nz; j++) { 927 x = rtmp+16*pj[j]; 928 /* x = rtmp+4*pj[j]; */ 929 930 SSE_INLINE_BEGIN_2(x,pv) 931 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 932 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 933 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 934 935 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 936 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 937 938 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 939 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 940 941 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 942 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 943 944 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 945 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 946 947 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 948 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 949 950 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 951 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 952 953 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 954 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 955 SSE_INLINE_END_2; 956 pv += 16; 957 } 958 /* invert diagonal block */ 959 w = ba + 16*diag_offset[i]; 960 if (pivotinblocks) { 961 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 962 } else { 963 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 964 } 965 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 966 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 967 } 968 969 ierr = PetscFree(rtmp);CHKERRQ(ierr); 970 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 971 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 972 C->assembled = PETSC_TRUE; 973 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 974 /* Flop Count from inverting diagonal blocks */ 975 SSE_SCOPE_END; 976 PetscFunctionReturn(0); 977 } 978 979 #undef __FUNCT__ 980 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 981 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 982 { 983 Mat A=C; 984 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 985 PetscErrorCode ierr; 986 int i,j,n = a->mbs; 987 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 988 unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 989 unsigned int row; 990 int nz,*bi=b->i; 991 int *diag_offset = b->diag,*ai=a->i; 992 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 993 MatScalar *ba = b->a,*aa = a->a; 994 int nonzero=0; 995 /* int nonzero=0,colscale = 16; */ 996 PetscTruth pivotinblocks = b->pivotinblocks; 997 PetscReal shift = info->shiftamount; 998 999 PetscFunctionBegin; 1000 SSE_SCOPE_BEGIN; 1001 1002 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1003 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1004 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1005 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1006 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1007 /* colscale = 4; */ 1008 /* } */ 1009 1010 for (i=0; i<n; i++) { 1011 nz = bi[i+1] - bi[i]; 1012 bjtmp = bj + bi[i]; 1013 /* zero out the 4x4 block accumulators */ 1014 /* zero out one register */ 1015 XOR_PS(XMM7,XMM7); 1016 for (j=0; j<nz; j++) { 1017 x = rtmp+16*((unsigned int)bjtmp[j]); 1018 /* x = rtmp+4*bjtmp[j]; */ 1019 SSE_INLINE_BEGIN_1(x) 1020 /* Copy zero register to memory locations */ 1021 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1022 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1023 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1024 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1025 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1026 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1027 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1028 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1029 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1030 SSE_INLINE_END_1; 1031 } 1032 /* load in initial (unfactored row) */ 1033 nz = ai[i+1] - ai[i]; 1034 ajtmp = aj + ai[i]; 1035 v = aa + 16*ai[i]; 1036 for (j=0; j<nz; j++) { 1037 x = rtmp+16*((unsigned int)ajtmp[j]); 1038 /* x = rtmp+colscale*ajtmp[j]; */ 1039 /* Copy v block into x block */ 1040 SSE_INLINE_BEGIN_2(v,x) 1041 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1042 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1043 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1044 1045 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1046 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1047 1048 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1049 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1050 1051 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1052 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1053 1054 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1055 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1056 1057 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1058 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1059 1060 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1061 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1062 1063 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1064 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1065 SSE_INLINE_END_2; 1066 1067 v += 16; 1068 } 1069 /* row = (*bjtmp++)/4; */ 1070 row = (unsigned int)(*bjtmp++); 1071 while (row < i) { 1072 pc = rtmp + 16*row; 1073 SSE_INLINE_BEGIN_1(pc) 1074 /* Load block from lower triangle */ 1075 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1076 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1077 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1078 1079 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1080 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1081 1082 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1083 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1084 1085 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1086 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1087 1088 /* Compare block to zero block */ 1089 1090 SSE_COPY_PS(XMM4,XMM7) 1091 SSE_CMPNEQ_PS(XMM4,XMM0) 1092 1093 SSE_COPY_PS(XMM5,XMM7) 1094 SSE_CMPNEQ_PS(XMM5,XMM1) 1095 1096 SSE_COPY_PS(XMM6,XMM7) 1097 SSE_CMPNEQ_PS(XMM6,XMM2) 1098 1099 SSE_CMPNEQ_PS(XMM7,XMM3) 1100 1101 /* Reduce the comparisons to one SSE register */ 1102 SSE_OR_PS(XMM6,XMM7) 1103 SSE_OR_PS(XMM5,XMM4) 1104 SSE_OR_PS(XMM5,XMM6) 1105 SSE_INLINE_END_1; 1106 1107 /* Reduce the one SSE register to an integer register for branching */ 1108 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1109 MOVEMASK(nonzero,XMM5); 1110 1111 /* If block is nonzero ... */ 1112 if (nonzero) { 1113 pv = ba + 16*diag_offset[row]; 1114 PREFETCH_L1(&pv[16]); 1115 pj = bj + diag_offset[row] + 1; 1116 1117 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1118 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1119 /* but the diagonal was inverted already */ 1120 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1121 1122 SSE_INLINE_BEGIN_2(pv,pc) 1123 /* Column 0, product is accumulated in XMM4 */ 1124 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1125 SSE_SHUFFLE(XMM4,XMM4,0x00) 1126 SSE_MULT_PS(XMM4,XMM0) 1127 1128 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1129 SSE_SHUFFLE(XMM5,XMM5,0x00) 1130 SSE_MULT_PS(XMM5,XMM1) 1131 SSE_ADD_PS(XMM4,XMM5) 1132 1133 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1134 SSE_SHUFFLE(XMM6,XMM6,0x00) 1135 SSE_MULT_PS(XMM6,XMM2) 1136 SSE_ADD_PS(XMM4,XMM6) 1137 1138 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1139 SSE_SHUFFLE(XMM7,XMM7,0x00) 1140 SSE_MULT_PS(XMM7,XMM3) 1141 SSE_ADD_PS(XMM4,XMM7) 1142 1143 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1144 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1145 1146 /* Column 1, product is accumulated in XMM5 */ 1147 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1148 SSE_SHUFFLE(XMM5,XMM5,0x00) 1149 SSE_MULT_PS(XMM5,XMM0) 1150 1151 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1152 SSE_SHUFFLE(XMM6,XMM6,0x00) 1153 SSE_MULT_PS(XMM6,XMM1) 1154 SSE_ADD_PS(XMM5,XMM6) 1155 1156 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1157 SSE_SHUFFLE(XMM7,XMM7,0x00) 1158 SSE_MULT_PS(XMM7,XMM2) 1159 SSE_ADD_PS(XMM5,XMM7) 1160 1161 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1162 SSE_SHUFFLE(XMM6,XMM6,0x00) 1163 SSE_MULT_PS(XMM6,XMM3) 1164 SSE_ADD_PS(XMM5,XMM6) 1165 1166 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1167 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1168 1169 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1170 1171 /* Column 2, product is accumulated in XMM6 */ 1172 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1173 SSE_SHUFFLE(XMM6,XMM6,0x00) 1174 SSE_MULT_PS(XMM6,XMM0) 1175 1176 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1177 SSE_SHUFFLE(XMM7,XMM7,0x00) 1178 SSE_MULT_PS(XMM7,XMM1) 1179 SSE_ADD_PS(XMM6,XMM7) 1180 1181 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1182 SSE_SHUFFLE(XMM7,XMM7,0x00) 1183 SSE_MULT_PS(XMM7,XMM2) 1184 SSE_ADD_PS(XMM6,XMM7) 1185 1186 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1187 SSE_SHUFFLE(XMM7,XMM7,0x00) 1188 SSE_MULT_PS(XMM7,XMM3) 1189 SSE_ADD_PS(XMM6,XMM7) 1190 1191 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1192 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1193 1194 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1195 /* Column 3, product is accumulated in XMM0 */ 1196 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1197 SSE_SHUFFLE(XMM7,XMM7,0x00) 1198 SSE_MULT_PS(XMM0,XMM7) 1199 1200 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1201 SSE_SHUFFLE(XMM7,XMM7,0x00) 1202 SSE_MULT_PS(XMM1,XMM7) 1203 SSE_ADD_PS(XMM0,XMM1) 1204 1205 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1206 SSE_SHUFFLE(XMM1,XMM1,0x00) 1207 SSE_MULT_PS(XMM1,XMM2) 1208 SSE_ADD_PS(XMM0,XMM1) 1209 1210 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1211 SSE_SHUFFLE(XMM7,XMM7,0x00) 1212 SSE_MULT_PS(XMM3,XMM7) 1213 SSE_ADD_PS(XMM0,XMM3) 1214 1215 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1216 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1217 1218 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1219 /* This is code to be maintained and read by humans afterall. */ 1220 /* Copy Multiplier Col 3 into XMM3 */ 1221 SSE_COPY_PS(XMM3,XMM0) 1222 /* Copy Multiplier Col 2 into XMM2 */ 1223 SSE_COPY_PS(XMM2,XMM6) 1224 /* Copy Multiplier Col 1 into XMM1 */ 1225 SSE_COPY_PS(XMM1,XMM5) 1226 /* Copy Multiplier Col 0 into XMM0 */ 1227 SSE_COPY_PS(XMM0,XMM4) 1228 SSE_INLINE_END_2; 1229 1230 /* Update the row: */ 1231 nz = bi[row+1] - diag_offset[row] - 1; 1232 pv += 16; 1233 for (j=0; j<nz; j++) { 1234 PREFETCH_L1(&pv[16]); 1235 x = rtmp + 16*((unsigned int)pj[j]); 1236 /* x = rtmp + 4*pj[j]; */ 1237 1238 /* X:=X-M*PV, One column at a time */ 1239 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1240 SSE_INLINE_BEGIN_2(x,pv) 1241 /* Load First Column of X*/ 1242 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1243 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1244 1245 /* Matrix-Vector Product: */ 1246 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1247 SSE_SHUFFLE(XMM5,XMM5,0x00) 1248 SSE_MULT_PS(XMM5,XMM0) 1249 SSE_SUB_PS(XMM4,XMM5) 1250 1251 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1252 SSE_SHUFFLE(XMM6,XMM6,0x00) 1253 SSE_MULT_PS(XMM6,XMM1) 1254 SSE_SUB_PS(XMM4,XMM6) 1255 1256 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1257 SSE_SHUFFLE(XMM7,XMM7,0x00) 1258 SSE_MULT_PS(XMM7,XMM2) 1259 SSE_SUB_PS(XMM4,XMM7) 1260 1261 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1262 SSE_SHUFFLE(XMM5,XMM5,0x00) 1263 SSE_MULT_PS(XMM5,XMM3) 1264 SSE_SUB_PS(XMM4,XMM5) 1265 1266 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1267 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1268 1269 /* Second Column */ 1270 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1271 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1272 1273 /* Matrix-Vector Product: */ 1274 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1275 SSE_SHUFFLE(XMM6,XMM6,0x00) 1276 SSE_MULT_PS(XMM6,XMM0) 1277 SSE_SUB_PS(XMM5,XMM6) 1278 1279 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1280 SSE_SHUFFLE(XMM7,XMM7,0x00) 1281 SSE_MULT_PS(XMM7,XMM1) 1282 SSE_SUB_PS(XMM5,XMM7) 1283 1284 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1285 SSE_SHUFFLE(XMM4,XMM4,0x00) 1286 SSE_MULT_PS(XMM4,XMM2) 1287 SSE_SUB_PS(XMM5,XMM4) 1288 1289 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1290 SSE_SHUFFLE(XMM6,XMM6,0x00) 1291 SSE_MULT_PS(XMM6,XMM3) 1292 SSE_SUB_PS(XMM5,XMM6) 1293 1294 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1295 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1296 1297 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1298 1299 /* Third Column */ 1300 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1301 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1302 1303 /* Matrix-Vector Product: */ 1304 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1305 SSE_SHUFFLE(XMM7,XMM7,0x00) 1306 SSE_MULT_PS(XMM7,XMM0) 1307 SSE_SUB_PS(XMM6,XMM7) 1308 1309 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1310 SSE_SHUFFLE(XMM4,XMM4,0x00) 1311 SSE_MULT_PS(XMM4,XMM1) 1312 SSE_SUB_PS(XMM6,XMM4) 1313 1314 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1315 SSE_SHUFFLE(XMM5,XMM5,0x00) 1316 SSE_MULT_PS(XMM5,XMM2) 1317 SSE_SUB_PS(XMM6,XMM5) 1318 1319 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1320 SSE_SHUFFLE(XMM7,XMM7,0x00) 1321 SSE_MULT_PS(XMM7,XMM3) 1322 SSE_SUB_PS(XMM6,XMM7) 1323 1324 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1325 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1326 1327 /* Fourth Column */ 1328 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1329 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1330 1331 /* Matrix-Vector Product: */ 1332 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1333 SSE_SHUFFLE(XMM5,XMM5,0x00) 1334 SSE_MULT_PS(XMM5,XMM0) 1335 SSE_SUB_PS(XMM4,XMM5) 1336 1337 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1338 SSE_SHUFFLE(XMM6,XMM6,0x00) 1339 SSE_MULT_PS(XMM6,XMM1) 1340 SSE_SUB_PS(XMM4,XMM6) 1341 1342 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1343 SSE_SHUFFLE(XMM7,XMM7,0x00) 1344 SSE_MULT_PS(XMM7,XMM2) 1345 SSE_SUB_PS(XMM4,XMM7) 1346 1347 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1348 SSE_SHUFFLE(XMM5,XMM5,0x00) 1349 SSE_MULT_PS(XMM5,XMM3) 1350 SSE_SUB_PS(XMM4,XMM5) 1351 1352 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1353 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1354 SSE_INLINE_END_2; 1355 pv += 16; 1356 } 1357 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1358 } 1359 row = (unsigned int)(*bjtmp++); 1360 /* row = (*bjtmp++)/4; */ 1361 /* bjtmp++; */ 1362 } 1363 /* finished row so stick it into b->a */ 1364 pv = ba + 16*bi[i]; 1365 pj = bj + bi[i]; 1366 nz = bi[i+1] - bi[i]; 1367 1368 /* Copy x block back into pv block */ 1369 for (j=0; j<nz; j++) { 1370 x = rtmp+16*((unsigned int)pj[j]); 1371 /* x = rtmp+4*pj[j]; */ 1372 1373 SSE_INLINE_BEGIN_2(x,pv) 1374 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1375 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1376 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1377 1378 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1379 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1380 1381 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1382 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1383 1384 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1385 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1386 1387 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1388 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1389 1390 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1391 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1392 1393 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1394 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1395 1396 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1397 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1398 SSE_INLINE_END_2; 1399 pv += 16; 1400 } 1401 /* invert diagonal block */ 1402 w = ba + 16*diag_offset[i]; 1403 if (pivotinblocks) { 1404 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1405 } else { 1406 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1407 } 1408 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1409 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1410 } 1411 1412 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1413 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1414 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1415 C->assembled = PETSC_TRUE; 1416 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1417 /* Flop Count from inverting diagonal blocks */ 1418 SSE_SCOPE_END; 1419 PetscFunctionReturn(0); 1420 } 1421 1422 #undef __FUNCT__ 1423 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1424 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1425 { 1426 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1427 PetscErrorCode ierr; 1428 int i,j,n = a->mbs; 1429 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1430 unsigned int row; 1431 int *ajtmpold,nz,*bi=b->i; 1432 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1433 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1434 MatScalar *ba = b->a,*aa = a->a; 1435 int nonzero=0; 1436 /* int nonzero=0,colscale = 16; */ 1437 PetscTruth pivotinblocks = b->pivotinblocks; 1438 PetscReal shift = info->shiftamount; 1439 1440 PetscFunctionBegin; 1441 SSE_SCOPE_BEGIN; 1442 1443 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1444 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1445 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1446 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1447 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1448 /* colscale = 4; */ 1449 /* } */ 1450 if ((unsigned long)bj==(unsigned long)aj) { 1451 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1452 } 1453 1454 for (i=0; i<n; i++) { 1455 nz = bi[i+1] - bi[i]; 1456 bjtmp = bj + bi[i]; 1457 /* zero out the 4x4 block accumulators */ 1458 /* zero out one register */ 1459 XOR_PS(XMM7,XMM7); 1460 for (j=0; j<nz; j++) { 1461 x = rtmp+16*((unsigned int)bjtmp[j]); 1462 /* x = rtmp+4*bjtmp[j]; */ 1463 SSE_INLINE_BEGIN_1(x) 1464 /* Copy zero register to memory locations */ 1465 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1466 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1467 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1468 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1469 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1470 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1471 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1472 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1473 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1474 SSE_INLINE_END_1; 1475 } 1476 /* load in initial (unfactored row) */ 1477 nz = ai[i+1] - ai[i]; 1478 ajtmpold = aj + ai[i]; 1479 v = aa + 16*ai[i]; 1480 for (j=0; j<nz; j++) { 1481 x = rtmp+16*ajtmpold[j]; 1482 /* x = rtmp+colscale*ajtmpold[j]; */ 1483 /* Copy v block into x block */ 1484 SSE_INLINE_BEGIN_2(v,x) 1485 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1486 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1487 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1488 1489 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1490 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1491 1492 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1493 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1494 1495 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1496 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1497 1498 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1499 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1500 1501 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1502 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1503 1504 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1505 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1506 1507 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1508 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1509 SSE_INLINE_END_2; 1510 1511 v += 16; 1512 } 1513 /* row = (*bjtmp++)/4; */ 1514 row = (unsigned int)(*bjtmp++); 1515 while (row < i) { 1516 pc = rtmp + 16*row; 1517 SSE_INLINE_BEGIN_1(pc) 1518 /* Load block from lower triangle */ 1519 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1520 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1521 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1522 1523 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1524 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1525 1526 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1527 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1528 1529 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1530 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1531 1532 /* Compare block to zero block */ 1533 1534 SSE_COPY_PS(XMM4,XMM7) 1535 SSE_CMPNEQ_PS(XMM4,XMM0) 1536 1537 SSE_COPY_PS(XMM5,XMM7) 1538 SSE_CMPNEQ_PS(XMM5,XMM1) 1539 1540 SSE_COPY_PS(XMM6,XMM7) 1541 SSE_CMPNEQ_PS(XMM6,XMM2) 1542 1543 SSE_CMPNEQ_PS(XMM7,XMM3) 1544 1545 /* Reduce the comparisons to one SSE register */ 1546 SSE_OR_PS(XMM6,XMM7) 1547 SSE_OR_PS(XMM5,XMM4) 1548 SSE_OR_PS(XMM5,XMM6) 1549 SSE_INLINE_END_1; 1550 1551 /* Reduce the one SSE register to an integer register for branching */ 1552 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1553 MOVEMASK(nonzero,XMM5); 1554 1555 /* If block is nonzero ... */ 1556 if (nonzero) { 1557 pv = ba + 16*diag_offset[row]; 1558 PREFETCH_L1(&pv[16]); 1559 pj = bj + diag_offset[row] + 1; 1560 1561 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1562 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1563 /* but the diagonal was inverted already */ 1564 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1565 1566 SSE_INLINE_BEGIN_2(pv,pc) 1567 /* Column 0, product is accumulated in XMM4 */ 1568 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1569 SSE_SHUFFLE(XMM4,XMM4,0x00) 1570 SSE_MULT_PS(XMM4,XMM0) 1571 1572 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1573 SSE_SHUFFLE(XMM5,XMM5,0x00) 1574 SSE_MULT_PS(XMM5,XMM1) 1575 SSE_ADD_PS(XMM4,XMM5) 1576 1577 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1578 SSE_SHUFFLE(XMM6,XMM6,0x00) 1579 SSE_MULT_PS(XMM6,XMM2) 1580 SSE_ADD_PS(XMM4,XMM6) 1581 1582 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1583 SSE_SHUFFLE(XMM7,XMM7,0x00) 1584 SSE_MULT_PS(XMM7,XMM3) 1585 SSE_ADD_PS(XMM4,XMM7) 1586 1587 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1588 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1589 1590 /* Column 1, product is accumulated in XMM5 */ 1591 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1592 SSE_SHUFFLE(XMM5,XMM5,0x00) 1593 SSE_MULT_PS(XMM5,XMM0) 1594 1595 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1596 SSE_SHUFFLE(XMM6,XMM6,0x00) 1597 SSE_MULT_PS(XMM6,XMM1) 1598 SSE_ADD_PS(XMM5,XMM6) 1599 1600 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1601 SSE_SHUFFLE(XMM7,XMM7,0x00) 1602 SSE_MULT_PS(XMM7,XMM2) 1603 SSE_ADD_PS(XMM5,XMM7) 1604 1605 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1606 SSE_SHUFFLE(XMM6,XMM6,0x00) 1607 SSE_MULT_PS(XMM6,XMM3) 1608 SSE_ADD_PS(XMM5,XMM6) 1609 1610 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1611 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1612 1613 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1614 1615 /* Column 2, product is accumulated in XMM6 */ 1616 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1617 SSE_SHUFFLE(XMM6,XMM6,0x00) 1618 SSE_MULT_PS(XMM6,XMM0) 1619 1620 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1621 SSE_SHUFFLE(XMM7,XMM7,0x00) 1622 SSE_MULT_PS(XMM7,XMM1) 1623 SSE_ADD_PS(XMM6,XMM7) 1624 1625 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1626 SSE_SHUFFLE(XMM7,XMM7,0x00) 1627 SSE_MULT_PS(XMM7,XMM2) 1628 SSE_ADD_PS(XMM6,XMM7) 1629 1630 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1631 SSE_SHUFFLE(XMM7,XMM7,0x00) 1632 SSE_MULT_PS(XMM7,XMM3) 1633 SSE_ADD_PS(XMM6,XMM7) 1634 1635 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1636 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1637 1638 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1639 /* Column 3, product is accumulated in XMM0 */ 1640 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1641 SSE_SHUFFLE(XMM7,XMM7,0x00) 1642 SSE_MULT_PS(XMM0,XMM7) 1643 1644 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1645 SSE_SHUFFLE(XMM7,XMM7,0x00) 1646 SSE_MULT_PS(XMM1,XMM7) 1647 SSE_ADD_PS(XMM0,XMM1) 1648 1649 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1650 SSE_SHUFFLE(XMM1,XMM1,0x00) 1651 SSE_MULT_PS(XMM1,XMM2) 1652 SSE_ADD_PS(XMM0,XMM1) 1653 1654 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1655 SSE_SHUFFLE(XMM7,XMM7,0x00) 1656 SSE_MULT_PS(XMM3,XMM7) 1657 SSE_ADD_PS(XMM0,XMM3) 1658 1659 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1660 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1661 1662 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1663 /* This is code to be maintained and read by humans afterall. */ 1664 /* Copy Multiplier Col 3 into XMM3 */ 1665 SSE_COPY_PS(XMM3,XMM0) 1666 /* Copy Multiplier Col 2 into XMM2 */ 1667 SSE_COPY_PS(XMM2,XMM6) 1668 /* Copy Multiplier Col 1 into XMM1 */ 1669 SSE_COPY_PS(XMM1,XMM5) 1670 /* Copy Multiplier Col 0 into XMM0 */ 1671 SSE_COPY_PS(XMM0,XMM4) 1672 SSE_INLINE_END_2; 1673 1674 /* Update the row: */ 1675 nz = bi[row+1] - diag_offset[row] - 1; 1676 pv += 16; 1677 for (j=0; j<nz; j++) { 1678 PREFETCH_L1(&pv[16]); 1679 x = rtmp + 16*((unsigned int)pj[j]); 1680 /* x = rtmp + 4*pj[j]; */ 1681 1682 /* X:=X-M*PV, One column at a time */ 1683 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1684 SSE_INLINE_BEGIN_2(x,pv) 1685 /* Load First Column of X*/ 1686 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1687 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1688 1689 /* Matrix-Vector Product: */ 1690 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1691 SSE_SHUFFLE(XMM5,XMM5,0x00) 1692 SSE_MULT_PS(XMM5,XMM0) 1693 SSE_SUB_PS(XMM4,XMM5) 1694 1695 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1696 SSE_SHUFFLE(XMM6,XMM6,0x00) 1697 SSE_MULT_PS(XMM6,XMM1) 1698 SSE_SUB_PS(XMM4,XMM6) 1699 1700 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1701 SSE_SHUFFLE(XMM7,XMM7,0x00) 1702 SSE_MULT_PS(XMM7,XMM2) 1703 SSE_SUB_PS(XMM4,XMM7) 1704 1705 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1706 SSE_SHUFFLE(XMM5,XMM5,0x00) 1707 SSE_MULT_PS(XMM5,XMM3) 1708 SSE_SUB_PS(XMM4,XMM5) 1709 1710 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1711 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1712 1713 /* Second Column */ 1714 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1715 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1716 1717 /* Matrix-Vector Product: */ 1718 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1719 SSE_SHUFFLE(XMM6,XMM6,0x00) 1720 SSE_MULT_PS(XMM6,XMM0) 1721 SSE_SUB_PS(XMM5,XMM6) 1722 1723 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1724 SSE_SHUFFLE(XMM7,XMM7,0x00) 1725 SSE_MULT_PS(XMM7,XMM1) 1726 SSE_SUB_PS(XMM5,XMM7) 1727 1728 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1729 SSE_SHUFFLE(XMM4,XMM4,0x00) 1730 SSE_MULT_PS(XMM4,XMM2) 1731 SSE_SUB_PS(XMM5,XMM4) 1732 1733 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1734 SSE_SHUFFLE(XMM6,XMM6,0x00) 1735 SSE_MULT_PS(XMM6,XMM3) 1736 SSE_SUB_PS(XMM5,XMM6) 1737 1738 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1739 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1740 1741 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1742 1743 /* Third Column */ 1744 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1745 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1746 1747 /* Matrix-Vector Product: */ 1748 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1749 SSE_SHUFFLE(XMM7,XMM7,0x00) 1750 SSE_MULT_PS(XMM7,XMM0) 1751 SSE_SUB_PS(XMM6,XMM7) 1752 1753 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1754 SSE_SHUFFLE(XMM4,XMM4,0x00) 1755 SSE_MULT_PS(XMM4,XMM1) 1756 SSE_SUB_PS(XMM6,XMM4) 1757 1758 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1759 SSE_SHUFFLE(XMM5,XMM5,0x00) 1760 SSE_MULT_PS(XMM5,XMM2) 1761 SSE_SUB_PS(XMM6,XMM5) 1762 1763 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1764 SSE_SHUFFLE(XMM7,XMM7,0x00) 1765 SSE_MULT_PS(XMM7,XMM3) 1766 SSE_SUB_PS(XMM6,XMM7) 1767 1768 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1769 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1770 1771 /* Fourth Column */ 1772 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1773 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1774 1775 /* Matrix-Vector Product: */ 1776 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1777 SSE_SHUFFLE(XMM5,XMM5,0x00) 1778 SSE_MULT_PS(XMM5,XMM0) 1779 SSE_SUB_PS(XMM4,XMM5) 1780 1781 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1782 SSE_SHUFFLE(XMM6,XMM6,0x00) 1783 SSE_MULT_PS(XMM6,XMM1) 1784 SSE_SUB_PS(XMM4,XMM6) 1785 1786 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1787 SSE_SHUFFLE(XMM7,XMM7,0x00) 1788 SSE_MULT_PS(XMM7,XMM2) 1789 SSE_SUB_PS(XMM4,XMM7) 1790 1791 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1792 SSE_SHUFFLE(XMM5,XMM5,0x00) 1793 SSE_MULT_PS(XMM5,XMM3) 1794 SSE_SUB_PS(XMM4,XMM5) 1795 1796 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1797 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1798 SSE_INLINE_END_2; 1799 pv += 16; 1800 } 1801 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1802 } 1803 row = (unsigned int)(*bjtmp++); 1804 /* row = (*bjtmp++)/4; */ 1805 /* bjtmp++; */ 1806 } 1807 /* finished row so stick it into b->a */ 1808 pv = ba + 16*bi[i]; 1809 pj = bj + bi[i]; 1810 nz = bi[i+1] - bi[i]; 1811 1812 /* Copy x block back into pv block */ 1813 for (j=0; j<nz; j++) { 1814 x = rtmp+16*((unsigned int)pj[j]); 1815 /* x = rtmp+4*pj[j]; */ 1816 1817 SSE_INLINE_BEGIN_2(x,pv) 1818 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1819 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1820 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1821 1822 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1823 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1824 1825 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1826 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1827 1828 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1829 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1830 1831 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1832 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1833 1834 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1835 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1836 1837 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1838 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1839 1840 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1841 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1842 SSE_INLINE_END_2; 1843 pv += 16; 1844 } 1845 /* invert diagonal block */ 1846 w = ba + 16*diag_offset[i]; 1847 if (pivotinblocks) { 1848 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1849 } else { 1850 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1851 } 1852 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1853 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1854 } 1855 1856 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1857 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1858 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1859 C->assembled = PETSC_TRUE; 1860 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1861 /* Flop Count from inverting diagonal blocks */ 1862 SSE_SCOPE_END; 1863 PetscFunctionReturn(0); 1864 } 1865 1866 #endif 1867