1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 #include "../src/mat/impls/baij/seq/baij.h" 7 #include "../src/mat/blockinvert.h" 8 9 /* ------------------------------------------------------------*/ 10 /* 11 Version for when blocks are 4 by 4 12 */ 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace" 15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 16 { 17 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 18 IS isrow = b->row,isicol = b->icol; 19 PetscErrorCode ierr; 20 const PetscInt *r,*ic; 21 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 22 PetscInt *ajtmpold,*ajtmp,nz,row; 23 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 24 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 25 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 26 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 27 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 28 MatScalar m13,m14,m15,m16; 29 MatScalar *ba = b->a,*aa = a->a; 30 PetscTruth pivotinblocks = b->pivotinblocks; 31 PetscReal shift = info->shiftamount; 32 33 PetscFunctionBegin; 34 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 35 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 36 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 37 38 for (i=0; i<n; i++) { 39 nz = bi[i+1] - bi[i]; 40 ajtmp = bj + bi[i]; 41 for (j=0; j<nz; j++) { 42 x = rtmp+16*ajtmp[j]; 43 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 44 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 45 } 46 /* load in initial (unfactored row) */ 47 idx = r[i]; 48 nz = ai[idx+1] - ai[idx]; 49 ajtmpold = aj + ai[idx]; 50 v = aa + 16*ai[idx]; 51 for (j=0; j<nz; j++) { 52 x = rtmp+16*ic[ajtmpold[j]]; 53 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 54 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 55 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 56 x[14] = v[14]; x[15] = v[15]; 57 v += 16; 58 } 59 row = *ajtmp++; 60 while (row < i) { 61 pc = rtmp + 16*row; 62 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 63 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 64 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 65 p15 = pc[14]; p16 = pc[15]; 66 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 67 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 68 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 69 || p16 != 0.0) { 70 pv = ba + 16*diag_offset[row]; 71 pj = bj + diag_offset[row] + 1; 72 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 73 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 74 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 75 x15 = pv[14]; x16 = pv[15]; 76 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 77 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 78 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 79 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 80 81 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 82 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 83 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 84 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 85 86 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 87 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 88 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 89 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 90 91 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 92 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 93 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 94 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 95 96 nz = bi[row+1] - diag_offset[row] - 1; 97 pv += 16; 98 for (j=0; j<nz; j++) { 99 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 100 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 101 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 102 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 103 x = rtmp + 16*pj[j]; 104 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 105 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 106 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 107 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 108 109 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 110 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 111 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 112 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 113 114 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 115 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 116 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 117 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 118 119 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 120 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 121 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 122 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 123 124 pv += 16; 125 } 126 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 127 } 128 row = *ajtmp++; 129 } 130 /* finished row so stick it into b->a */ 131 pv = ba + 16*bi[i]; 132 pj = bj + bi[i]; 133 nz = bi[i+1] - bi[i]; 134 for (j=0; j<nz; j++) { 135 x = rtmp+16*pj[j]; 136 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 137 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 138 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 139 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 140 pv += 16; 141 } 142 /* invert diagonal block */ 143 w = ba + 16*diag_offset[i]; 144 if (pivotinblocks) { 145 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 146 } else { 147 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148 } 149 } 150 151 ierr = PetscFree(rtmp);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 153 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 155 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 156 C->assembled = PETSC_TRUE; 157 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 158 PetscFunctionReturn(0); 159 } 160 161 /* MatLUFactorNumeric_SeqBAIJ_4 - 162 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 163 Kernel_A_gets_A_times_B() 164 Kernel_A_gets_A_minus_B_times_C() 165 Kernel_A_gets_inverse_A() 166 */ 167 168 #undef __FUNCT__ 169 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 171 { 172 Mat C=B; 173 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 174 IS isrow = b->row,isicol = b->icol; 175 PetscErrorCode ierr; 176 const PetscInt *r,*ic,*ics; 177 PetscInt i,j,k,nz,nzL,row; 178 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 179 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 180 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 181 PetscInt flg; 182 PetscReal shift; 183 184 PetscFunctionBegin; 185 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 186 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 187 188 if (info->shifttype == MAT_SHIFT_NONE){ 189 shift = 0; 190 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 191 shift = info->shiftamount; 192 } 193 194 /* generate work space needed by the factorization */ 195 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 196 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 197 ics = ic; 198 199 for (i=0; i<n; i++){ 200 /* zero rtmp */ 201 /* L part */ 202 nz = bi[i+1] - bi[i]; 203 bjtmp = bj + bi[i]; 204 for (j=0; j<nz; j++){ 205 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 206 } 207 208 /* U part */ 209 nz = bdiag[i]-bdiag[i+1]; 210 bjtmp = bj + bdiag[i+1]+1; 211 for (j=0; j<nz; j++){ 212 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 213 } 214 215 /* load in initial (unfactored row) */ 216 nz = ai[r[i]+1] - ai[r[i]]; 217 ajtmp = aj + ai[r[i]]; 218 v = aa + bs2*ai[r[i]]; 219 for (j=0; j<nz; j++) { 220 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 221 } 222 223 /* elimination */ 224 bjtmp = bj + bi[i]; 225 nzL = bi[i+1] - bi[i]; 226 for(k=0;k < nzL;k++) { 227 row = bjtmp[k]; 228 pc = rtmp + bs2*row; 229 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 230 if (flg) { 231 pv = b->a + bs2*bdiag[row]; 232 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 233 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 234 235 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 236 pv = b->a + bs2*(bdiag[row+1]+1); 237 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 238 for (j=0; j<nz; j++) { 239 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 240 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 241 v = rtmp + bs2*pj[j]; 242 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 243 pv += bs2; 244 } 245 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 246 } 247 } 248 249 /* finished row so stick it into b->a */ 250 /* L part */ 251 pv = b->a + bs2*bi[i] ; 252 pj = b->j + bi[i] ; 253 nz = bi[i+1] - bi[i]; 254 for (j=0; j<nz; j++) { 255 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 256 } 257 258 /* Mark diagonal and invert diagonal for simplier triangular solves */ 259 pv = b->a + bs2*bdiag[i]; 260 pj = b->j + bdiag[i]; 261 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 262 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 263 264 /* U part */ 265 pv = b->a + bs2*(bdiag[i+1]+1); 266 pj = b->j + bdiag[i+1]+1; 267 nz = bdiag[i] - bdiag[i+1] - 1; 268 for (j=0; j<nz; j++){ 269 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 270 } 271 } 272 273 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 274 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 275 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 276 C->ops->solve = MatSolve_SeqBAIJ_4; 277 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 278 C->assembled = PETSC_TRUE; 279 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 280 PetscFunctionReturn(0); 281 } 282 283 #undef __FUNCT__ 284 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace" 285 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 286 { 287 /* 288 Default Version for when blocks are 4 by 4 Using natural ordering 289 */ 290 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 291 PetscErrorCode ierr; 292 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 293 PetscInt *ajtmpold,*ajtmp,nz,row; 294 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 295 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 296 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 297 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 298 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 299 MatScalar m13,m14,m15,m16; 300 MatScalar *ba = b->a,*aa = a->a; 301 PetscTruth pivotinblocks = b->pivotinblocks; 302 PetscReal shift = info->shiftamount; 303 304 PetscFunctionBegin; 305 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 306 307 for (i=0; i<n; i++) { 308 nz = bi[i+1] - bi[i]; 309 ajtmp = bj + bi[i]; 310 for (j=0; j<nz; j++) { 311 x = rtmp+16*ajtmp[j]; 312 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 313 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 314 } 315 /* load in initial (unfactored row) */ 316 nz = ai[i+1] - ai[i]; 317 ajtmpold = aj + ai[i]; 318 v = aa + 16*ai[i]; 319 for (j=0; j<nz; j++) { 320 x = rtmp+16*ajtmpold[j]; 321 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 322 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 323 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 324 x[14] = v[14]; x[15] = v[15]; 325 v += 16; 326 } 327 row = *ajtmp++; 328 while (row < i) { 329 pc = rtmp + 16*row; 330 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 331 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 332 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 333 p15 = pc[14]; p16 = pc[15]; 334 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 335 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 336 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 337 || p16 != 0.0) { 338 pv = ba + 16*diag_offset[row]; 339 pj = bj + diag_offset[row] + 1; 340 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 341 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 342 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 343 x15 = pv[14]; x16 = pv[15]; 344 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 345 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 346 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 347 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 348 349 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 350 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 351 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 352 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 353 354 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 355 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 356 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 357 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 358 359 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 360 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 361 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 362 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 363 nz = bi[row+1] - diag_offset[row] - 1; 364 pv += 16; 365 for (j=0; j<nz; j++) { 366 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 367 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 368 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 369 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 370 x = rtmp + 16*pj[j]; 371 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 372 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 373 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 374 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 375 376 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 377 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 378 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 379 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 380 381 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 382 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 383 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 384 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 385 386 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 387 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 388 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 389 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 390 391 pv += 16; 392 } 393 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 394 } 395 row = *ajtmp++; 396 } 397 /* finished row so stick it into b->a */ 398 pv = ba + 16*bi[i]; 399 pj = bj + bi[i]; 400 nz = bi[i+1] - bi[i]; 401 for (j=0; j<nz; j++) { 402 x = rtmp+16*pj[j]; 403 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 404 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 405 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 406 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 407 pv += 16; 408 } 409 /* invert diagonal block */ 410 w = ba + 16*diag_offset[i]; 411 if (pivotinblocks) { 412 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 413 } else { 414 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 415 } 416 } 417 418 ierr = PetscFree(rtmp);CHKERRQ(ierr); 419 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 420 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 421 C->assembled = PETSC_TRUE; 422 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 423 PetscFunctionReturn(0); 424 } 425 426 /* 427 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 428 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 429 */ 430 #undef __FUNCT__ 431 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 432 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 433 { 434 Mat C=B; 435 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 436 PetscErrorCode ierr; 437 PetscInt i,j,k,nz,nzL,row; 438 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 439 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 440 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 441 PetscInt flg; 442 PetscReal shift; 443 444 PetscFunctionBegin; 445 /* generate work space needed by the factorization */ 446 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 447 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 448 449 if (info->shifttype == MAT_SHIFT_NONE){ 450 shift = 0; 451 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 452 shift = info->shiftamount; 453 } 454 455 for (i=0; i<n; i++){ 456 /* zero rtmp */ 457 /* L part */ 458 nz = bi[i+1] - bi[i]; 459 bjtmp = bj + bi[i]; 460 for (j=0; j<nz; j++){ 461 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 462 } 463 464 /* U part */ 465 nz = bdiag[i] - bdiag[i+1]; 466 bjtmp = bj + bdiag[i+1]+1; 467 for (j=0; j<nz; j++){ 468 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 469 } 470 471 /* load in initial (unfactored row) */ 472 nz = ai[i+1] - ai[i]; 473 ajtmp = aj + ai[i]; 474 v = aa + bs2*ai[i]; 475 for (j=0; j<nz; j++) { 476 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 477 } 478 479 /* elimination */ 480 bjtmp = bj + bi[i]; 481 nzL = bi[i+1] - bi[i]; 482 for(k=0;k < nzL;k++) { 483 row = bjtmp[k]; 484 pc = rtmp + bs2*row; 485 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 486 if (flg) { 487 pv = b->a + bs2*bdiag[row]; 488 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 489 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 490 491 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 492 pv = b->a + bs2*(bdiag[row+1]+1); 493 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 494 for (j=0; j<nz; j++) { 495 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 496 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 497 v = rtmp + bs2*pj[j]; 498 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 499 pv += bs2; 500 } 501 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 502 } 503 } 504 505 /* finished row so stick it into b->a */ 506 /* L part */ 507 pv = b->a + bs2*bi[i] ; 508 pj = b->j + bi[i] ; 509 nz = bi[i+1] - bi[i]; 510 for (j=0; j<nz; j++) { 511 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 512 } 513 514 /* Mark diagonal and invert diagonal for simplier triangular solves */ 515 pv = b->a + bs2*bdiag[i]; 516 pj = b->j + bdiag[i]; 517 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 518 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 519 520 /* U part */ 521 pv = b->a + bs2*(bdiag[i+1]+1); 522 pj = b->j + bdiag[i+1]+1; 523 nz = bdiag[i] - bdiag[i+1] - 1; 524 for (j=0; j<nz; j++){ 525 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 526 } 527 } 528 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 529 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 530 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 531 C->assembled = PETSC_TRUE; 532 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 533 PetscFunctionReturn(0); 534 } 535 536 #if defined(PETSC_HAVE_SSE) 537 538 #include PETSC_HAVE_SSE 539 540 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 541 #undef __FUNCT__ 542 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 543 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 544 { 545 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 546 PetscErrorCode ierr; 547 int i,j,n = a->mbs; 548 int *bj = b->j,*bjtmp,*pj; 549 int row; 550 int *ajtmpold,nz,*bi=b->i; 551 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 552 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 553 MatScalar *ba = b->a,*aa = a->a; 554 int nonzero=0; 555 /* int nonzero=0,colscale = 16; */ 556 PetscTruth pivotinblocks = b->pivotinblocks; 557 PetscReal shift = info->shiftamount; 558 559 PetscFunctionBegin; 560 SSE_SCOPE_BEGIN; 561 562 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 563 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 564 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 565 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 566 /* if ((unsigned long)bj==(unsigned long)aj) { */ 567 /* colscale = 4; */ 568 /* } */ 569 for (i=0; i<n; i++) { 570 nz = bi[i+1] - bi[i]; 571 bjtmp = bj + bi[i]; 572 /* zero out the 4x4 block accumulators */ 573 /* zero out one register */ 574 XOR_PS(XMM7,XMM7); 575 for (j=0; j<nz; j++) { 576 x = rtmp+16*bjtmp[j]; 577 /* x = rtmp+4*bjtmp[j]; */ 578 SSE_INLINE_BEGIN_1(x) 579 /* Copy zero register to memory locations */ 580 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 581 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 582 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 583 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 584 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 585 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 586 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 587 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 588 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 589 SSE_INLINE_END_1; 590 } 591 /* load in initial (unfactored row) */ 592 nz = ai[i+1] - ai[i]; 593 ajtmpold = aj + ai[i]; 594 v = aa + 16*ai[i]; 595 for (j=0; j<nz; j++) { 596 x = rtmp+16*ajtmpold[j]; 597 /* x = rtmp+colscale*ajtmpold[j]; */ 598 /* Copy v block into x block */ 599 SSE_INLINE_BEGIN_2(v,x) 600 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 601 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 602 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 603 604 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 605 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 606 607 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 608 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 609 610 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 611 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 612 613 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 614 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 615 616 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 617 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 618 619 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 620 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 621 622 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 623 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 624 SSE_INLINE_END_2; 625 626 v += 16; 627 } 628 /* row = (*bjtmp++)/4; */ 629 row = *bjtmp++; 630 while (row < i) { 631 pc = rtmp + 16*row; 632 SSE_INLINE_BEGIN_1(pc) 633 /* Load block from lower triangle */ 634 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 635 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 636 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 637 638 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 639 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 640 641 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 642 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 643 644 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 645 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 646 647 /* Compare block to zero block */ 648 649 SSE_COPY_PS(XMM4,XMM7) 650 SSE_CMPNEQ_PS(XMM4,XMM0) 651 652 SSE_COPY_PS(XMM5,XMM7) 653 SSE_CMPNEQ_PS(XMM5,XMM1) 654 655 SSE_COPY_PS(XMM6,XMM7) 656 SSE_CMPNEQ_PS(XMM6,XMM2) 657 658 SSE_CMPNEQ_PS(XMM7,XMM3) 659 660 /* Reduce the comparisons to one SSE register */ 661 SSE_OR_PS(XMM6,XMM7) 662 SSE_OR_PS(XMM5,XMM4) 663 SSE_OR_PS(XMM5,XMM6) 664 SSE_INLINE_END_1; 665 666 /* Reduce the one SSE register to an integer register for branching */ 667 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 668 MOVEMASK(nonzero,XMM5); 669 670 /* If block is nonzero ... */ 671 if (nonzero) { 672 pv = ba + 16*diag_offset[row]; 673 PREFETCH_L1(&pv[16]); 674 pj = bj + diag_offset[row] + 1; 675 676 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 677 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 678 /* but the diagonal was inverted already */ 679 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 680 681 SSE_INLINE_BEGIN_2(pv,pc) 682 /* Column 0, product is accumulated in XMM4 */ 683 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 684 SSE_SHUFFLE(XMM4,XMM4,0x00) 685 SSE_MULT_PS(XMM4,XMM0) 686 687 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 688 SSE_SHUFFLE(XMM5,XMM5,0x00) 689 SSE_MULT_PS(XMM5,XMM1) 690 SSE_ADD_PS(XMM4,XMM5) 691 692 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 693 SSE_SHUFFLE(XMM6,XMM6,0x00) 694 SSE_MULT_PS(XMM6,XMM2) 695 SSE_ADD_PS(XMM4,XMM6) 696 697 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 698 SSE_SHUFFLE(XMM7,XMM7,0x00) 699 SSE_MULT_PS(XMM7,XMM3) 700 SSE_ADD_PS(XMM4,XMM7) 701 702 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 703 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 704 705 /* Column 1, product is accumulated in XMM5 */ 706 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 707 SSE_SHUFFLE(XMM5,XMM5,0x00) 708 SSE_MULT_PS(XMM5,XMM0) 709 710 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 711 SSE_SHUFFLE(XMM6,XMM6,0x00) 712 SSE_MULT_PS(XMM6,XMM1) 713 SSE_ADD_PS(XMM5,XMM6) 714 715 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 716 SSE_SHUFFLE(XMM7,XMM7,0x00) 717 SSE_MULT_PS(XMM7,XMM2) 718 SSE_ADD_PS(XMM5,XMM7) 719 720 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 721 SSE_SHUFFLE(XMM6,XMM6,0x00) 722 SSE_MULT_PS(XMM6,XMM3) 723 SSE_ADD_PS(XMM5,XMM6) 724 725 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 726 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 727 728 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 729 730 /* Column 2, product is accumulated in XMM6 */ 731 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 732 SSE_SHUFFLE(XMM6,XMM6,0x00) 733 SSE_MULT_PS(XMM6,XMM0) 734 735 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 736 SSE_SHUFFLE(XMM7,XMM7,0x00) 737 SSE_MULT_PS(XMM7,XMM1) 738 SSE_ADD_PS(XMM6,XMM7) 739 740 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 741 SSE_SHUFFLE(XMM7,XMM7,0x00) 742 SSE_MULT_PS(XMM7,XMM2) 743 SSE_ADD_PS(XMM6,XMM7) 744 745 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 746 SSE_SHUFFLE(XMM7,XMM7,0x00) 747 SSE_MULT_PS(XMM7,XMM3) 748 SSE_ADD_PS(XMM6,XMM7) 749 750 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 751 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 752 753 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 754 /* Column 3, product is accumulated in XMM0 */ 755 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 756 SSE_SHUFFLE(XMM7,XMM7,0x00) 757 SSE_MULT_PS(XMM0,XMM7) 758 759 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 760 SSE_SHUFFLE(XMM7,XMM7,0x00) 761 SSE_MULT_PS(XMM1,XMM7) 762 SSE_ADD_PS(XMM0,XMM1) 763 764 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 765 SSE_SHUFFLE(XMM1,XMM1,0x00) 766 SSE_MULT_PS(XMM1,XMM2) 767 SSE_ADD_PS(XMM0,XMM1) 768 769 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 770 SSE_SHUFFLE(XMM7,XMM7,0x00) 771 SSE_MULT_PS(XMM3,XMM7) 772 SSE_ADD_PS(XMM0,XMM3) 773 774 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 775 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 776 777 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 778 /* This is code to be maintained and read by humans afterall. */ 779 /* Copy Multiplier Col 3 into XMM3 */ 780 SSE_COPY_PS(XMM3,XMM0) 781 /* Copy Multiplier Col 2 into XMM2 */ 782 SSE_COPY_PS(XMM2,XMM6) 783 /* Copy Multiplier Col 1 into XMM1 */ 784 SSE_COPY_PS(XMM1,XMM5) 785 /* Copy Multiplier Col 0 into XMM0 */ 786 SSE_COPY_PS(XMM0,XMM4) 787 SSE_INLINE_END_2; 788 789 /* Update the row: */ 790 nz = bi[row+1] - diag_offset[row] - 1; 791 pv += 16; 792 for (j=0; j<nz; j++) { 793 PREFETCH_L1(&pv[16]); 794 x = rtmp + 16*pj[j]; 795 /* x = rtmp + 4*pj[j]; */ 796 797 /* X:=X-M*PV, One column at a time */ 798 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 799 SSE_INLINE_BEGIN_2(x,pv) 800 /* Load First Column of X*/ 801 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 802 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 803 804 /* Matrix-Vector Product: */ 805 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 806 SSE_SHUFFLE(XMM5,XMM5,0x00) 807 SSE_MULT_PS(XMM5,XMM0) 808 SSE_SUB_PS(XMM4,XMM5) 809 810 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 811 SSE_SHUFFLE(XMM6,XMM6,0x00) 812 SSE_MULT_PS(XMM6,XMM1) 813 SSE_SUB_PS(XMM4,XMM6) 814 815 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 816 SSE_SHUFFLE(XMM7,XMM7,0x00) 817 SSE_MULT_PS(XMM7,XMM2) 818 SSE_SUB_PS(XMM4,XMM7) 819 820 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 821 SSE_SHUFFLE(XMM5,XMM5,0x00) 822 SSE_MULT_PS(XMM5,XMM3) 823 SSE_SUB_PS(XMM4,XMM5) 824 825 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 826 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 827 828 /* Second Column */ 829 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 830 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 831 832 /* Matrix-Vector Product: */ 833 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 834 SSE_SHUFFLE(XMM6,XMM6,0x00) 835 SSE_MULT_PS(XMM6,XMM0) 836 SSE_SUB_PS(XMM5,XMM6) 837 838 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 839 SSE_SHUFFLE(XMM7,XMM7,0x00) 840 SSE_MULT_PS(XMM7,XMM1) 841 SSE_SUB_PS(XMM5,XMM7) 842 843 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 844 SSE_SHUFFLE(XMM4,XMM4,0x00) 845 SSE_MULT_PS(XMM4,XMM2) 846 SSE_SUB_PS(XMM5,XMM4) 847 848 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 849 SSE_SHUFFLE(XMM6,XMM6,0x00) 850 SSE_MULT_PS(XMM6,XMM3) 851 SSE_SUB_PS(XMM5,XMM6) 852 853 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 854 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 855 856 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 857 858 /* Third Column */ 859 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 860 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 861 862 /* Matrix-Vector Product: */ 863 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 864 SSE_SHUFFLE(XMM7,XMM7,0x00) 865 SSE_MULT_PS(XMM7,XMM0) 866 SSE_SUB_PS(XMM6,XMM7) 867 868 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 869 SSE_SHUFFLE(XMM4,XMM4,0x00) 870 SSE_MULT_PS(XMM4,XMM1) 871 SSE_SUB_PS(XMM6,XMM4) 872 873 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 874 SSE_SHUFFLE(XMM5,XMM5,0x00) 875 SSE_MULT_PS(XMM5,XMM2) 876 SSE_SUB_PS(XMM6,XMM5) 877 878 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 879 SSE_SHUFFLE(XMM7,XMM7,0x00) 880 SSE_MULT_PS(XMM7,XMM3) 881 SSE_SUB_PS(XMM6,XMM7) 882 883 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 884 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 885 886 /* Fourth Column */ 887 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 888 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 889 890 /* Matrix-Vector Product: */ 891 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 892 SSE_SHUFFLE(XMM5,XMM5,0x00) 893 SSE_MULT_PS(XMM5,XMM0) 894 SSE_SUB_PS(XMM4,XMM5) 895 896 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 897 SSE_SHUFFLE(XMM6,XMM6,0x00) 898 SSE_MULT_PS(XMM6,XMM1) 899 SSE_SUB_PS(XMM4,XMM6) 900 901 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 902 SSE_SHUFFLE(XMM7,XMM7,0x00) 903 SSE_MULT_PS(XMM7,XMM2) 904 SSE_SUB_PS(XMM4,XMM7) 905 906 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 907 SSE_SHUFFLE(XMM5,XMM5,0x00) 908 SSE_MULT_PS(XMM5,XMM3) 909 SSE_SUB_PS(XMM4,XMM5) 910 911 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 912 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 913 SSE_INLINE_END_2; 914 pv += 16; 915 } 916 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 917 } 918 row = *bjtmp++; 919 /* row = (*bjtmp++)/4; */ 920 } 921 /* finished row so stick it into b->a */ 922 pv = ba + 16*bi[i]; 923 pj = bj + bi[i]; 924 nz = bi[i+1] - bi[i]; 925 926 /* Copy x block back into pv block */ 927 for (j=0; j<nz; j++) { 928 x = rtmp+16*pj[j]; 929 /* x = rtmp+4*pj[j]; */ 930 931 SSE_INLINE_BEGIN_2(x,pv) 932 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 933 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 934 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 935 936 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 937 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 938 939 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 940 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 941 942 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 943 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 944 945 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 946 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 947 948 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 949 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 950 951 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 952 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 953 954 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 955 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 956 SSE_INLINE_END_2; 957 pv += 16; 958 } 959 /* invert diagonal block */ 960 w = ba + 16*diag_offset[i]; 961 if (pivotinblocks) { 962 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 963 } else { 964 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 965 } 966 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 967 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 968 } 969 970 ierr = PetscFree(rtmp);CHKERRQ(ierr); 971 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 972 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 973 C->assembled = PETSC_TRUE; 974 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 975 /* Flop Count from inverting diagonal blocks */ 976 SSE_SCOPE_END; 977 PetscFunctionReturn(0); 978 } 979 980 #undef __FUNCT__ 981 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 982 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 983 { 984 Mat A=C; 985 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 986 PetscErrorCode ierr; 987 int i,j,n = a->mbs; 988 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 989 unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 990 unsigned int row; 991 int nz,*bi=b->i; 992 int *diag_offset = b->diag,*ai=a->i; 993 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 994 MatScalar *ba = b->a,*aa = a->a; 995 int nonzero=0; 996 /* int nonzero=0,colscale = 16; */ 997 PetscTruth pivotinblocks = b->pivotinblocks; 998 PetscReal shift = info->shiftamount; 999 1000 PetscFunctionBegin; 1001 SSE_SCOPE_BEGIN; 1002 1003 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1004 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1005 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1006 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1007 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1008 /* colscale = 4; */ 1009 /* } */ 1010 1011 for (i=0; i<n; i++) { 1012 nz = bi[i+1] - bi[i]; 1013 bjtmp = bj + bi[i]; 1014 /* zero out the 4x4 block accumulators */ 1015 /* zero out one register */ 1016 XOR_PS(XMM7,XMM7); 1017 for (j=0; j<nz; j++) { 1018 x = rtmp+16*((unsigned int)bjtmp[j]); 1019 /* x = rtmp+4*bjtmp[j]; */ 1020 SSE_INLINE_BEGIN_1(x) 1021 /* Copy zero register to memory locations */ 1022 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1023 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1024 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1025 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1026 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1027 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1028 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1029 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1030 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1031 SSE_INLINE_END_1; 1032 } 1033 /* load in initial (unfactored row) */ 1034 nz = ai[i+1] - ai[i]; 1035 ajtmp = aj + ai[i]; 1036 v = aa + 16*ai[i]; 1037 for (j=0; j<nz; j++) { 1038 x = rtmp+16*((unsigned int)ajtmp[j]); 1039 /* x = rtmp+colscale*ajtmp[j]; */ 1040 /* Copy v block into x block */ 1041 SSE_INLINE_BEGIN_2(v,x) 1042 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1043 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1044 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1045 1046 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1047 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1048 1049 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1050 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1051 1052 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1053 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1054 1055 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1056 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1057 1058 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1059 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1060 1061 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1062 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1063 1064 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1065 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1066 SSE_INLINE_END_2; 1067 1068 v += 16; 1069 } 1070 /* row = (*bjtmp++)/4; */ 1071 row = (unsigned int)(*bjtmp++); 1072 while (row < i) { 1073 pc = rtmp + 16*row; 1074 SSE_INLINE_BEGIN_1(pc) 1075 /* Load block from lower triangle */ 1076 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1077 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1078 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1079 1080 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1081 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1082 1083 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1084 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1085 1086 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1087 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1088 1089 /* Compare block to zero block */ 1090 1091 SSE_COPY_PS(XMM4,XMM7) 1092 SSE_CMPNEQ_PS(XMM4,XMM0) 1093 1094 SSE_COPY_PS(XMM5,XMM7) 1095 SSE_CMPNEQ_PS(XMM5,XMM1) 1096 1097 SSE_COPY_PS(XMM6,XMM7) 1098 SSE_CMPNEQ_PS(XMM6,XMM2) 1099 1100 SSE_CMPNEQ_PS(XMM7,XMM3) 1101 1102 /* Reduce the comparisons to one SSE register */ 1103 SSE_OR_PS(XMM6,XMM7) 1104 SSE_OR_PS(XMM5,XMM4) 1105 SSE_OR_PS(XMM5,XMM6) 1106 SSE_INLINE_END_1; 1107 1108 /* Reduce the one SSE register to an integer register for branching */ 1109 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1110 MOVEMASK(nonzero,XMM5); 1111 1112 /* If block is nonzero ... */ 1113 if (nonzero) { 1114 pv = ba + 16*diag_offset[row]; 1115 PREFETCH_L1(&pv[16]); 1116 pj = bj + diag_offset[row] + 1; 1117 1118 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1119 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1120 /* but the diagonal was inverted already */ 1121 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1122 1123 SSE_INLINE_BEGIN_2(pv,pc) 1124 /* Column 0, product is accumulated in XMM4 */ 1125 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1126 SSE_SHUFFLE(XMM4,XMM4,0x00) 1127 SSE_MULT_PS(XMM4,XMM0) 1128 1129 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1130 SSE_SHUFFLE(XMM5,XMM5,0x00) 1131 SSE_MULT_PS(XMM5,XMM1) 1132 SSE_ADD_PS(XMM4,XMM5) 1133 1134 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1135 SSE_SHUFFLE(XMM6,XMM6,0x00) 1136 SSE_MULT_PS(XMM6,XMM2) 1137 SSE_ADD_PS(XMM4,XMM6) 1138 1139 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1140 SSE_SHUFFLE(XMM7,XMM7,0x00) 1141 SSE_MULT_PS(XMM7,XMM3) 1142 SSE_ADD_PS(XMM4,XMM7) 1143 1144 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1145 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1146 1147 /* Column 1, product is accumulated in XMM5 */ 1148 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1149 SSE_SHUFFLE(XMM5,XMM5,0x00) 1150 SSE_MULT_PS(XMM5,XMM0) 1151 1152 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1153 SSE_SHUFFLE(XMM6,XMM6,0x00) 1154 SSE_MULT_PS(XMM6,XMM1) 1155 SSE_ADD_PS(XMM5,XMM6) 1156 1157 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1158 SSE_SHUFFLE(XMM7,XMM7,0x00) 1159 SSE_MULT_PS(XMM7,XMM2) 1160 SSE_ADD_PS(XMM5,XMM7) 1161 1162 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1163 SSE_SHUFFLE(XMM6,XMM6,0x00) 1164 SSE_MULT_PS(XMM6,XMM3) 1165 SSE_ADD_PS(XMM5,XMM6) 1166 1167 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1168 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1169 1170 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1171 1172 /* Column 2, product is accumulated in XMM6 */ 1173 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1174 SSE_SHUFFLE(XMM6,XMM6,0x00) 1175 SSE_MULT_PS(XMM6,XMM0) 1176 1177 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1178 SSE_SHUFFLE(XMM7,XMM7,0x00) 1179 SSE_MULT_PS(XMM7,XMM1) 1180 SSE_ADD_PS(XMM6,XMM7) 1181 1182 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1183 SSE_SHUFFLE(XMM7,XMM7,0x00) 1184 SSE_MULT_PS(XMM7,XMM2) 1185 SSE_ADD_PS(XMM6,XMM7) 1186 1187 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1188 SSE_SHUFFLE(XMM7,XMM7,0x00) 1189 SSE_MULT_PS(XMM7,XMM3) 1190 SSE_ADD_PS(XMM6,XMM7) 1191 1192 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1193 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1194 1195 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1196 /* Column 3, product is accumulated in XMM0 */ 1197 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1198 SSE_SHUFFLE(XMM7,XMM7,0x00) 1199 SSE_MULT_PS(XMM0,XMM7) 1200 1201 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1202 SSE_SHUFFLE(XMM7,XMM7,0x00) 1203 SSE_MULT_PS(XMM1,XMM7) 1204 SSE_ADD_PS(XMM0,XMM1) 1205 1206 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1207 SSE_SHUFFLE(XMM1,XMM1,0x00) 1208 SSE_MULT_PS(XMM1,XMM2) 1209 SSE_ADD_PS(XMM0,XMM1) 1210 1211 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1212 SSE_SHUFFLE(XMM7,XMM7,0x00) 1213 SSE_MULT_PS(XMM3,XMM7) 1214 SSE_ADD_PS(XMM0,XMM3) 1215 1216 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1217 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1218 1219 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1220 /* This is code to be maintained and read by humans afterall. */ 1221 /* Copy Multiplier Col 3 into XMM3 */ 1222 SSE_COPY_PS(XMM3,XMM0) 1223 /* Copy Multiplier Col 2 into XMM2 */ 1224 SSE_COPY_PS(XMM2,XMM6) 1225 /* Copy Multiplier Col 1 into XMM1 */ 1226 SSE_COPY_PS(XMM1,XMM5) 1227 /* Copy Multiplier Col 0 into XMM0 */ 1228 SSE_COPY_PS(XMM0,XMM4) 1229 SSE_INLINE_END_2; 1230 1231 /* Update the row: */ 1232 nz = bi[row+1] - diag_offset[row] - 1; 1233 pv += 16; 1234 for (j=0; j<nz; j++) { 1235 PREFETCH_L1(&pv[16]); 1236 x = rtmp + 16*((unsigned int)pj[j]); 1237 /* x = rtmp + 4*pj[j]; */ 1238 1239 /* X:=X-M*PV, One column at a time */ 1240 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1241 SSE_INLINE_BEGIN_2(x,pv) 1242 /* Load First Column of X*/ 1243 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1244 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1245 1246 /* Matrix-Vector Product: */ 1247 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1248 SSE_SHUFFLE(XMM5,XMM5,0x00) 1249 SSE_MULT_PS(XMM5,XMM0) 1250 SSE_SUB_PS(XMM4,XMM5) 1251 1252 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1253 SSE_SHUFFLE(XMM6,XMM6,0x00) 1254 SSE_MULT_PS(XMM6,XMM1) 1255 SSE_SUB_PS(XMM4,XMM6) 1256 1257 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1258 SSE_SHUFFLE(XMM7,XMM7,0x00) 1259 SSE_MULT_PS(XMM7,XMM2) 1260 SSE_SUB_PS(XMM4,XMM7) 1261 1262 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1263 SSE_SHUFFLE(XMM5,XMM5,0x00) 1264 SSE_MULT_PS(XMM5,XMM3) 1265 SSE_SUB_PS(XMM4,XMM5) 1266 1267 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1268 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1269 1270 /* Second Column */ 1271 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1272 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1273 1274 /* Matrix-Vector Product: */ 1275 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1276 SSE_SHUFFLE(XMM6,XMM6,0x00) 1277 SSE_MULT_PS(XMM6,XMM0) 1278 SSE_SUB_PS(XMM5,XMM6) 1279 1280 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1281 SSE_SHUFFLE(XMM7,XMM7,0x00) 1282 SSE_MULT_PS(XMM7,XMM1) 1283 SSE_SUB_PS(XMM5,XMM7) 1284 1285 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1286 SSE_SHUFFLE(XMM4,XMM4,0x00) 1287 SSE_MULT_PS(XMM4,XMM2) 1288 SSE_SUB_PS(XMM5,XMM4) 1289 1290 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1291 SSE_SHUFFLE(XMM6,XMM6,0x00) 1292 SSE_MULT_PS(XMM6,XMM3) 1293 SSE_SUB_PS(XMM5,XMM6) 1294 1295 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1296 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1297 1298 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1299 1300 /* Third Column */ 1301 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1302 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1303 1304 /* Matrix-Vector Product: */ 1305 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1306 SSE_SHUFFLE(XMM7,XMM7,0x00) 1307 SSE_MULT_PS(XMM7,XMM0) 1308 SSE_SUB_PS(XMM6,XMM7) 1309 1310 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1311 SSE_SHUFFLE(XMM4,XMM4,0x00) 1312 SSE_MULT_PS(XMM4,XMM1) 1313 SSE_SUB_PS(XMM6,XMM4) 1314 1315 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1316 SSE_SHUFFLE(XMM5,XMM5,0x00) 1317 SSE_MULT_PS(XMM5,XMM2) 1318 SSE_SUB_PS(XMM6,XMM5) 1319 1320 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1321 SSE_SHUFFLE(XMM7,XMM7,0x00) 1322 SSE_MULT_PS(XMM7,XMM3) 1323 SSE_SUB_PS(XMM6,XMM7) 1324 1325 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1326 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1327 1328 /* Fourth Column */ 1329 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1330 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1331 1332 /* Matrix-Vector Product: */ 1333 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1334 SSE_SHUFFLE(XMM5,XMM5,0x00) 1335 SSE_MULT_PS(XMM5,XMM0) 1336 SSE_SUB_PS(XMM4,XMM5) 1337 1338 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1339 SSE_SHUFFLE(XMM6,XMM6,0x00) 1340 SSE_MULT_PS(XMM6,XMM1) 1341 SSE_SUB_PS(XMM4,XMM6) 1342 1343 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1344 SSE_SHUFFLE(XMM7,XMM7,0x00) 1345 SSE_MULT_PS(XMM7,XMM2) 1346 SSE_SUB_PS(XMM4,XMM7) 1347 1348 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1349 SSE_SHUFFLE(XMM5,XMM5,0x00) 1350 SSE_MULT_PS(XMM5,XMM3) 1351 SSE_SUB_PS(XMM4,XMM5) 1352 1353 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1354 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1355 SSE_INLINE_END_2; 1356 pv += 16; 1357 } 1358 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1359 } 1360 row = (unsigned int)(*bjtmp++); 1361 /* row = (*bjtmp++)/4; */ 1362 /* bjtmp++; */ 1363 } 1364 /* finished row so stick it into b->a */ 1365 pv = ba + 16*bi[i]; 1366 pj = bj + bi[i]; 1367 nz = bi[i+1] - bi[i]; 1368 1369 /* Copy x block back into pv block */ 1370 for (j=0; j<nz; j++) { 1371 x = rtmp+16*((unsigned int)pj[j]); 1372 /* x = rtmp+4*pj[j]; */ 1373 1374 SSE_INLINE_BEGIN_2(x,pv) 1375 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1376 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1377 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1378 1379 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1380 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1381 1382 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1383 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1384 1385 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1386 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1387 1388 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1389 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1390 1391 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1392 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1393 1394 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1395 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1396 1397 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1398 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1399 SSE_INLINE_END_2; 1400 pv += 16; 1401 } 1402 /* invert diagonal block */ 1403 w = ba + 16*diag_offset[i]; 1404 if (pivotinblocks) { 1405 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1406 } else { 1407 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1408 } 1409 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1410 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1411 } 1412 1413 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1414 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1415 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1416 C->assembled = PETSC_TRUE; 1417 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1418 /* Flop Count from inverting diagonal blocks */ 1419 SSE_SCOPE_END; 1420 PetscFunctionReturn(0); 1421 } 1422 1423 #undef __FUNCT__ 1424 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1425 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1426 { 1427 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1428 PetscErrorCode ierr; 1429 int i,j,n = a->mbs; 1430 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1431 unsigned int row; 1432 int *ajtmpold,nz,*bi=b->i; 1433 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1434 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1435 MatScalar *ba = b->a,*aa = a->a; 1436 int nonzero=0; 1437 /* int nonzero=0,colscale = 16; */ 1438 PetscTruth pivotinblocks = b->pivotinblocks; 1439 PetscReal shift = info->shiftamount; 1440 1441 PetscFunctionBegin; 1442 SSE_SCOPE_BEGIN; 1443 1444 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1445 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1446 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1447 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1448 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1449 /* colscale = 4; */ 1450 /* } */ 1451 if ((unsigned long)bj==(unsigned long)aj) { 1452 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1453 } 1454 1455 for (i=0; i<n; i++) { 1456 nz = bi[i+1] - bi[i]; 1457 bjtmp = bj + bi[i]; 1458 /* zero out the 4x4 block accumulators */ 1459 /* zero out one register */ 1460 XOR_PS(XMM7,XMM7); 1461 for (j=0; j<nz; j++) { 1462 x = rtmp+16*((unsigned int)bjtmp[j]); 1463 /* x = rtmp+4*bjtmp[j]; */ 1464 SSE_INLINE_BEGIN_1(x) 1465 /* Copy zero register to memory locations */ 1466 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1467 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1468 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1469 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1470 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1471 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1472 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1473 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1474 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1475 SSE_INLINE_END_1; 1476 } 1477 /* load in initial (unfactored row) */ 1478 nz = ai[i+1] - ai[i]; 1479 ajtmpold = aj + ai[i]; 1480 v = aa + 16*ai[i]; 1481 for (j=0; j<nz; j++) { 1482 x = rtmp+16*ajtmpold[j]; 1483 /* x = rtmp+colscale*ajtmpold[j]; */ 1484 /* Copy v block into x block */ 1485 SSE_INLINE_BEGIN_2(v,x) 1486 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1487 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1488 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1489 1490 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1491 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1492 1493 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1494 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1495 1496 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1497 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1498 1499 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1500 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1501 1502 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1503 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1504 1505 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1506 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1507 1508 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1509 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1510 SSE_INLINE_END_2; 1511 1512 v += 16; 1513 } 1514 /* row = (*bjtmp++)/4; */ 1515 row = (unsigned int)(*bjtmp++); 1516 while (row < i) { 1517 pc = rtmp + 16*row; 1518 SSE_INLINE_BEGIN_1(pc) 1519 /* Load block from lower triangle */ 1520 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1521 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1522 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1523 1524 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1525 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1526 1527 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1528 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1529 1530 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1531 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1532 1533 /* Compare block to zero block */ 1534 1535 SSE_COPY_PS(XMM4,XMM7) 1536 SSE_CMPNEQ_PS(XMM4,XMM0) 1537 1538 SSE_COPY_PS(XMM5,XMM7) 1539 SSE_CMPNEQ_PS(XMM5,XMM1) 1540 1541 SSE_COPY_PS(XMM6,XMM7) 1542 SSE_CMPNEQ_PS(XMM6,XMM2) 1543 1544 SSE_CMPNEQ_PS(XMM7,XMM3) 1545 1546 /* Reduce the comparisons to one SSE register */ 1547 SSE_OR_PS(XMM6,XMM7) 1548 SSE_OR_PS(XMM5,XMM4) 1549 SSE_OR_PS(XMM5,XMM6) 1550 SSE_INLINE_END_1; 1551 1552 /* Reduce the one SSE register to an integer register for branching */ 1553 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1554 MOVEMASK(nonzero,XMM5); 1555 1556 /* If block is nonzero ... */ 1557 if (nonzero) { 1558 pv = ba + 16*diag_offset[row]; 1559 PREFETCH_L1(&pv[16]); 1560 pj = bj + diag_offset[row] + 1; 1561 1562 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1563 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1564 /* but the diagonal was inverted already */ 1565 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1566 1567 SSE_INLINE_BEGIN_2(pv,pc) 1568 /* Column 0, product is accumulated in XMM4 */ 1569 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1570 SSE_SHUFFLE(XMM4,XMM4,0x00) 1571 SSE_MULT_PS(XMM4,XMM0) 1572 1573 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1574 SSE_SHUFFLE(XMM5,XMM5,0x00) 1575 SSE_MULT_PS(XMM5,XMM1) 1576 SSE_ADD_PS(XMM4,XMM5) 1577 1578 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1579 SSE_SHUFFLE(XMM6,XMM6,0x00) 1580 SSE_MULT_PS(XMM6,XMM2) 1581 SSE_ADD_PS(XMM4,XMM6) 1582 1583 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1584 SSE_SHUFFLE(XMM7,XMM7,0x00) 1585 SSE_MULT_PS(XMM7,XMM3) 1586 SSE_ADD_PS(XMM4,XMM7) 1587 1588 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1589 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1590 1591 /* Column 1, product is accumulated in XMM5 */ 1592 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1593 SSE_SHUFFLE(XMM5,XMM5,0x00) 1594 SSE_MULT_PS(XMM5,XMM0) 1595 1596 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1597 SSE_SHUFFLE(XMM6,XMM6,0x00) 1598 SSE_MULT_PS(XMM6,XMM1) 1599 SSE_ADD_PS(XMM5,XMM6) 1600 1601 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1602 SSE_SHUFFLE(XMM7,XMM7,0x00) 1603 SSE_MULT_PS(XMM7,XMM2) 1604 SSE_ADD_PS(XMM5,XMM7) 1605 1606 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1607 SSE_SHUFFLE(XMM6,XMM6,0x00) 1608 SSE_MULT_PS(XMM6,XMM3) 1609 SSE_ADD_PS(XMM5,XMM6) 1610 1611 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1612 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1613 1614 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1615 1616 /* Column 2, product is accumulated in XMM6 */ 1617 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1618 SSE_SHUFFLE(XMM6,XMM6,0x00) 1619 SSE_MULT_PS(XMM6,XMM0) 1620 1621 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1622 SSE_SHUFFLE(XMM7,XMM7,0x00) 1623 SSE_MULT_PS(XMM7,XMM1) 1624 SSE_ADD_PS(XMM6,XMM7) 1625 1626 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1627 SSE_SHUFFLE(XMM7,XMM7,0x00) 1628 SSE_MULT_PS(XMM7,XMM2) 1629 SSE_ADD_PS(XMM6,XMM7) 1630 1631 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1632 SSE_SHUFFLE(XMM7,XMM7,0x00) 1633 SSE_MULT_PS(XMM7,XMM3) 1634 SSE_ADD_PS(XMM6,XMM7) 1635 1636 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1637 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1638 1639 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1640 /* Column 3, product is accumulated in XMM0 */ 1641 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1642 SSE_SHUFFLE(XMM7,XMM7,0x00) 1643 SSE_MULT_PS(XMM0,XMM7) 1644 1645 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1646 SSE_SHUFFLE(XMM7,XMM7,0x00) 1647 SSE_MULT_PS(XMM1,XMM7) 1648 SSE_ADD_PS(XMM0,XMM1) 1649 1650 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1651 SSE_SHUFFLE(XMM1,XMM1,0x00) 1652 SSE_MULT_PS(XMM1,XMM2) 1653 SSE_ADD_PS(XMM0,XMM1) 1654 1655 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1656 SSE_SHUFFLE(XMM7,XMM7,0x00) 1657 SSE_MULT_PS(XMM3,XMM7) 1658 SSE_ADD_PS(XMM0,XMM3) 1659 1660 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1661 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1662 1663 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1664 /* This is code to be maintained and read by humans afterall. */ 1665 /* Copy Multiplier Col 3 into XMM3 */ 1666 SSE_COPY_PS(XMM3,XMM0) 1667 /* Copy Multiplier Col 2 into XMM2 */ 1668 SSE_COPY_PS(XMM2,XMM6) 1669 /* Copy Multiplier Col 1 into XMM1 */ 1670 SSE_COPY_PS(XMM1,XMM5) 1671 /* Copy Multiplier Col 0 into XMM0 */ 1672 SSE_COPY_PS(XMM0,XMM4) 1673 SSE_INLINE_END_2; 1674 1675 /* Update the row: */ 1676 nz = bi[row+1] - diag_offset[row] - 1; 1677 pv += 16; 1678 for (j=0; j<nz; j++) { 1679 PREFETCH_L1(&pv[16]); 1680 x = rtmp + 16*((unsigned int)pj[j]); 1681 /* x = rtmp + 4*pj[j]; */ 1682 1683 /* X:=X-M*PV, One column at a time */ 1684 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1685 SSE_INLINE_BEGIN_2(x,pv) 1686 /* Load First Column of X*/ 1687 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1688 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1689 1690 /* Matrix-Vector Product: */ 1691 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1692 SSE_SHUFFLE(XMM5,XMM5,0x00) 1693 SSE_MULT_PS(XMM5,XMM0) 1694 SSE_SUB_PS(XMM4,XMM5) 1695 1696 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1697 SSE_SHUFFLE(XMM6,XMM6,0x00) 1698 SSE_MULT_PS(XMM6,XMM1) 1699 SSE_SUB_PS(XMM4,XMM6) 1700 1701 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1702 SSE_SHUFFLE(XMM7,XMM7,0x00) 1703 SSE_MULT_PS(XMM7,XMM2) 1704 SSE_SUB_PS(XMM4,XMM7) 1705 1706 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1707 SSE_SHUFFLE(XMM5,XMM5,0x00) 1708 SSE_MULT_PS(XMM5,XMM3) 1709 SSE_SUB_PS(XMM4,XMM5) 1710 1711 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1712 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1713 1714 /* Second Column */ 1715 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1716 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1717 1718 /* Matrix-Vector Product: */ 1719 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1720 SSE_SHUFFLE(XMM6,XMM6,0x00) 1721 SSE_MULT_PS(XMM6,XMM0) 1722 SSE_SUB_PS(XMM5,XMM6) 1723 1724 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1725 SSE_SHUFFLE(XMM7,XMM7,0x00) 1726 SSE_MULT_PS(XMM7,XMM1) 1727 SSE_SUB_PS(XMM5,XMM7) 1728 1729 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1730 SSE_SHUFFLE(XMM4,XMM4,0x00) 1731 SSE_MULT_PS(XMM4,XMM2) 1732 SSE_SUB_PS(XMM5,XMM4) 1733 1734 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1735 SSE_SHUFFLE(XMM6,XMM6,0x00) 1736 SSE_MULT_PS(XMM6,XMM3) 1737 SSE_SUB_PS(XMM5,XMM6) 1738 1739 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1740 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1741 1742 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1743 1744 /* Third Column */ 1745 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1746 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1747 1748 /* Matrix-Vector Product: */ 1749 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1750 SSE_SHUFFLE(XMM7,XMM7,0x00) 1751 SSE_MULT_PS(XMM7,XMM0) 1752 SSE_SUB_PS(XMM6,XMM7) 1753 1754 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1755 SSE_SHUFFLE(XMM4,XMM4,0x00) 1756 SSE_MULT_PS(XMM4,XMM1) 1757 SSE_SUB_PS(XMM6,XMM4) 1758 1759 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1760 SSE_SHUFFLE(XMM5,XMM5,0x00) 1761 SSE_MULT_PS(XMM5,XMM2) 1762 SSE_SUB_PS(XMM6,XMM5) 1763 1764 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1765 SSE_SHUFFLE(XMM7,XMM7,0x00) 1766 SSE_MULT_PS(XMM7,XMM3) 1767 SSE_SUB_PS(XMM6,XMM7) 1768 1769 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1770 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1771 1772 /* Fourth Column */ 1773 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1774 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1775 1776 /* Matrix-Vector Product: */ 1777 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1778 SSE_SHUFFLE(XMM5,XMM5,0x00) 1779 SSE_MULT_PS(XMM5,XMM0) 1780 SSE_SUB_PS(XMM4,XMM5) 1781 1782 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1783 SSE_SHUFFLE(XMM6,XMM6,0x00) 1784 SSE_MULT_PS(XMM6,XMM1) 1785 SSE_SUB_PS(XMM4,XMM6) 1786 1787 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1788 SSE_SHUFFLE(XMM7,XMM7,0x00) 1789 SSE_MULT_PS(XMM7,XMM2) 1790 SSE_SUB_PS(XMM4,XMM7) 1791 1792 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1793 SSE_SHUFFLE(XMM5,XMM5,0x00) 1794 SSE_MULT_PS(XMM5,XMM3) 1795 SSE_SUB_PS(XMM4,XMM5) 1796 1797 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1798 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1799 SSE_INLINE_END_2; 1800 pv += 16; 1801 } 1802 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1803 } 1804 row = (unsigned int)(*bjtmp++); 1805 /* row = (*bjtmp++)/4; */ 1806 /* bjtmp++; */ 1807 } 1808 /* finished row so stick it into b->a */ 1809 pv = ba + 16*bi[i]; 1810 pj = bj + bi[i]; 1811 nz = bi[i+1] - bi[i]; 1812 1813 /* Copy x block back into pv block */ 1814 for (j=0; j<nz; j++) { 1815 x = rtmp+16*((unsigned int)pj[j]); 1816 /* x = rtmp+4*pj[j]; */ 1817 1818 SSE_INLINE_BEGIN_2(x,pv) 1819 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1820 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1821 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1822 1823 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1824 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1825 1826 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1827 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1828 1829 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1830 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1831 1832 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1833 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1834 1835 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1836 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1837 1838 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1839 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1840 1841 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1842 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1843 SSE_INLINE_END_2; 1844 pv += 16; 1845 } 1846 /* invert diagonal block */ 1847 w = ba + 16*diag_offset[i]; 1848 if (pivotinblocks) { 1849 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1850 } else { 1851 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1852 } 1853 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1854 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1855 } 1856 1857 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1858 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1859 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1860 C->assembled = PETSC_TRUE; 1861 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1862 /* Flop Count from inverting diagonal blocks */ 1863 SSE_SCOPE_END; 1864 PetscFunctionReturn(0); 1865 } 1866 1867 #endif 1868