1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 #include "../src/mat/impls/baij/seq/baij.h" 7 #include "../src/mat/blockinvert.h" 8 9 /* ------------------------------------------------------------*/ 10 /* 11 Version for when blocks are 4 by 4 12 */ 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info) 16 { 17 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 18 IS isrow = b->row,isicol = b->icol; 19 PetscErrorCode ierr; 20 const PetscInt *r,*ic; 21 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 22 PetscInt *ajtmpold,*ajtmp,nz,row; 23 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 24 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 25 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 26 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 27 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 28 MatScalar m13,m14,m15,m16; 29 MatScalar *ba = b->a,*aa = a->a; 30 PetscTruth pivotinblocks = b->pivotinblocks; 31 PetscReal shift = info->shiftinblocks; 32 33 PetscFunctionBegin; 34 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 35 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 36 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 37 38 for (i=0; i<n; i++) { 39 nz = bi[i+1] - bi[i]; 40 ajtmp = bj + bi[i]; 41 for (j=0; j<nz; j++) { 42 x = rtmp+16*ajtmp[j]; 43 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 44 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 45 } 46 /* load in initial (unfactored row) */ 47 idx = r[i]; 48 nz = ai[idx+1] - ai[idx]; 49 ajtmpold = aj + ai[idx]; 50 v = aa + 16*ai[idx]; 51 for (j=0; j<nz; j++) { 52 x = rtmp+16*ic[ajtmpold[j]]; 53 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 54 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 55 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 56 x[14] = v[14]; x[15] = v[15]; 57 v += 16; 58 } 59 row = *ajtmp++; 60 while (row < i) { 61 pc = rtmp + 16*row; 62 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 63 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 64 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 65 p15 = pc[14]; p16 = pc[15]; 66 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 67 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 68 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 69 || p16 != 0.0) { 70 pv = ba + 16*diag_offset[row]; 71 pj = bj + diag_offset[row] + 1; 72 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 73 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 74 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 75 x15 = pv[14]; x16 = pv[15]; 76 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 77 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 78 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 79 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 80 81 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 82 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 83 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 84 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 85 86 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 87 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 88 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 89 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 90 91 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 92 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 93 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 94 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 95 96 nz = bi[row+1] - diag_offset[row] - 1; 97 pv += 16; 98 for (j=0; j<nz; j++) { 99 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 100 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 101 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 102 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 103 x = rtmp + 16*pj[j]; 104 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 105 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 106 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 107 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 108 109 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 110 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 111 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 112 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 113 114 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 115 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 116 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 117 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 118 119 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 120 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 121 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 122 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 123 124 pv += 16; 125 } 126 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 127 } 128 row = *ajtmp++; 129 } 130 /* finished row so stick it into b->a */ 131 pv = ba + 16*bi[i]; 132 pj = bj + bi[i]; 133 nz = bi[i+1] - bi[i]; 134 for (j=0; j<nz; j++) { 135 x = rtmp+16*pj[j]; 136 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 137 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 138 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 139 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 140 pv += 16; 141 } 142 /* invert diagonal block */ 143 w = ba + 16*diag_offset[i]; 144 if (pivotinblocks) { 145 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 146 } else { 147 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148 } 149 } 150 151 ierr = PetscFree(rtmp);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 153 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154 C->ops->solve = MatSolve_SeqBAIJ_4; 155 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 156 C->assembled = PETSC_TRUE; 157 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 158 PetscFunctionReturn(0); 159 } 160 161 #undef __FUNCT__ 162 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 163 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 164 { 165 /* 166 Default Version for when blocks are 4 by 4 Using natural ordering 167 */ 168 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 169 PetscErrorCode ierr; 170 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 171 PetscInt *ajtmpold,*ajtmp,nz,row; 172 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 173 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 174 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 175 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 176 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 177 MatScalar m13,m14,m15,m16; 178 MatScalar *ba = b->a,*aa = a->a; 179 PetscTruth pivotinblocks = b->pivotinblocks; 180 PetscReal shift = info->shiftinblocks; 181 182 PetscFunctionBegin; 183 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 184 185 for (i=0; i<n; i++) { 186 nz = bi[i+1] - bi[i]; 187 ajtmp = bj + bi[i]; 188 for (j=0; j<nz; j++) { 189 x = rtmp+16*ajtmp[j]; 190 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 191 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 192 } 193 /* load in initial (unfactored row) */ 194 nz = ai[i+1] - ai[i]; 195 ajtmpold = aj + ai[i]; 196 v = aa + 16*ai[i]; 197 for (j=0; j<nz; j++) { 198 x = rtmp+16*ajtmpold[j]; 199 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 200 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 201 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 202 x[14] = v[14]; x[15] = v[15]; 203 v += 16; 204 } 205 row = *ajtmp++; 206 while (row < i) { 207 pc = rtmp + 16*row; 208 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 209 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 210 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 211 p15 = pc[14]; p16 = pc[15]; 212 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 213 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 214 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 215 || p16 != 0.0) { 216 pv = ba + 16*diag_offset[row]; 217 pj = bj + diag_offset[row] + 1; 218 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 219 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 220 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 221 x15 = pv[14]; x16 = pv[15]; 222 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 223 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 224 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 225 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 226 227 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 228 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 229 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 230 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 231 232 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 233 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 234 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 235 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 236 237 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 238 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 239 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 240 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 241 nz = bi[row+1] - diag_offset[row] - 1; 242 pv += 16; 243 for (j=0; j<nz; j++) { 244 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 245 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 246 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 247 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 248 x = rtmp + 16*pj[j]; 249 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 250 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 251 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 252 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 253 254 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 255 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 256 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 257 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 258 259 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 260 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 261 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 262 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 263 264 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 265 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 266 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 267 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 268 269 pv += 16; 270 } 271 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 272 } 273 row = *ajtmp++; 274 } 275 /* finished row so stick it into b->a */ 276 pv = ba + 16*bi[i]; 277 pj = bj + bi[i]; 278 nz = bi[i+1] - bi[i]; 279 for (j=0; j<nz; j++) { 280 x = rtmp+16*pj[j]; 281 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 282 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 283 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 284 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 285 pv += 16; 286 } 287 /* invert diagonal block */ 288 w = ba + 16*diag_offset[i]; 289 if (pivotinblocks) { 290 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 291 } else { 292 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 293 } 294 } 295 296 ierr = PetscFree(rtmp);CHKERRQ(ierr); 297 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 298 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 299 C->assembled = PETSC_TRUE; 300 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 301 PetscFunctionReturn(0); 302 } 303 304 305 #if defined(PETSC_HAVE_SSE) 306 307 #include PETSC_HAVE_SSE 308 309 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 310 #undef __FUNCT__ 311 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 312 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 313 { 314 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 315 PetscErrorCode ierr; 316 int i,j,n = a->mbs; 317 int *bj = b->j,*bjtmp,*pj; 318 int row; 319 int *ajtmpold,nz,*bi=b->i; 320 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 321 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 322 MatScalar *ba = b->a,*aa = a->a; 323 int nonzero=0; 324 /* int nonzero=0,colscale = 16; */ 325 PetscTruth pivotinblocks = b->pivotinblocks; 326 PetscReal shift = info->shiftinblocks; 327 328 PetscFunctionBegin; 329 SSE_SCOPE_BEGIN; 330 331 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 332 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 333 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 334 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 335 /* if ((unsigned long)bj==(unsigned long)aj) { */ 336 /* colscale = 4; */ 337 /* } */ 338 for (i=0; i<n; i++) { 339 nz = bi[i+1] - bi[i]; 340 bjtmp = bj + bi[i]; 341 /* zero out the 4x4 block accumulators */ 342 /* zero out one register */ 343 XOR_PS(XMM7,XMM7); 344 for (j=0; j<nz; j++) { 345 x = rtmp+16*bjtmp[j]; 346 /* x = rtmp+4*bjtmp[j]; */ 347 SSE_INLINE_BEGIN_1(x) 348 /* Copy zero register to memory locations */ 349 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 350 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 351 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 352 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 353 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 354 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 355 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 356 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 357 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 358 SSE_INLINE_END_1; 359 } 360 /* load in initial (unfactored row) */ 361 nz = ai[i+1] - ai[i]; 362 ajtmpold = aj + ai[i]; 363 v = aa + 16*ai[i]; 364 for (j=0; j<nz; j++) { 365 x = rtmp+16*ajtmpold[j]; 366 /* x = rtmp+colscale*ajtmpold[j]; */ 367 /* Copy v block into x block */ 368 SSE_INLINE_BEGIN_2(v,x) 369 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 370 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 371 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 372 373 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 374 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 375 376 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 377 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 378 379 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 380 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 381 382 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 383 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 384 385 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 386 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 387 388 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 389 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 390 391 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 392 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 393 SSE_INLINE_END_2; 394 395 v += 16; 396 } 397 /* row = (*bjtmp++)/4; */ 398 row = *bjtmp++; 399 while (row < i) { 400 pc = rtmp + 16*row; 401 SSE_INLINE_BEGIN_1(pc) 402 /* Load block from lower triangle */ 403 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 404 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 405 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 406 407 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 408 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 409 410 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 411 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 412 413 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 414 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 415 416 /* Compare block to zero block */ 417 418 SSE_COPY_PS(XMM4,XMM7) 419 SSE_CMPNEQ_PS(XMM4,XMM0) 420 421 SSE_COPY_PS(XMM5,XMM7) 422 SSE_CMPNEQ_PS(XMM5,XMM1) 423 424 SSE_COPY_PS(XMM6,XMM7) 425 SSE_CMPNEQ_PS(XMM6,XMM2) 426 427 SSE_CMPNEQ_PS(XMM7,XMM3) 428 429 /* Reduce the comparisons to one SSE register */ 430 SSE_OR_PS(XMM6,XMM7) 431 SSE_OR_PS(XMM5,XMM4) 432 SSE_OR_PS(XMM5,XMM6) 433 SSE_INLINE_END_1; 434 435 /* Reduce the one SSE register to an integer register for branching */ 436 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 437 MOVEMASK(nonzero,XMM5); 438 439 /* If block is nonzero ... */ 440 if (nonzero) { 441 pv = ba + 16*diag_offset[row]; 442 PREFETCH_L1(&pv[16]); 443 pj = bj + diag_offset[row] + 1; 444 445 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 446 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 447 /* but the diagonal was inverted already */ 448 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 449 450 SSE_INLINE_BEGIN_2(pv,pc) 451 /* Column 0, product is accumulated in XMM4 */ 452 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 453 SSE_SHUFFLE(XMM4,XMM4,0x00) 454 SSE_MULT_PS(XMM4,XMM0) 455 456 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 457 SSE_SHUFFLE(XMM5,XMM5,0x00) 458 SSE_MULT_PS(XMM5,XMM1) 459 SSE_ADD_PS(XMM4,XMM5) 460 461 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 462 SSE_SHUFFLE(XMM6,XMM6,0x00) 463 SSE_MULT_PS(XMM6,XMM2) 464 SSE_ADD_PS(XMM4,XMM6) 465 466 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 467 SSE_SHUFFLE(XMM7,XMM7,0x00) 468 SSE_MULT_PS(XMM7,XMM3) 469 SSE_ADD_PS(XMM4,XMM7) 470 471 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 472 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 473 474 /* Column 1, product is accumulated in XMM5 */ 475 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 476 SSE_SHUFFLE(XMM5,XMM5,0x00) 477 SSE_MULT_PS(XMM5,XMM0) 478 479 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 480 SSE_SHUFFLE(XMM6,XMM6,0x00) 481 SSE_MULT_PS(XMM6,XMM1) 482 SSE_ADD_PS(XMM5,XMM6) 483 484 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 485 SSE_SHUFFLE(XMM7,XMM7,0x00) 486 SSE_MULT_PS(XMM7,XMM2) 487 SSE_ADD_PS(XMM5,XMM7) 488 489 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 490 SSE_SHUFFLE(XMM6,XMM6,0x00) 491 SSE_MULT_PS(XMM6,XMM3) 492 SSE_ADD_PS(XMM5,XMM6) 493 494 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 495 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 496 497 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 498 499 /* Column 2, product is accumulated in XMM6 */ 500 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 501 SSE_SHUFFLE(XMM6,XMM6,0x00) 502 SSE_MULT_PS(XMM6,XMM0) 503 504 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 505 SSE_SHUFFLE(XMM7,XMM7,0x00) 506 SSE_MULT_PS(XMM7,XMM1) 507 SSE_ADD_PS(XMM6,XMM7) 508 509 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 510 SSE_SHUFFLE(XMM7,XMM7,0x00) 511 SSE_MULT_PS(XMM7,XMM2) 512 SSE_ADD_PS(XMM6,XMM7) 513 514 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 515 SSE_SHUFFLE(XMM7,XMM7,0x00) 516 SSE_MULT_PS(XMM7,XMM3) 517 SSE_ADD_PS(XMM6,XMM7) 518 519 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 520 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 521 522 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 523 /* Column 3, product is accumulated in XMM0 */ 524 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 525 SSE_SHUFFLE(XMM7,XMM7,0x00) 526 SSE_MULT_PS(XMM0,XMM7) 527 528 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 529 SSE_SHUFFLE(XMM7,XMM7,0x00) 530 SSE_MULT_PS(XMM1,XMM7) 531 SSE_ADD_PS(XMM0,XMM1) 532 533 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 534 SSE_SHUFFLE(XMM1,XMM1,0x00) 535 SSE_MULT_PS(XMM1,XMM2) 536 SSE_ADD_PS(XMM0,XMM1) 537 538 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 539 SSE_SHUFFLE(XMM7,XMM7,0x00) 540 SSE_MULT_PS(XMM3,XMM7) 541 SSE_ADD_PS(XMM0,XMM3) 542 543 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 544 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 545 546 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 547 /* This is code to be maintained and read by humans afterall. */ 548 /* Copy Multiplier Col 3 into XMM3 */ 549 SSE_COPY_PS(XMM3,XMM0) 550 /* Copy Multiplier Col 2 into XMM2 */ 551 SSE_COPY_PS(XMM2,XMM6) 552 /* Copy Multiplier Col 1 into XMM1 */ 553 SSE_COPY_PS(XMM1,XMM5) 554 /* Copy Multiplier Col 0 into XMM0 */ 555 SSE_COPY_PS(XMM0,XMM4) 556 SSE_INLINE_END_2; 557 558 /* Update the row: */ 559 nz = bi[row+1] - diag_offset[row] - 1; 560 pv += 16; 561 for (j=0; j<nz; j++) { 562 PREFETCH_L1(&pv[16]); 563 x = rtmp + 16*pj[j]; 564 /* x = rtmp + 4*pj[j]; */ 565 566 /* X:=X-M*PV, One column at a time */ 567 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 568 SSE_INLINE_BEGIN_2(x,pv) 569 /* Load First Column of X*/ 570 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 571 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 572 573 /* Matrix-Vector Product: */ 574 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 575 SSE_SHUFFLE(XMM5,XMM5,0x00) 576 SSE_MULT_PS(XMM5,XMM0) 577 SSE_SUB_PS(XMM4,XMM5) 578 579 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 580 SSE_SHUFFLE(XMM6,XMM6,0x00) 581 SSE_MULT_PS(XMM6,XMM1) 582 SSE_SUB_PS(XMM4,XMM6) 583 584 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 585 SSE_SHUFFLE(XMM7,XMM7,0x00) 586 SSE_MULT_PS(XMM7,XMM2) 587 SSE_SUB_PS(XMM4,XMM7) 588 589 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 590 SSE_SHUFFLE(XMM5,XMM5,0x00) 591 SSE_MULT_PS(XMM5,XMM3) 592 SSE_SUB_PS(XMM4,XMM5) 593 594 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 595 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 596 597 /* Second Column */ 598 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 599 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 600 601 /* Matrix-Vector Product: */ 602 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 603 SSE_SHUFFLE(XMM6,XMM6,0x00) 604 SSE_MULT_PS(XMM6,XMM0) 605 SSE_SUB_PS(XMM5,XMM6) 606 607 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 608 SSE_SHUFFLE(XMM7,XMM7,0x00) 609 SSE_MULT_PS(XMM7,XMM1) 610 SSE_SUB_PS(XMM5,XMM7) 611 612 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 613 SSE_SHUFFLE(XMM4,XMM4,0x00) 614 SSE_MULT_PS(XMM4,XMM2) 615 SSE_SUB_PS(XMM5,XMM4) 616 617 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 618 SSE_SHUFFLE(XMM6,XMM6,0x00) 619 SSE_MULT_PS(XMM6,XMM3) 620 SSE_SUB_PS(XMM5,XMM6) 621 622 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 623 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 624 625 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 626 627 /* Third Column */ 628 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 629 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 630 631 /* Matrix-Vector Product: */ 632 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 633 SSE_SHUFFLE(XMM7,XMM7,0x00) 634 SSE_MULT_PS(XMM7,XMM0) 635 SSE_SUB_PS(XMM6,XMM7) 636 637 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 638 SSE_SHUFFLE(XMM4,XMM4,0x00) 639 SSE_MULT_PS(XMM4,XMM1) 640 SSE_SUB_PS(XMM6,XMM4) 641 642 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 643 SSE_SHUFFLE(XMM5,XMM5,0x00) 644 SSE_MULT_PS(XMM5,XMM2) 645 SSE_SUB_PS(XMM6,XMM5) 646 647 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 648 SSE_SHUFFLE(XMM7,XMM7,0x00) 649 SSE_MULT_PS(XMM7,XMM3) 650 SSE_SUB_PS(XMM6,XMM7) 651 652 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 653 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 654 655 /* Fourth Column */ 656 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 657 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 658 659 /* Matrix-Vector Product: */ 660 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 661 SSE_SHUFFLE(XMM5,XMM5,0x00) 662 SSE_MULT_PS(XMM5,XMM0) 663 SSE_SUB_PS(XMM4,XMM5) 664 665 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 666 SSE_SHUFFLE(XMM6,XMM6,0x00) 667 SSE_MULT_PS(XMM6,XMM1) 668 SSE_SUB_PS(XMM4,XMM6) 669 670 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 671 SSE_SHUFFLE(XMM7,XMM7,0x00) 672 SSE_MULT_PS(XMM7,XMM2) 673 SSE_SUB_PS(XMM4,XMM7) 674 675 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 676 SSE_SHUFFLE(XMM5,XMM5,0x00) 677 SSE_MULT_PS(XMM5,XMM3) 678 SSE_SUB_PS(XMM4,XMM5) 679 680 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 681 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 682 SSE_INLINE_END_2; 683 pv += 16; 684 } 685 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 686 } 687 row = *bjtmp++; 688 /* row = (*bjtmp++)/4; */ 689 } 690 /* finished row so stick it into b->a */ 691 pv = ba + 16*bi[i]; 692 pj = bj + bi[i]; 693 nz = bi[i+1] - bi[i]; 694 695 /* Copy x block back into pv block */ 696 for (j=0; j<nz; j++) { 697 x = rtmp+16*pj[j]; 698 /* x = rtmp+4*pj[j]; */ 699 700 SSE_INLINE_BEGIN_2(x,pv) 701 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 702 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 703 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 704 705 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 706 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 707 708 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 709 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 710 711 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 712 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 713 714 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 715 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 716 717 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 718 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 719 720 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 721 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 722 723 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 724 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 725 SSE_INLINE_END_2; 726 pv += 16; 727 } 728 /* invert diagonal block */ 729 w = ba + 16*diag_offset[i]; 730 if (pivotinblocks) { 731 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 732 } else { 733 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 734 } 735 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 736 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 737 } 738 739 ierr = PetscFree(rtmp);CHKERRQ(ierr); 740 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 741 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 742 C->assembled = PETSC_TRUE; 743 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 744 /* Flop Count from inverting diagonal blocks */ 745 SSE_SCOPE_END; 746 PetscFunctionReturn(0); 747 } 748 749 #undef __FUNCT__ 750 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 751 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 752 { 753 Mat A=C; 754 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 755 PetscErrorCode ierr; 756 int i,j,n = a->mbs; 757 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 758 unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 759 unsigned int row; 760 int nz,*bi=b->i; 761 int *diag_offset = b->diag,*ai=a->i; 762 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 763 MatScalar *ba = b->a,*aa = a->a; 764 int nonzero=0; 765 /* int nonzero=0,colscale = 16; */ 766 PetscTruth pivotinblocks = b->pivotinblocks; 767 PetscReal shift = info->shiftinblocks; 768 769 PetscFunctionBegin; 770 SSE_SCOPE_BEGIN; 771 772 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 773 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 774 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 775 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 776 /* if ((unsigned long)bj==(unsigned long)aj) { */ 777 /* colscale = 4; */ 778 /* } */ 779 780 for (i=0; i<n; i++) { 781 nz = bi[i+1] - bi[i]; 782 bjtmp = bj + bi[i]; 783 /* zero out the 4x4 block accumulators */ 784 /* zero out one register */ 785 XOR_PS(XMM7,XMM7); 786 for (j=0; j<nz; j++) { 787 x = rtmp+16*((unsigned int)bjtmp[j]); 788 /* x = rtmp+4*bjtmp[j]; */ 789 SSE_INLINE_BEGIN_1(x) 790 /* Copy zero register to memory locations */ 791 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 792 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 793 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 794 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 795 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 796 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 797 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 798 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 799 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 800 SSE_INLINE_END_1; 801 } 802 /* load in initial (unfactored row) */ 803 nz = ai[i+1] - ai[i]; 804 ajtmp = aj + ai[i]; 805 v = aa + 16*ai[i]; 806 for (j=0; j<nz; j++) { 807 x = rtmp+16*((unsigned int)ajtmp[j]); 808 /* x = rtmp+colscale*ajtmp[j]; */ 809 /* Copy v block into x block */ 810 SSE_INLINE_BEGIN_2(v,x) 811 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 812 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 813 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 814 815 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 816 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 817 818 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 819 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 820 821 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 822 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 823 824 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 825 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 826 827 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 828 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 829 830 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 831 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 832 833 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 834 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 835 SSE_INLINE_END_2; 836 837 v += 16; 838 } 839 /* row = (*bjtmp++)/4; */ 840 row = (unsigned int)(*bjtmp++); 841 while (row < i) { 842 pc = rtmp + 16*row; 843 SSE_INLINE_BEGIN_1(pc) 844 /* Load block from lower triangle */ 845 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 846 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 847 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 848 849 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 850 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 851 852 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 853 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 854 855 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 856 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 857 858 /* Compare block to zero block */ 859 860 SSE_COPY_PS(XMM4,XMM7) 861 SSE_CMPNEQ_PS(XMM4,XMM0) 862 863 SSE_COPY_PS(XMM5,XMM7) 864 SSE_CMPNEQ_PS(XMM5,XMM1) 865 866 SSE_COPY_PS(XMM6,XMM7) 867 SSE_CMPNEQ_PS(XMM6,XMM2) 868 869 SSE_CMPNEQ_PS(XMM7,XMM3) 870 871 /* Reduce the comparisons to one SSE register */ 872 SSE_OR_PS(XMM6,XMM7) 873 SSE_OR_PS(XMM5,XMM4) 874 SSE_OR_PS(XMM5,XMM6) 875 SSE_INLINE_END_1; 876 877 /* Reduce the one SSE register to an integer register for branching */ 878 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 879 MOVEMASK(nonzero,XMM5); 880 881 /* If block is nonzero ... */ 882 if (nonzero) { 883 pv = ba + 16*diag_offset[row]; 884 PREFETCH_L1(&pv[16]); 885 pj = bj + diag_offset[row] + 1; 886 887 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 888 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 889 /* but the diagonal was inverted already */ 890 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 891 892 SSE_INLINE_BEGIN_2(pv,pc) 893 /* Column 0, product is accumulated in XMM4 */ 894 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 895 SSE_SHUFFLE(XMM4,XMM4,0x00) 896 SSE_MULT_PS(XMM4,XMM0) 897 898 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 899 SSE_SHUFFLE(XMM5,XMM5,0x00) 900 SSE_MULT_PS(XMM5,XMM1) 901 SSE_ADD_PS(XMM4,XMM5) 902 903 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 904 SSE_SHUFFLE(XMM6,XMM6,0x00) 905 SSE_MULT_PS(XMM6,XMM2) 906 SSE_ADD_PS(XMM4,XMM6) 907 908 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 909 SSE_SHUFFLE(XMM7,XMM7,0x00) 910 SSE_MULT_PS(XMM7,XMM3) 911 SSE_ADD_PS(XMM4,XMM7) 912 913 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 914 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 915 916 /* Column 1, product is accumulated in XMM5 */ 917 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 918 SSE_SHUFFLE(XMM5,XMM5,0x00) 919 SSE_MULT_PS(XMM5,XMM0) 920 921 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 922 SSE_SHUFFLE(XMM6,XMM6,0x00) 923 SSE_MULT_PS(XMM6,XMM1) 924 SSE_ADD_PS(XMM5,XMM6) 925 926 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 927 SSE_SHUFFLE(XMM7,XMM7,0x00) 928 SSE_MULT_PS(XMM7,XMM2) 929 SSE_ADD_PS(XMM5,XMM7) 930 931 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 932 SSE_SHUFFLE(XMM6,XMM6,0x00) 933 SSE_MULT_PS(XMM6,XMM3) 934 SSE_ADD_PS(XMM5,XMM6) 935 936 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 937 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 938 939 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 940 941 /* Column 2, product is accumulated in XMM6 */ 942 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 943 SSE_SHUFFLE(XMM6,XMM6,0x00) 944 SSE_MULT_PS(XMM6,XMM0) 945 946 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 947 SSE_SHUFFLE(XMM7,XMM7,0x00) 948 SSE_MULT_PS(XMM7,XMM1) 949 SSE_ADD_PS(XMM6,XMM7) 950 951 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 952 SSE_SHUFFLE(XMM7,XMM7,0x00) 953 SSE_MULT_PS(XMM7,XMM2) 954 SSE_ADD_PS(XMM6,XMM7) 955 956 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 957 SSE_SHUFFLE(XMM7,XMM7,0x00) 958 SSE_MULT_PS(XMM7,XMM3) 959 SSE_ADD_PS(XMM6,XMM7) 960 961 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 962 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 963 964 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 965 /* Column 3, product is accumulated in XMM0 */ 966 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 967 SSE_SHUFFLE(XMM7,XMM7,0x00) 968 SSE_MULT_PS(XMM0,XMM7) 969 970 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 971 SSE_SHUFFLE(XMM7,XMM7,0x00) 972 SSE_MULT_PS(XMM1,XMM7) 973 SSE_ADD_PS(XMM0,XMM1) 974 975 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 976 SSE_SHUFFLE(XMM1,XMM1,0x00) 977 SSE_MULT_PS(XMM1,XMM2) 978 SSE_ADD_PS(XMM0,XMM1) 979 980 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 981 SSE_SHUFFLE(XMM7,XMM7,0x00) 982 SSE_MULT_PS(XMM3,XMM7) 983 SSE_ADD_PS(XMM0,XMM3) 984 985 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 986 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 987 988 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 989 /* This is code to be maintained and read by humans afterall. */ 990 /* Copy Multiplier Col 3 into XMM3 */ 991 SSE_COPY_PS(XMM3,XMM0) 992 /* Copy Multiplier Col 2 into XMM2 */ 993 SSE_COPY_PS(XMM2,XMM6) 994 /* Copy Multiplier Col 1 into XMM1 */ 995 SSE_COPY_PS(XMM1,XMM5) 996 /* Copy Multiplier Col 0 into XMM0 */ 997 SSE_COPY_PS(XMM0,XMM4) 998 SSE_INLINE_END_2; 999 1000 /* Update the row: */ 1001 nz = bi[row+1] - diag_offset[row] - 1; 1002 pv += 16; 1003 for (j=0; j<nz; j++) { 1004 PREFETCH_L1(&pv[16]); 1005 x = rtmp + 16*((unsigned int)pj[j]); 1006 /* x = rtmp + 4*pj[j]; */ 1007 1008 /* X:=X-M*PV, One column at a time */ 1009 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1010 SSE_INLINE_BEGIN_2(x,pv) 1011 /* Load First Column of X*/ 1012 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1013 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1014 1015 /* Matrix-Vector Product: */ 1016 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1017 SSE_SHUFFLE(XMM5,XMM5,0x00) 1018 SSE_MULT_PS(XMM5,XMM0) 1019 SSE_SUB_PS(XMM4,XMM5) 1020 1021 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1022 SSE_SHUFFLE(XMM6,XMM6,0x00) 1023 SSE_MULT_PS(XMM6,XMM1) 1024 SSE_SUB_PS(XMM4,XMM6) 1025 1026 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1027 SSE_SHUFFLE(XMM7,XMM7,0x00) 1028 SSE_MULT_PS(XMM7,XMM2) 1029 SSE_SUB_PS(XMM4,XMM7) 1030 1031 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1032 SSE_SHUFFLE(XMM5,XMM5,0x00) 1033 SSE_MULT_PS(XMM5,XMM3) 1034 SSE_SUB_PS(XMM4,XMM5) 1035 1036 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1037 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1038 1039 /* Second Column */ 1040 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1041 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1042 1043 /* Matrix-Vector Product: */ 1044 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1045 SSE_SHUFFLE(XMM6,XMM6,0x00) 1046 SSE_MULT_PS(XMM6,XMM0) 1047 SSE_SUB_PS(XMM5,XMM6) 1048 1049 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1050 SSE_SHUFFLE(XMM7,XMM7,0x00) 1051 SSE_MULT_PS(XMM7,XMM1) 1052 SSE_SUB_PS(XMM5,XMM7) 1053 1054 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1055 SSE_SHUFFLE(XMM4,XMM4,0x00) 1056 SSE_MULT_PS(XMM4,XMM2) 1057 SSE_SUB_PS(XMM5,XMM4) 1058 1059 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1060 SSE_SHUFFLE(XMM6,XMM6,0x00) 1061 SSE_MULT_PS(XMM6,XMM3) 1062 SSE_SUB_PS(XMM5,XMM6) 1063 1064 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1065 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1066 1067 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1068 1069 /* Third Column */ 1070 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1071 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1072 1073 /* Matrix-Vector Product: */ 1074 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1075 SSE_SHUFFLE(XMM7,XMM7,0x00) 1076 SSE_MULT_PS(XMM7,XMM0) 1077 SSE_SUB_PS(XMM6,XMM7) 1078 1079 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1080 SSE_SHUFFLE(XMM4,XMM4,0x00) 1081 SSE_MULT_PS(XMM4,XMM1) 1082 SSE_SUB_PS(XMM6,XMM4) 1083 1084 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1085 SSE_SHUFFLE(XMM5,XMM5,0x00) 1086 SSE_MULT_PS(XMM5,XMM2) 1087 SSE_SUB_PS(XMM6,XMM5) 1088 1089 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1090 SSE_SHUFFLE(XMM7,XMM7,0x00) 1091 SSE_MULT_PS(XMM7,XMM3) 1092 SSE_SUB_PS(XMM6,XMM7) 1093 1094 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1095 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1096 1097 /* Fourth Column */ 1098 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1099 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1100 1101 /* Matrix-Vector Product: */ 1102 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1103 SSE_SHUFFLE(XMM5,XMM5,0x00) 1104 SSE_MULT_PS(XMM5,XMM0) 1105 SSE_SUB_PS(XMM4,XMM5) 1106 1107 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1108 SSE_SHUFFLE(XMM6,XMM6,0x00) 1109 SSE_MULT_PS(XMM6,XMM1) 1110 SSE_SUB_PS(XMM4,XMM6) 1111 1112 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1113 SSE_SHUFFLE(XMM7,XMM7,0x00) 1114 SSE_MULT_PS(XMM7,XMM2) 1115 SSE_SUB_PS(XMM4,XMM7) 1116 1117 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1118 SSE_SHUFFLE(XMM5,XMM5,0x00) 1119 SSE_MULT_PS(XMM5,XMM3) 1120 SSE_SUB_PS(XMM4,XMM5) 1121 1122 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1123 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1124 SSE_INLINE_END_2; 1125 pv += 16; 1126 } 1127 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1128 } 1129 row = (unsigned int)(*bjtmp++); 1130 /* row = (*bjtmp++)/4; */ 1131 /* bjtmp++; */ 1132 } 1133 /* finished row so stick it into b->a */ 1134 pv = ba + 16*bi[i]; 1135 pj = bj + bi[i]; 1136 nz = bi[i+1] - bi[i]; 1137 1138 /* Copy x block back into pv block */ 1139 for (j=0; j<nz; j++) { 1140 x = rtmp+16*((unsigned int)pj[j]); 1141 /* x = rtmp+4*pj[j]; */ 1142 1143 SSE_INLINE_BEGIN_2(x,pv) 1144 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1145 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1146 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1147 1148 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1149 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1150 1151 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1152 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1153 1154 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1155 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1156 1157 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1158 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1159 1160 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1161 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1162 1163 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1164 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1165 1166 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1167 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1168 SSE_INLINE_END_2; 1169 pv += 16; 1170 } 1171 /* invert diagonal block */ 1172 w = ba + 16*diag_offset[i]; 1173 if (pivotinblocks) { 1174 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1175 } else { 1176 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1177 } 1178 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1179 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1180 } 1181 1182 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1183 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1184 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1185 C->assembled = PETSC_TRUE; 1186 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1187 /* Flop Count from inverting diagonal blocks */ 1188 SSE_SCOPE_END; 1189 PetscFunctionReturn(0); 1190 } 1191 1192 #undef __FUNCT__ 1193 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1194 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1195 { 1196 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1197 PetscErrorCode ierr; 1198 int i,j,n = a->mbs; 1199 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1200 unsigned int row; 1201 int *ajtmpold,nz,*bi=b->i; 1202 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1203 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1204 MatScalar *ba = b->a,*aa = a->a; 1205 int nonzero=0; 1206 /* int nonzero=0,colscale = 16; */ 1207 PetscTruth pivotinblocks = b->pivotinblocks; 1208 PetscReal shift = info->shiftinblocks; 1209 1210 PetscFunctionBegin; 1211 SSE_SCOPE_BEGIN; 1212 1213 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1214 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1215 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1216 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1217 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1218 /* colscale = 4; */ 1219 /* } */ 1220 if ((unsigned long)bj==(unsigned long)aj) { 1221 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1222 } 1223 1224 for (i=0; i<n; i++) { 1225 nz = bi[i+1] - bi[i]; 1226 bjtmp = bj + bi[i]; 1227 /* zero out the 4x4 block accumulators */ 1228 /* zero out one register */ 1229 XOR_PS(XMM7,XMM7); 1230 for (j=0; j<nz; j++) { 1231 x = rtmp+16*((unsigned int)bjtmp[j]); 1232 /* x = rtmp+4*bjtmp[j]; */ 1233 SSE_INLINE_BEGIN_1(x) 1234 /* Copy zero register to memory locations */ 1235 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1236 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1237 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1238 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1239 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1240 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1241 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1242 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1243 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1244 SSE_INLINE_END_1; 1245 } 1246 /* load in initial (unfactored row) */ 1247 nz = ai[i+1] - ai[i]; 1248 ajtmpold = aj + ai[i]; 1249 v = aa + 16*ai[i]; 1250 for (j=0; j<nz; j++) { 1251 x = rtmp+16*ajtmpold[j]; 1252 /* x = rtmp+colscale*ajtmpold[j]; */ 1253 /* Copy v block into x block */ 1254 SSE_INLINE_BEGIN_2(v,x) 1255 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1256 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1257 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1258 1259 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1260 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1261 1262 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1263 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1264 1265 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1266 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1267 1268 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1269 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1270 1271 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1272 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1273 1274 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1275 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1276 1277 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1278 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1279 SSE_INLINE_END_2; 1280 1281 v += 16; 1282 } 1283 /* row = (*bjtmp++)/4; */ 1284 row = (unsigned int)(*bjtmp++); 1285 while (row < i) { 1286 pc = rtmp + 16*row; 1287 SSE_INLINE_BEGIN_1(pc) 1288 /* Load block from lower triangle */ 1289 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1290 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1291 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1292 1293 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1294 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1295 1296 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1297 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1298 1299 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1300 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1301 1302 /* Compare block to zero block */ 1303 1304 SSE_COPY_PS(XMM4,XMM7) 1305 SSE_CMPNEQ_PS(XMM4,XMM0) 1306 1307 SSE_COPY_PS(XMM5,XMM7) 1308 SSE_CMPNEQ_PS(XMM5,XMM1) 1309 1310 SSE_COPY_PS(XMM6,XMM7) 1311 SSE_CMPNEQ_PS(XMM6,XMM2) 1312 1313 SSE_CMPNEQ_PS(XMM7,XMM3) 1314 1315 /* Reduce the comparisons to one SSE register */ 1316 SSE_OR_PS(XMM6,XMM7) 1317 SSE_OR_PS(XMM5,XMM4) 1318 SSE_OR_PS(XMM5,XMM6) 1319 SSE_INLINE_END_1; 1320 1321 /* Reduce the one SSE register to an integer register for branching */ 1322 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1323 MOVEMASK(nonzero,XMM5); 1324 1325 /* If block is nonzero ... */ 1326 if (nonzero) { 1327 pv = ba + 16*diag_offset[row]; 1328 PREFETCH_L1(&pv[16]); 1329 pj = bj + diag_offset[row] + 1; 1330 1331 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1332 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1333 /* but the diagonal was inverted already */ 1334 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1335 1336 SSE_INLINE_BEGIN_2(pv,pc) 1337 /* Column 0, product is accumulated in XMM4 */ 1338 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1339 SSE_SHUFFLE(XMM4,XMM4,0x00) 1340 SSE_MULT_PS(XMM4,XMM0) 1341 1342 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1343 SSE_SHUFFLE(XMM5,XMM5,0x00) 1344 SSE_MULT_PS(XMM5,XMM1) 1345 SSE_ADD_PS(XMM4,XMM5) 1346 1347 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1348 SSE_SHUFFLE(XMM6,XMM6,0x00) 1349 SSE_MULT_PS(XMM6,XMM2) 1350 SSE_ADD_PS(XMM4,XMM6) 1351 1352 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1353 SSE_SHUFFLE(XMM7,XMM7,0x00) 1354 SSE_MULT_PS(XMM7,XMM3) 1355 SSE_ADD_PS(XMM4,XMM7) 1356 1357 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1358 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1359 1360 /* Column 1, product is accumulated in XMM5 */ 1361 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1362 SSE_SHUFFLE(XMM5,XMM5,0x00) 1363 SSE_MULT_PS(XMM5,XMM0) 1364 1365 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1366 SSE_SHUFFLE(XMM6,XMM6,0x00) 1367 SSE_MULT_PS(XMM6,XMM1) 1368 SSE_ADD_PS(XMM5,XMM6) 1369 1370 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1371 SSE_SHUFFLE(XMM7,XMM7,0x00) 1372 SSE_MULT_PS(XMM7,XMM2) 1373 SSE_ADD_PS(XMM5,XMM7) 1374 1375 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1376 SSE_SHUFFLE(XMM6,XMM6,0x00) 1377 SSE_MULT_PS(XMM6,XMM3) 1378 SSE_ADD_PS(XMM5,XMM6) 1379 1380 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1381 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1382 1383 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1384 1385 /* Column 2, product is accumulated in XMM6 */ 1386 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1387 SSE_SHUFFLE(XMM6,XMM6,0x00) 1388 SSE_MULT_PS(XMM6,XMM0) 1389 1390 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1391 SSE_SHUFFLE(XMM7,XMM7,0x00) 1392 SSE_MULT_PS(XMM7,XMM1) 1393 SSE_ADD_PS(XMM6,XMM7) 1394 1395 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1396 SSE_SHUFFLE(XMM7,XMM7,0x00) 1397 SSE_MULT_PS(XMM7,XMM2) 1398 SSE_ADD_PS(XMM6,XMM7) 1399 1400 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1401 SSE_SHUFFLE(XMM7,XMM7,0x00) 1402 SSE_MULT_PS(XMM7,XMM3) 1403 SSE_ADD_PS(XMM6,XMM7) 1404 1405 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1406 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1407 1408 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1409 /* Column 3, product is accumulated in XMM0 */ 1410 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1411 SSE_SHUFFLE(XMM7,XMM7,0x00) 1412 SSE_MULT_PS(XMM0,XMM7) 1413 1414 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1415 SSE_SHUFFLE(XMM7,XMM7,0x00) 1416 SSE_MULT_PS(XMM1,XMM7) 1417 SSE_ADD_PS(XMM0,XMM1) 1418 1419 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1420 SSE_SHUFFLE(XMM1,XMM1,0x00) 1421 SSE_MULT_PS(XMM1,XMM2) 1422 SSE_ADD_PS(XMM0,XMM1) 1423 1424 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1425 SSE_SHUFFLE(XMM7,XMM7,0x00) 1426 SSE_MULT_PS(XMM3,XMM7) 1427 SSE_ADD_PS(XMM0,XMM3) 1428 1429 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1430 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1431 1432 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1433 /* This is code to be maintained and read by humans afterall. */ 1434 /* Copy Multiplier Col 3 into XMM3 */ 1435 SSE_COPY_PS(XMM3,XMM0) 1436 /* Copy Multiplier Col 2 into XMM2 */ 1437 SSE_COPY_PS(XMM2,XMM6) 1438 /* Copy Multiplier Col 1 into XMM1 */ 1439 SSE_COPY_PS(XMM1,XMM5) 1440 /* Copy Multiplier Col 0 into XMM0 */ 1441 SSE_COPY_PS(XMM0,XMM4) 1442 SSE_INLINE_END_2; 1443 1444 /* Update the row: */ 1445 nz = bi[row+1] - diag_offset[row] - 1; 1446 pv += 16; 1447 for (j=0; j<nz; j++) { 1448 PREFETCH_L1(&pv[16]); 1449 x = rtmp + 16*((unsigned int)pj[j]); 1450 /* x = rtmp + 4*pj[j]; */ 1451 1452 /* X:=X-M*PV, One column at a time */ 1453 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1454 SSE_INLINE_BEGIN_2(x,pv) 1455 /* Load First Column of X*/ 1456 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1457 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1458 1459 /* Matrix-Vector Product: */ 1460 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1461 SSE_SHUFFLE(XMM5,XMM5,0x00) 1462 SSE_MULT_PS(XMM5,XMM0) 1463 SSE_SUB_PS(XMM4,XMM5) 1464 1465 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1466 SSE_SHUFFLE(XMM6,XMM6,0x00) 1467 SSE_MULT_PS(XMM6,XMM1) 1468 SSE_SUB_PS(XMM4,XMM6) 1469 1470 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1471 SSE_SHUFFLE(XMM7,XMM7,0x00) 1472 SSE_MULT_PS(XMM7,XMM2) 1473 SSE_SUB_PS(XMM4,XMM7) 1474 1475 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1476 SSE_SHUFFLE(XMM5,XMM5,0x00) 1477 SSE_MULT_PS(XMM5,XMM3) 1478 SSE_SUB_PS(XMM4,XMM5) 1479 1480 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1481 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1482 1483 /* Second Column */ 1484 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1485 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1486 1487 /* Matrix-Vector Product: */ 1488 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1489 SSE_SHUFFLE(XMM6,XMM6,0x00) 1490 SSE_MULT_PS(XMM6,XMM0) 1491 SSE_SUB_PS(XMM5,XMM6) 1492 1493 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1494 SSE_SHUFFLE(XMM7,XMM7,0x00) 1495 SSE_MULT_PS(XMM7,XMM1) 1496 SSE_SUB_PS(XMM5,XMM7) 1497 1498 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1499 SSE_SHUFFLE(XMM4,XMM4,0x00) 1500 SSE_MULT_PS(XMM4,XMM2) 1501 SSE_SUB_PS(XMM5,XMM4) 1502 1503 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1504 SSE_SHUFFLE(XMM6,XMM6,0x00) 1505 SSE_MULT_PS(XMM6,XMM3) 1506 SSE_SUB_PS(XMM5,XMM6) 1507 1508 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1509 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1510 1511 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1512 1513 /* Third Column */ 1514 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1515 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1516 1517 /* Matrix-Vector Product: */ 1518 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1519 SSE_SHUFFLE(XMM7,XMM7,0x00) 1520 SSE_MULT_PS(XMM7,XMM0) 1521 SSE_SUB_PS(XMM6,XMM7) 1522 1523 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1524 SSE_SHUFFLE(XMM4,XMM4,0x00) 1525 SSE_MULT_PS(XMM4,XMM1) 1526 SSE_SUB_PS(XMM6,XMM4) 1527 1528 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1529 SSE_SHUFFLE(XMM5,XMM5,0x00) 1530 SSE_MULT_PS(XMM5,XMM2) 1531 SSE_SUB_PS(XMM6,XMM5) 1532 1533 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1534 SSE_SHUFFLE(XMM7,XMM7,0x00) 1535 SSE_MULT_PS(XMM7,XMM3) 1536 SSE_SUB_PS(XMM6,XMM7) 1537 1538 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1539 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1540 1541 /* Fourth Column */ 1542 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1543 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1544 1545 /* Matrix-Vector Product: */ 1546 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1547 SSE_SHUFFLE(XMM5,XMM5,0x00) 1548 SSE_MULT_PS(XMM5,XMM0) 1549 SSE_SUB_PS(XMM4,XMM5) 1550 1551 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1552 SSE_SHUFFLE(XMM6,XMM6,0x00) 1553 SSE_MULT_PS(XMM6,XMM1) 1554 SSE_SUB_PS(XMM4,XMM6) 1555 1556 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1557 SSE_SHUFFLE(XMM7,XMM7,0x00) 1558 SSE_MULT_PS(XMM7,XMM2) 1559 SSE_SUB_PS(XMM4,XMM7) 1560 1561 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1562 SSE_SHUFFLE(XMM5,XMM5,0x00) 1563 SSE_MULT_PS(XMM5,XMM3) 1564 SSE_SUB_PS(XMM4,XMM5) 1565 1566 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1567 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1568 SSE_INLINE_END_2; 1569 pv += 16; 1570 } 1571 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1572 } 1573 row = (unsigned int)(*bjtmp++); 1574 /* row = (*bjtmp++)/4; */ 1575 /* bjtmp++; */ 1576 } 1577 /* finished row so stick it into b->a */ 1578 pv = ba + 16*bi[i]; 1579 pj = bj + bi[i]; 1580 nz = bi[i+1] - bi[i]; 1581 1582 /* Copy x block back into pv block */ 1583 for (j=0; j<nz; j++) { 1584 x = rtmp+16*((unsigned int)pj[j]); 1585 /* x = rtmp+4*pj[j]; */ 1586 1587 SSE_INLINE_BEGIN_2(x,pv) 1588 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1589 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1590 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1591 1592 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1593 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1594 1595 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1596 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1597 1598 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1599 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1600 1601 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1602 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1603 1604 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1605 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1606 1607 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1608 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1609 1610 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1611 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1612 SSE_INLINE_END_2; 1613 pv += 16; 1614 } 1615 /* invert diagonal block */ 1616 w = ba + 16*diag_offset[i]; 1617 if (pivotinblocks) { 1618 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1619 } else { 1620 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1621 } 1622 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1623 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1624 } 1625 1626 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1627 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1628 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1629 C->assembled = PETSC_TRUE; 1630 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1631 /* Flop Count from inverting diagonal blocks */ 1632 SSE_SCOPE_END; 1633 PetscFunctionReturn(0); 1634 } 1635 1636 #endif 1637