1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 #include "../src/mat/impls/baij/seq/baij.h" 7 #include "../src/mat/blockinvert.h" 8 9 /* ------------------------------------------------------------*/ 10 /* 11 Version for when blocks are 4 by 4 12 */ 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info) 16 { 17 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 18 IS isrow = b->row,isicol = b->icol; 19 PetscErrorCode ierr; 20 const PetscInt *r,*ic; 21 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 22 PetscInt *ajtmpold,*ajtmp,nz,row; 23 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 24 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 25 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 26 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 27 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 28 MatScalar m13,m14,m15,m16; 29 MatScalar *ba = b->a,*aa = a->a; 30 PetscTruth pivotinblocks = b->pivotinblocks; 31 PetscReal shift = info->shiftinblocks; 32 33 PetscFunctionBegin; 34 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 35 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 36 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 37 38 for (i=0; i<n; i++) { 39 nz = bi[i+1] - bi[i]; 40 ajtmp = bj + bi[i]; 41 for (j=0; j<nz; j++) { 42 x = rtmp+16*ajtmp[j]; 43 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 44 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 45 } 46 /* load in initial (unfactored row) */ 47 idx = r[i]; 48 nz = ai[idx+1] - ai[idx]; 49 ajtmpold = aj + ai[idx]; 50 v = aa + 16*ai[idx]; 51 for (j=0; j<nz; j++) { 52 x = rtmp+16*ic[ajtmpold[j]]; 53 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 54 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 55 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 56 x[14] = v[14]; x[15] = v[15]; 57 v += 16; 58 } 59 row = *ajtmp++; 60 while (row < i) { 61 pc = rtmp + 16*row; 62 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 63 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 64 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 65 p15 = pc[14]; p16 = pc[15]; 66 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 67 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 68 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 69 || p16 != 0.0) { 70 pv = ba + 16*diag_offset[row]; 71 pj = bj + diag_offset[row] + 1; 72 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 73 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 74 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 75 x15 = pv[14]; x16 = pv[15]; 76 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 77 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 78 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 79 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 80 81 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 82 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 83 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 84 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 85 86 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 87 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 88 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 89 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 90 91 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 92 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 93 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 94 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 95 96 nz = bi[row+1] - diag_offset[row] - 1; 97 pv += 16; 98 for (j=0; j<nz; j++) { 99 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 100 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 101 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 102 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 103 x = rtmp + 16*pj[j]; 104 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 105 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 106 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 107 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 108 109 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 110 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 111 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 112 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 113 114 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 115 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 116 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 117 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 118 119 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 120 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 121 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 122 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 123 124 pv += 16; 125 } 126 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 127 } 128 row = *ajtmp++; 129 } 130 /* finished row so stick it into b->a */ 131 pv = ba + 16*bi[i]; 132 pj = bj + bi[i]; 133 nz = bi[i+1] - bi[i]; 134 for (j=0; j<nz; j++) { 135 x = rtmp+16*pj[j]; 136 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 137 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 138 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 139 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 140 pv += 16; 141 } 142 /* invert diagonal block */ 143 w = ba + 16*diag_offset[i]; 144 if (pivotinblocks) { 145 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 146 } else { 147 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148 } 149 } 150 151 ierr = PetscFree(rtmp);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 153 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154 C->ops->solve = MatSolve_SeqBAIJ_4; 155 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 156 C->assembled = PETSC_TRUE; 157 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 158 PetscFunctionReturn(0); 159 } 160 161 /* MatLUFactorNumeric_SeqBAIJ_4_newdatastruct - 162 copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented 163 Kernel_A_gets_A_times_B() 164 Kernel_A_gets_A_minus_B_times_C() 165 Kernel_A_gets_inverse_A() 166 */ 167 168 #undef __FUNCT__ 169 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct" 170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 171 { 172 Mat C=B; 173 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 174 IS isrow = b->row,isicol = b->icol; 175 PetscErrorCode ierr; 176 const PetscInt *r,*ic,*ics; 177 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 178 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 179 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 180 PetscInt bs2 = a->bs2,flg; 181 PetscReal shift = info->shiftinblocks; 182 183 PetscFunctionBegin; 184 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 185 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 186 187 /* generate work space needed by the factorization */ 188 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 189 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 190 ics = ic; 191 192 for (i=0; i<n; i++){ 193 /* zero rtmp */ 194 /* L part */ 195 nz = bi[i+1] - bi[i]; 196 bjtmp = bj + bi[i]; 197 for (j=0; j<nz; j++){ 198 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 199 } 200 201 /* U part */ 202 nz = bdiag[i]-bdiag[i+1]; 203 bjtmp = bj + bdiag[i+1]+1; 204 for (j=0; j<nz; j++){ 205 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 206 } 207 208 /* load in initial (unfactored row) */ 209 nz = ai[r[i]+1] - ai[r[i]]; 210 ajtmp = aj + ai[r[i]]; 211 v = aa + bs2*ai[r[i]]; 212 for (j=0; j<nz; j++) { 213 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 214 } 215 216 /* elimination */ 217 bjtmp = bj + bi[i]; 218 nzL = bi[i+1] - bi[i]; 219 for(k=0;k < nzL;k++) { 220 row = bjtmp[k]; 221 pc = rtmp + bs2*row; 222 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 223 if (flg) { 224 pv = b->a + bs2*bdiag[row]; 225 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 226 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 227 228 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 229 pv = b->a + bs2*(bdiag[row+1]+1); 230 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 231 for (j=0; j<nz; j++) { 232 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 233 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 234 v = rtmp + bs2*pj[j]; 235 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 236 pv += bs2; 237 } 238 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 239 } 240 } 241 242 /* finished row so stick it into b->a */ 243 /* L part */ 244 pv = b->a + bs2*bi[i] ; 245 pj = b->j + bi[i] ; 246 nz = bi[i+1] - bi[i]; 247 for (j=0; j<nz; j++) { 248 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 249 } 250 251 /* Mark diagonal and invert diagonal for simplier triangular solves */ 252 pv = b->a + bs2*bdiag[i]; 253 pj = b->j + bdiag[i]; 254 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 255 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 256 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 257 258 /* U part */ 259 pv = b->a + bs2*(bdiag[i+1]+1); 260 pj = b->j + bdiag[i+1]+1; 261 nz = bdiag[i] - bdiag[i+1] - 1; 262 for (j=0; j<nz; j++){ 263 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 264 } 265 } 266 267 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 268 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 269 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 270 271 C->assembled = PETSC_TRUE; 272 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 273 PetscFunctionReturn(0); 274 } 275 276 #undef __FUNCT__ 277 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 278 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 279 { 280 /* 281 Default Version for when blocks are 4 by 4 Using natural ordering 282 */ 283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 284 PetscErrorCode ierr; 285 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 286 PetscInt *ajtmpold,*ajtmp,nz,row; 287 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 288 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 289 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 290 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 291 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 292 MatScalar m13,m14,m15,m16; 293 MatScalar *ba = b->a,*aa = a->a; 294 PetscTruth pivotinblocks = b->pivotinblocks; 295 PetscReal shift = info->shiftinblocks; 296 297 PetscFunctionBegin; 298 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 299 300 for (i=0; i<n; i++) { 301 nz = bi[i+1] - bi[i]; 302 ajtmp = bj + bi[i]; 303 for (j=0; j<nz; j++) { 304 x = rtmp+16*ajtmp[j]; 305 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 306 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 307 } 308 /* load in initial (unfactored row) */ 309 nz = ai[i+1] - ai[i]; 310 ajtmpold = aj + ai[i]; 311 v = aa + 16*ai[i]; 312 for (j=0; j<nz; j++) { 313 x = rtmp+16*ajtmpold[j]; 314 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 315 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 316 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 317 x[14] = v[14]; x[15] = v[15]; 318 v += 16; 319 } 320 row = *ajtmp++; 321 while (row < i) { 322 pc = rtmp + 16*row; 323 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 324 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 325 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 326 p15 = pc[14]; p16 = pc[15]; 327 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 328 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 329 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 330 || p16 != 0.0) { 331 pv = ba + 16*diag_offset[row]; 332 pj = bj + diag_offset[row] + 1; 333 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 334 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 335 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 336 x15 = pv[14]; x16 = pv[15]; 337 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 338 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 339 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 340 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 341 342 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 343 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 344 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 345 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 346 347 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 348 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 349 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 350 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 351 352 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 353 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 354 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 355 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 356 nz = bi[row+1] - diag_offset[row] - 1; 357 pv += 16; 358 for (j=0; j<nz; j++) { 359 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 360 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 361 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 362 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 363 x = rtmp + 16*pj[j]; 364 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 365 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 366 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 367 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 368 369 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 370 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 371 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 372 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 373 374 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 375 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 376 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 377 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 378 379 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 380 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 381 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 382 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 383 384 pv += 16; 385 } 386 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 387 } 388 row = *ajtmp++; 389 } 390 /* finished row so stick it into b->a */ 391 pv = ba + 16*bi[i]; 392 pj = bj + bi[i]; 393 nz = bi[i+1] - bi[i]; 394 for (j=0; j<nz; j++) { 395 x = rtmp+16*pj[j]; 396 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 397 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 398 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 399 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 400 pv += 16; 401 } 402 /* invert diagonal block */ 403 w = ba + 16*diag_offset[i]; 404 if (pivotinblocks) { 405 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 406 } else { 407 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 408 } 409 } 410 411 ierr = PetscFree(rtmp);CHKERRQ(ierr); 412 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 413 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 414 C->assembled = PETSC_TRUE; 415 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 416 PetscFunctionReturn(0); 417 } 418 419 /* 420 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct - 421 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_newdatastruct() 422 */ 423 #undef __FUNCT__ 424 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct" 425 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 426 { 427 Mat C=B; 428 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 429 PetscErrorCode ierr; 430 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 431 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 432 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 433 PetscInt bs2 = a->bs2,flg; 434 PetscReal shift = info->shiftinblocks; 435 436 PetscFunctionBegin; 437 /* generate work space needed by the factorization */ 438 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 439 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 440 441 for (i=0; i<n; i++){ 442 /* zero rtmp */ 443 /* L part */ 444 nz = bi[i+1] - bi[i]; 445 bjtmp = bj + bi[i]; 446 for (j=0; j<nz; j++){ 447 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 448 } 449 450 /* U part */ 451 nz = bdiag[i] - bdiag[i+1]; 452 bjtmp = bj + bdiag[i+1]+1; 453 for (j=0; j<nz; j++){ 454 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 455 } 456 457 /* load in initial (unfactored row) */ 458 nz = ai[i+1] - ai[i]; 459 ajtmp = aj + ai[i]; 460 v = aa + bs2*ai[i]; 461 for (j=0; j<nz; j++) { 462 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 463 } 464 465 /* elimination */ 466 bjtmp = bj + bi[i]; 467 nzL = bi[i+1] - bi[i]; 468 for(k=0;k < nzL;k++) { 469 row = bjtmp[k]; 470 pc = rtmp + bs2*row; 471 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 472 if (flg) { 473 pv = b->a + bs2*bdiag[row]; 474 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 475 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 476 477 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 478 pv = b->a + bs2*(bdiag[row+1]+1); 479 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 480 for (j=0; j<nz; j++) { 481 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 482 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 483 v = rtmp + bs2*pj[j]; 484 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 485 pv += bs2; 486 } 487 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 488 } 489 } 490 491 /* finished row so stick it into b->a */ 492 /* L part */ 493 pv = b->a + bs2*bi[i] ; 494 pj = b->j + bi[i] ; 495 nz = bi[i+1] - bi[i]; 496 for (j=0; j<nz; j++) { 497 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 498 } 499 500 /* Mark diagonal and invert diagonal for simplier triangular solves */ 501 pv = b->a + bs2*bdiag[i]; 502 pj = b->j + bdiag[i]; 503 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 504 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 505 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 506 507 /* U part */ 508 pv = b->a + bs2*(bdiag[i+1]+1); 509 pj = b->j + bdiag[i+1]+1; 510 nz = bdiag[i] - bdiag[i+1] - 1; 511 for (j=0; j<nz; j++){ 512 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 513 } 514 } 515 516 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 517 C->assembled = PETSC_TRUE; 518 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 519 PetscFunctionReturn(0); 520 } 521 522 #if defined(PETSC_HAVE_SSE) 523 524 #include PETSC_HAVE_SSE 525 526 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 527 #undef __FUNCT__ 528 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 529 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 530 { 531 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 532 PetscErrorCode ierr; 533 int i,j,n = a->mbs; 534 int *bj = b->j,*bjtmp,*pj; 535 int row; 536 int *ajtmpold,nz,*bi=b->i; 537 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 538 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 539 MatScalar *ba = b->a,*aa = a->a; 540 int nonzero=0; 541 /* int nonzero=0,colscale = 16; */ 542 PetscTruth pivotinblocks = b->pivotinblocks; 543 PetscReal shift = info->shiftinblocks; 544 545 PetscFunctionBegin; 546 SSE_SCOPE_BEGIN; 547 548 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 549 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 550 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 551 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 552 /* if ((unsigned long)bj==(unsigned long)aj) { */ 553 /* colscale = 4; */ 554 /* } */ 555 for (i=0; i<n; i++) { 556 nz = bi[i+1] - bi[i]; 557 bjtmp = bj + bi[i]; 558 /* zero out the 4x4 block accumulators */ 559 /* zero out one register */ 560 XOR_PS(XMM7,XMM7); 561 for (j=0; j<nz; j++) { 562 x = rtmp+16*bjtmp[j]; 563 /* x = rtmp+4*bjtmp[j]; */ 564 SSE_INLINE_BEGIN_1(x) 565 /* Copy zero register to memory locations */ 566 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 567 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 568 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 569 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 570 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 571 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 572 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 573 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 574 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 575 SSE_INLINE_END_1; 576 } 577 /* load in initial (unfactored row) */ 578 nz = ai[i+1] - ai[i]; 579 ajtmpold = aj + ai[i]; 580 v = aa + 16*ai[i]; 581 for (j=0; j<nz; j++) { 582 x = rtmp+16*ajtmpold[j]; 583 /* x = rtmp+colscale*ajtmpold[j]; */ 584 /* Copy v block into x block */ 585 SSE_INLINE_BEGIN_2(v,x) 586 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 587 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 588 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 589 590 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 591 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 592 593 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 594 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 595 596 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 597 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 598 599 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 600 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 601 602 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 603 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 604 605 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 606 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 607 608 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 609 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 610 SSE_INLINE_END_2; 611 612 v += 16; 613 } 614 /* row = (*bjtmp++)/4; */ 615 row = *bjtmp++; 616 while (row < i) { 617 pc = rtmp + 16*row; 618 SSE_INLINE_BEGIN_1(pc) 619 /* Load block from lower triangle */ 620 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 621 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 622 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 623 624 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 625 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 626 627 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 628 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 629 630 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 631 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 632 633 /* Compare block to zero block */ 634 635 SSE_COPY_PS(XMM4,XMM7) 636 SSE_CMPNEQ_PS(XMM4,XMM0) 637 638 SSE_COPY_PS(XMM5,XMM7) 639 SSE_CMPNEQ_PS(XMM5,XMM1) 640 641 SSE_COPY_PS(XMM6,XMM7) 642 SSE_CMPNEQ_PS(XMM6,XMM2) 643 644 SSE_CMPNEQ_PS(XMM7,XMM3) 645 646 /* Reduce the comparisons to one SSE register */ 647 SSE_OR_PS(XMM6,XMM7) 648 SSE_OR_PS(XMM5,XMM4) 649 SSE_OR_PS(XMM5,XMM6) 650 SSE_INLINE_END_1; 651 652 /* Reduce the one SSE register to an integer register for branching */ 653 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 654 MOVEMASK(nonzero,XMM5); 655 656 /* If block is nonzero ... */ 657 if (nonzero) { 658 pv = ba + 16*diag_offset[row]; 659 PREFETCH_L1(&pv[16]); 660 pj = bj + diag_offset[row] + 1; 661 662 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 663 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 664 /* but the diagonal was inverted already */ 665 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 666 667 SSE_INLINE_BEGIN_2(pv,pc) 668 /* Column 0, product is accumulated in XMM4 */ 669 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 670 SSE_SHUFFLE(XMM4,XMM4,0x00) 671 SSE_MULT_PS(XMM4,XMM0) 672 673 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 674 SSE_SHUFFLE(XMM5,XMM5,0x00) 675 SSE_MULT_PS(XMM5,XMM1) 676 SSE_ADD_PS(XMM4,XMM5) 677 678 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 679 SSE_SHUFFLE(XMM6,XMM6,0x00) 680 SSE_MULT_PS(XMM6,XMM2) 681 SSE_ADD_PS(XMM4,XMM6) 682 683 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 684 SSE_SHUFFLE(XMM7,XMM7,0x00) 685 SSE_MULT_PS(XMM7,XMM3) 686 SSE_ADD_PS(XMM4,XMM7) 687 688 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 689 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 690 691 /* Column 1, product is accumulated in XMM5 */ 692 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 693 SSE_SHUFFLE(XMM5,XMM5,0x00) 694 SSE_MULT_PS(XMM5,XMM0) 695 696 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 697 SSE_SHUFFLE(XMM6,XMM6,0x00) 698 SSE_MULT_PS(XMM6,XMM1) 699 SSE_ADD_PS(XMM5,XMM6) 700 701 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 702 SSE_SHUFFLE(XMM7,XMM7,0x00) 703 SSE_MULT_PS(XMM7,XMM2) 704 SSE_ADD_PS(XMM5,XMM7) 705 706 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 707 SSE_SHUFFLE(XMM6,XMM6,0x00) 708 SSE_MULT_PS(XMM6,XMM3) 709 SSE_ADD_PS(XMM5,XMM6) 710 711 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 712 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 713 714 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 715 716 /* Column 2, product is accumulated in XMM6 */ 717 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 718 SSE_SHUFFLE(XMM6,XMM6,0x00) 719 SSE_MULT_PS(XMM6,XMM0) 720 721 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 722 SSE_SHUFFLE(XMM7,XMM7,0x00) 723 SSE_MULT_PS(XMM7,XMM1) 724 SSE_ADD_PS(XMM6,XMM7) 725 726 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 727 SSE_SHUFFLE(XMM7,XMM7,0x00) 728 SSE_MULT_PS(XMM7,XMM2) 729 SSE_ADD_PS(XMM6,XMM7) 730 731 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 732 SSE_SHUFFLE(XMM7,XMM7,0x00) 733 SSE_MULT_PS(XMM7,XMM3) 734 SSE_ADD_PS(XMM6,XMM7) 735 736 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 737 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 738 739 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 740 /* Column 3, product is accumulated in XMM0 */ 741 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 742 SSE_SHUFFLE(XMM7,XMM7,0x00) 743 SSE_MULT_PS(XMM0,XMM7) 744 745 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 746 SSE_SHUFFLE(XMM7,XMM7,0x00) 747 SSE_MULT_PS(XMM1,XMM7) 748 SSE_ADD_PS(XMM0,XMM1) 749 750 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 751 SSE_SHUFFLE(XMM1,XMM1,0x00) 752 SSE_MULT_PS(XMM1,XMM2) 753 SSE_ADD_PS(XMM0,XMM1) 754 755 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 756 SSE_SHUFFLE(XMM7,XMM7,0x00) 757 SSE_MULT_PS(XMM3,XMM7) 758 SSE_ADD_PS(XMM0,XMM3) 759 760 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 761 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 762 763 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 764 /* This is code to be maintained and read by humans afterall. */ 765 /* Copy Multiplier Col 3 into XMM3 */ 766 SSE_COPY_PS(XMM3,XMM0) 767 /* Copy Multiplier Col 2 into XMM2 */ 768 SSE_COPY_PS(XMM2,XMM6) 769 /* Copy Multiplier Col 1 into XMM1 */ 770 SSE_COPY_PS(XMM1,XMM5) 771 /* Copy Multiplier Col 0 into XMM0 */ 772 SSE_COPY_PS(XMM0,XMM4) 773 SSE_INLINE_END_2; 774 775 /* Update the row: */ 776 nz = bi[row+1] - diag_offset[row] - 1; 777 pv += 16; 778 for (j=0; j<nz; j++) { 779 PREFETCH_L1(&pv[16]); 780 x = rtmp + 16*pj[j]; 781 /* x = rtmp + 4*pj[j]; */ 782 783 /* X:=X-M*PV, One column at a time */ 784 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 785 SSE_INLINE_BEGIN_2(x,pv) 786 /* Load First Column of X*/ 787 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 788 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 789 790 /* Matrix-Vector Product: */ 791 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 792 SSE_SHUFFLE(XMM5,XMM5,0x00) 793 SSE_MULT_PS(XMM5,XMM0) 794 SSE_SUB_PS(XMM4,XMM5) 795 796 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 797 SSE_SHUFFLE(XMM6,XMM6,0x00) 798 SSE_MULT_PS(XMM6,XMM1) 799 SSE_SUB_PS(XMM4,XMM6) 800 801 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 802 SSE_SHUFFLE(XMM7,XMM7,0x00) 803 SSE_MULT_PS(XMM7,XMM2) 804 SSE_SUB_PS(XMM4,XMM7) 805 806 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 807 SSE_SHUFFLE(XMM5,XMM5,0x00) 808 SSE_MULT_PS(XMM5,XMM3) 809 SSE_SUB_PS(XMM4,XMM5) 810 811 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 812 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 813 814 /* Second Column */ 815 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 816 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 817 818 /* Matrix-Vector Product: */ 819 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 820 SSE_SHUFFLE(XMM6,XMM6,0x00) 821 SSE_MULT_PS(XMM6,XMM0) 822 SSE_SUB_PS(XMM5,XMM6) 823 824 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 825 SSE_SHUFFLE(XMM7,XMM7,0x00) 826 SSE_MULT_PS(XMM7,XMM1) 827 SSE_SUB_PS(XMM5,XMM7) 828 829 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 830 SSE_SHUFFLE(XMM4,XMM4,0x00) 831 SSE_MULT_PS(XMM4,XMM2) 832 SSE_SUB_PS(XMM5,XMM4) 833 834 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 835 SSE_SHUFFLE(XMM6,XMM6,0x00) 836 SSE_MULT_PS(XMM6,XMM3) 837 SSE_SUB_PS(XMM5,XMM6) 838 839 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 840 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 841 842 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 843 844 /* Third Column */ 845 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 846 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 847 848 /* Matrix-Vector Product: */ 849 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 850 SSE_SHUFFLE(XMM7,XMM7,0x00) 851 SSE_MULT_PS(XMM7,XMM0) 852 SSE_SUB_PS(XMM6,XMM7) 853 854 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 855 SSE_SHUFFLE(XMM4,XMM4,0x00) 856 SSE_MULT_PS(XMM4,XMM1) 857 SSE_SUB_PS(XMM6,XMM4) 858 859 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 860 SSE_SHUFFLE(XMM5,XMM5,0x00) 861 SSE_MULT_PS(XMM5,XMM2) 862 SSE_SUB_PS(XMM6,XMM5) 863 864 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 865 SSE_SHUFFLE(XMM7,XMM7,0x00) 866 SSE_MULT_PS(XMM7,XMM3) 867 SSE_SUB_PS(XMM6,XMM7) 868 869 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 870 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 871 872 /* Fourth Column */ 873 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 874 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 875 876 /* Matrix-Vector Product: */ 877 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 878 SSE_SHUFFLE(XMM5,XMM5,0x00) 879 SSE_MULT_PS(XMM5,XMM0) 880 SSE_SUB_PS(XMM4,XMM5) 881 882 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 883 SSE_SHUFFLE(XMM6,XMM6,0x00) 884 SSE_MULT_PS(XMM6,XMM1) 885 SSE_SUB_PS(XMM4,XMM6) 886 887 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 888 SSE_SHUFFLE(XMM7,XMM7,0x00) 889 SSE_MULT_PS(XMM7,XMM2) 890 SSE_SUB_PS(XMM4,XMM7) 891 892 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 893 SSE_SHUFFLE(XMM5,XMM5,0x00) 894 SSE_MULT_PS(XMM5,XMM3) 895 SSE_SUB_PS(XMM4,XMM5) 896 897 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 898 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 899 SSE_INLINE_END_2; 900 pv += 16; 901 } 902 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 903 } 904 row = *bjtmp++; 905 /* row = (*bjtmp++)/4; */ 906 } 907 /* finished row so stick it into b->a */ 908 pv = ba + 16*bi[i]; 909 pj = bj + bi[i]; 910 nz = bi[i+1] - bi[i]; 911 912 /* Copy x block back into pv block */ 913 for (j=0; j<nz; j++) { 914 x = rtmp+16*pj[j]; 915 /* x = rtmp+4*pj[j]; */ 916 917 SSE_INLINE_BEGIN_2(x,pv) 918 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 919 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 920 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 921 922 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 923 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 924 925 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 926 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 927 928 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 929 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 930 931 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 932 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 933 934 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 935 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 936 937 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 938 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 939 940 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 941 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 942 SSE_INLINE_END_2; 943 pv += 16; 944 } 945 /* invert diagonal block */ 946 w = ba + 16*diag_offset[i]; 947 if (pivotinblocks) { 948 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 949 } else { 950 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 951 } 952 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 953 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 954 } 955 956 ierr = PetscFree(rtmp);CHKERRQ(ierr); 957 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 958 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 959 C->assembled = PETSC_TRUE; 960 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 961 /* Flop Count from inverting diagonal blocks */ 962 SSE_SCOPE_END; 963 PetscFunctionReturn(0); 964 } 965 966 #undef __FUNCT__ 967 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 968 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 969 { 970 Mat A=C; 971 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 972 PetscErrorCode ierr; 973 int i,j,n = a->mbs; 974 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 975 unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 976 unsigned int row; 977 int nz,*bi=b->i; 978 int *diag_offset = b->diag,*ai=a->i; 979 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 980 MatScalar *ba = b->a,*aa = a->a; 981 int nonzero=0; 982 /* int nonzero=0,colscale = 16; */ 983 PetscTruth pivotinblocks = b->pivotinblocks; 984 PetscReal shift = info->shiftinblocks; 985 986 PetscFunctionBegin; 987 SSE_SCOPE_BEGIN; 988 989 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 990 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 991 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 992 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 993 /* if ((unsigned long)bj==(unsigned long)aj) { */ 994 /* colscale = 4; */ 995 /* } */ 996 997 for (i=0; i<n; i++) { 998 nz = bi[i+1] - bi[i]; 999 bjtmp = bj + bi[i]; 1000 /* zero out the 4x4 block accumulators */ 1001 /* zero out one register */ 1002 XOR_PS(XMM7,XMM7); 1003 for (j=0; j<nz; j++) { 1004 x = rtmp+16*((unsigned int)bjtmp[j]); 1005 /* x = rtmp+4*bjtmp[j]; */ 1006 SSE_INLINE_BEGIN_1(x) 1007 /* Copy zero register to memory locations */ 1008 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1009 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1010 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1011 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1012 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1013 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1014 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1015 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1016 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1017 SSE_INLINE_END_1; 1018 } 1019 /* load in initial (unfactored row) */ 1020 nz = ai[i+1] - ai[i]; 1021 ajtmp = aj + ai[i]; 1022 v = aa + 16*ai[i]; 1023 for (j=0; j<nz; j++) { 1024 x = rtmp+16*((unsigned int)ajtmp[j]); 1025 /* x = rtmp+colscale*ajtmp[j]; */ 1026 /* Copy v block into x block */ 1027 SSE_INLINE_BEGIN_2(v,x) 1028 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1029 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1030 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1031 1032 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1033 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1034 1035 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1036 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1037 1038 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1039 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1040 1041 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1042 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1043 1044 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1045 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1046 1047 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1048 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1049 1050 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1051 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1052 SSE_INLINE_END_2; 1053 1054 v += 16; 1055 } 1056 /* row = (*bjtmp++)/4; */ 1057 row = (unsigned int)(*bjtmp++); 1058 while (row < i) { 1059 pc = rtmp + 16*row; 1060 SSE_INLINE_BEGIN_1(pc) 1061 /* Load block from lower triangle */ 1062 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1063 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1064 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1065 1066 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1067 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1068 1069 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1070 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1071 1072 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1073 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1074 1075 /* Compare block to zero block */ 1076 1077 SSE_COPY_PS(XMM4,XMM7) 1078 SSE_CMPNEQ_PS(XMM4,XMM0) 1079 1080 SSE_COPY_PS(XMM5,XMM7) 1081 SSE_CMPNEQ_PS(XMM5,XMM1) 1082 1083 SSE_COPY_PS(XMM6,XMM7) 1084 SSE_CMPNEQ_PS(XMM6,XMM2) 1085 1086 SSE_CMPNEQ_PS(XMM7,XMM3) 1087 1088 /* Reduce the comparisons to one SSE register */ 1089 SSE_OR_PS(XMM6,XMM7) 1090 SSE_OR_PS(XMM5,XMM4) 1091 SSE_OR_PS(XMM5,XMM6) 1092 SSE_INLINE_END_1; 1093 1094 /* Reduce the one SSE register to an integer register for branching */ 1095 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1096 MOVEMASK(nonzero,XMM5); 1097 1098 /* If block is nonzero ... */ 1099 if (nonzero) { 1100 pv = ba + 16*diag_offset[row]; 1101 PREFETCH_L1(&pv[16]); 1102 pj = bj + diag_offset[row] + 1; 1103 1104 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1105 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1106 /* but the diagonal was inverted already */ 1107 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1108 1109 SSE_INLINE_BEGIN_2(pv,pc) 1110 /* Column 0, product is accumulated in XMM4 */ 1111 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1112 SSE_SHUFFLE(XMM4,XMM4,0x00) 1113 SSE_MULT_PS(XMM4,XMM0) 1114 1115 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1116 SSE_SHUFFLE(XMM5,XMM5,0x00) 1117 SSE_MULT_PS(XMM5,XMM1) 1118 SSE_ADD_PS(XMM4,XMM5) 1119 1120 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1121 SSE_SHUFFLE(XMM6,XMM6,0x00) 1122 SSE_MULT_PS(XMM6,XMM2) 1123 SSE_ADD_PS(XMM4,XMM6) 1124 1125 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1126 SSE_SHUFFLE(XMM7,XMM7,0x00) 1127 SSE_MULT_PS(XMM7,XMM3) 1128 SSE_ADD_PS(XMM4,XMM7) 1129 1130 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1131 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1132 1133 /* Column 1, product is accumulated in XMM5 */ 1134 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1135 SSE_SHUFFLE(XMM5,XMM5,0x00) 1136 SSE_MULT_PS(XMM5,XMM0) 1137 1138 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1139 SSE_SHUFFLE(XMM6,XMM6,0x00) 1140 SSE_MULT_PS(XMM6,XMM1) 1141 SSE_ADD_PS(XMM5,XMM6) 1142 1143 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1144 SSE_SHUFFLE(XMM7,XMM7,0x00) 1145 SSE_MULT_PS(XMM7,XMM2) 1146 SSE_ADD_PS(XMM5,XMM7) 1147 1148 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1149 SSE_SHUFFLE(XMM6,XMM6,0x00) 1150 SSE_MULT_PS(XMM6,XMM3) 1151 SSE_ADD_PS(XMM5,XMM6) 1152 1153 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1154 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1155 1156 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1157 1158 /* Column 2, product is accumulated in XMM6 */ 1159 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1160 SSE_SHUFFLE(XMM6,XMM6,0x00) 1161 SSE_MULT_PS(XMM6,XMM0) 1162 1163 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1164 SSE_SHUFFLE(XMM7,XMM7,0x00) 1165 SSE_MULT_PS(XMM7,XMM1) 1166 SSE_ADD_PS(XMM6,XMM7) 1167 1168 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1169 SSE_SHUFFLE(XMM7,XMM7,0x00) 1170 SSE_MULT_PS(XMM7,XMM2) 1171 SSE_ADD_PS(XMM6,XMM7) 1172 1173 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1174 SSE_SHUFFLE(XMM7,XMM7,0x00) 1175 SSE_MULT_PS(XMM7,XMM3) 1176 SSE_ADD_PS(XMM6,XMM7) 1177 1178 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1179 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1180 1181 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1182 /* Column 3, product is accumulated in XMM0 */ 1183 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1184 SSE_SHUFFLE(XMM7,XMM7,0x00) 1185 SSE_MULT_PS(XMM0,XMM7) 1186 1187 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1188 SSE_SHUFFLE(XMM7,XMM7,0x00) 1189 SSE_MULT_PS(XMM1,XMM7) 1190 SSE_ADD_PS(XMM0,XMM1) 1191 1192 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1193 SSE_SHUFFLE(XMM1,XMM1,0x00) 1194 SSE_MULT_PS(XMM1,XMM2) 1195 SSE_ADD_PS(XMM0,XMM1) 1196 1197 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1198 SSE_SHUFFLE(XMM7,XMM7,0x00) 1199 SSE_MULT_PS(XMM3,XMM7) 1200 SSE_ADD_PS(XMM0,XMM3) 1201 1202 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1203 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1204 1205 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1206 /* This is code to be maintained and read by humans afterall. */ 1207 /* Copy Multiplier Col 3 into XMM3 */ 1208 SSE_COPY_PS(XMM3,XMM0) 1209 /* Copy Multiplier Col 2 into XMM2 */ 1210 SSE_COPY_PS(XMM2,XMM6) 1211 /* Copy Multiplier Col 1 into XMM1 */ 1212 SSE_COPY_PS(XMM1,XMM5) 1213 /* Copy Multiplier Col 0 into XMM0 */ 1214 SSE_COPY_PS(XMM0,XMM4) 1215 SSE_INLINE_END_2; 1216 1217 /* Update the row: */ 1218 nz = bi[row+1] - diag_offset[row] - 1; 1219 pv += 16; 1220 for (j=0; j<nz; j++) { 1221 PREFETCH_L1(&pv[16]); 1222 x = rtmp + 16*((unsigned int)pj[j]); 1223 /* x = rtmp + 4*pj[j]; */ 1224 1225 /* X:=X-M*PV, One column at a time */ 1226 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1227 SSE_INLINE_BEGIN_2(x,pv) 1228 /* Load First Column of X*/ 1229 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1230 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1231 1232 /* Matrix-Vector Product: */ 1233 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1234 SSE_SHUFFLE(XMM5,XMM5,0x00) 1235 SSE_MULT_PS(XMM5,XMM0) 1236 SSE_SUB_PS(XMM4,XMM5) 1237 1238 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1239 SSE_SHUFFLE(XMM6,XMM6,0x00) 1240 SSE_MULT_PS(XMM6,XMM1) 1241 SSE_SUB_PS(XMM4,XMM6) 1242 1243 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1244 SSE_SHUFFLE(XMM7,XMM7,0x00) 1245 SSE_MULT_PS(XMM7,XMM2) 1246 SSE_SUB_PS(XMM4,XMM7) 1247 1248 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1249 SSE_SHUFFLE(XMM5,XMM5,0x00) 1250 SSE_MULT_PS(XMM5,XMM3) 1251 SSE_SUB_PS(XMM4,XMM5) 1252 1253 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1254 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1255 1256 /* Second Column */ 1257 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1258 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1259 1260 /* Matrix-Vector Product: */ 1261 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1262 SSE_SHUFFLE(XMM6,XMM6,0x00) 1263 SSE_MULT_PS(XMM6,XMM0) 1264 SSE_SUB_PS(XMM5,XMM6) 1265 1266 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1267 SSE_SHUFFLE(XMM7,XMM7,0x00) 1268 SSE_MULT_PS(XMM7,XMM1) 1269 SSE_SUB_PS(XMM5,XMM7) 1270 1271 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1272 SSE_SHUFFLE(XMM4,XMM4,0x00) 1273 SSE_MULT_PS(XMM4,XMM2) 1274 SSE_SUB_PS(XMM5,XMM4) 1275 1276 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1277 SSE_SHUFFLE(XMM6,XMM6,0x00) 1278 SSE_MULT_PS(XMM6,XMM3) 1279 SSE_SUB_PS(XMM5,XMM6) 1280 1281 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1282 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1283 1284 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1285 1286 /* Third Column */ 1287 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1288 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1289 1290 /* Matrix-Vector Product: */ 1291 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1292 SSE_SHUFFLE(XMM7,XMM7,0x00) 1293 SSE_MULT_PS(XMM7,XMM0) 1294 SSE_SUB_PS(XMM6,XMM7) 1295 1296 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1297 SSE_SHUFFLE(XMM4,XMM4,0x00) 1298 SSE_MULT_PS(XMM4,XMM1) 1299 SSE_SUB_PS(XMM6,XMM4) 1300 1301 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1302 SSE_SHUFFLE(XMM5,XMM5,0x00) 1303 SSE_MULT_PS(XMM5,XMM2) 1304 SSE_SUB_PS(XMM6,XMM5) 1305 1306 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1307 SSE_SHUFFLE(XMM7,XMM7,0x00) 1308 SSE_MULT_PS(XMM7,XMM3) 1309 SSE_SUB_PS(XMM6,XMM7) 1310 1311 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1312 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1313 1314 /* Fourth Column */ 1315 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1316 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1317 1318 /* Matrix-Vector Product: */ 1319 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1320 SSE_SHUFFLE(XMM5,XMM5,0x00) 1321 SSE_MULT_PS(XMM5,XMM0) 1322 SSE_SUB_PS(XMM4,XMM5) 1323 1324 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1325 SSE_SHUFFLE(XMM6,XMM6,0x00) 1326 SSE_MULT_PS(XMM6,XMM1) 1327 SSE_SUB_PS(XMM4,XMM6) 1328 1329 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1330 SSE_SHUFFLE(XMM7,XMM7,0x00) 1331 SSE_MULT_PS(XMM7,XMM2) 1332 SSE_SUB_PS(XMM4,XMM7) 1333 1334 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1335 SSE_SHUFFLE(XMM5,XMM5,0x00) 1336 SSE_MULT_PS(XMM5,XMM3) 1337 SSE_SUB_PS(XMM4,XMM5) 1338 1339 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1340 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1341 SSE_INLINE_END_2; 1342 pv += 16; 1343 } 1344 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1345 } 1346 row = (unsigned int)(*bjtmp++); 1347 /* row = (*bjtmp++)/4; */ 1348 /* bjtmp++; */ 1349 } 1350 /* finished row so stick it into b->a */ 1351 pv = ba + 16*bi[i]; 1352 pj = bj + bi[i]; 1353 nz = bi[i+1] - bi[i]; 1354 1355 /* Copy x block back into pv block */ 1356 for (j=0; j<nz; j++) { 1357 x = rtmp+16*((unsigned int)pj[j]); 1358 /* x = rtmp+4*pj[j]; */ 1359 1360 SSE_INLINE_BEGIN_2(x,pv) 1361 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1362 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1363 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1364 1365 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1366 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1367 1368 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1369 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1370 1371 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1372 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1373 1374 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1375 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1376 1377 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1378 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1379 1380 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1381 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1382 1383 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1384 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1385 SSE_INLINE_END_2; 1386 pv += 16; 1387 } 1388 /* invert diagonal block */ 1389 w = ba + 16*diag_offset[i]; 1390 if (pivotinblocks) { 1391 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1392 } else { 1393 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1394 } 1395 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1396 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1397 } 1398 1399 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1400 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1401 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1402 C->assembled = PETSC_TRUE; 1403 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1404 /* Flop Count from inverting diagonal blocks */ 1405 SSE_SCOPE_END; 1406 PetscFunctionReturn(0); 1407 } 1408 1409 #undef __FUNCT__ 1410 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1411 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1412 { 1413 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1414 PetscErrorCode ierr; 1415 int i,j,n = a->mbs; 1416 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1417 unsigned int row; 1418 int *ajtmpold,nz,*bi=b->i; 1419 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1420 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1421 MatScalar *ba = b->a,*aa = a->a; 1422 int nonzero=0; 1423 /* int nonzero=0,colscale = 16; */ 1424 PetscTruth pivotinblocks = b->pivotinblocks; 1425 PetscReal shift = info->shiftinblocks; 1426 1427 PetscFunctionBegin; 1428 SSE_SCOPE_BEGIN; 1429 1430 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1431 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1432 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1433 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1434 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1435 /* colscale = 4; */ 1436 /* } */ 1437 if ((unsigned long)bj==(unsigned long)aj) { 1438 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1439 } 1440 1441 for (i=0; i<n; i++) { 1442 nz = bi[i+1] - bi[i]; 1443 bjtmp = bj + bi[i]; 1444 /* zero out the 4x4 block accumulators */ 1445 /* zero out one register */ 1446 XOR_PS(XMM7,XMM7); 1447 for (j=0; j<nz; j++) { 1448 x = rtmp+16*((unsigned int)bjtmp[j]); 1449 /* x = rtmp+4*bjtmp[j]; */ 1450 SSE_INLINE_BEGIN_1(x) 1451 /* Copy zero register to memory locations */ 1452 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1453 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1454 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1455 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1456 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1457 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1458 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1459 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1460 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1461 SSE_INLINE_END_1; 1462 } 1463 /* load in initial (unfactored row) */ 1464 nz = ai[i+1] - ai[i]; 1465 ajtmpold = aj + ai[i]; 1466 v = aa + 16*ai[i]; 1467 for (j=0; j<nz; j++) { 1468 x = rtmp+16*ajtmpold[j]; 1469 /* x = rtmp+colscale*ajtmpold[j]; */ 1470 /* Copy v block into x block */ 1471 SSE_INLINE_BEGIN_2(v,x) 1472 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1473 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1474 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1475 1476 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1477 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1478 1479 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1480 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1481 1482 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1483 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1484 1485 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1486 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1487 1488 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1489 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1490 1491 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1492 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1493 1494 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1495 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1496 SSE_INLINE_END_2; 1497 1498 v += 16; 1499 } 1500 /* row = (*bjtmp++)/4; */ 1501 row = (unsigned int)(*bjtmp++); 1502 while (row < i) { 1503 pc = rtmp + 16*row; 1504 SSE_INLINE_BEGIN_1(pc) 1505 /* Load block from lower triangle */ 1506 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1507 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1508 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1509 1510 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1511 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1512 1513 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1514 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1515 1516 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1517 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1518 1519 /* Compare block to zero block */ 1520 1521 SSE_COPY_PS(XMM4,XMM7) 1522 SSE_CMPNEQ_PS(XMM4,XMM0) 1523 1524 SSE_COPY_PS(XMM5,XMM7) 1525 SSE_CMPNEQ_PS(XMM5,XMM1) 1526 1527 SSE_COPY_PS(XMM6,XMM7) 1528 SSE_CMPNEQ_PS(XMM6,XMM2) 1529 1530 SSE_CMPNEQ_PS(XMM7,XMM3) 1531 1532 /* Reduce the comparisons to one SSE register */ 1533 SSE_OR_PS(XMM6,XMM7) 1534 SSE_OR_PS(XMM5,XMM4) 1535 SSE_OR_PS(XMM5,XMM6) 1536 SSE_INLINE_END_1; 1537 1538 /* Reduce the one SSE register to an integer register for branching */ 1539 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1540 MOVEMASK(nonzero,XMM5); 1541 1542 /* If block is nonzero ... */ 1543 if (nonzero) { 1544 pv = ba + 16*diag_offset[row]; 1545 PREFETCH_L1(&pv[16]); 1546 pj = bj + diag_offset[row] + 1; 1547 1548 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1549 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1550 /* but the diagonal was inverted already */ 1551 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1552 1553 SSE_INLINE_BEGIN_2(pv,pc) 1554 /* Column 0, product is accumulated in XMM4 */ 1555 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1556 SSE_SHUFFLE(XMM4,XMM4,0x00) 1557 SSE_MULT_PS(XMM4,XMM0) 1558 1559 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1560 SSE_SHUFFLE(XMM5,XMM5,0x00) 1561 SSE_MULT_PS(XMM5,XMM1) 1562 SSE_ADD_PS(XMM4,XMM5) 1563 1564 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1565 SSE_SHUFFLE(XMM6,XMM6,0x00) 1566 SSE_MULT_PS(XMM6,XMM2) 1567 SSE_ADD_PS(XMM4,XMM6) 1568 1569 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1570 SSE_SHUFFLE(XMM7,XMM7,0x00) 1571 SSE_MULT_PS(XMM7,XMM3) 1572 SSE_ADD_PS(XMM4,XMM7) 1573 1574 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1575 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1576 1577 /* Column 1, product is accumulated in XMM5 */ 1578 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1579 SSE_SHUFFLE(XMM5,XMM5,0x00) 1580 SSE_MULT_PS(XMM5,XMM0) 1581 1582 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1583 SSE_SHUFFLE(XMM6,XMM6,0x00) 1584 SSE_MULT_PS(XMM6,XMM1) 1585 SSE_ADD_PS(XMM5,XMM6) 1586 1587 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1588 SSE_SHUFFLE(XMM7,XMM7,0x00) 1589 SSE_MULT_PS(XMM7,XMM2) 1590 SSE_ADD_PS(XMM5,XMM7) 1591 1592 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1593 SSE_SHUFFLE(XMM6,XMM6,0x00) 1594 SSE_MULT_PS(XMM6,XMM3) 1595 SSE_ADD_PS(XMM5,XMM6) 1596 1597 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1598 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1599 1600 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1601 1602 /* Column 2, product is accumulated in XMM6 */ 1603 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1604 SSE_SHUFFLE(XMM6,XMM6,0x00) 1605 SSE_MULT_PS(XMM6,XMM0) 1606 1607 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1608 SSE_SHUFFLE(XMM7,XMM7,0x00) 1609 SSE_MULT_PS(XMM7,XMM1) 1610 SSE_ADD_PS(XMM6,XMM7) 1611 1612 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1613 SSE_SHUFFLE(XMM7,XMM7,0x00) 1614 SSE_MULT_PS(XMM7,XMM2) 1615 SSE_ADD_PS(XMM6,XMM7) 1616 1617 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1618 SSE_SHUFFLE(XMM7,XMM7,0x00) 1619 SSE_MULT_PS(XMM7,XMM3) 1620 SSE_ADD_PS(XMM6,XMM7) 1621 1622 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1623 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1624 1625 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1626 /* Column 3, product is accumulated in XMM0 */ 1627 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1628 SSE_SHUFFLE(XMM7,XMM7,0x00) 1629 SSE_MULT_PS(XMM0,XMM7) 1630 1631 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1632 SSE_SHUFFLE(XMM7,XMM7,0x00) 1633 SSE_MULT_PS(XMM1,XMM7) 1634 SSE_ADD_PS(XMM0,XMM1) 1635 1636 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1637 SSE_SHUFFLE(XMM1,XMM1,0x00) 1638 SSE_MULT_PS(XMM1,XMM2) 1639 SSE_ADD_PS(XMM0,XMM1) 1640 1641 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1642 SSE_SHUFFLE(XMM7,XMM7,0x00) 1643 SSE_MULT_PS(XMM3,XMM7) 1644 SSE_ADD_PS(XMM0,XMM3) 1645 1646 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1647 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1648 1649 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1650 /* This is code to be maintained and read by humans afterall. */ 1651 /* Copy Multiplier Col 3 into XMM3 */ 1652 SSE_COPY_PS(XMM3,XMM0) 1653 /* Copy Multiplier Col 2 into XMM2 */ 1654 SSE_COPY_PS(XMM2,XMM6) 1655 /* Copy Multiplier Col 1 into XMM1 */ 1656 SSE_COPY_PS(XMM1,XMM5) 1657 /* Copy Multiplier Col 0 into XMM0 */ 1658 SSE_COPY_PS(XMM0,XMM4) 1659 SSE_INLINE_END_2; 1660 1661 /* Update the row: */ 1662 nz = bi[row+1] - diag_offset[row] - 1; 1663 pv += 16; 1664 for (j=0; j<nz; j++) { 1665 PREFETCH_L1(&pv[16]); 1666 x = rtmp + 16*((unsigned int)pj[j]); 1667 /* x = rtmp + 4*pj[j]; */ 1668 1669 /* X:=X-M*PV, One column at a time */ 1670 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1671 SSE_INLINE_BEGIN_2(x,pv) 1672 /* Load First Column of X*/ 1673 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1674 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1675 1676 /* Matrix-Vector Product: */ 1677 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1678 SSE_SHUFFLE(XMM5,XMM5,0x00) 1679 SSE_MULT_PS(XMM5,XMM0) 1680 SSE_SUB_PS(XMM4,XMM5) 1681 1682 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1683 SSE_SHUFFLE(XMM6,XMM6,0x00) 1684 SSE_MULT_PS(XMM6,XMM1) 1685 SSE_SUB_PS(XMM4,XMM6) 1686 1687 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1688 SSE_SHUFFLE(XMM7,XMM7,0x00) 1689 SSE_MULT_PS(XMM7,XMM2) 1690 SSE_SUB_PS(XMM4,XMM7) 1691 1692 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1693 SSE_SHUFFLE(XMM5,XMM5,0x00) 1694 SSE_MULT_PS(XMM5,XMM3) 1695 SSE_SUB_PS(XMM4,XMM5) 1696 1697 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1698 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1699 1700 /* Second Column */ 1701 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1702 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1703 1704 /* Matrix-Vector Product: */ 1705 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1706 SSE_SHUFFLE(XMM6,XMM6,0x00) 1707 SSE_MULT_PS(XMM6,XMM0) 1708 SSE_SUB_PS(XMM5,XMM6) 1709 1710 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1711 SSE_SHUFFLE(XMM7,XMM7,0x00) 1712 SSE_MULT_PS(XMM7,XMM1) 1713 SSE_SUB_PS(XMM5,XMM7) 1714 1715 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1716 SSE_SHUFFLE(XMM4,XMM4,0x00) 1717 SSE_MULT_PS(XMM4,XMM2) 1718 SSE_SUB_PS(XMM5,XMM4) 1719 1720 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1721 SSE_SHUFFLE(XMM6,XMM6,0x00) 1722 SSE_MULT_PS(XMM6,XMM3) 1723 SSE_SUB_PS(XMM5,XMM6) 1724 1725 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1726 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1727 1728 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1729 1730 /* Third Column */ 1731 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1732 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1733 1734 /* Matrix-Vector Product: */ 1735 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1736 SSE_SHUFFLE(XMM7,XMM7,0x00) 1737 SSE_MULT_PS(XMM7,XMM0) 1738 SSE_SUB_PS(XMM6,XMM7) 1739 1740 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1741 SSE_SHUFFLE(XMM4,XMM4,0x00) 1742 SSE_MULT_PS(XMM4,XMM1) 1743 SSE_SUB_PS(XMM6,XMM4) 1744 1745 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1746 SSE_SHUFFLE(XMM5,XMM5,0x00) 1747 SSE_MULT_PS(XMM5,XMM2) 1748 SSE_SUB_PS(XMM6,XMM5) 1749 1750 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1751 SSE_SHUFFLE(XMM7,XMM7,0x00) 1752 SSE_MULT_PS(XMM7,XMM3) 1753 SSE_SUB_PS(XMM6,XMM7) 1754 1755 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1756 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1757 1758 /* Fourth Column */ 1759 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1760 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1761 1762 /* Matrix-Vector Product: */ 1763 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1764 SSE_SHUFFLE(XMM5,XMM5,0x00) 1765 SSE_MULT_PS(XMM5,XMM0) 1766 SSE_SUB_PS(XMM4,XMM5) 1767 1768 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1769 SSE_SHUFFLE(XMM6,XMM6,0x00) 1770 SSE_MULT_PS(XMM6,XMM1) 1771 SSE_SUB_PS(XMM4,XMM6) 1772 1773 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1774 SSE_SHUFFLE(XMM7,XMM7,0x00) 1775 SSE_MULT_PS(XMM7,XMM2) 1776 SSE_SUB_PS(XMM4,XMM7) 1777 1778 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1779 SSE_SHUFFLE(XMM5,XMM5,0x00) 1780 SSE_MULT_PS(XMM5,XMM3) 1781 SSE_SUB_PS(XMM4,XMM5) 1782 1783 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1784 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1785 SSE_INLINE_END_2; 1786 pv += 16; 1787 } 1788 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1789 } 1790 row = (unsigned int)(*bjtmp++); 1791 /* row = (*bjtmp++)/4; */ 1792 /* bjtmp++; */ 1793 } 1794 /* finished row so stick it into b->a */ 1795 pv = ba + 16*bi[i]; 1796 pj = bj + bi[i]; 1797 nz = bi[i+1] - bi[i]; 1798 1799 /* Copy x block back into pv block */ 1800 for (j=0; j<nz; j++) { 1801 x = rtmp+16*((unsigned int)pj[j]); 1802 /* x = rtmp+4*pj[j]; */ 1803 1804 SSE_INLINE_BEGIN_2(x,pv) 1805 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1806 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1807 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1808 1809 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1810 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1811 1812 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1813 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1814 1815 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1816 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1817 1818 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1819 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1820 1821 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1822 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1823 1824 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1825 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1826 1827 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1828 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1829 SSE_INLINE_END_2; 1830 pv += 16; 1831 } 1832 /* invert diagonal block */ 1833 w = ba + 16*diag_offset[i]; 1834 if (pivotinblocks) { 1835 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1836 } else { 1837 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1838 } 1839 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1840 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1841 } 1842 1843 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1844 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1845 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1846 C->assembled = PETSC_TRUE; 1847 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1848 /* Flop Count from inverting diagonal blocks */ 1849 SSE_SCOPE_END; 1850 PetscFunctionReturn(0); 1851 } 1852 1853 #endif 1854