1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 #include "../src/mat/impls/baij/seq/baij.h" 7 #include "../src/mat/blockinvert.h" 8 9 /* ------------------------------------------------------------*/ 10 /* 11 Version for when blocks are 4 by 4 12 */ 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info) 16 { 17 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 18 IS isrow = b->row,isicol = b->icol; 19 PetscErrorCode ierr; 20 const PetscInt *r,*ic; 21 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 22 PetscInt *ajtmpold,*ajtmp,nz,row; 23 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 24 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 25 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 26 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 27 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 28 MatScalar m13,m14,m15,m16; 29 MatScalar *ba = b->a,*aa = a->a; 30 PetscTruth pivotinblocks = b->pivotinblocks; 31 PetscReal shift = info->shiftinblocks; 32 33 PetscFunctionBegin; 34 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 35 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 36 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 37 38 for (i=0; i<n; i++) { 39 nz = bi[i+1] - bi[i]; 40 ajtmp = bj + bi[i]; 41 for (j=0; j<nz; j++) { 42 x = rtmp+16*ajtmp[j]; 43 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 44 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 45 } 46 /* load in initial (unfactored row) */ 47 idx = r[i]; 48 nz = ai[idx+1] - ai[idx]; 49 ajtmpold = aj + ai[idx]; 50 v = aa + 16*ai[idx]; 51 for (j=0; j<nz; j++) { 52 x = rtmp+16*ic[ajtmpold[j]]; 53 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 54 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 55 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 56 x[14] = v[14]; x[15] = v[15]; 57 v += 16; 58 } 59 row = *ajtmp++; 60 while (row < i) { 61 pc = rtmp + 16*row; 62 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 63 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 64 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 65 p15 = pc[14]; p16 = pc[15]; 66 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 67 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 68 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 69 || p16 != 0.0) { 70 pv = ba + 16*diag_offset[row]; 71 pj = bj + diag_offset[row] + 1; 72 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 73 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 74 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 75 x15 = pv[14]; x16 = pv[15]; 76 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 77 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 78 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 79 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 80 81 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 82 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 83 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 84 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 85 86 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 87 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 88 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 89 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 90 91 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 92 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 93 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 94 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 95 96 nz = bi[row+1] - diag_offset[row] - 1; 97 pv += 16; 98 for (j=0; j<nz; j++) { 99 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 100 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 101 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 102 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 103 x = rtmp + 16*pj[j]; 104 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 105 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 106 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 107 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 108 109 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 110 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 111 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 112 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 113 114 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 115 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 116 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 117 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 118 119 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 120 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 121 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 122 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 123 124 pv += 16; 125 } 126 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 127 } 128 row = *ajtmp++; 129 } 130 /* finished row so stick it into b->a */ 131 pv = ba + 16*bi[i]; 132 pj = bj + bi[i]; 133 nz = bi[i+1] - bi[i]; 134 for (j=0; j<nz; j++) { 135 x = rtmp+16*pj[j]; 136 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 137 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 138 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 139 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 140 pv += 16; 141 } 142 /* invert diagonal block */ 143 w = ba + 16*diag_offset[i]; 144 if (pivotinblocks) { 145 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 146 } else { 147 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148 } 149 } 150 151 ierr = PetscFree(rtmp);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 153 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154 C->ops->solve = MatSolve_SeqBAIJ_4; 155 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 156 C->assembled = PETSC_TRUE; 157 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 158 PetscFunctionReturn(0); 159 } 160 161 /* MatLUFactorNumeric_SeqBAIJ_4_newdatastruct - 162 copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented 163 Kernel_A_gets_A_times_B() 164 Kernel_A_gets_A_minus_B_times_C() 165 Kernel_A_gets_inverse_A() 166 */ 167 168 #undef __FUNCT__ 169 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct" 170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 171 { 172 Mat C=B; 173 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 174 IS isrow = b->row,isicol = b->icol; 175 PetscErrorCode ierr; 176 const PetscInt *r,*ic,*ics; 177 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 178 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 179 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 180 PetscInt bs2 = a->bs2,flg; 181 PetscReal shift = info->shiftinblocks; 182 183 PetscFunctionBegin; 184 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 185 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 186 187 /* generate work space needed by the factorization */ 188 ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 189 mwork = rtmp + bs2*n; 190 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 191 ics = ic; 192 193 for (i=0; i<n; i++){ 194 /* zero rtmp */ 195 /* L part */ 196 nz = bi[i+1] - bi[i]; 197 bjtmp = bj + bi[i]; 198 for (j=0; j<nz; j++){ 199 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 200 } 201 202 /* U part */ 203 nz = bdiag[i]-bdiag[i+1]; 204 bjtmp = bj + bdiag[i+1]+1; 205 for (j=0; j<nz; j++){ 206 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 207 } 208 209 /* load in initial (unfactored row) */ 210 nz = ai[r[i]+1] - ai[r[i]]; 211 ajtmp = aj + ai[r[i]]; 212 v = aa + bs2*ai[r[i]]; 213 for (j=0; j<nz; j++) { 214 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 215 } 216 217 /* elimination */ 218 bjtmp = bj + bi[i]; 219 nzL = bi[i+1] - bi[i]; 220 for(k=0;k < nzL;k++) { 221 row = bjtmp[k]; 222 pc = rtmp + bs2*row; 223 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 224 if (flg) { 225 pv = b->a + bs2*bdiag[row]; 226 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 227 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 228 229 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 230 pv = b->a + bs2*(bdiag[row+1]+1); 231 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 232 for (j=0; j<nz; j++) { 233 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 234 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 235 v = rtmp + bs2*pj[j]; 236 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 237 pv += bs2; 238 } 239 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 240 } 241 } 242 243 /* finished row so stick it into b->a */ 244 /* L part */ 245 pv = b->a + bs2*bi[i] ; 246 pj = b->j + bi[i] ; 247 nz = bi[i+1] - bi[i]; 248 for (j=0; j<nz; j++) { 249 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 250 } 251 252 /* Mark diagonal and invert diagonal for simplier triangular solves */ 253 pv = b->a + bs2*bdiag[i]; 254 pj = b->j + bdiag[i]; 255 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 256 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 257 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 258 259 /* U part */ 260 pv = b->a + bs2*(bdiag[i+1]+1); 261 pj = b->j + bdiag[i+1]+1; 262 nz = bdiag[i] - bdiag[i+1] - 1; 263 for (j=0; j<nz; j++){ 264 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 265 } 266 } 267 268 ierr = PetscFree(rtmp);CHKERRQ(ierr); 269 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 270 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 271 272 C->assembled = PETSC_TRUE; 273 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 274 PetscFunctionReturn(0); 275 } 276 277 #undef __FUNCT__ 278 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 279 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 280 { 281 /* 282 Default Version for when blocks are 4 by 4 Using natural ordering 283 */ 284 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 285 PetscErrorCode ierr; 286 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 287 PetscInt *ajtmpold,*ajtmp,nz,row; 288 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 289 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 290 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 291 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 292 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 293 MatScalar m13,m14,m15,m16; 294 MatScalar *ba = b->a,*aa = a->a; 295 PetscTruth pivotinblocks = b->pivotinblocks; 296 PetscReal shift = info->shiftinblocks; 297 298 PetscFunctionBegin; 299 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 300 301 for (i=0; i<n; i++) { 302 nz = bi[i+1] - bi[i]; 303 ajtmp = bj + bi[i]; 304 for (j=0; j<nz; j++) { 305 x = rtmp+16*ajtmp[j]; 306 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 307 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 308 } 309 /* load in initial (unfactored row) */ 310 nz = ai[i+1] - ai[i]; 311 ajtmpold = aj + ai[i]; 312 v = aa + 16*ai[i]; 313 for (j=0; j<nz; j++) { 314 x = rtmp+16*ajtmpold[j]; 315 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 316 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 317 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 318 x[14] = v[14]; x[15] = v[15]; 319 v += 16; 320 } 321 row = *ajtmp++; 322 while (row < i) { 323 pc = rtmp + 16*row; 324 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 325 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 326 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 327 p15 = pc[14]; p16 = pc[15]; 328 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 329 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 330 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 331 || p16 != 0.0) { 332 pv = ba + 16*diag_offset[row]; 333 pj = bj + diag_offset[row] + 1; 334 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 335 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 336 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 337 x15 = pv[14]; x16 = pv[15]; 338 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 339 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 340 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 341 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 342 343 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 344 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 345 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 346 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 347 348 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 349 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 350 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 351 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 352 353 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 354 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 355 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 356 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 357 nz = bi[row+1] - diag_offset[row] - 1; 358 pv += 16; 359 for (j=0; j<nz; j++) { 360 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 361 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 362 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 363 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 364 x = rtmp + 16*pj[j]; 365 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 366 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 367 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 368 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 369 370 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 371 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 372 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 373 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 374 375 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 376 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 377 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 378 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 379 380 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 381 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 382 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 383 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 384 385 pv += 16; 386 } 387 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 388 } 389 row = *ajtmp++; 390 } 391 /* finished row so stick it into b->a */ 392 pv = ba + 16*bi[i]; 393 pj = bj + bi[i]; 394 nz = bi[i+1] - bi[i]; 395 for (j=0; j<nz; j++) { 396 x = rtmp+16*pj[j]; 397 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 398 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 399 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 400 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 401 pv += 16; 402 } 403 /* invert diagonal block */ 404 w = ba + 16*diag_offset[i]; 405 if (pivotinblocks) { 406 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 407 } else { 408 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 409 } 410 } 411 412 ierr = PetscFree(rtmp);CHKERRQ(ierr); 413 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 414 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 415 C->assembled = PETSC_TRUE; 416 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 417 PetscFunctionReturn(0); 418 } 419 420 /* 421 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct - 422 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_newdatastruct() 423 */ 424 #undef __FUNCT__ 425 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct" 426 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 427 { 428 Mat C=B; 429 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 430 PetscErrorCode ierr; 431 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 432 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 433 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 434 PetscInt bs2 = a->bs2,flg; 435 PetscReal shift = info->shiftinblocks; 436 437 PetscFunctionBegin; 438 /* generate work space needed by the factorization */ 439 ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 440 mwork = rtmp + bs2*n; 441 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 442 443 for (i=0; i<n; i++){ 444 /* zero rtmp */ 445 /* L part */ 446 nz = bi[i+1] - bi[i]; 447 bjtmp = bj + bi[i]; 448 for (j=0; j<nz; j++){ 449 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 450 } 451 452 /* U part */ 453 nz = bdiag[i] - bdiag[i+1]; 454 bjtmp = bj + bdiag[i+1]+1; 455 for (j=0; j<nz; j++){ 456 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 457 } 458 459 /* load in initial (unfactored row) */ 460 nz = ai[i+1] - ai[i]; 461 ajtmp = aj + ai[i]; 462 v = aa + bs2*ai[i]; 463 for (j=0; j<nz; j++) { 464 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 465 } 466 467 /* elimination */ 468 bjtmp = bj + bi[i]; 469 nzL = bi[i+1] - bi[i]; 470 for(k=0;k < nzL;k++) { 471 row = bjtmp[k]; 472 pc = rtmp + bs2*row; 473 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 474 if (flg) { 475 pv = b->a + bs2*bdiag[row]; 476 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 477 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 478 479 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 480 pv = b->a + bs2*(bdiag[row+1]+1); 481 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 482 for (j=0; j<nz; j++) { 483 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 484 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 485 v = rtmp + bs2*pj[j]; 486 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 487 pv += bs2; 488 } 489 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 490 } 491 } 492 493 /* finished row so stick it into b->a */ 494 /* L part */ 495 pv = b->a + bs2*bi[i] ; 496 pj = b->j + bi[i] ; 497 nz = bi[i+1] - bi[i]; 498 for (j=0; j<nz; j++) { 499 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 500 } 501 502 /* Mark diagonal and invert diagonal for simplier triangular solves */ 503 pv = b->a + bs2*bdiag[i]; 504 pj = b->j + bdiag[i]; 505 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 506 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 507 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 508 509 /* U part */ 510 pv = b->a + bs2*(bdiag[i+1]+1); 511 pj = b->j + bdiag[i+1]+1; 512 nz = bdiag[i] - bdiag[i+1] - 1; 513 for (j=0; j<nz; j++){ 514 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 515 } 516 } 517 518 ierr = PetscFree(rtmp);CHKERRQ(ierr); 519 C->assembled = PETSC_TRUE; 520 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 521 PetscFunctionReturn(0); 522 } 523 524 #if defined(PETSC_HAVE_SSE) 525 526 #include PETSC_HAVE_SSE 527 528 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 529 #undef __FUNCT__ 530 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 531 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 532 { 533 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 534 PetscErrorCode ierr; 535 int i,j,n = a->mbs; 536 int *bj = b->j,*bjtmp,*pj; 537 int row; 538 int *ajtmpold,nz,*bi=b->i; 539 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 540 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 541 MatScalar *ba = b->a,*aa = a->a; 542 int nonzero=0; 543 /* int nonzero=0,colscale = 16; */ 544 PetscTruth pivotinblocks = b->pivotinblocks; 545 PetscReal shift = info->shiftinblocks; 546 547 PetscFunctionBegin; 548 SSE_SCOPE_BEGIN; 549 550 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 551 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 552 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 553 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 554 /* if ((unsigned long)bj==(unsigned long)aj) { */ 555 /* colscale = 4; */ 556 /* } */ 557 for (i=0; i<n; i++) { 558 nz = bi[i+1] - bi[i]; 559 bjtmp = bj + bi[i]; 560 /* zero out the 4x4 block accumulators */ 561 /* zero out one register */ 562 XOR_PS(XMM7,XMM7); 563 for (j=0; j<nz; j++) { 564 x = rtmp+16*bjtmp[j]; 565 /* x = rtmp+4*bjtmp[j]; */ 566 SSE_INLINE_BEGIN_1(x) 567 /* Copy zero register to memory locations */ 568 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 569 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 570 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 571 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 572 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 573 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 574 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 575 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 576 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 577 SSE_INLINE_END_1; 578 } 579 /* load in initial (unfactored row) */ 580 nz = ai[i+1] - ai[i]; 581 ajtmpold = aj + ai[i]; 582 v = aa + 16*ai[i]; 583 for (j=0; j<nz; j++) { 584 x = rtmp+16*ajtmpold[j]; 585 /* x = rtmp+colscale*ajtmpold[j]; */ 586 /* Copy v block into x block */ 587 SSE_INLINE_BEGIN_2(v,x) 588 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 589 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 590 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 591 592 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 593 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 594 595 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 596 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 597 598 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 599 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 600 601 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 602 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 603 604 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 605 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 606 607 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 608 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 609 610 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 611 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 612 SSE_INLINE_END_2; 613 614 v += 16; 615 } 616 /* row = (*bjtmp++)/4; */ 617 row = *bjtmp++; 618 while (row < i) { 619 pc = rtmp + 16*row; 620 SSE_INLINE_BEGIN_1(pc) 621 /* Load block from lower triangle */ 622 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 623 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 624 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 625 626 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 627 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 628 629 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 630 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 631 632 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 633 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 634 635 /* Compare block to zero block */ 636 637 SSE_COPY_PS(XMM4,XMM7) 638 SSE_CMPNEQ_PS(XMM4,XMM0) 639 640 SSE_COPY_PS(XMM5,XMM7) 641 SSE_CMPNEQ_PS(XMM5,XMM1) 642 643 SSE_COPY_PS(XMM6,XMM7) 644 SSE_CMPNEQ_PS(XMM6,XMM2) 645 646 SSE_CMPNEQ_PS(XMM7,XMM3) 647 648 /* Reduce the comparisons to one SSE register */ 649 SSE_OR_PS(XMM6,XMM7) 650 SSE_OR_PS(XMM5,XMM4) 651 SSE_OR_PS(XMM5,XMM6) 652 SSE_INLINE_END_1; 653 654 /* Reduce the one SSE register to an integer register for branching */ 655 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 656 MOVEMASK(nonzero,XMM5); 657 658 /* If block is nonzero ... */ 659 if (nonzero) { 660 pv = ba + 16*diag_offset[row]; 661 PREFETCH_L1(&pv[16]); 662 pj = bj + diag_offset[row] + 1; 663 664 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 665 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 666 /* but the diagonal was inverted already */ 667 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 668 669 SSE_INLINE_BEGIN_2(pv,pc) 670 /* Column 0, product is accumulated in XMM4 */ 671 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 672 SSE_SHUFFLE(XMM4,XMM4,0x00) 673 SSE_MULT_PS(XMM4,XMM0) 674 675 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 676 SSE_SHUFFLE(XMM5,XMM5,0x00) 677 SSE_MULT_PS(XMM5,XMM1) 678 SSE_ADD_PS(XMM4,XMM5) 679 680 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 681 SSE_SHUFFLE(XMM6,XMM6,0x00) 682 SSE_MULT_PS(XMM6,XMM2) 683 SSE_ADD_PS(XMM4,XMM6) 684 685 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 686 SSE_SHUFFLE(XMM7,XMM7,0x00) 687 SSE_MULT_PS(XMM7,XMM3) 688 SSE_ADD_PS(XMM4,XMM7) 689 690 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 691 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 692 693 /* Column 1, product is accumulated in XMM5 */ 694 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 695 SSE_SHUFFLE(XMM5,XMM5,0x00) 696 SSE_MULT_PS(XMM5,XMM0) 697 698 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 699 SSE_SHUFFLE(XMM6,XMM6,0x00) 700 SSE_MULT_PS(XMM6,XMM1) 701 SSE_ADD_PS(XMM5,XMM6) 702 703 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 704 SSE_SHUFFLE(XMM7,XMM7,0x00) 705 SSE_MULT_PS(XMM7,XMM2) 706 SSE_ADD_PS(XMM5,XMM7) 707 708 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 709 SSE_SHUFFLE(XMM6,XMM6,0x00) 710 SSE_MULT_PS(XMM6,XMM3) 711 SSE_ADD_PS(XMM5,XMM6) 712 713 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 714 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 715 716 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 717 718 /* Column 2, product is accumulated in XMM6 */ 719 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 720 SSE_SHUFFLE(XMM6,XMM6,0x00) 721 SSE_MULT_PS(XMM6,XMM0) 722 723 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 724 SSE_SHUFFLE(XMM7,XMM7,0x00) 725 SSE_MULT_PS(XMM7,XMM1) 726 SSE_ADD_PS(XMM6,XMM7) 727 728 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 729 SSE_SHUFFLE(XMM7,XMM7,0x00) 730 SSE_MULT_PS(XMM7,XMM2) 731 SSE_ADD_PS(XMM6,XMM7) 732 733 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 734 SSE_SHUFFLE(XMM7,XMM7,0x00) 735 SSE_MULT_PS(XMM7,XMM3) 736 SSE_ADD_PS(XMM6,XMM7) 737 738 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 739 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 740 741 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 742 /* Column 3, product is accumulated in XMM0 */ 743 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 744 SSE_SHUFFLE(XMM7,XMM7,0x00) 745 SSE_MULT_PS(XMM0,XMM7) 746 747 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 748 SSE_SHUFFLE(XMM7,XMM7,0x00) 749 SSE_MULT_PS(XMM1,XMM7) 750 SSE_ADD_PS(XMM0,XMM1) 751 752 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 753 SSE_SHUFFLE(XMM1,XMM1,0x00) 754 SSE_MULT_PS(XMM1,XMM2) 755 SSE_ADD_PS(XMM0,XMM1) 756 757 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 758 SSE_SHUFFLE(XMM7,XMM7,0x00) 759 SSE_MULT_PS(XMM3,XMM7) 760 SSE_ADD_PS(XMM0,XMM3) 761 762 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 763 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 764 765 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 766 /* This is code to be maintained and read by humans afterall. */ 767 /* Copy Multiplier Col 3 into XMM3 */ 768 SSE_COPY_PS(XMM3,XMM0) 769 /* Copy Multiplier Col 2 into XMM2 */ 770 SSE_COPY_PS(XMM2,XMM6) 771 /* Copy Multiplier Col 1 into XMM1 */ 772 SSE_COPY_PS(XMM1,XMM5) 773 /* Copy Multiplier Col 0 into XMM0 */ 774 SSE_COPY_PS(XMM0,XMM4) 775 SSE_INLINE_END_2; 776 777 /* Update the row: */ 778 nz = bi[row+1] - diag_offset[row] - 1; 779 pv += 16; 780 for (j=0; j<nz; j++) { 781 PREFETCH_L1(&pv[16]); 782 x = rtmp + 16*pj[j]; 783 /* x = rtmp + 4*pj[j]; */ 784 785 /* X:=X-M*PV, One column at a time */ 786 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 787 SSE_INLINE_BEGIN_2(x,pv) 788 /* Load First Column of X*/ 789 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 790 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 791 792 /* Matrix-Vector Product: */ 793 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 794 SSE_SHUFFLE(XMM5,XMM5,0x00) 795 SSE_MULT_PS(XMM5,XMM0) 796 SSE_SUB_PS(XMM4,XMM5) 797 798 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 799 SSE_SHUFFLE(XMM6,XMM6,0x00) 800 SSE_MULT_PS(XMM6,XMM1) 801 SSE_SUB_PS(XMM4,XMM6) 802 803 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 804 SSE_SHUFFLE(XMM7,XMM7,0x00) 805 SSE_MULT_PS(XMM7,XMM2) 806 SSE_SUB_PS(XMM4,XMM7) 807 808 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 809 SSE_SHUFFLE(XMM5,XMM5,0x00) 810 SSE_MULT_PS(XMM5,XMM3) 811 SSE_SUB_PS(XMM4,XMM5) 812 813 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 814 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 815 816 /* Second Column */ 817 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 818 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 819 820 /* Matrix-Vector Product: */ 821 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 822 SSE_SHUFFLE(XMM6,XMM6,0x00) 823 SSE_MULT_PS(XMM6,XMM0) 824 SSE_SUB_PS(XMM5,XMM6) 825 826 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 827 SSE_SHUFFLE(XMM7,XMM7,0x00) 828 SSE_MULT_PS(XMM7,XMM1) 829 SSE_SUB_PS(XMM5,XMM7) 830 831 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 832 SSE_SHUFFLE(XMM4,XMM4,0x00) 833 SSE_MULT_PS(XMM4,XMM2) 834 SSE_SUB_PS(XMM5,XMM4) 835 836 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 837 SSE_SHUFFLE(XMM6,XMM6,0x00) 838 SSE_MULT_PS(XMM6,XMM3) 839 SSE_SUB_PS(XMM5,XMM6) 840 841 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 842 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 843 844 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 845 846 /* Third Column */ 847 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 848 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 849 850 /* Matrix-Vector Product: */ 851 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 852 SSE_SHUFFLE(XMM7,XMM7,0x00) 853 SSE_MULT_PS(XMM7,XMM0) 854 SSE_SUB_PS(XMM6,XMM7) 855 856 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 857 SSE_SHUFFLE(XMM4,XMM4,0x00) 858 SSE_MULT_PS(XMM4,XMM1) 859 SSE_SUB_PS(XMM6,XMM4) 860 861 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 862 SSE_SHUFFLE(XMM5,XMM5,0x00) 863 SSE_MULT_PS(XMM5,XMM2) 864 SSE_SUB_PS(XMM6,XMM5) 865 866 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 867 SSE_SHUFFLE(XMM7,XMM7,0x00) 868 SSE_MULT_PS(XMM7,XMM3) 869 SSE_SUB_PS(XMM6,XMM7) 870 871 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 872 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 873 874 /* Fourth Column */ 875 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 876 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 877 878 /* Matrix-Vector Product: */ 879 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 880 SSE_SHUFFLE(XMM5,XMM5,0x00) 881 SSE_MULT_PS(XMM5,XMM0) 882 SSE_SUB_PS(XMM4,XMM5) 883 884 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 885 SSE_SHUFFLE(XMM6,XMM6,0x00) 886 SSE_MULT_PS(XMM6,XMM1) 887 SSE_SUB_PS(XMM4,XMM6) 888 889 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 890 SSE_SHUFFLE(XMM7,XMM7,0x00) 891 SSE_MULT_PS(XMM7,XMM2) 892 SSE_SUB_PS(XMM4,XMM7) 893 894 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 895 SSE_SHUFFLE(XMM5,XMM5,0x00) 896 SSE_MULT_PS(XMM5,XMM3) 897 SSE_SUB_PS(XMM4,XMM5) 898 899 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 900 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 901 SSE_INLINE_END_2; 902 pv += 16; 903 } 904 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 905 } 906 row = *bjtmp++; 907 /* row = (*bjtmp++)/4; */ 908 } 909 /* finished row so stick it into b->a */ 910 pv = ba + 16*bi[i]; 911 pj = bj + bi[i]; 912 nz = bi[i+1] - bi[i]; 913 914 /* Copy x block back into pv block */ 915 for (j=0; j<nz; j++) { 916 x = rtmp+16*pj[j]; 917 /* x = rtmp+4*pj[j]; */ 918 919 SSE_INLINE_BEGIN_2(x,pv) 920 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 921 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 922 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 923 924 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 925 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 926 927 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 928 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 929 930 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 931 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 932 933 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 934 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 935 936 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 937 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 938 939 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 940 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 941 942 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 943 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 944 SSE_INLINE_END_2; 945 pv += 16; 946 } 947 /* invert diagonal block */ 948 w = ba + 16*diag_offset[i]; 949 if (pivotinblocks) { 950 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 951 } else { 952 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 953 } 954 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 955 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 956 } 957 958 ierr = PetscFree(rtmp);CHKERRQ(ierr); 959 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 960 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 961 C->assembled = PETSC_TRUE; 962 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 963 /* Flop Count from inverting diagonal blocks */ 964 SSE_SCOPE_END; 965 PetscFunctionReturn(0); 966 } 967 968 #undef __FUNCT__ 969 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 970 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 971 { 972 Mat A=C; 973 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 974 PetscErrorCode ierr; 975 int i,j,n = a->mbs; 976 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 977 unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 978 unsigned int row; 979 int nz,*bi=b->i; 980 int *diag_offset = b->diag,*ai=a->i; 981 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 982 MatScalar *ba = b->a,*aa = a->a; 983 int nonzero=0; 984 /* int nonzero=0,colscale = 16; */ 985 PetscTruth pivotinblocks = b->pivotinblocks; 986 PetscReal shift = info->shiftinblocks; 987 988 PetscFunctionBegin; 989 SSE_SCOPE_BEGIN; 990 991 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 992 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 993 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 994 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 995 /* if ((unsigned long)bj==(unsigned long)aj) { */ 996 /* colscale = 4; */ 997 /* } */ 998 999 for (i=0; i<n; i++) { 1000 nz = bi[i+1] - bi[i]; 1001 bjtmp = bj + bi[i]; 1002 /* zero out the 4x4 block accumulators */ 1003 /* zero out one register */ 1004 XOR_PS(XMM7,XMM7); 1005 for (j=0; j<nz; j++) { 1006 x = rtmp+16*((unsigned int)bjtmp[j]); 1007 /* x = rtmp+4*bjtmp[j]; */ 1008 SSE_INLINE_BEGIN_1(x) 1009 /* Copy zero register to memory locations */ 1010 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1011 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1012 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1013 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1014 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1015 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1016 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1017 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1018 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1019 SSE_INLINE_END_1; 1020 } 1021 /* load in initial (unfactored row) */ 1022 nz = ai[i+1] - ai[i]; 1023 ajtmp = aj + ai[i]; 1024 v = aa + 16*ai[i]; 1025 for (j=0; j<nz; j++) { 1026 x = rtmp+16*((unsigned int)ajtmp[j]); 1027 /* x = rtmp+colscale*ajtmp[j]; */ 1028 /* Copy v block into x block */ 1029 SSE_INLINE_BEGIN_2(v,x) 1030 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1031 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1032 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1033 1034 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1035 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1036 1037 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1038 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1039 1040 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1041 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1042 1043 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1044 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1045 1046 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1047 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1048 1049 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1050 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1051 1052 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1053 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1054 SSE_INLINE_END_2; 1055 1056 v += 16; 1057 } 1058 /* row = (*bjtmp++)/4; */ 1059 row = (unsigned int)(*bjtmp++); 1060 while (row < i) { 1061 pc = rtmp + 16*row; 1062 SSE_INLINE_BEGIN_1(pc) 1063 /* Load block from lower triangle */ 1064 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1065 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1066 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1067 1068 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1069 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1070 1071 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1072 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1073 1074 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1075 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1076 1077 /* Compare block to zero block */ 1078 1079 SSE_COPY_PS(XMM4,XMM7) 1080 SSE_CMPNEQ_PS(XMM4,XMM0) 1081 1082 SSE_COPY_PS(XMM5,XMM7) 1083 SSE_CMPNEQ_PS(XMM5,XMM1) 1084 1085 SSE_COPY_PS(XMM6,XMM7) 1086 SSE_CMPNEQ_PS(XMM6,XMM2) 1087 1088 SSE_CMPNEQ_PS(XMM7,XMM3) 1089 1090 /* Reduce the comparisons to one SSE register */ 1091 SSE_OR_PS(XMM6,XMM7) 1092 SSE_OR_PS(XMM5,XMM4) 1093 SSE_OR_PS(XMM5,XMM6) 1094 SSE_INLINE_END_1; 1095 1096 /* Reduce the one SSE register to an integer register for branching */ 1097 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1098 MOVEMASK(nonzero,XMM5); 1099 1100 /* If block is nonzero ... */ 1101 if (nonzero) { 1102 pv = ba + 16*diag_offset[row]; 1103 PREFETCH_L1(&pv[16]); 1104 pj = bj + diag_offset[row] + 1; 1105 1106 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1107 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1108 /* but the diagonal was inverted already */ 1109 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1110 1111 SSE_INLINE_BEGIN_2(pv,pc) 1112 /* Column 0, product is accumulated in XMM4 */ 1113 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1114 SSE_SHUFFLE(XMM4,XMM4,0x00) 1115 SSE_MULT_PS(XMM4,XMM0) 1116 1117 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1118 SSE_SHUFFLE(XMM5,XMM5,0x00) 1119 SSE_MULT_PS(XMM5,XMM1) 1120 SSE_ADD_PS(XMM4,XMM5) 1121 1122 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1123 SSE_SHUFFLE(XMM6,XMM6,0x00) 1124 SSE_MULT_PS(XMM6,XMM2) 1125 SSE_ADD_PS(XMM4,XMM6) 1126 1127 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1128 SSE_SHUFFLE(XMM7,XMM7,0x00) 1129 SSE_MULT_PS(XMM7,XMM3) 1130 SSE_ADD_PS(XMM4,XMM7) 1131 1132 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1133 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1134 1135 /* Column 1, product is accumulated in XMM5 */ 1136 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1137 SSE_SHUFFLE(XMM5,XMM5,0x00) 1138 SSE_MULT_PS(XMM5,XMM0) 1139 1140 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1141 SSE_SHUFFLE(XMM6,XMM6,0x00) 1142 SSE_MULT_PS(XMM6,XMM1) 1143 SSE_ADD_PS(XMM5,XMM6) 1144 1145 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1146 SSE_SHUFFLE(XMM7,XMM7,0x00) 1147 SSE_MULT_PS(XMM7,XMM2) 1148 SSE_ADD_PS(XMM5,XMM7) 1149 1150 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1151 SSE_SHUFFLE(XMM6,XMM6,0x00) 1152 SSE_MULT_PS(XMM6,XMM3) 1153 SSE_ADD_PS(XMM5,XMM6) 1154 1155 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1156 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1157 1158 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1159 1160 /* Column 2, product is accumulated in XMM6 */ 1161 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1162 SSE_SHUFFLE(XMM6,XMM6,0x00) 1163 SSE_MULT_PS(XMM6,XMM0) 1164 1165 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1166 SSE_SHUFFLE(XMM7,XMM7,0x00) 1167 SSE_MULT_PS(XMM7,XMM1) 1168 SSE_ADD_PS(XMM6,XMM7) 1169 1170 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1171 SSE_SHUFFLE(XMM7,XMM7,0x00) 1172 SSE_MULT_PS(XMM7,XMM2) 1173 SSE_ADD_PS(XMM6,XMM7) 1174 1175 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1176 SSE_SHUFFLE(XMM7,XMM7,0x00) 1177 SSE_MULT_PS(XMM7,XMM3) 1178 SSE_ADD_PS(XMM6,XMM7) 1179 1180 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1181 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1182 1183 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1184 /* Column 3, product is accumulated in XMM0 */ 1185 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1186 SSE_SHUFFLE(XMM7,XMM7,0x00) 1187 SSE_MULT_PS(XMM0,XMM7) 1188 1189 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1190 SSE_SHUFFLE(XMM7,XMM7,0x00) 1191 SSE_MULT_PS(XMM1,XMM7) 1192 SSE_ADD_PS(XMM0,XMM1) 1193 1194 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1195 SSE_SHUFFLE(XMM1,XMM1,0x00) 1196 SSE_MULT_PS(XMM1,XMM2) 1197 SSE_ADD_PS(XMM0,XMM1) 1198 1199 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1200 SSE_SHUFFLE(XMM7,XMM7,0x00) 1201 SSE_MULT_PS(XMM3,XMM7) 1202 SSE_ADD_PS(XMM0,XMM3) 1203 1204 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1205 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1206 1207 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1208 /* This is code to be maintained and read by humans afterall. */ 1209 /* Copy Multiplier Col 3 into XMM3 */ 1210 SSE_COPY_PS(XMM3,XMM0) 1211 /* Copy Multiplier Col 2 into XMM2 */ 1212 SSE_COPY_PS(XMM2,XMM6) 1213 /* Copy Multiplier Col 1 into XMM1 */ 1214 SSE_COPY_PS(XMM1,XMM5) 1215 /* Copy Multiplier Col 0 into XMM0 */ 1216 SSE_COPY_PS(XMM0,XMM4) 1217 SSE_INLINE_END_2; 1218 1219 /* Update the row: */ 1220 nz = bi[row+1] - diag_offset[row] - 1; 1221 pv += 16; 1222 for (j=0; j<nz; j++) { 1223 PREFETCH_L1(&pv[16]); 1224 x = rtmp + 16*((unsigned int)pj[j]); 1225 /* x = rtmp + 4*pj[j]; */ 1226 1227 /* X:=X-M*PV, One column at a time */ 1228 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1229 SSE_INLINE_BEGIN_2(x,pv) 1230 /* Load First Column of X*/ 1231 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1232 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1233 1234 /* Matrix-Vector Product: */ 1235 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1236 SSE_SHUFFLE(XMM5,XMM5,0x00) 1237 SSE_MULT_PS(XMM5,XMM0) 1238 SSE_SUB_PS(XMM4,XMM5) 1239 1240 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1241 SSE_SHUFFLE(XMM6,XMM6,0x00) 1242 SSE_MULT_PS(XMM6,XMM1) 1243 SSE_SUB_PS(XMM4,XMM6) 1244 1245 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1246 SSE_SHUFFLE(XMM7,XMM7,0x00) 1247 SSE_MULT_PS(XMM7,XMM2) 1248 SSE_SUB_PS(XMM4,XMM7) 1249 1250 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1251 SSE_SHUFFLE(XMM5,XMM5,0x00) 1252 SSE_MULT_PS(XMM5,XMM3) 1253 SSE_SUB_PS(XMM4,XMM5) 1254 1255 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1256 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1257 1258 /* Second Column */ 1259 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1260 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1261 1262 /* Matrix-Vector Product: */ 1263 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1264 SSE_SHUFFLE(XMM6,XMM6,0x00) 1265 SSE_MULT_PS(XMM6,XMM0) 1266 SSE_SUB_PS(XMM5,XMM6) 1267 1268 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1269 SSE_SHUFFLE(XMM7,XMM7,0x00) 1270 SSE_MULT_PS(XMM7,XMM1) 1271 SSE_SUB_PS(XMM5,XMM7) 1272 1273 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1274 SSE_SHUFFLE(XMM4,XMM4,0x00) 1275 SSE_MULT_PS(XMM4,XMM2) 1276 SSE_SUB_PS(XMM5,XMM4) 1277 1278 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1279 SSE_SHUFFLE(XMM6,XMM6,0x00) 1280 SSE_MULT_PS(XMM6,XMM3) 1281 SSE_SUB_PS(XMM5,XMM6) 1282 1283 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1284 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1285 1286 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1287 1288 /* Third Column */ 1289 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1290 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1291 1292 /* Matrix-Vector Product: */ 1293 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1294 SSE_SHUFFLE(XMM7,XMM7,0x00) 1295 SSE_MULT_PS(XMM7,XMM0) 1296 SSE_SUB_PS(XMM6,XMM7) 1297 1298 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1299 SSE_SHUFFLE(XMM4,XMM4,0x00) 1300 SSE_MULT_PS(XMM4,XMM1) 1301 SSE_SUB_PS(XMM6,XMM4) 1302 1303 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1304 SSE_SHUFFLE(XMM5,XMM5,0x00) 1305 SSE_MULT_PS(XMM5,XMM2) 1306 SSE_SUB_PS(XMM6,XMM5) 1307 1308 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1309 SSE_SHUFFLE(XMM7,XMM7,0x00) 1310 SSE_MULT_PS(XMM7,XMM3) 1311 SSE_SUB_PS(XMM6,XMM7) 1312 1313 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1314 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1315 1316 /* Fourth Column */ 1317 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1318 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1319 1320 /* Matrix-Vector Product: */ 1321 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1322 SSE_SHUFFLE(XMM5,XMM5,0x00) 1323 SSE_MULT_PS(XMM5,XMM0) 1324 SSE_SUB_PS(XMM4,XMM5) 1325 1326 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1327 SSE_SHUFFLE(XMM6,XMM6,0x00) 1328 SSE_MULT_PS(XMM6,XMM1) 1329 SSE_SUB_PS(XMM4,XMM6) 1330 1331 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1332 SSE_SHUFFLE(XMM7,XMM7,0x00) 1333 SSE_MULT_PS(XMM7,XMM2) 1334 SSE_SUB_PS(XMM4,XMM7) 1335 1336 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1337 SSE_SHUFFLE(XMM5,XMM5,0x00) 1338 SSE_MULT_PS(XMM5,XMM3) 1339 SSE_SUB_PS(XMM4,XMM5) 1340 1341 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1342 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1343 SSE_INLINE_END_2; 1344 pv += 16; 1345 } 1346 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1347 } 1348 row = (unsigned int)(*bjtmp++); 1349 /* row = (*bjtmp++)/4; */ 1350 /* bjtmp++; */ 1351 } 1352 /* finished row so stick it into b->a */ 1353 pv = ba + 16*bi[i]; 1354 pj = bj + bi[i]; 1355 nz = bi[i+1] - bi[i]; 1356 1357 /* Copy x block back into pv block */ 1358 for (j=0; j<nz; j++) { 1359 x = rtmp+16*((unsigned int)pj[j]); 1360 /* x = rtmp+4*pj[j]; */ 1361 1362 SSE_INLINE_BEGIN_2(x,pv) 1363 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1364 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1365 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1366 1367 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1368 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1369 1370 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1371 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1372 1373 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1374 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1375 1376 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1377 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1378 1379 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1380 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1381 1382 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1383 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1384 1385 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1386 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1387 SSE_INLINE_END_2; 1388 pv += 16; 1389 } 1390 /* invert diagonal block */ 1391 w = ba + 16*diag_offset[i]; 1392 if (pivotinblocks) { 1393 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1394 } else { 1395 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1396 } 1397 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1398 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1399 } 1400 1401 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1402 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1403 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1404 C->assembled = PETSC_TRUE; 1405 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1406 /* Flop Count from inverting diagonal blocks */ 1407 SSE_SCOPE_END; 1408 PetscFunctionReturn(0); 1409 } 1410 1411 #undef __FUNCT__ 1412 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1413 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1414 { 1415 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1416 PetscErrorCode ierr; 1417 int i,j,n = a->mbs; 1418 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1419 unsigned int row; 1420 int *ajtmpold,nz,*bi=b->i; 1421 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1422 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1423 MatScalar *ba = b->a,*aa = a->a; 1424 int nonzero=0; 1425 /* int nonzero=0,colscale = 16; */ 1426 PetscTruth pivotinblocks = b->pivotinblocks; 1427 PetscReal shift = info->shiftinblocks; 1428 1429 PetscFunctionBegin; 1430 SSE_SCOPE_BEGIN; 1431 1432 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1433 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1434 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1435 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1436 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1437 /* colscale = 4; */ 1438 /* } */ 1439 if ((unsigned long)bj==(unsigned long)aj) { 1440 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1441 } 1442 1443 for (i=0; i<n; i++) { 1444 nz = bi[i+1] - bi[i]; 1445 bjtmp = bj + bi[i]; 1446 /* zero out the 4x4 block accumulators */ 1447 /* zero out one register */ 1448 XOR_PS(XMM7,XMM7); 1449 for (j=0; j<nz; j++) { 1450 x = rtmp+16*((unsigned int)bjtmp[j]); 1451 /* x = rtmp+4*bjtmp[j]; */ 1452 SSE_INLINE_BEGIN_1(x) 1453 /* Copy zero register to memory locations */ 1454 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1455 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1456 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1457 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1458 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1459 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1460 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1461 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1462 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1463 SSE_INLINE_END_1; 1464 } 1465 /* load in initial (unfactored row) */ 1466 nz = ai[i+1] - ai[i]; 1467 ajtmpold = aj + ai[i]; 1468 v = aa + 16*ai[i]; 1469 for (j=0; j<nz; j++) { 1470 x = rtmp+16*ajtmpold[j]; 1471 /* x = rtmp+colscale*ajtmpold[j]; */ 1472 /* Copy v block into x block */ 1473 SSE_INLINE_BEGIN_2(v,x) 1474 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1475 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1476 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1477 1478 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1479 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1480 1481 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1482 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1483 1484 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1485 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1486 1487 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1488 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1489 1490 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1491 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1492 1493 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1494 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1495 1496 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1497 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1498 SSE_INLINE_END_2; 1499 1500 v += 16; 1501 } 1502 /* row = (*bjtmp++)/4; */ 1503 row = (unsigned int)(*bjtmp++); 1504 while (row < i) { 1505 pc = rtmp + 16*row; 1506 SSE_INLINE_BEGIN_1(pc) 1507 /* Load block from lower triangle */ 1508 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1509 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1510 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1511 1512 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1513 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1514 1515 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1516 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1517 1518 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1519 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1520 1521 /* Compare block to zero block */ 1522 1523 SSE_COPY_PS(XMM4,XMM7) 1524 SSE_CMPNEQ_PS(XMM4,XMM0) 1525 1526 SSE_COPY_PS(XMM5,XMM7) 1527 SSE_CMPNEQ_PS(XMM5,XMM1) 1528 1529 SSE_COPY_PS(XMM6,XMM7) 1530 SSE_CMPNEQ_PS(XMM6,XMM2) 1531 1532 SSE_CMPNEQ_PS(XMM7,XMM3) 1533 1534 /* Reduce the comparisons to one SSE register */ 1535 SSE_OR_PS(XMM6,XMM7) 1536 SSE_OR_PS(XMM5,XMM4) 1537 SSE_OR_PS(XMM5,XMM6) 1538 SSE_INLINE_END_1; 1539 1540 /* Reduce the one SSE register to an integer register for branching */ 1541 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1542 MOVEMASK(nonzero,XMM5); 1543 1544 /* If block is nonzero ... */ 1545 if (nonzero) { 1546 pv = ba + 16*diag_offset[row]; 1547 PREFETCH_L1(&pv[16]); 1548 pj = bj + diag_offset[row] + 1; 1549 1550 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1551 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1552 /* but the diagonal was inverted already */ 1553 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1554 1555 SSE_INLINE_BEGIN_2(pv,pc) 1556 /* Column 0, product is accumulated in XMM4 */ 1557 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1558 SSE_SHUFFLE(XMM4,XMM4,0x00) 1559 SSE_MULT_PS(XMM4,XMM0) 1560 1561 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1562 SSE_SHUFFLE(XMM5,XMM5,0x00) 1563 SSE_MULT_PS(XMM5,XMM1) 1564 SSE_ADD_PS(XMM4,XMM5) 1565 1566 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1567 SSE_SHUFFLE(XMM6,XMM6,0x00) 1568 SSE_MULT_PS(XMM6,XMM2) 1569 SSE_ADD_PS(XMM4,XMM6) 1570 1571 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1572 SSE_SHUFFLE(XMM7,XMM7,0x00) 1573 SSE_MULT_PS(XMM7,XMM3) 1574 SSE_ADD_PS(XMM4,XMM7) 1575 1576 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1577 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1578 1579 /* Column 1, product is accumulated in XMM5 */ 1580 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1581 SSE_SHUFFLE(XMM5,XMM5,0x00) 1582 SSE_MULT_PS(XMM5,XMM0) 1583 1584 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1585 SSE_SHUFFLE(XMM6,XMM6,0x00) 1586 SSE_MULT_PS(XMM6,XMM1) 1587 SSE_ADD_PS(XMM5,XMM6) 1588 1589 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1590 SSE_SHUFFLE(XMM7,XMM7,0x00) 1591 SSE_MULT_PS(XMM7,XMM2) 1592 SSE_ADD_PS(XMM5,XMM7) 1593 1594 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1595 SSE_SHUFFLE(XMM6,XMM6,0x00) 1596 SSE_MULT_PS(XMM6,XMM3) 1597 SSE_ADD_PS(XMM5,XMM6) 1598 1599 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1600 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1601 1602 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1603 1604 /* Column 2, product is accumulated in XMM6 */ 1605 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1606 SSE_SHUFFLE(XMM6,XMM6,0x00) 1607 SSE_MULT_PS(XMM6,XMM0) 1608 1609 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1610 SSE_SHUFFLE(XMM7,XMM7,0x00) 1611 SSE_MULT_PS(XMM7,XMM1) 1612 SSE_ADD_PS(XMM6,XMM7) 1613 1614 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1615 SSE_SHUFFLE(XMM7,XMM7,0x00) 1616 SSE_MULT_PS(XMM7,XMM2) 1617 SSE_ADD_PS(XMM6,XMM7) 1618 1619 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1620 SSE_SHUFFLE(XMM7,XMM7,0x00) 1621 SSE_MULT_PS(XMM7,XMM3) 1622 SSE_ADD_PS(XMM6,XMM7) 1623 1624 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1625 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1626 1627 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1628 /* Column 3, product is accumulated in XMM0 */ 1629 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1630 SSE_SHUFFLE(XMM7,XMM7,0x00) 1631 SSE_MULT_PS(XMM0,XMM7) 1632 1633 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1634 SSE_SHUFFLE(XMM7,XMM7,0x00) 1635 SSE_MULT_PS(XMM1,XMM7) 1636 SSE_ADD_PS(XMM0,XMM1) 1637 1638 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1639 SSE_SHUFFLE(XMM1,XMM1,0x00) 1640 SSE_MULT_PS(XMM1,XMM2) 1641 SSE_ADD_PS(XMM0,XMM1) 1642 1643 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1644 SSE_SHUFFLE(XMM7,XMM7,0x00) 1645 SSE_MULT_PS(XMM3,XMM7) 1646 SSE_ADD_PS(XMM0,XMM3) 1647 1648 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1649 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1650 1651 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1652 /* This is code to be maintained and read by humans afterall. */ 1653 /* Copy Multiplier Col 3 into XMM3 */ 1654 SSE_COPY_PS(XMM3,XMM0) 1655 /* Copy Multiplier Col 2 into XMM2 */ 1656 SSE_COPY_PS(XMM2,XMM6) 1657 /* Copy Multiplier Col 1 into XMM1 */ 1658 SSE_COPY_PS(XMM1,XMM5) 1659 /* Copy Multiplier Col 0 into XMM0 */ 1660 SSE_COPY_PS(XMM0,XMM4) 1661 SSE_INLINE_END_2; 1662 1663 /* Update the row: */ 1664 nz = bi[row+1] - diag_offset[row] - 1; 1665 pv += 16; 1666 for (j=0; j<nz; j++) { 1667 PREFETCH_L1(&pv[16]); 1668 x = rtmp + 16*((unsigned int)pj[j]); 1669 /* x = rtmp + 4*pj[j]; */ 1670 1671 /* X:=X-M*PV, One column at a time */ 1672 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1673 SSE_INLINE_BEGIN_2(x,pv) 1674 /* Load First Column of X*/ 1675 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1676 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1677 1678 /* Matrix-Vector Product: */ 1679 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1680 SSE_SHUFFLE(XMM5,XMM5,0x00) 1681 SSE_MULT_PS(XMM5,XMM0) 1682 SSE_SUB_PS(XMM4,XMM5) 1683 1684 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1685 SSE_SHUFFLE(XMM6,XMM6,0x00) 1686 SSE_MULT_PS(XMM6,XMM1) 1687 SSE_SUB_PS(XMM4,XMM6) 1688 1689 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1690 SSE_SHUFFLE(XMM7,XMM7,0x00) 1691 SSE_MULT_PS(XMM7,XMM2) 1692 SSE_SUB_PS(XMM4,XMM7) 1693 1694 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1695 SSE_SHUFFLE(XMM5,XMM5,0x00) 1696 SSE_MULT_PS(XMM5,XMM3) 1697 SSE_SUB_PS(XMM4,XMM5) 1698 1699 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1700 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1701 1702 /* Second Column */ 1703 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1704 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1705 1706 /* Matrix-Vector Product: */ 1707 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1708 SSE_SHUFFLE(XMM6,XMM6,0x00) 1709 SSE_MULT_PS(XMM6,XMM0) 1710 SSE_SUB_PS(XMM5,XMM6) 1711 1712 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1713 SSE_SHUFFLE(XMM7,XMM7,0x00) 1714 SSE_MULT_PS(XMM7,XMM1) 1715 SSE_SUB_PS(XMM5,XMM7) 1716 1717 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1718 SSE_SHUFFLE(XMM4,XMM4,0x00) 1719 SSE_MULT_PS(XMM4,XMM2) 1720 SSE_SUB_PS(XMM5,XMM4) 1721 1722 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1723 SSE_SHUFFLE(XMM6,XMM6,0x00) 1724 SSE_MULT_PS(XMM6,XMM3) 1725 SSE_SUB_PS(XMM5,XMM6) 1726 1727 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1728 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1729 1730 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1731 1732 /* Third Column */ 1733 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1734 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1735 1736 /* Matrix-Vector Product: */ 1737 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1738 SSE_SHUFFLE(XMM7,XMM7,0x00) 1739 SSE_MULT_PS(XMM7,XMM0) 1740 SSE_SUB_PS(XMM6,XMM7) 1741 1742 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1743 SSE_SHUFFLE(XMM4,XMM4,0x00) 1744 SSE_MULT_PS(XMM4,XMM1) 1745 SSE_SUB_PS(XMM6,XMM4) 1746 1747 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1748 SSE_SHUFFLE(XMM5,XMM5,0x00) 1749 SSE_MULT_PS(XMM5,XMM2) 1750 SSE_SUB_PS(XMM6,XMM5) 1751 1752 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1753 SSE_SHUFFLE(XMM7,XMM7,0x00) 1754 SSE_MULT_PS(XMM7,XMM3) 1755 SSE_SUB_PS(XMM6,XMM7) 1756 1757 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1758 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1759 1760 /* Fourth Column */ 1761 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1762 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1763 1764 /* Matrix-Vector Product: */ 1765 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1766 SSE_SHUFFLE(XMM5,XMM5,0x00) 1767 SSE_MULT_PS(XMM5,XMM0) 1768 SSE_SUB_PS(XMM4,XMM5) 1769 1770 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1771 SSE_SHUFFLE(XMM6,XMM6,0x00) 1772 SSE_MULT_PS(XMM6,XMM1) 1773 SSE_SUB_PS(XMM4,XMM6) 1774 1775 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1776 SSE_SHUFFLE(XMM7,XMM7,0x00) 1777 SSE_MULT_PS(XMM7,XMM2) 1778 SSE_SUB_PS(XMM4,XMM7) 1779 1780 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1781 SSE_SHUFFLE(XMM5,XMM5,0x00) 1782 SSE_MULT_PS(XMM5,XMM3) 1783 SSE_SUB_PS(XMM4,XMM5) 1784 1785 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1786 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1787 SSE_INLINE_END_2; 1788 pv += 16; 1789 } 1790 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1791 } 1792 row = (unsigned int)(*bjtmp++); 1793 /* row = (*bjtmp++)/4; */ 1794 /* bjtmp++; */ 1795 } 1796 /* finished row so stick it into b->a */ 1797 pv = ba + 16*bi[i]; 1798 pj = bj + bi[i]; 1799 nz = bi[i+1] - bi[i]; 1800 1801 /* Copy x block back into pv block */ 1802 for (j=0; j<nz; j++) { 1803 x = rtmp+16*((unsigned int)pj[j]); 1804 /* x = rtmp+4*pj[j]; */ 1805 1806 SSE_INLINE_BEGIN_2(x,pv) 1807 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1808 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1809 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1810 1811 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1812 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1813 1814 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1815 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1816 1817 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1818 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1819 1820 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1821 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1822 1823 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1824 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1825 1826 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1827 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1828 1829 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1830 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1831 SSE_INLINE_END_2; 1832 pv += 16; 1833 } 1834 /* invert diagonal block */ 1835 w = ba + 16*diag_offset[i]; 1836 if (pivotinblocks) { 1837 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1838 } else { 1839 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1840 } 1841 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1842 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1843 } 1844 1845 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1846 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1847 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1848 C->assembled = PETSC_TRUE; 1849 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1850 /* Flop Count from inverting diagonal blocks */ 1851 SSE_SCOPE_END; 1852 PetscFunctionReturn(0); 1853 } 1854 1855 #endif 1856