1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 #include "../src/mat/impls/baij/seq/baij.h" 7 #include "../src/mat/blockinvert.h" 8 9 /* ------------------------------------------------------------*/ 10 /* 11 Version for when blocks are 4 by 4 12 */ 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info) 16 { 17 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 18 IS isrow = b->row,isicol = b->icol; 19 PetscErrorCode ierr; 20 const PetscInt *r,*ic; 21 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 22 PetscInt *ajtmpold,*ajtmp,nz,row; 23 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 24 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 25 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 26 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 27 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 28 MatScalar m13,m14,m15,m16; 29 MatScalar *ba = b->a,*aa = a->a; 30 PetscTruth pivotinblocks = b->pivotinblocks; 31 PetscReal shift = info->shiftinblocks; 32 33 PetscFunctionBegin; 34 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 35 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 36 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 37 38 for (i=0; i<n; i++) { 39 nz = bi[i+1] - bi[i]; 40 ajtmp = bj + bi[i]; 41 for (j=0; j<nz; j++) { 42 x = rtmp+16*ajtmp[j]; 43 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 44 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 45 } 46 /* load in initial (unfactored row) */ 47 idx = r[i]; 48 nz = ai[idx+1] - ai[idx]; 49 ajtmpold = aj + ai[idx]; 50 v = aa + 16*ai[idx]; 51 for (j=0; j<nz; j++) { 52 x = rtmp+16*ic[ajtmpold[j]]; 53 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 54 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 55 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 56 x[14] = v[14]; x[15] = v[15]; 57 v += 16; 58 } 59 row = *ajtmp++; 60 while (row < i) { 61 pc = rtmp + 16*row; 62 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 63 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 64 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 65 p15 = pc[14]; p16 = pc[15]; 66 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 67 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 68 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 69 || p16 != 0.0) { 70 pv = ba + 16*diag_offset[row]; 71 pj = bj + diag_offset[row] + 1; 72 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 73 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 74 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 75 x15 = pv[14]; x16 = pv[15]; 76 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 77 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 78 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 79 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 80 81 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 82 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 83 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 84 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 85 86 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 87 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 88 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 89 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 90 91 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 92 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 93 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 94 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 95 96 nz = bi[row+1] - diag_offset[row] - 1; 97 pv += 16; 98 for (j=0; j<nz; j++) { 99 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 100 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 101 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 102 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 103 x = rtmp + 16*pj[j]; 104 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 105 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 106 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 107 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 108 109 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 110 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 111 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 112 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 113 114 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 115 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 116 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 117 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 118 119 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 120 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 121 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 122 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 123 124 pv += 16; 125 } 126 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 127 } 128 row = *ajtmp++; 129 } 130 /* finished row so stick it into b->a */ 131 pv = ba + 16*bi[i]; 132 pj = bj + bi[i]; 133 nz = bi[i+1] - bi[i]; 134 for (j=0; j<nz; j++) { 135 x = rtmp+16*pj[j]; 136 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 137 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 138 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 139 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 140 pv += 16; 141 } 142 /* invert diagonal block */ 143 w = ba + 16*diag_offset[i]; 144 if (pivotinblocks) { 145 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 146 } else { 147 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148 } 149 } 150 151 ierr = PetscFree(rtmp);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 153 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154 C->ops->solve = MatSolve_SeqBAIJ_4; 155 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 156 C->assembled = PETSC_TRUE; 157 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 158 PetscFunctionReturn(0); 159 } 160 161 /* MatLUFactorNumeric_SeqBAIJ_4_newdatastruct - 162 copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented 163 Kernel_A_gets_A_times_B() 164 Kernel_A_gets_A_minus_B_times_C() 165 Kernel_A_gets_inverse_A() 166 */ 167 #undef __FUNCT__ 168 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct" 169 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 170 { 171 Mat C=B; 172 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 173 IS isrow = b->row,isicol = b->icol; 174 PetscErrorCode ierr; 175 const PetscInt *r,*ic,*ics; 176 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 177 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 178 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 179 PetscInt bs2 = a->bs2,flg; 180 PetscReal shift = info->shiftinblocks; 181 182 PetscFunctionBegin; 183 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 184 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 185 186 /* generate work space needed by the factorization */ 187 ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 188 mwork = rtmp + bs2*n; 189 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 190 ics = ic; 191 192 for (i=0; i<n; i++){ 193 /* zero rtmp */ 194 /* L part */ 195 nz = bi[i+1] - bi[i]; 196 bjtmp = bj + bi[i]; 197 for (j=0; j<nz; j++){ 198 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 199 } 200 201 /* U part */ 202 nz = bi[2*n-i+1] - bi[2*n-i]; 203 bjtmp = bj + bi[2*n-i]; 204 for (j=0; j<nz; j++){ 205 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 206 } 207 208 /* load in initial (unfactored row) */ 209 nz = ai[r[i]+1] - ai[r[i]]; 210 ajtmp = aj + ai[r[i]]; 211 v = aa + bs2*ai[r[i]]; 212 for (j=0; j<nz; j++) { 213 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 214 } 215 216 /* elimination */ 217 bjtmp = bj + bi[i]; 218 nzL = bi[i+1] - bi[i]; 219 for(k=0;k < nzL;k++) { 220 row = bjtmp[k]; 221 pc = rtmp + bs2*row; 222 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 223 if (flg) { 224 pv = b->a + bs2*bdiag[row]; 225 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 226 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 227 228 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 229 pv = b->a + bs2*bi[2*n-row]; 230 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 231 for (j=0; j<nz; j++) { 232 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 233 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 234 v = rtmp + bs2*pj[j]; 235 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 236 pv += bs2; 237 } 238 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 239 } 240 } 241 242 /* finished row so stick it into b->a */ 243 /* L part */ 244 pv = b->a + bs2*bi[i] ; 245 pj = b->j + bi[i] ; 246 nz = bi[i+1] - bi[i]; 247 for (j=0; j<nz; j++) { 248 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 249 } 250 251 /* Mark diagonal and invert diagonal for simplier triangular solves */ 252 pv = b->a + bs2*bdiag[i]; 253 pj = b->j + bdiag[i]; 254 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 255 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 256 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 257 258 /* U part */ 259 pv = b->a + bs2*bi[2*n-i]; 260 pj = b->j + bi[2*n-i]; 261 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 262 for (j=0; j<nz; j++){ 263 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 264 } 265 } 266 267 ierr = PetscFree(rtmp);CHKERRQ(ierr); 268 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 269 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 270 271 C->assembled = PETSC_TRUE; 272 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 273 PetscFunctionReturn(0); 274 } 275 276 #undef __FUNCT__ 277 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 278 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 279 { 280 /* 281 Default Version for when blocks are 4 by 4 Using natural ordering 282 */ 283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 284 PetscErrorCode ierr; 285 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 286 PetscInt *ajtmpold,*ajtmp,nz,row; 287 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 288 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 289 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 290 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 291 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 292 MatScalar m13,m14,m15,m16; 293 MatScalar *ba = b->a,*aa = a->a; 294 PetscTruth pivotinblocks = b->pivotinblocks; 295 PetscReal shift = info->shiftinblocks; 296 297 PetscFunctionBegin; 298 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 299 300 for (i=0; i<n; i++) { 301 nz = bi[i+1] - bi[i]; 302 ajtmp = bj + bi[i]; 303 for (j=0; j<nz; j++) { 304 x = rtmp+16*ajtmp[j]; 305 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 306 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 307 } 308 /* load in initial (unfactored row) */ 309 nz = ai[i+1] - ai[i]; 310 ajtmpold = aj + ai[i]; 311 v = aa + 16*ai[i]; 312 for (j=0; j<nz; j++) { 313 x = rtmp+16*ajtmpold[j]; 314 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 315 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 316 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 317 x[14] = v[14]; x[15] = v[15]; 318 v += 16; 319 } 320 row = *ajtmp++; 321 while (row < i) { 322 pc = rtmp + 16*row; 323 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 324 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 325 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 326 p15 = pc[14]; p16 = pc[15]; 327 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 328 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 329 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 330 || p16 != 0.0) { 331 pv = ba + 16*diag_offset[row]; 332 pj = bj + diag_offset[row] + 1; 333 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 334 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 335 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 336 x15 = pv[14]; x16 = pv[15]; 337 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 338 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 339 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 340 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 341 342 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 343 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 344 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 345 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 346 347 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 348 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 349 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 350 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 351 352 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 353 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 354 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 355 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 356 nz = bi[row+1] - diag_offset[row] - 1; 357 pv += 16; 358 for (j=0; j<nz; j++) { 359 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 360 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 361 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 362 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 363 x = rtmp + 16*pj[j]; 364 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 365 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 366 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 367 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 368 369 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 370 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 371 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 372 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 373 374 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 375 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 376 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 377 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 378 379 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 380 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 381 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 382 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 383 384 pv += 16; 385 } 386 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 387 } 388 row = *ajtmp++; 389 } 390 /* finished row so stick it into b->a */ 391 pv = ba + 16*bi[i]; 392 pj = bj + bi[i]; 393 nz = bi[i+1] - bi[i]; 394 for (j=0; j<nz; j++) { 395 x = rtmp+16*pj[j]; 396 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 397 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 398 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 399 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 400 pv += 16; 401 } 402 /* invert diagonal block */ 403 w = ba + 16*diag_offset[i]; 404 if (pivotinblocks) { 405 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 406 } else { 407 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 408 } 409 } 410 411 ierr = PetscFree(rtmp);CHKERRQ(ierr); 412 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 413 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 414 C->assembled = PETSC_TRUE; 415 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 416 PetscFunctionReturn(0); 417 } 418 /* 419 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct - 420 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_newdatastruct() 421 */ 422 #undef __FUNCT__ 423 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct" 424 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 425 { 426 Mat C=B; 427 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 428 PetscErrorCode ierr; 429 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 430 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 431 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 432 PetscInt bs2 = a->bs2,flg; 433 PetscReal shift = info->shiftinblocks; 434 435 PetscFunctionBegin; 436 /* generate work space needed by the factorization */ 437 ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 438 mwork = rtmp + bs2*n; 439 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 440 441 for (i=0; i<n; i++){ 442 /* zero rtmp */ 443 /* L part */ 444 nz = bi[i+1] - bi[i]; 445 bjtmp = bj + bi[i]; 446 for (j=0; j<nz; j++){ 447 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 448 } 449 450 /* U part */ 451 nz = bi[2*n-i+1] - bi[2*n-i]; 452 bjtmp = bj + bi[2*n-i]; 453 for (j=0; j<nz; j++){ 454 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 455 } 456 457 /* load in initial (unfactored row) */ 458 nz = ai[i+1] - ai[i]; 459 ajtmp = aj + ai[i]; 460 v = aa + bs2*ai[i]; 461 for (j=0; j<nz; j++) { 462 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 463 } 464 465 /* elimination */ 466 bjtmp = bj + bi[i]; 467 nzL = bi[i+1] - bi[i]; 468 for(k=0;k < nzL;k++) { 469 row = bjtmp[k]; 470 pc = rtmp + bs2*row; 471 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 472 if (flg) { 473 pv = b->a + bs2*bdiag[row]; 474 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 475 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 476 477 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 478 pv = b->a + bs2*bi[2*n-row]; 479 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 480 for (j=0; j<nz; j++) { 481 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 482 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 483 v = rtmp + bs2*pj[j]; 484 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 485 pv += bs2; 486 } 487 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 488 } 489 } 490 491 /* finished row so stick it into b->a */ 492 /* L part */ 493 pv = b->a + bs2*bi[i] ; 494 pj = b->j + bi[i] ; 495 nz = bi[i+1] - bi[i]; 496 for (j=0; j<nz; j++) { 497 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 498 } 499 500 /* Mark diagonal and invert diagonal for simplier triangular solves */ 501 pv = b->a + bs2*bdiag[i]; 502 pj = b->j + bdiag[i]; 503 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 504 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 505 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 506 507 /* U part */ 508 pv = b->a + bs2*bi[2*n-i]; 509 pj = b->j + bi[2*n-i]; 510 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 511 for (j=0; j<nz; j++){ 512 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 513 } 514 } 515 516 ierr = PetscFree(rtmp);CHKERRQ(ierr); 517 C->assembled = PETSC_TRUE; 518 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 519 PetscFunctionReturn(0); 520 } 521 522 #undef __FUNCT__ 523 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 524 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info) 525 { 526 Mat C=B; 527 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 528 PetscErrorCode ierr; 529 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 530 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 531 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 532 PetscInt bs2 = a->bs2,flg; 533 PetscReal shift = info->shiftinblocks; 534 535 PetscFunctionBegin; 536 /* generate work space needed by the factorization */ 537 ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 538 mwork = rtmp + bs2*n; 539 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 540 541 for (i=0; i<n; i++){ 542 /* zero rtmp */ 543 /* L part */ 544 nz = bi[i+1] - bi[i]; 545 bjtmp = bj + bi[i]; 546 for (j=0; j<nz; j++){ 547 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 548 } 549 550 /* U part */ 551 nz = bdiag[i] - bdiag[i+1]; 552 bjtmp = bj + bdiag[i+1]+1; 553 for (j=0; j<nz; j++){ 554 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 555 } 556 557 /* load in initial (unfactored row) */ 558 nz = ai[i+1] - ai[i]; 559 ajtmp = aj + ai[i]; 560 v = aa + bs2*ai[i]; 561 for (j=0; j<nz; j++) { 562 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 563 } 564 565 /* elimination */ 566 bjtmp = bj + bi[i]; 567 nzL = bi[i+1] - bi[i]; 568 for(k=0;k < nzL;k++) { 569 row = bjtmp[k]; 570 pc = rtmp + bs2*row; 571 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 572 if (flg) { 573 pv = b->a + bs2*bdiag[row]; 574 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 575 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 576 577 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 578 pv = b->a + bs2*(bdiag[row+1]+1); 579 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 580 for (j=0; j<nz; j++) { 581 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 582 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 583 v = rtmp + bs2*pj[j]; 584 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 585 pv += bs2; 586 } 587 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 588 } 589 } 590 591 /* finished row so stick it into b->a */ 592 /* L part */ 593 pv = b->a + bs2*bi[i] ; 594 pj = b->j + bi[i] ; 595 nz = bi[i+1] - bi[i]; 596 for (j=0; j<nz; j++) { 597 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 598 } 599 600 /* Mark diagonal and invert diagonal for simplier triangular solves */ 601 pv = b->a + bs2*bdiag[i]; 602 pj = b->j + bdiag[i]; 603 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 604 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 605 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 606 607 /* U part */ 608 pv = b->a + bs2*(bdiag[i+1]+1); 609 pj = b->j + bdiag[i+1]+1; 610 nz = bdiag[i] - bdiag[i+1] - 1; 611 for (j=0; j<nz; j++){ 612 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 613 } 614 } 615 616 ierr = PetscFree(rtmp);CHKERRQ(ierr); 617 C->assembled = PETSC_TRUE; 618 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 619 PetscFunctionReturn(0); 620 } 621 622 #if defined(PETSC_HAVE_SSE) 623 624 #include PETSC_HAVE_SSE 625 626 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 627 #undef __FUNCT__ 628 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 629 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 630 { 631 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 632 PetscErrorCode ierr; 633 int i,j,n = a->mbs; 634 int *bj = b->j,*bjtmp,*pj; 635 int row; 636 int *ajtmpold,nz,*bi=b->i; 637 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 638 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 639 MatScalar *ba = b->a,*aa = a->a; 640 int nonzero=0; 641 /* int nonzero=0,colscale = 16; */ 642 PetscTruth pivotinblocks = b->pivotinblocks; 643 PetscReal shift = info->shiftinblocks; 644 645 PetscFunctionBegin; 646 SSE_SCOPE_BEGIN; 647 648 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 649 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 650 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 651 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 652 /* if ((unsigned long)bj==(unsigned long)aj) { */ 653 /* colscale = 4; */ 654 /* } */ 655 for (i=0; i<n; i++) { 656 nz = bi[i+1] - bi[i]; 657 bjtmp = bj + bi[i]; 658 /* zero out the 4x4 block accumulators */ 659 /* zero out one register */ 660 XOR_PS(XMM7,XMM7); 661 for (j=0; j<nz; j++) { 662 x = rtmp+16*bjtmp[j]; 663 /* x = rtmp+4*bjtmp[j]; */ 664 SSE_INLINE_BEGIN_1(x) 665 /* Copy zero register to memory locations */ 666 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 667 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 668 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 669 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 670 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 671 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 672 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 673 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 674 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 675 SSE_INLINE_END_1; 676 } 677 /* load in initial (unfactored row) */ 678 nz = ai[i+1] - ai[i]; 679 ajtmpold = aj + ai[i]; 680 v = aa + 16*ai[i]; 681 for (j=0; j<nz; j++) { 682 x = rtmp+16*ajtmpold[j]; 683 /* x = rtmp+colscale*ajtmpold[j]; */ 684 /* Copy v block into x block */ 685 SSE_INLINE_BEGIN_2(v,x) 686 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 687 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 688 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 689 690 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 691 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 692 693 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 694 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 695 696 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 697 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 698 699 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 700 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 701 702 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 703 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 704 705 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 706 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 707 708 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 709 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 710 SSE_INLINE_END_2; 711 712 v += 16; 713 } 714 /* row = (*bjtmp++)/4; */ 715 row = *bjtmp++; 716 while (row < i) { 717 pc = rtmp + 16*row; 718 SSE_INLINE_BEGIN_1(pc) 719 /* Load block from lower triangle */ 720 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 721 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 722 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 723 724 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 725 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 726 727 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 728 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 729 730 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 731 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 732 733 /* Compare block to zero block */ 734 735 SSE_COPY_PS(XMM4,XMM7) 736 SSE_CMPNEQ_PS(XMM4,XMM0) 737 738 SSE_COPY_PS(XMM5,XMM7) 739 SSE_CMPNEQ_PS(XMM5,XMM1) 740 741 SSE_COPY_PS(XMM6,XMM7) 742 SSE_CMPNEQ_PS(XMM6,XMM2) 743 744 SSE_CMPNEQ_PS(XMM7,XMM3) 745 746 /* Reduce the comparisons to one SSE register */ 747 SSE_OR_PS(XMM6,XMM7) 748 SSE_OR_PS(XMM5,XMM4) 749 SSE_OR_PS(XMM5,XMM6) 750 SSE_INLINE_END_1; 751 752 /* Reduce the one SSE register to an integer register for branching */ 753 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 754 MOVEMASK(nonzero,XMM5); 755 756 /* If block is nonzero ... */ 757 if (nonzero) { 758 pv = ba + 16*diag_offset[row]; 759 PREFETCH_L1(&pv[16]); 760 pj = bj + diag_offset[row] + 1; 761 762 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 763 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 764 /* but the diagonal was inverted already */ 765 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 766 767 SSE_INLINE_BEGIN_2(pv,pc) 768 /* Column 0, product is accumulated in XMM4 */ 769 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 770 SSE_SHUFFLE(XMM4,XMM4,0x00) 771 SSE_MULT_PS(XMM4,XMM0) 772 773 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 774 SSE_SHUFFLE(XMM5,XMM5,0x00) 775 SSE_MULT_PS(XMM5,XMM1) 776 SSE_ADD_PS(XMM4,XMM5) 777 778 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 779 SSE_SHUFFLE(XMM6,XMM6,0x00) 780 SSE_MULT_PS(XMM6,XMM2) 781 SSE_ADD_PS(XMM4,XMM6) 782 783 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 784 SSE_SHUFFLE(XMM7,XMM7,0x00) 785 SSE_MULT_PS(XMM7,XMM3) 786 SSE_ADD_PS(XMM4,XMM7) 787 788 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 789 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 790 791 /* Column 1, product is accumulated in XMM5 */ 792 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 793 SSE_SHUFFLE(XMM5,XMM5,0x00) 794 SSE_MULT_PS(XMM5,XMM0) 795 796 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 797 SSE_SHUFFLE(XMM6,XMM6,0x00) 798 SSE_MULT_PS(XMM6,XMM1) 799 SSE_ADD_PS(XMM5,XMM6) 800 801 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 802 SSE_SHUFFLE(XMM7,XMM7,0x00) 803 SSE_MULT_PS(XMM7,XMM2) 804 SSE_ADD_PS(XMM5,XMM7) 805 806 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 807 SSE_SHUFFLE(XMM6,XMM6,0x00) 808 SSE_MULT_PS(XMM6,XMM3) 809 SSE_ADD_PS(XMM5,XMM6) 810 811 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 812 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 813 814 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 815 816 /* Column 2, product is accumulated in XMM6 */ 817 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 818 SSE_SHUFFLE(XMM6,XMM6,0x00) 819 SSE_MULT_PS(XMM6,XMM0) 820 821 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 822 SSE_SHUFFLE(XMM7,XMM7,0x00) 823 SSE_MULT_PS(XMM7,XMM1) 824 SSE_ADD_PS(XMM6,XMM7) 825 826 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 827 SSE_SHUFFLE(XMM7,XMM7,0x00) 828 SSE_MULT_PS(XMM7,XMM2) 829 SSE_ADD_PS(XMM6,XMM7) 830 831 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 832 SSE_SHUFFLE(XMM7,XMM7,0x00) 833 SSE_MULT_PS(XMM7,XMM3) 834 SSE_ADD_PS(XMM6,XMM7) 835 836 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 837 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 838 839 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 840 /* Column 3, product is accumulated in XMM0 */ 841 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 842 SSE_SHUFFLE(XMM7,XMM7,0x00) 843 SSE_MULT_PS(XMM0,XMM7) 844 845 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 846 SSE_SHUFFLE(XMM7,XMM7,0x00) 847 SSE_MULT_PS(XMM1,XMM7) 848 SSE_ADD_PS(XMM0,XMM1) 849 850 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 851 SSE_SHUFFLE(XMM1,XMM1,0x00) 852 SSE_MULT_PS(XMM1,XMM2) 853 SSE_ADD_PS(XMM0,XMM1) 854 855 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 856 SSE_SHUFFLE(XMM7,XMM7,0x00) 857 SSE_MULT_PS(XMM3,XMM7) 858 SSE_ADD_PS(XMM0,XMM3) 859 860 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 861 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 862 863 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 864 /* This is code to be maintained and read by humans afterall. */ 865 /* Copy Multiplier Col 3 into XMM3 */ 866 SSE_COPY_PS(XMM3,XMM0) 867 /* Copy Multiplier Col 2 into XMM2 */ 868 SSE_COPY_PS(XMM2,XMM6) 869 /* Copy Multiplier Col 1 into XMM1 */ 870 SSE_COPY_PS(XMM1,XMM5) 871 /* Copy Multiplier Col 0 into XMM0 */ 872 SSE_COPY_PS(XMM0,XMM4) 873 SSE_INLINE_END_2; 874 875 /* Update the row: */ 876 nz = bi[row+1] - diag_offset[row] - 1; 877 pv += 16; 878 for (j=0; j<nz; j++) { 879 PREFETCH_L1(&pv[16]); 880 x = rtmp + 16*pj[j]; 881 /* x = rtmp + 4*pj[j]; */ 882 883 /* X:=X-M*PV, One column at a time */ 884 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 885 SSE_INLINE_BEGIN_2(x,pv) 886 /* Load First Column of X*/ 887 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 888 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 889 890 /* Matrix-Vector Product: */ 891 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 892 SSE_SHUFFLE(XMM5,XMM5,0x00) 893 SSE_MULT_PS(XMM5,XMM0) 894 SSE_SUB_PS(XMM4,XMM5) 895 896 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 897 SSE_SHUFFLE(XMM6,XMM6,0x00) 898 SSE_MULT_PS(XMM6,XMM1) 899 SSE_SUB_PS(XMM4,XMM6) 900 901 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 902 SSE_SHUFFLE(XMM7,XMM7,0x00) 903 SSE_MULT_PS(XMM7,XMM2) 904 SSE_SUB_PS(XMM4,XMM7) 905 906 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 907 SSE_SHUFFLE(XMM5,XMM5,0x00) 908 SSE_MULT_PS(XMM5,XMM3) 909 SSE_SUB_PS(XMM4,XMM5) 910 911 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 912 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 913 914 /* Second Column */ 915 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 916 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 917 918 /* Matrix-Vector Product: */ 919 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 920 SSE_SHUFFLE(XMM6,XMM6,0x00) 921 SSE_MULT_PS(XMM6,XMM0) 922 SSE_SUB_PS(XMM5,XMM6) 923 924 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 925 SSE_SHUFFLE(XMM7,XMM7,0x00) 926 SSE_MULT_PS(XMM7,XMM1) 927 SSE_SUB_PS(XMM5,XMM7) 928 929 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 930 SSE_SHUFFLE(XMM4,XMM4,0x00) 931 SSE_MULT_PS(XMM4,XMM2) 932 SSE_SUB_PS(XMM5,XMM4) 933 934 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 935 SSE_SHUFFLE(XMM6,XMM6,0x00) 936 SSE_MULT_PS(XMM6,XMM3) 937 SSE_SUB_PS(XMM5,XMM6) 938 939 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 940 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 941 942 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 943 944 /* Third Column */ 945 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 946 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 947 948 /* Matrix-Vector Product: */ 949 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 950 SSE_SHUFFLE(XMM7,XMM7,0x00) 951 SSE_MULT_PS(XMM7,XMM0) 952 SSE_SUB_PS(XMM6,XMM7) 953 954 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 955 SSE_SHUFFLE(XMM4,XMM4,0x00) 956 SSE_MULT_PS(XMM4,XMM1) 957 SSE_SUB_PS(XMM6,XMM4) 958 959 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 960 SSE_SHUFFLE(XMM5,XMM5,0x00) 961 SSE_MULT_PS(XMM5,XMM2) 962 SSE_SUB_PS(XMM6,XMM5) 963 964 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 965 SSE_SHUFFLE(XMM7,XMM7,0x00) 966 SSE_MULT_PS(XMM7,XMM3) 967 SSE_SUB_PS(XMM6,XMM7) 968 969 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 970 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 971 972 /* Fourth Column */ 973 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 974 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 975 976 /* Matrix-Vector Product: */ 977 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 978 SSE_SHUFFLE(XMM5,XMM5,0x00) 979 SSE_MULT_PS(XMM5,XMM0) 980 SSE_SUB_PS(XMM4,XMM5) 981 982 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 983 SSE_SHUFFLE(XMM6,XMM6,0x00) 984 SSE_MULT_PS(XMM6,XMM1) 985 SSE_SUB_PS(XMM4,XMM6) 986 987 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 988 SSE_SHUFFLE(XMM7,XMM7,0x00) 989 SSE_MULT_PS(XMM7,XMM2) 990 SSE_SUB_PS(XMM4,XMM7) 991 992 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 993 SSE_SHUFFLE(XMM5,XMM5,0x00) 994 SSE_MULT_PS(XMM5,XMM3) 995 SSE_SUB_PS(XMM4,XMM5) 996 997 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 998 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 999 SSE_INLINE_END_2; 1000 pv += 16; 1001 } 1002 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1003 } 1004 row = *bjtmp++; 1005 /* row = (*bjtmp++)/4; */ 1006 } 1007 /* finished row so stick it into b->a */ 1008 pv = ba + 16*bi[i]; 1009 pj = bj + bi[i]; 1010 nz = bi[i+1] - bi[i]; 1011 1012 /* Copy x block back into pv block */ 1013 for (j=0; j<nz; j++) { 1014 x = rtmp+16*pj[j]; 1015 /* x = rtmp+4*pj[j]; */ 1016 1017 SSE_INLINE_BEGIN_2(x,pv) 1018 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1019 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1020 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1021 1022 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1023 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1024 1025 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1026 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1027 1028 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1029 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1030 1031 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1032 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1033 1034 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1035 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1036 1037 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1038 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1039 1040 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1041 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1042 SSE_INLINE_END_2; 1043 pv += 16; 1044 } 1045 /* invert diagonal block */ 1046 w = ba + 16*diag_offset[i]; 1047 if (pivotinblocks) { 1048 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1049 } else { 1050 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1051 } 1052 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1053 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1054 } 1055 1056 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1057 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1058 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1059 C->assembled = PETSC_TRUE; 1060 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1061 /* Flop Count from inverting diagonal blocks */ 1062 SSE_SCOPE_END; 1063 PetscFunctionReturn(0); 1064 } 1065 1066 #undef __FUNCT__ 1067 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 1068 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1069 { 1070 Mat A=C; 1071 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1072 PetscErrorCode ierr; 1073 int i,j,n = a->mbs; 1074 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1075 unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 1076 unsigned int row; 1077 int nz,*bi=b->i; 1078 int *diag_offset = b->diag,*ai=a->i; 1079 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1080 MatScalar *ba = b->a,*aa = a->a; 1081 int nonzero=0; 1082 /* int nonzero=0,colscale = 16; */ 1083 PetscTruth pivotinblocks = b->pivotinblocks; 1084 PetscReal shift = info->shiftinblocks; 1085 1086 PetscFunctionBegin; 1087 SSE_SCOPE_BEGIN; 1088 1089 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1090 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1091 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1092 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1093 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1094 /* colscale = 4; */ 1095 /* } */ 1096 1097 for (i=0; i<n; i++) { 1098 nz = bi[i+1] - bi[i]; 1099 bjtmp = bj + bi[i]; 1100 /* zero out the 4x4 block accumulators */ 1101 /* zero out one register */ 1102 XOR_PS(XMM7,XMM7); 1103 for (j=0; j<nz; j++) { 1104 x = rtmp+16*((unsigned int)bjtmp[j]); 1105 /* x = rtmp+4*bjtmp[j]; */ 1106 SSE_INLINE_BEGIN_1(x) 1107 /* Copy zero register to memory locations */ 1108 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1109 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1110 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1111 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1112 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1113 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1114 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1115 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1116 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1117 SSE_INLINE_END_1; 1118 } 1119 /* load in initial (unfactored row) */ 1120 nz = ai[i+1] - ai[i]; 1121 ajtmp = aj + ai[i]; 1122 v = aa + 16*ai[i]; 1123 for (j=0; j<nz; j++) { 1124 x = rtmp+16*((unsigned int)ajtmp[j]); 1125 /* x = rtmp+colscale*ajtmp[j]; */ 1126 /* Copy v block into x block */ 1127 SSE_INLINE_BEGIN_2(v,x) 1128 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1129 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1130 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1131 1132 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1133 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1134 1135 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1136 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1137 1138 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1139 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1140 1141 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1142 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1143 1144 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1145 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1146 1147 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1148 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1149 1150 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1151 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1152 SSE_INLINE_END_2; 1153 1154 v += 16; 1155 } 1156 /* row = (*bjtmp++)/4; */ 1157 row = (unsigned int)(*bjtmp++); 1158 while (row < i) { 1159 pc = rtmp + 16*row; 1160 SSE_INLINE_BEGIN_1(pc) 1161 /* Load block from lower triangle */ 1162 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1163 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1164 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1165 1166 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1167 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1168 1169 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1170 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1171 1172 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1173 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1174 1175 /* Compare block to zero block */ 1176 1177 SSE_COPY_PS(XMM4,XMM7) 1178 SSE_CMPNEQ_PS(XMM4,XMM0) 1179 1180 SSE_COPY_PS(XMM5,XMM7) 1181 SSE_CMPNEQ_PS(XMM5,XMM1) 1182 1183 SSE_COPY_PS(XMM6,XMM7) 1184 SSE_CMPNEQ_PS(XMM6,XMM2) 1185 1186 SSE_CMPNEQ_PS(XMM7,XMM3) 1187 1188 /* Reduce the comparisons to one SSE register */ 1189 SSE_OR_PS(XMM6,XMM7) 1190 SSE_OR_PS(XMM5,XMM4) 1191 SSE_OR_PS(XMM5,XMM6) 1192 SSE_INLINE_END_1; 1193 1194 /* Reduce the one SSE register to an integer register for branching */ 1195 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1196 MOVEMASK(nonzero,XMM5); 1197 1198 /* If block is nonzero ... */ 1199 if (nonzero) { 1200 pv = ba + 16*diag_offset[row]; 1201 PREFETCH_L1(&pv[16]); 1202 pj = bj + diag_offset[row] + 1; 1203 1204 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1205 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1206 /* but the diagonal was inverted already */ 1207 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1208 1209 SSE_INLINE_BEGIN_2(pv,pc) 1210 /* Column 0, product is accumulated in XMM4 */ 1211 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1212 SSE_SHUFFLE(XMM4,XMM4,0x00) 1213 SSE_MULT_PS(XMM4,XMM0) 1214 1215 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1216 SSE_SHUFFLE(XMM5,XMM5,0x00) 1217 SSE_MULT_PS(XMM5,XMM1) 1218 SSE_ADD_PS(XMM4,XMM5) 1219 1220 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1221 SSE_SHUFFLE(XMM6,XMM6,0x00) 1222 SSE_MULT_PS(XMM6,XMM2) 1223 SSE_ADD_PS(XMM4,XMM6) 1224 1225 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1226 SSE_SHUFFLE(XMM7,XMM7,0x00) 1227 SSE_MULT_PS(XMM7,XMM3) 1228 SSE_ADD_PS(XMM4,XMM7) 1229 1230 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1231 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1232 1233 /* Column 1, product is accumulated in XMM5 */ 1234 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1235 SSE_SHUFFLE(XMM5,XMM5,0x00) 1236 SSE_MULT_PS(XMM5,XMM0) 1237 1238 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1239 SSE_SHUFFLE(XMM6,XMM6,0x00) 1240 SSE_MULT_PS(XMM6,XMM1) 1241 SSE_ADD_PS(XMM5,XMM6) 1242 1243 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1244 SSE_SHUFFLE(XMM7,XMM7,0x00) 1245 SSE_MULT_PS(XMM7,XMM2) 1246 SSE_ADD_PS(XMM5,XMM7) 1247 1248 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1249 SSE_SHUFFLE(XMM6,XMM6,0x00) 1250 SSE_MULT_PS(XMM6,XMM3) 1251 SSE_ADD_PS(XMM5,XMM6) 1252 1253 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1254 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1255 1256 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1257 1258 /* Column 2, product is accumulated in XMM6 */ 1259 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1260 SSE_SHUFFLE(XMM6,XMM6,0x00) 1261 SSE_MULT_PS(XMM6,XMM0) 1262 1263 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1264 SSE_SHUFFLE(XMM7,XMM7,0x00) 1265 SSE_MULT_PS(XMM7,XMM1) 1266 SSE_ADD_PS(XMM6,XMM7) 1267 1268 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1269 SSE_SHUFFLE(XMM7,XMM7,0x00) 1270 SSE_MULT_PS(XMM7,XMM2) 1271 SSE_ADD_PS(XMM6,XMM7) 1272 1273 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1274 SSE_SHUFFLE(XMM7,XMM7,0x00) 1275 SSE_MULT_PS(XMM7,XMM3) 1276 SSE_ADD_PS(XMM6,XMM7) 1277 1278 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1279 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1280 1281 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1282 /* Column 3, product is accumulated in XMM0 */ 1283 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1284 SSE_SHUFFLE(XMM7,XMM7,0x00) 1285 SSE_MULT_PS(XMM0,XMM7) 1286 1287 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1288 SSE_SHUFFLE(XMM7,XMM7,0x00) 1289 SSE_MULT_PS(XMM1,XMM7) 1290 SSE_ADD_PS(XMM0,XMM1) 1291 1292 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1293 SSE_SHUFFLE(XMM1,XMM1,0x00) 1294 SSE_MULT_PS(XMM1,XMM2) 1295 SSE_ADD_PS(XMM0,XMM1) 1296 1297 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1298 SSE_SHUFFLE(XMM7,XMM7,0x00) 1299 SSE_MULT_PS(XMM3,XMM7) 1300 SSE_ADD_PS(XMM0,XMM3) 1301 1302 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1303 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1304 1305 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1306 /* This is code to be maintained and read by humans afterall. */ 1307 /* Copy Multiplier Col 3 into XMM3 */ 1308 SSE_COPY_PS(XMM3,XMM0) 1309 /* Copy Multiplier Col 2 into XMM2 */ 1310 SSE_COPY_PS(XMM2,XMM6) 1311 /* Copy Multiplier Col 1 into XMM1 */ 1312 SSE_COPY_PS(XMM1,XMM5) 1313 /* Copy Multiplier Col 0 into XMM0 */ 1314 SSE_COPY_PS(XMM0,XMM4) 1315 SSE_INLINE_END_2; 1316 1317 /* Update the row: */ 1318 nz = bi[row+1] - diag_offset[row] - 1; 1319 pv += 16; 1320 for (j=0; j<nz; j++) { 1321 PREFETCH_L1(&pv[16]); 1322 x = rtmp + 16*((unsigned int)pj[j]); 1323 /* x = rtmp + 4*pj[j]; */ 1324 1325 /* X:=X-M*PV, One column at a time */ 1326 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1327 SSE_INLINE_BEGIN_2(x,pv) 1328 /* Load First Column of X*/ 1329 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1330 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1331 1332 /* Matrix-Vector Product: */ 1333 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1334 SSE_SHUFFLE(XMM5,XMM5,0x00) 1335 SSE_MULT_PS(XMM5,XMM0) 1336 SSE_SUB_PS(XMM4,XMM5) 1337 1338 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1339 SSE_SHUFFLE(XMM6,XMM6,0x00) 1340 SSE_MULT_PS(XMM6,XMM1) 1341 SSE_SUB_PS(XMM4,XMM6) 1342 1343 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1344 SSE_SHUFFLE(XMM7,XMM7,0x00) 1345 SSE_MULT_PS(XMM7,XMM2) 1346 SSE_SUB_PS(XMM4,XMM7) 1347 1348 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1349 SSE_SHUFFLE(XMM5,XMM5,0x00) 1350 SSE_MULT_PS(XMM5,XMM3) 1351 SSE_SUB_PS(XMM4,XMM5) 1352 1353 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1354 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1355 1356 /* Second Column */ 1357 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1358 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1359 1360 /* Matrix-Vector Product: */ 1361 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1362 SSE_SHUFFLE(XMM6,XMM6,0x00) 1363 SSE_MULT_PS(XMM6,XMM0) 1364 SSE_SUB_PS(XMM5,XMM6) 1365 1366 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1367 SSE_SHUFFLE(XMM7,XMM7,0x00) 1368 SSE_MULT_PS(XMM7,XMM1) 1369 SSE_SUB_PS(XMM5,XMM7) 1370 1371 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1372 SSE_SHUFFLE(XMM4,XMM4,0x00) 1373 SSE_MULT_PS(XMM4,XMM2) 1374 SSE_SUB_PS(XMM5,XMM4) 1375 1376 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1377 SSE_SHUFFLE(XMM6,XMM6,0x00) 1378 SSE_MULT_PS(XMM6,XMM3) 1379 SSE_SUB_PS(XMM5,XMM6) 1380 1381 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1382 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1383 1384 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1385 1386 /* Third Column */ 1387 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1388 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1389 1390 /* Matrix-Vector Product: */ 1391 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1392 SSE_SHUFFLE(XMM7,XMM7,0x00) 1393 SSE_MULT_PS(XMM7,XMM0) 1394 SSE_SUB_PS(XMM6,XMM7) 1395 1396 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1397 SSE_SHUFFLE(XMM4,XMM4,0x00) 1398 SSE_MULT_PS(XMM4,XMM1) 1399 SSE_SUB_PS(XMM6,XMM4) 1400 1401 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1402 SSE_SHUFFLE(XMM5,XMM5,0x00) 1403 SSE_MULT_PS(XMM5,XMM2) 1404 SSE_SUB_PS(XMM6,XMM5) 1405 1406 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1407 SSE_SHUFFLE(XMM7,XMM7,0x00) 1408 SSE_MULT_PS(XMM7,XMM3) 1409 SSE_SUB_PS(XMM6,XMM7) 1410 1411 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1412 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1413 1414 /* Fourth Column */ 1415 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1416 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1417 1418 /* Matrix-Vector Product: */ 1419 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1420 SSE_SHUFFLE(XMM5,XMM5,0x00) 1421 SSE_MULT_PS(XMM5,XMM0) 1422 SSE_SUB_PS(XMM4,XMM5) 1423 1424 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1425 SSE_SHUFFLE(XMM6,XMM6,0x00) 1426 SSE_MULT_PS(XMM6,XMM1) 1427 SSE_SUB_PS(XMM4,XMM6) 1428 1429 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1430 SSE_SHUFFLE(XMM7,XMM7,0x00) 1431 SSE_MULT_PS(XMM7,XMM2) 1432 SSE_SUB_PS(XMM4,XMM7) 1433 1434 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1435 SSE_SHUFFLE(XMM5,XMM5,0x00) 1436 SSE_MULT_PS(XMM5,XMM3) 1437 SSE_SUB_PS(XMM4,XMM5) 1438 1439 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1440 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1441 SSE_INLINE_END_2; 1442 pv += 16; 1443 } 1444 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1445 } 1446 row = (unsigned int)(*bjtmp++); 1447 /* row = (*bjtmp++)/4; */ 1448 /* bjtmp++; */ 1449 } 1450 /* finished row so stick it into b->a */ 1451 pv = ba + 16*bi[i]; 1452 pj = bj + bi[i]; 1453 nz = bi[i+1] - bi[i]; 1454 1455 /* Copy x block back into pv block */ 1456 for (j=0; j<nz; j++) { 1457 x = rtmp+16*((unsigned int)pj[j]); 1458 /* x = rtmp+4*pj[j]; */ 1459 1460 SSE_INLINE_BEGIN_2(x,pv) 1461 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1462 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1463 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1464 1465 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1466 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1467 1468 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1469 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1470 1471 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1472 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1473 1474 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1475 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1476 1477 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1478 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1479 1480 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1481 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1482 1483 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1484 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1485 SSE_INLINE_END_2; 1486 pv += 16; 1487 } 1488 /* invert diagonal block */ 1489 w = ba + 16*diag_offset[i]; 1490 if (pivotinblocks) { 1491 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1492 } else { 1493 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1494 } 1495 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1496 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1497 } 1498 1499 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1500 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1501 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1502 C->assembled = PETSC_TRUE; 1503 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1504 /* Flop Count from inverting diagonal blocks */ 1505 SSE_SCOPE_END; 1506 PetscFunctionReturn(0); 1507 } 1508 1509 #undef __FUNCT__ 1510 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1511 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1512 { 1513 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1514 PetscErrorCode ierr; 1515 int i,j,n = a->mbs; 1516 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1517 unsigned int row; 1518 int *ajtmpold,nz,*bi=b->i; 1519 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1520 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1521 MatScalar *ba = b->a,*aa = a->a; 1522 int nonzero=0; 1523 /* int nonzero=0,colscale = 16; */ 1524 PetscTruth pivotinblocks = b->pivotinblocks; 1525 PetscReal shift = info->shiftinblocks; 1526 1527 PetscFunctionBegin; 1528 SSE_SCOPE_BEGIN; 1529 1530 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1531 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1532 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1533 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1534 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1535 /* colscale = 4; */ 1536 /* } */ 1537 if ((unsigned long)bj==(unsigned long)aj) { 1538 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1539 } 1540 1541 for (i=0; i<n; i++) { 1542 nz = bi[i+1] - bi[i]; 1543 bjtmp = bj + bi[i]; 1544 /* zero out the 4x4 block accumulators */ 1545 /* zero out one register */ 1546 XOR_PS(XMM7,XMM7); 1547 for (j=0; j<nz; j++) { 1548 x = rtmp+16*((unsigned int)bjtmp[j]); 1549 /* x = rtmp+4*bjtmp[j]; */ 1550 SSE_INLINE_BEGIN_1(x) 1551 /* Copy zero register to memory locations */ 1552 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1553 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1554 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1555 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1556 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1557 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1558 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1559 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1560 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1561 SSE_INLINE_END_1; 1562 } 1563 /* load in initial (unfactored row) */ 1564 nz = ai[i+1] - ai[i]; 1565 ajtmpold = aj + ai[i]; 1566 v = aa + 16*ai[i]; 1567 for (j=0; j<nz; j++) { 1568 x = rtmp+16*ajtmpold[j]; 1569 /* x = rtmp+colscale*ajtmpold[j]; */ 1570 /* Copy v block into x block */ 1571 SSE_INLINE_BEGIN_2(v,x) 1572 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1573 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1574 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1575 1576 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1577 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1578 1579 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1580 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1581 1582 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1583 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1584 1585 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1586 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1587 1588 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1589 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1590 1591 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1592 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1593 1594 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1595 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1596 SSE_INLINE_END_2; 1597 1598 v += 16; 1599 } 1600 /* row = (*bjtmp++)/4; */ 1601 row = (unsigned int)(*bjtmp++); 1602 while (row < i) { 1603 pc = rtmp + 16*row; 1604 SSE_INLINE_BEGIN_1(pc) 1605 /* Load block from lower triangle */ 1606 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1607 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1608 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1609 1610 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1611 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1612 1613 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1614 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1615 1616 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1617 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1618 1619 /* Compare block to zero block */ 1620 1621 SSE_COPY_PS(XMM4,XMM7) 1622 SSE_CMPNEQ_PS(XMM4,XMM0) 1623 1624 SSE_COPY_PS(XMM5,XMM7) 1625 SSE_CMPNEQ_PS(XMM5,XMM1) 1626 1627 SSE_COPY_PS(XMM6,XMM7) 1628 SSE_CMPNEQ_PS(XMM6,XMM2) 1629 1630 SSE_CMPNEQ_PS(XMM7,XMM3) 1631 1632 /* Reduce the comparisons to one SSE register */ 1633 SSE_OR_PS(XMM6,XMM7) 1634 SSE_OR_PS(XMM5,XMM4) 1635 SSE_OR_PS(XMM5,XMM6) 1636 SSE_INLINE_END_1; 1637 1638 /* Reduce the one SSE register to an integer register for branching */ 1639 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1640 MOVEMASK(nonzero,XMM5); 1641 1642 /* If block is nonzero ... */ 1643 if (nonzero) { 1644 pv = ba + 16*diag_offset[row]; 1645 PREFETCH_L1(&pv[16]); 1646 pj = bj + diag_offset[row] + 1; 1647 1648 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1649 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1650 /* but the diagonal was inverted already */ 1651 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1652 1653 SSE_INLINE_BEGIN_2(pv,pc) 1654 /* Column 0, product is accumulated in XMM4 */ 1655 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1656 SSE_SHUFFLE(XMM4,XMM4,0x00) 1657 SSE_MULT_PS(XMM4,XMM0) 1658 1659 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1660 SSE_SHUFFLE(XMM5,XMM5,0x00) 1661 SSE_MULT_PS(XMM5,XMM1) 1662 SSE_ADD_PS(XMM4,XMM5) 1663 1664 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1665 SSE_SHUFFLE(XMM6,XMM6,0x00) 1666 SSE_MULT_PS(XMM6,XMM2) 1667 SSE_ADD_PS(XMM4,XMM6) 1668 1669 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1670 SSE_SHUFFLE(XMM7,XMM7,0x00) 1671 SSE_MULT_PS(XMM7,XMM3) 1672 SSE_ADD_PS(XMM4,XMM7) 1673 1674 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1675 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1676 1677 /* Column 1, product is accumulated in XMM5 */ 1678 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1679 SSE_SHUFFLE(XMM5,XMM5,0x00) 1680 SSE_MULT_PS(XMM5,XMM0) 1681 1682 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1683 SSE_SHUFFLE(XMM6,XMM6,0x00) 1684 SSE_MULT_PS(XMM6,XMM1) 1685 SSE_ADD_PS(XMM5,XMM6) 1686 1687 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1688 SSE_SHUFFLE(XMM7,XMM7,0x00) 1689 SSE_MULT_PS(XMM7,XMM2) 1690 SSE_ADD_PS(XMM5,XMM7) 1691 1692 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1693 SSE_SHUFFLE(XMM6,XMM6,0x00) 1694 SSE_MULT_PS(XMM6,XMM3) 1695 SSE_ADD_PS(XMM5,XMM6) 1696 1697 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1698 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1699 1700 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1701 1702 /* Column 2, product is accumulated in XMM6 */ 1703 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1704 SSE_SHUFFLE(XMM6,XMM6,0x00) 1705 SSE_MULT_PS(XMM6,XMM0) 1706 1707 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1708 SSE_SHUFFLE(XMM7,XMM7,0x00) 1709 SSE_MULT_PS(XMM7,XMM1) 1710 SSE_ADD_PS(XMM6,XMM7) 1711 1712 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1713 SSE_SHUFFLE(XMM7,XMM7,0x00) 1714 SSE_MULT_PS(XMM7,XMM2) 1715 SSE_ADD_PS(XMM6,XMM7) 1716 1717 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1718 SSE_SHUFFLE(XMM7,XMM7,0x00) 1719 SSE_MULT_PS(XMM7,XMM3) 1720 SSE_ADD_PS(XMM6,XMM7) 1721 1722 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1723 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1724 1725 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1726 /* Column 3, product is accumulated in XMM0 */ 1727 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1728 SSE_SHUFFLE(XMM7,XMM7,0x00) 1729 SSE_MULT_PS(XMM0,XMM7) 1730 1731 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1732 SSE_SHUFFLE(XMM7,XMM7,0x00) 1733 SSE_MULT_PS(XMM1,XMM7) 1734 SSE_ADD_PS(XMM0,XMM1) 1735 1736 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1737 SSE_SHUFFLE(XMM1,XMM1,0x00) 1738 SSE_MULT_PS(XMM1,XMM2) 1739 SSE_ADD_PS(XMM0,XMM1) 1740 1741 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1742 SSE_SHUFFLE(XMM7,XMM7,0x00) 1743 SSE_MULT_PS(XMM3,XMM7) 1744 SSE_ADD_PS(XMM0,XMM3) 1745 1746 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1747 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1748 1749 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1750 /* This is code to be maintained and read by humans afterall. */ 1751 /* Copy Multiplier Col 3 into XMM3 */ 1752 SSE_COPY_PS(XMM3,XMM0) 1753 /* Copy Multiplier Col 2 into XMM2 */ 1754 SSE_COPY_PS(XMM2,XMM6) 1755 /* Copy Multiplier Col 1 into XMM1 */ 1756 SSE_COPY_PS(XMM1,XMM5) 1757 /* Copy Multiplier Col 0 into XMM0 */ 1758 SSE_COPY_PS(XMM0,XMM4) 1759 SSE_INLINE_END_2; 1760 1761 /* Update the row: */ 1762 nz = bi[row+1] - diag_offset[row] - 1; 1763 pv += 16; 1764 for (j=0; j<nz; j++) { 1765 PREFETCH_L1(&pv[16]); 1766 x = rtmp + 16*((unsigned int)pj[j]); 1767 /* x = rtmp + 4*pj[j]; */ 1768 1769 /* X:=X-M*PV, One column at a time */ 1770 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1771 SSE_INLINE_BEGIN_2(x,pv) 1772 /* Load First Column of X*/ 1773 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1774 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1775 1776 /* Matrix-Vector Product: */ 1777 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1778 SSE_SHUFFLE(XMM5,XMM5,0x00) 1779 SSE_MULT_PS(XMM5,XMM0) 1780 SSE_SUB_PS(XMM4,XMM5) 1781 1782 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1783 SSE_SHUFFLE(XMM6,XMM6,0x00) 1784 SSE_MULT_PS(XMM6,XMM1) 1785 SSE_SUB_PS(XMM4,XMM6) 1786 1787 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1788 SSE_SHUFFLE(XMM7,XMM7,0x00) 1789 SSE_MULT_PS(XMM7,XMM2) 1790 SSE_SUB_PS(XMM4,XMM7) 1791 1792 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1793 SSE_SHUFFLE(XMM5,XMM5,0x00) 1794 SSE_MULT_PS(XMM5,XMM3) 1795 SSE_SUB_PS(XMM4,XMM5) 1796 1797 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1798 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1799 1800 /* Second Column */ 1801 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1802 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1803 1804 /* Matrix-Vector Product: */ 1805 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1806 SSE_SHUFFLE(XMM6,XMM6,0x00) 1807 SSE_MULT_PS(XMM6,XMM0) 1808 SSE_SUB_PS(XMM5,XMM6) 1809 1810 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1811 SSE_SHUFFLE(XMM7,XMM7,0x00) 1812 SSE_MULT_PS(XMM7,XMM1) 1813 SSE_SUB_PS(XMM5,XMM7) 1814 1815 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1816 SSE_SHUFFLE(XMM4,XMM4,0x00) 1817 SSE_MULT_PS(XMM4,XMM2) 1818 SSE_SUB_PS(XMM5,XMM4) 1819 1820 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1821 SSE_SHUFFLE(XMM6,XMM6,0x00) 1822 SSE_MULT_PS(XMM6,XMM3) 1823 SSE_SUB_PS(XMM5,XMM6) 1824 1825 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1826 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1827 1828 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1829 1830 /* Third Column */ 1831 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1832 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1833 1834 /* Matrix-Vector Product: */ 1835 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1836 SSE_SHUFFLE(XMM7,XMM7,0x00) 1837 SSE_MULT_PS(XMM7,XMM0) 1838 SSE_SUB_PS(XMM6,XMM7) 1839 1840 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1841 SSE_SHUFFLE(XMM4,XMM4,0x00) 1842 SSE_MULT_PS(XMM4,XMM1) 1843 SSE_SUB_PS(XMM6,XMM4) 1844 1845 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1846 SSE_SHUFFLE(XMM5,XMM5,0x00) 1847 SSE_MULT_PS(XMM5,XMM2) 1848 SSE_SUB_PS(XMM6,XMM5) 1849 1850 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1851 SSE_SHUFFLE(XMM7,XMM7,0x00) 1852 SSE_MULT_PS(XMM7,XMM3) 1853 SSE_SUB_PS(XMM6,XMM7) 1854 1855 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1856 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1857 1858 /* Fourth Column */ 1859 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1860 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1861 1862 /* Matrix-Vector Product: */ 1863 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1864 SSE_SHUFFLE(XMM5,XMM5,0x00) 1865 SSE_MULT_PS(XMM5,XMM0) 1866 SSE_SUB_PS(XMM4,XMM5) 1867 1868 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1869 SSE_SHUFFLE(XMM6,XMM6,0x00) 1870 SSE_MULT_PS(XMM6,XMM1) 1871 SSE_SUB_PS(XMM4,XMM6) 1872 1873 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1874 SSE_SHUFFLE(XMM7,XMM7,0x00) 1875 SSE_MULT_PS(XMM7,XMM2) 1876 SSE_SUB_PS(XMM4,XMM7) 1877 1878 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1879 SSE_SHUFFLE(XMM5,XMM5,0x00) 1880 SSE_MULT_PS(XMM5,XMM3) 1881 SSE_SUB_PS(XMM4,XMM5) 1882 1883 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1884 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1885 SSE_INLINE_END_2; 1886 pv += 16; 1887 } 1888 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1889 } 1890 row = (unsigned int)(*bjtmp++); 1891 /* row = (*bjtmp++)/4; */ 1892 /* bjtmp++; */ 1893 } 1894 /* finished row so stick it into b->a */ 1895 pv = ba + 16*bi[i]; 1896 pj = bj + bi[i]; 1897 nz = bi[i+1] - bi[i]; 1898 1899 /* Copy x block back into pv block */ 1900 for (j=0; j<nz; j++) { 1901 x = rtmp+16*((unsigned int)pj[j]); 1902 /* x = rtmp+4*pj[j]; */ 1903 1904 SSE_INLINE_BEGIN_2(x,pv) 1905 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1906 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1907 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1908 1909 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1910 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1911 1912 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1913 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1914 1915 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1916 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1917 1918 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1919 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1920 1921 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1922 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1923 1924 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1925 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1926 1927 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1928 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1929 SSE_INLINE_END_2; 1930 pv += 16; 1931 } 1932 /* invert diagonal block */ 1933 w = ba + 16*diag_offset[i]; 1934 if (pivotinblocks) { 1935 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1936 } else { 1937 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1938 } 1939 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1940 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1941 } 1942 1943 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1944 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1945 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1946 C->assembled = PETSC_TRUE; 1947 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1948 /* Flop Count from inverting diagonal blocks */ 1949 SSE_SCOPE_END; 1950 PetscFunctionReturn(0); 1951 } 1952 1953 #endif 1954