1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 #include "../src/mat/impls/baij/seq/baij.h" 7 #include "../src/mat/blockinvert.h" 8 9 /* ------------------------------------------------------------*/ 10 /* 11 Version for when blocks are 4 by 4 12 */ 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info) 16 { 17 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 18 IS isrow = b->row,isicol = b->icol; 19 PetscErrorCode ierr; 20 const PetscInt *r,*ic; 21 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 22 PetscInt *ajtmpold,*ajtmp,nz,row; 23 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 24 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 25 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 26 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 27 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 28 MatScalar m13,m14,m15,m16; 29 MatScalar *ba = b->a,*aa = a->a; 30 PetscTruth pivotinblocks = b->pivotinblocks; 31 PetscReal shift = info->shiftinblocks; 32 33 PetscFunctionBegin; 34 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 35 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 36 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 37 38 for (i=0; i<n; i++) { 39 nz = bi[i+1] - bi[i]; 40 ajtmp = bj + bi[i]; 41 for (j=0; j<nz; j++) { 42 x = rtmp+16*ajtmp[j]; 43 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 44 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 45 } 46 /* load in initial (unfactored row) */ 47 idx = r[i]; 48 nz = ai[idx+1] - ai[idx]; 49 ajtmpold = aj + ai[idx]; 50 v = aa + 16*ai[idx]; 51 for (j=0; j<nz; j++) { 52 x = rtmp+16*ic[ajtmpold[j]]; 53 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 54 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 55 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 56 x[14] = v[14]; x[15] = v[15]; 57 v += 16; 58 } 59 row = *ajtmp++; 60 while (row < i) { 61 pc = rtmp + 16*row; 62 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 63 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 64 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 65 p15 = pc[14]; p16 = pc[15]; 66 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 67 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 68 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 69 || p16 != 0.0) { 70 pv = ba + 16*diag_offset[row]; 71 pj = bj + diag_offset[row] + 1; 72 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 73 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 74 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 75 x15 = pv[14]; x16 = pv[15]; 76 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 77 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 78 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 79 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 80 81 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 82 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 83 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 84 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 85 86 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 87 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 88 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 89 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 90 91 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 92 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 93 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 94 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 95 96 nz = bi[row+1] - diag_offset[row] - 1; 97 pv += 16; 98 for (j=0; j<nz; j++) { 99 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 100 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 101 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 102 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 103 x = rtmp + 16*pj[j]; 104 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 105 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 106 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 107 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 108 109 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 110 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 111 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 112 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 113 114 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 115 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 116 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 117 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 118 119 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 120 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 121 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 122 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 123 124 pv += 16; 125 } 126 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 127 } 128 row = *ajtmp++; 129 } 130 /* finished row so stick it into b->a */ 131 pv = ba + 16*bi[i]; 132 pj = bj + bi[i]; 133 nz = bi[i+1] - bi[i]; 134 for (j=0; j<nz; j++) { 135 x = rtmp+16*pj[j]; 136 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 137 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 138 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 139 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 140 pv += 16; 141 } 142 /* invert diagonal block */ 143 w = ba + 16*diag_offset[i]; 144 if (pivotinblocks) { 145 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 146 } else { 147 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148 } 149 } 150 151 ierr = PetscFree(rtmp);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 153 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154 C->ops->solve = MatSolve_SeqBAIJ_4; 155 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 156 C->assembled = PETSC_TRUE; 157 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 158 PetscFunctionReturn(0); 159 } 160 161 /* MatLUFactorNumeric_SeqBAIJ_4_newdatastruct - 162 copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented 163 Kernel_A_gets_A_times_B() 164 Kernel_A_gets_A_minus_B_times_C() 165 Kernel_A_gets_inverse_A() 166 */ 167 #undef __FUNCT__ 168 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct" 169 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 170 { 171 Mat C=B; 172 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 173 IS isrow = b->row,isicol = b->icol; 174 PetscErrorCode ierr; 175 const PetscInt *r,*ic,*ics; 176 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 177 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 178 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 179 PetscInt bs2 = a->bs2,flg; 180 PetscReal shift = info->shiftinblocks; 181 182 PetscFunctionBegin; 183 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 184 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 185 186 /* generate work space needed by the factorization */ 187 ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 188 mwork = rtmp + bs2*n; 189 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 190 ics = ic; 191 192 for (i=0; i<n; i++){ 193 /* zero rtmp */ 194 /* L part */ 195 nz = bi[i+1] - bi[i]; 196 bjtmp = bj + bi[i]; 197 for (j=0; j<nz; j++){ 198 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 199 } 200 201 /* U part */ 202 nz = bi[2*n-i+1] - bi[2*n-i]; 203 bjtmp = bj + bi[2*n-i]; 204 for (j=0; j<nz; j++){ 205 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 206 } 207 208 /* load in initial (unfactored row) */ 209 nz = ai[r[i]+1] - ai[r[i]]; 210 ajtmp = aj + ai[r[i]]; 211 v = aa + bs2*ai[r[i]]; 212 for (j=0; j<nz; j++) { 213 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 214 } 215 216 /* elimination */ 217 bjtmp = bj + bi[i]; 218 nzL = bi[i+1] - bi[i]; 219 for(k=0;k < nzL;k++) { 220 row = bjtmp[k]; 221 pc = rtmp + bs2*row; 222 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 223 if (flg) { 224 pv = b->a + bs2*bdiag[row]; 225 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 226 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 227 228 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 229 pv = b->a + bs2*bi[2*n-row]; 230 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 231 for (j=0; j<nz; j++) { 232 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 233 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 234 v = rtmp + bs2*pj[j]; 235 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 236 pv += bs2; 237 } 238 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 239 } 240 } 241 242 /* finished row so stick it into b->a */ 243 /* L part */ 244 pv = b->a + bs2*bi[i] ; 245 pj = b->j + bi[i] ; 246 nz = bi[i+1] - bi[i]; 247 for (j=0; j<nz; j++) { 248 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 249 } 250 251 /* Mark diagonal and invert diagonal for simplier triangular solves */ 252 pv = b->a + bs2*bdiag[i]; 253 pj = b->j + bdiag[i]; 254 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 255 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 256 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 257 258 /* U part */ 259 pv = b->a + bs2*bi[2*n-i]; 260 pj = b->j + bi[2*n-i]; 261 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 262 for (j=0; j<nz; j++){ 263 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 264 } 265 } 266 267 ierr = PetscFree(rtmp);CHKERRQ(ierr); 268 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 269 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 270 271 C->assembled = PETSC_TRUE; 272 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 273 PetscFunctionReturn(0); 274 } 275 276 #undef __FUNCT__ 277 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct_v2" 278 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info) 279 { 280 Mat C=B; 281 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 282 IS isrow = b->row,isicol = b->icol; 283 PetscErrorCode ierr; 284 const PetscInt *r,*ic,*ics; 285 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 286 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 287 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 288 PetscInt bs2 = a->bs2,flg; 289 PetscReal shift = info->shiftinblocks; 290 291 PetscFunctionBegin; 292 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 293 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 294 295 /* generate work space needed by the factorization */ 296 ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 297 mwork = rtmp + bs2*n; 298 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 299 ics = ic; 300 301 for (i=0; i<n; i++){ 302 /* zero rtmp */ 303 /* L part */ 304 nz = bi[i+1] - bi[i]; 305 bjtmp = bj + bi[i]; 306 for (j=0; j<nz; j++){ 307 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 308 } 309 310 /* U part */ 311 nz = bdiag[i]-bdiag[i+1]; 312 bjtmp = bj + bdiag[i+1]+1; 313 for (j=0; j<nz; j++){ 314 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 315 } 316 317 /* load in initial (unfactored row) */ 318 nz = ai[r[i]+1] - ai[r[i]]; 319 ajtmp = aj + ai[r[i]]; 320 v = aa + bs2*ai[r[i]]; 321 for (j=0; j<nz; j++) { 322 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 323 } 324 325 /* elimination */ 326 bjtmp = bj + bi[i]; 327 nzL = bi[i+1] - bi[i]; 328 for(k=0;k < nzL;k++) { 329 row = bjtmp[k]; 330 pc = rtmp + bs2*row; 331 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 332 if (flg) { 333 pv = b->a + bs2*bdiag[row]; 334 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 335 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 336 337 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 338 pv = b->a + bs2*(bdiag[row+1]+1); 339 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 340 for (j=0; j<nz; j++) { 341 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 342 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 343 v = rtmp + bs2*pj[j]; 344 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 345 pv += bs2; 346 } 347 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 348 } 349 } 350 351 /* finished row so stick it into b->a */ 352 /* L part */ 353 pv = b->a + bs2*bi[i] ; 354 pj = b->j + bi[i] ; 355 nz = bi[i+1] - bi[i]; 356 for (j=0; j<nz; j++) { 357 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 358 } 359 360 /* Mark diagonal and invert diagonal for simplier triangular solves */ 361 pv = b->a + bs2*bdiag[i]; 362 pj = b->j + bdiag[i]; 363 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 364 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 365 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 366 367 /* U part */ 368 pv = b->a + bs2*(bdiag[i+1]+1); 369 pj = b->j + bdiag[i+1]+1; 370 nz = bdiag[i] - bdiag[i+1] - 1; 371 for (j=0; j<nz; j++){ 372 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 373 } 374 } 375 376 ierr = PetscFree(rtmp);CHKERRQ(ierr); 377 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 378 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 379 380 C->assembled = PETSC_TRUE; 381 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 382 PetscFunctionReturn(0); 383 } 384 385 #undef __FUNCT__ 386 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 387 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 388 { 389 /* 390 Default Version for when blocks are 4 by 4 Using natural ordering 391 */ 392 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 393 PetscErrorCode ierr; 394 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 395 PetscInt *ajtmpold,*ajtmp,nz,row; 396 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 397 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 398 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 399 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 400 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 401 MatScalar m13,m14,m15,m16; 402 MatScalar *ba = b->a,*aa = a->a; 403 PetscTruth pivotinblocks = b->pivotinblocks; 404 PetscReal shift = info->shiftinblocks; 405 406 PetscFunctionBegin; 407 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 408 409 for (i=0; i<n; i++) { 410 nz = bi[i+1] - bi[i]; 411 ajtmp = bj + bi[i]; 412 for (j=0; j<nz; j++) { 413 x = rtmp+16*ajtmp[j]; 414 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 415 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 416 } 417 /* load in initial (unfactored row) */ 418 nz = ai[i+1] - ai[i]; 419 ajtmpold = aj + ai[i]; 420 v = aa + 16*ai[i]; 421 for (j=0; j<nz; j++) { 422 x = rtmp+16*ajtmpold[j]; 423 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 424 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 425 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 426 x[14] = v[14]; x[15] = v[15]; 427 v += 16; 428 } 429 row = *ajtmp++; 430 while (row < i) { 431 pc = rtmp + 16*row; 432 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 433 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 434 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 435 p15 = pc[14]; p16 = pc[15]; 436 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 437 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 438 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 439 || p16 != 0.0) { 440 pv = ba + 16*diag_offset[row]; 441 pj = bj + diag_offset[row] + 1; 442 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 443 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 444 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 445 x15 = pv[14]; x16 = pv[15]; 446 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 447 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 448 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 449 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 450 451 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 452 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 453 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 454 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 455 456 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 457 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 458 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 459 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 460 461 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 462 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 463 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 464 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 465 nz = bi[row+1] - diag_offset[row] - 1; 466 pv += 16; 467 for (j=0; j<nz; j++) { 468 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 469 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 470 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 471 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 472 x = rtmp + 16*pj[j]; 473 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 474 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 475 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 476 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 477 478 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 479 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 480 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 481 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 482 483 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 484 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 485 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 486 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 487 488 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 489 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 490 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 491 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 492 493 pv += 16; 494 } 495 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 496 } 497 row = *ajtmp++; 498 } 499 /* finished row so stick it into b->a */ 500 pv = ba + 16*bi[i]; 501 pj = bj + bi[i]; 502 nz = bi[i+1] - bi[i]; 503 for (j=0; j<nz; j++) { 504 x = rtmp+16*pj[j]; 505 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 506 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 507 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 508 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 509 pv += 16; 510 } 511 /* invert diagonal block */ 512 w = ba + 16*diag_offset[i]; 513 if (pivotinblocks) { 514 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 515 } else { 516 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 517 } 518 } 519 520 ierr = PetscFree(rtmp);CHKERRQ(ierr); 521 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 522 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 523 C->assembled = PETSC_TRUE; 524 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 525 PetscFunctionReturn(0); 526 } 527 /* 528 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct - 529 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_newdatastruct() 530 */ 531 #undef __FUNCT__ 532 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct" 533 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 534 { 535 Mat C=B; 536 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 537 PetscErrorCode ierr; 538 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 539 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 540 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 541 PetscInt bs2 = a->bs2,flg; 542 PetscReal shift = info->shiftinblocks; 543 544 PetscFunctionBegin; 545 /* generate work space needed by the factorization */ 546 ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 547 mwork = rtmp + bs2*n; 548 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 549 550 for (i=0; i<n; i++){ 551 /* zero rtmp */ 552 /* L part */ 553 nz = bi[i+1] - bi[i]; 554 bjtmp = bj + bi[i]; 555 for (j=0; j<nz; j++){ 556 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 557 } 558 559 /* U part */ 560 nz = bi[2*n-i+1] - bi[2*n-i]; 561 bjtmp = bj + bi[2*n-i]; 562 for (j=0; j<nz; j++){ 563 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 564 } 565 566 /* load in initial (unfactored row) */ 567 nz = ai[i+1] - ai[i]; 568 ajtmp = aj + ai[i]; 569 v = aa + bs2*ai[i]; 570 for (j=0; j<nz; j++) { 571 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 572 } 573 574 /* elimination */ 575 bjtmp = bj + bi[i]; 576 nzL = bi[i+1] - bi[i]; 577 for(k=0;k < nzL;k++) { 578 row = bjtmp[k]; 579 pc = rtmp + bs2*row; 580 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 581 if (flg) { 582 pv = b->a + bs2*bdiag[row]; 583 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 584 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 585 586 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 587 pv = b->a + bs2*bi[2*n-row]; 588 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 589 for (j=0; j<nz; j++) { 590 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 591 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 592 v = rtmp + bs2*pj[j]; 593 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 594 pv += bs2; 595 } 596 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 597 } 598 } 599 600 /* finished row so stick it into b->a */ 601 /* L part */ 602 pv = b->a + bs2*bi[i] ; 603 pj = b->j + bi[i] ; 604 nz = bi[i+1] - bi[i]; 605 for (j=0; j<nz; j++) { 606 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 607 } 608 609 /* Mark diagonal and invert diagonal for simplier triangular solves */ 610 pv = b->a + bs2*bdiag[i]; 611 pj = b->j + bdiag[i]; 612 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 613 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 614 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 615 616 /* U part */ 617 pv = b->a + bs2*bi[2*n-i]; 618 pj = b->j + bi[2*n-i]; 619 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 620 for (j=0; j<nz; j++){ 621 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 622 } 623 } 624 625 ierr = PetscFree(rtmp);CHKERRQ(ierr); 626 C->assembled = PETSC_TRUE; 627 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 628 PetscFunctionReturn(0); 629 } 630 631 #undef __FUNCT__ 632 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 633 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info) 634 { 635 Mat C=B; 636 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 637 PetscErrorCode ierr; 638 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 639 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 640 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 641 PetscInt bs2 = a->bs2,flg; 642 PetscReal shift = info->shiftinblocks; 643 644 PetscFunctionBegin; 645 /* generate work space needed by the factorization */ 646 ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 647 mwork = rtmp + bs2*n; 648 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 649 650 for (i=0; i<n; i++){ 651 /* zero rtmp */ 652 /* L part */ 653 nz = bi[i+1] - bi[i]; 654 bjtmp = bj + bi[i]; 655 for (j=0; j<nz; j++){ 656 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 657 } 658 659 /* U part */ 660 nz = bdiag[i] - bdiag[i+1]; 661 bjtmp = bj + bdiag[i+1]+1; 662 for (j=0; j<nz; j++){ 663 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 664 } 665 666 /* load in initial (unfactored row) */ 667 nz = ai[i+1] - ai[i]; 668 ajtmp = aj + ai[i]; 669 v = aa + bs2*ai[i]; 670 for (j=0; j<nz; j++) { 671 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 672 } 673 674 /* elimination */ 675 bjtmp = bj + bi[i]; 676 nzL = bi[i+1] - bi[i]; 677 for(k=0;k < nzL;k++) { 678 row = bjtmp[k]; 679 pc = rtmp + bs2*row; 680 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 681 if (flg) { 682 pv = b->a + bs2*bdiag[row]; 683 /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 684 ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 685 686 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 687 pv = b->a + bs2*(bdiag[row+1]+1); 688 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 689 for (j=0; j<nz; j++) { 690 /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 691 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 692 v = rtmp + bs2*pj[j]; 693 ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 694 pv += bs2; 695 } 696 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 697 } 698 } 699 700 /* finished row so stick it into b->a */ 701 /* L part */ 702 pv = b->a + bs2*bi[i] ; 703 pj = b->j + bi[i] ; 704 nz = bi[i+1] - bi[i]; 705 for (j=0; j<nz; j++) { 706 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 707 } 708 709 /* Mark diagonal and invert diagonal for simplier triangular solves */ 710 pv = b->a + bs2*bdiag[i]; 711 pj = b->j + bdiag[i]; 712 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 713 /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 714 ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 715 716 /* U part */ 717 pv = b->a + bs2*(bdiag[i+1]+1); 718 pj = b->j + bdiag[i+1]+1; 719 nz = bdiag[i] - bdiag[i+1] - 1; 720 for (j=0; j<nz; j++){ 721 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 722 } 723 } 724 725 ierr = PetscFree(rtmp);CHKERRQ(ierr); 726 C->assembled = PETSC_TRUE; 727 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 728 PetscFunctionReturn(0); 729 } 730 731 #if defined(PETSC_HAVE_SSE) 732 733 #include PETSC_HAVE_SSE 734 735 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 736 #undef __FUNCT__ 737 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 738 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 739 { 740 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 741 PetscErrorCode ierr; 742 int i,j,n = a->mbs; 743 int *bj = b->j,*bjtmp,*pj; 744 int row; 745 int *ajtmpold,nz,*bi=b->i; 746 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 747 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 748 MatScalar *ba = b->a,*aa = a->a; 749 int nonzero=0; 750 /* int nonzero=0,colscale = 16; */ 751 PetscTruth pivotinblocks = b->pivotinblocks; 752 PetscReal shift = info->shiftinblocks; 753 754 PetscFunctionBegin; 755 SSE_SCOPE_BEGIN; 756 757 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 758 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 759 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 760 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 761 /* if ((unsigned long)bj==(unsigned long)aj) { */ 762 /* colscale = 4; */ 763 /* } */ 764 for (i=0; i<n; i++) { 765 nz = bi[i+1] - bi[i]; 766 bjtmp = bj + bi[i]; 767 /* zero out the 4x4 block accumulators */ 768 /* zero out one register */ 769 XOR_PS(XMM7,XMM7); 770 for (j=0; j<nz; j++) { 771 x = rtmp+16*bjtmp[j]; 772 /* x = rtmp+4*bjtmp[j]; */ 773 SSE_INLINE_BEGIN_1(x) 774 /* Copy zero register to memory locations */ 775 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 776 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 777 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 778 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 779 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 780 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 781 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 782 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 783 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 784 SSE_INLINE_END_1; 785 } 786 /* load in initial (unfactored row) */ 787 nz = ai[i+1] - ai[i]; 788 ajtmpold = aj + ai[i]; 789 v = aa + 16*ai[i]; 790 for (j=0; j<nz; j++) { 791 x = rtmp+16*ajtmpold[j]; 792 /* x = rtmp+colscale*ajtmpold[j]; */ 793 /* Copy v block into x block */ 794 SSE_INLINE_BEGIN_2(v,x) 795 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 796 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 797 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 798 799 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 800 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 801 802 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 803 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 804 805 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 806 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 807 808 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 809 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 810 811 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 812 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 813 814 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 815 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 816 817 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 818 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 819 SSE_INLINE_END_2; 820 821 v += 16; 822 } 823 /* row = (*bjtmp++)/4; */ 824 row = *bjtmp++; 825 while (row < i) { 826 pc = rtmp + 16*row; 827 SSE_INLINE_BEGIN_1(pc) 828 /* Load block from lower triangle */ 829 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 830 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 831 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 832 833 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 834 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 835 836 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 837 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 838 839 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 840 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 841 842 /* Compare block to zero block */ 843 844 SSE_COPY_PS(XMM4,XMM7) 845 SSE_CMPNEQ_PS(XMM4,XMM0) 846 847 SSE_COPY_PS(XMM5,XMM7) 848 SSE_CMPNEQ_PS(XMM5,XMM1) 849 850 SSE_COPY_PS(XMM6,XMM7) 851 SSE_CMPNEQ_PS(XMM6,XMM2) 852 853 SSE_CMPNEQ_PS(XMM7,XMM3) 854 855 /* Reduce the comparisons to one SSE register */ 856 SSE_OR_PS(XMM6,XMM7) 857 SSE_OR_PS(XMM5,XMM4) 858 SSE_OR_PS(XMM5,XMM6) 859 SSE_INLINE_END_1; 860 861 /* Reduce the one SSE register to an integer register for branching */ 862 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 863 MOVEMASK(nonzero,XMM5); 864 865 /* If block is nonzero ... */ 866 if (nonzero) { 867 pv = ba + 16*diag_offset[row]; 868 PREFETCH_L1(&pv[16]); 869 pj = bj + diag_offset[row] + 1; 870 871 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 872 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 873 /* but the diagonal was inverted already */ 874 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 875 876 SSE_INLINE_BEGIN_2(pv,pc) 877 /* Column 0, product is accumulated in XMM4 */ 878 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 879 SSE_SHUFFLE(XMM4,XMM4,0x00) 880 SSE_MULT_PS(XMM4,XMM0) 881 882 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 883 SSE_SHUFFLE(XMM5,XMM5,0x00) 884 SSE_MULT_PS(XMM5,XMM1) 885 SSE_ADD_PS(XMM4,XMM5) 886 887 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 888 SSE_SHUFFLE(XMM6,XMM6,0x00) 889 SSE_MULT_PS(XMM6,XMM2) 890 SSE_ADD_PS(XMM4,XMM6) 891 892 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 893 SSE_SHUFFLE(XMM7,XMM7,0x00) 894 SSE_MULT_PS(XMM7,XMM3) 895 SSE_ADD_PS(XMM4,XMM7) 896 897 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 898 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 899 900 /* Column 1, product is accumulated in XMM5 */ 901 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 902 SSE_SHUFFLE(XMM5,XMM5,0x00) 903 SSE_MULT_PS(XMM5,XMM0) 904 905 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 906 SSE_SHUFFLE(XMM6,XMM6,0x00) 907 SSE_MULT_PS(XMM6,XMM1) 908 SSE_ADD_PS(XMM5,XMM6) 909 910 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 911 SSE_SHUFFLE(XMM7,XMM7,0x00) 912 SSE_MULT_PS(XMM7,XMM2) 913 SSE_ADD_PS(XMM5,XMM7) 914 915 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 916 SSE_SHUFFLE(XMM6,XMM6,0x00) 917 SSE_MULT_PS(XMM6,XMM3) 918 SSE_ADD_PS(XMM5,XMM6) 919 920 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 921 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 922 923 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 924 925 /* Column 2, product is accumulated in XMM6 */ 926 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 927 SSE_SHUFFLE(XMM6,XMM6,0x00) 928 SSE_MULT_PS(XMM6,XMM0) 929 930 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 931 SSE_SHUFFLE(XMM7,XMM7,0x00) 932 SSE_MULT_PS(XMM7,XMM1) 933 SSE_ADD_PS(XMM6,XMM7) 934 935 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 936 SSE_SHUFFLE(XMM7,XMM7,0x00) 937 SSE_MULT_PS(XMM7,XMM2) 938 SSE_ADD_PS(XMM6,XMM7) 939 940 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 941 SSE_SHUFFLE(XMM7,XMM7,0x00) 942 SSE_MULT_PS(XMM7,XMM3) 943 SSE_ADD_PS(XMM6,XMM7) 944 945 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 946 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 947 948 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 949 /* Column 3, product is accumulated in XMM0 */ 950 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 951 SSE_SHUFFLE(XMM7,XMM7,0x00) 952 SSE_MULT_PS(XMM0,XMM7) 953 954 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 955 SSE_SHUFFLE(XMM7,XMM7,0x00) 956 SSE_MULT_PS(XMM1,XMM7) 957 SSE_ADD_PS(XMM0,XMM1) 958 959 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 960 SSE_SHUFFLE(XMM1,XMM1,0x00) 961 SSE_MULT_PS(XMM1,XMM2) 962 SSE_ADD_PS(XMM0,XMM1) 963 964 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 965 SSE_SHUFFLE(XMM7,XMM7,0x00) 966 SSE_MULT_PS(XMM3,XMM7) 967 SSE_ADD_PS(XMM0,XMM3) 968 969 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 970 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 971 972 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 973 /* This is code to be maintained and read by humans afterall. */ 974 /* Copy Multiplier Col 3 into XMM3 */ 975 SSE_COPY_PS(XMM3,XMM0) 976 /* Copy Multiplier Col 2 into XMM2 */ 977 SSE_COPY_PS(XMM2,XMM6) 978 /* Copy Multiplier Col 1 into XMM1 */ 979 SSE_COPY_PS(XMM1,XMM5) 980 /* Copy Multiplier Col 0 into XMM0 */ 981 SSE_COPY_PS(XMM0,XMM4) 982 SSE_INLINE_END_2; 983 984 /* Update the row: */ 985 nz = bi[row+1] - diag_offset[row] - 1; 986 pv += 16; 987 for (j=0; j<nz; j++) { 988 PREFETCH_L1(&pv[16]); 989 x = rtmp + 16*pj[j]; 990 /* x = rtmp + 4*pj[j]; */ 991 992 /* X:=X-M*PV, One column at a time */ 993 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 994 SSE_INLINE_BEGIN_2(x,pv) 995 /* Load First Column of X*/ 996 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 997 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 998 999 /* Matrix-Vector Product: */ 1000 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1001 SSE_SHUFFLE(XMM5,XMM5,0x00) 1002 SSE_MULT_PS(XMM5,XMM0) 1003 SSE_SUB_PS(XMM4,XMM5) 1004 1005 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1006 SSE_SHUFFLE(XMM6,XMM6,0x00) 1007 SSE_MULT_PS(XMM6,XMM1) 1008 SSE_SUB_PS(XMM4,XMM6) 1009 1010 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1011 SSE_SHUFFLE(XMM7,XMM7,0x00) 1012 SSE_MULT_PS(XMM7,XMM2) 1013 SSE_SUB_PS(XMM4,XMM7) 1014 1015 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1016 SSE_SHUFFLE(XMM5,XMM5,0x00) 1017 SSE_MULT_PS(XMM5,XMM3) 1018 SSE_SUB_PS(XMM4,XMM5) 1019 1020 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1021 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1022 1023 /* Second Column */ 1024 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1025 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1026 1027 /* Matrix-Vector Product: */ 1028 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1029 SSE_SHUFFLE(XMM6,XMM6,0x00) 1030 SSE_MULT_PS(XMM6,XMM0) 1031 SSE_SUB_PS(XMM5,XMM6) 1032 1033 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1034 SSE_SHUFFLE(XMM7,XMM7,0x00) 1035 SSE_MULT_PS(XMM7,XMM1) 1036 SSE_SUB_PS(XMM5,XMM7) 1037 1038 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1039 SSE_SHUFFLE(XMM4,XMM4,0x00) 1040 SSE_MULT_PS(XMM4,XMM2) 1041 SSE_SUB_PS(XMM5,XMM4) 1042 1043 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1044 SSE_SHUFFLE(XMM6,XMM6,0x00) 1045 SSE_MULT_PS(XMM6,XMM3) 1046 SSE_SUB_PS(XMM5,XMM6) 1047 1048 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1049 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1050 1051 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1052 1053 /* Third Column */ 1054 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1055 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1056 1057 /* Matrix-Vector Product: */ 1058 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1059 SSE_SHUFFLE(XMM7,XMM7,0x00) 1060 SSE_MULT_PS(XMM7,XMM0) 1061 SSE_SUB_PS(XMM6,XMM7) 1062 1063 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1064 SSE_SHUFFLE(XMM4,XMM4,0x00) 1065 SSE_MULT_PS(XMM4,XMM1) 1066 SSE_SUB_PS(XMM6,XMM4) 1067 1068 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1069 SSE_SHUFFLE(XMM5,XMM5,0x00) 1070 SSE_MULT_PS(XMM5,XMM2) 1071 SSE_SUB_PS(XMM6,XMM5) 1072 1073 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1074 SSE_SHUFFLE(XMM7,XMM7,0x00) 1075 SSE_MULT_PS(XMM7,XMM3) 1076 SSE_SUB_PS(XMM6,XMM7) 1077 1078 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1079 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1080 1081 /* Fourth Column */ 1082 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1083 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1084 1085 /* Matrix-Vector Product: */ 1086 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1087 SSE_SHUFFLE(XMM5,XMM5,0x00) 1088 SSE_MULT_PS(XMM5,XMM0) 1089 SSE_SUB_PS(XMM4,XMM5) 1090 1091 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1092 SSE_SHUFFLE(XMM6,XMM6,0x00) 1093 SSE_MULT_PS(XMM6,XMM1) 1094 SSE_SUB_PS(XMM4,XMM6) 1095 1096 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1097 SSE_SHUFFLE(XMM7,XMM7,0x00) 1098 SSE_MULT_PS(XMM7,XMM2) 1099 SSE_SUB_PS(XMM4,XMM7) 1100 1101 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1102 SSE_SHUFFLE(XMM5,XMM5,0x00) 1103 SSE_MULT_PS(XMM5,XMM3) 1104 SSE_SUB_PS(XMM4,XMM5) 1105 1106 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1107 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1108 SSE_INLINE_END_2; 1109 pv += 16; 1110 } 1111 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1112 } 1113 row = *bjtmp++; 1114 /* row = (*bjtmp++)/4; */ 1115 } 1116 /* finished row so stick it into b->a */ 1117 pv = ba + 16*bi[i]; 1118 pj = bj + bi[i]; 1119 nz = bi[i+1] - bi[i]; 1120 1121 /* Copy x block back into pv block */ 1122 for (j=0; j<nz; j++) { 1123 x = rtmp+16*pj[j]; 1124 /* x = rtmp+4*pj[j]; */ 1125 1126 SSE_INLINE_BEGIN_2(x,pv) 1127 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1128 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1129 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1130 1131 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1132 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1133 1134 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1135 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1136 1137 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1138 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1139 1140 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1141 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1142 1143 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1144 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1145 1146 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1147 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1148 1149 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1150 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1151 SSE_INLINE_END_2; 1152 pv += 16; 1153 } 1154 /* invert diagonal block */ 1155 w = ba + 16*diag_offset[i]; 1156 if (pivotinblocks) { 1157 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1158 } else { 1159 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1160 } 1161 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1162 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1163 } 1164 1165 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1166 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1167 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1168 C->assembled = PETSC_TRUE; 1169 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1170 /* Flop Count from inverting diagonal blocks */ 1171 SSE_SCOPE_END; 1172 PetscFunctionReturn(0); 1173 } 1174 1175 #undef __FUNCT__ 1176 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 1177 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1178 { 1179 Mat A=C; 1180 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1181 PetscErrorCode ierr; 1182 int i,j,n = a->mbs; 1183 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1184 unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 1185 unsigned int row; 1186 int nz,*bi=b->i; 1187 int *diag_offset = b->diag,*ai=a->i; 1188 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1189 MatScalar *ba = b->a,*aa = a->a; 1190 int nonzero=0; 1191 /* int nonzero=0,colscale = 16; */ 1192 PetscTruth pivotinblocks = b->pivotinblocks; 1193 PetscReal shift = info->shiftinblocks; 1194 1195 PetscFunctionBegin; 1196 SSE_SCOPE_BEGIN; 1197 1198 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1199 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1200 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1201 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1202 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1203 /* colscale = 4; */ 1204 /* } */ 1205 1206 for (i=0; i<n; i++) { 1207 nz = bi[i+1] - bi[i]; 1208 bjtmp = bj + bi[i]; 1209 /* zero out the 4x4 block accumulators */ 1210 /* zero out one register */ 1211 XOR_PS(XMM7,XMM7); 1212 for (j=0; j<nz; j++) { 1213 x = rtmp+16*((unsigned int)bjtmp[j]); 1214 /* x = rtmp+4*bjtmp[j]; */ 1215 SSE_INLINE_BEGIN_1(x) 1216 /* Copy zero register to memory locations */ 1217 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1218 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1219 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1220 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1221 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1222 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1223 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1224 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1225 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1226 SSE_INLINE_END_1; 1227 } 1228 /* load in initial (unfactored row) */ 1229 nz = ai[i+1] - ai[i]; 1230 ajtmp = aj + ai[i]; 1231 v = aa + 16*ai[i]; 1232 for (j=0; j<nz; j++) { 1233 x = rtmp+16*((unsigned int)ajtmp[j]); 1234 /* x = rtmp+colscale*ajtmp[j]; */ 1235 /* Copy v block into x block */ 1236 SSE_INLINE_BEGIN_2(v,x) 1237 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1238 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1239 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1240 1241 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1242 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1243 1244 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1245 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1246 1247 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1248 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1249 1250 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1251 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1252 1253 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1254 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1255 1256 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1257 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1258 1259 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1260 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1261 SSE_INLINE_END_2; 1262 1263 v += 16; 1264 } 1265 /* row = (*bjtmp++)/4; */ 1266 row = (unsigned int)(*bjtmp++); 1267 while (row < i) { 1268 pc = rtmp + 16*row; 1269 SSE_INLINE_BEGIN_1(pc) 1270 /* Load block from lower triangle */ 1271 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1272 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1273 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1274 1275 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1276 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1277 1278 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1279 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1280 1281 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1282 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1283 1284 /* Compare block to zero block */ 1285 1286 SSE_COPY_PS(XMM4,XMM7) 1287 SSE_CMPNEQ_PS(XMM4,XMM0) 1288 1289 SSE_COPY_PS(XMM5,XMM7) 1290 SSE_CMPNEQ_PS(XMM5,XMM1) 1291 1292 SSE_COPY_PS(XMM6,XMM7) 1293 SSE_CMPNEQ_PS(XMM6,XMM2) 1294 1295 SSE_CMPNEQ_PS(XMM7,XMM3) 1296 1297 /* Reduce the comparisons to one SSE register */ 1298 SSE_OR_PS(XMM6,XMM7) 1299 SSE_OR_PS(XMM5,XMM4) 1300 SSE_OR_PS(XMM5,XMM6) 1301 SSE_INLINE_END_1; 1302 1303 /* Reduce the one SSE register to an integer register for branching */ 1304 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1305 MOVEMASK(nonzero,XMM5); 1306 1307 /* If block is nonzero ... */ 1308 if (nonzero) { 1309 pv = ba + 16*diag_offset[row]; 1310 PREFETCH_L1(&pv[16]); 1311 pj = bj + diag_offset[row] + 1; 1312 1313 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1314 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1315 /* but the diagonal was inverted already */ 1316 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1317 1318 SSE_INLINE_BEGIN_2(pv,pc) 1319 /* Column 0, product is accumulated in XMM4 */ 1320 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1321 SSE_SHUFFLE(XMM4,XMM4,0x00) 1322 SSE_MULT_PS(XMM4,XMM0) 1323 1324 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1325 SSE_SHUFFLE(XMM5,XMM5,0x00) 1326 SSE_MULT_PS(XMM5,XMM1) 1327 SSE_ADD_PS(XMM4,XMM5) 1328 1329 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1330 SSE_SHUFFLE(XMM6,XMM6,0x00) 1331 SSE_MULT_PS(XMM6,XMM2) 1332 SSE_ADD_PS(XMM4,XMM6) 1333 1334 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1335 SSE_SHUFFLE(XMM7,XMM7,0x00) 1336 SSE_MULT_PS(XMM7,XMM3) 1337 SSE_ADD_PS(XMM4,XMM7) 1338 1339 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1340 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1341 1342 /* Column 1, product is accumulated in XMM5 */ 1343 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1344 SSE_SHUFFLE(XMM5,XMM5,0x00) 1345 SSE_MULT_PS(XMM5,XMM0) 1346 1347 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1348 SSE_SHUFFLE(XMM6,XMM6,0x00) 1349 SSE_MULT_PS(XMM6,XMM1) 1350 SSE_ADD_PS(XMM5,XMM6) 1351 1352 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1353 SSE_SHUFFLE(XMM7,XMM7,0x00) 1354 SSE_MULT_PS(XMM7,XMM2) 1355 SSE_ADD_PS(XMM5,XMM7) 1356 1357 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1358 SSE_SHUFFLE(XMM6,XMM6,0x00) 1359 SSE_MULT_PS(XMM6,XMM3) 1360 SSE_ADD_PS(XMM5,XMM6) 1361 1362 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1363 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1364 1365 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1366 1367 /* Column 2, product is accumulated in XMM6 */ 1368 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1369 SSE_SHUFFLE(XMM6,XMM6,0x00) 1370 SSE_MULT_PS(XMM6,XMM0) 1371 1372 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1373 SSE_SHUFFLE(XMM7,XMM7,0x00) 1374 SSE_MULT_PS(XMM7,XMM1) 1375 SSE_ADD_PS(XMM6,XMM7) 1376 1377 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1378 SSE_SHUFFLE(XMM7,XMM7,0x00) 1379 SSE_MULT_PS(XMM7,XMM2) 1380 SSE_ADD_PS(XMM6,XMM7) 1381 1382 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1383 SSE_SHUFFLE(XMM7,XMM7,0x00) 1384 SSE_MULT_PS(XMM7,XMM3) 1385 SSE_ADD_PS(XMM6,XMM7) 1386 1387 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1388 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1389 1390 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1391 /* Column 3, product is accumulated in XMM0 */ 1392 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1393 SSE_SHUFFLE(XMM7,XMM7,0x00) 1394 SSE_MULT_PS(XMM0,XMM7) 1395 1396 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1397 SSE_SHUFFLE(XMM7,XMM7,0x00) 1398 SSE_MULT_PS(XMM1,XMM7) 1399 SSE_ADD_PS(XMM0,XMM1) 1400 1401 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1402 SSE_SHUFFLE(XMM1,XMM1,0x00) 1403 SSE_MULT_PS(XMM1,XMM2) 1404 SSE_ADD_PS(XMM0,XMM1) 1405 1406 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1407 SSE_SHUFFLE(XMM7,XMM7,0x00) 1408 SSE_MULT_PS(XMM3,XMM7) 1409 SSE_ADD_PS(XMM0,XMM3) 1410 1411 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1412 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1413 1414 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1415 /* This is code to be maintained and read by humans afterall. */ 1416 /* Copy Multiplier Col 3 into XMM3 */ 1417 SSE_COPY_PS(XMM3,XMM0) 1418 /* Copy Multiplier Col 2 into XMM2 */ 1419 SSE_COPY_PS(XMM2,XMM6) 1420 /* Copy Multiplier Col 1 into XMM1 */ 1421 SSE_COPY_PS(XMM1,XMM5) 1422 /* Copy Multiplier Col 0 into XMM0 */ 1423 SSE_COPY_PS(XMM0,XMM4) 1424 SSE_INLINE_END_2; 1425 1426 /* Update the row: */ 1427 nz = bi[row+1] - diag_offset[row] - 1; 1428 pv += 16; 1429 for (j=0; j<nz; j++) { 1430 PREFETCH_L1(&pv[16]); 1431 x = rtmp + 16*((unsigned int)pj[j]); 1432 /* x = rtmp + 4*pj[j]; */ 1433 1434 /* X:=X-M*PV, One column at a time */ 1435 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1436 SSE_INLINE_BEGIN_2(x,pv) 1437 /* Load First Column of X*/ 1438 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1439 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1440 1441 /* Matrix-Vector Product: */ 1442 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1443 SSE_SHUFFLE(XMM5,XMM5,0x00) 1444 SSE_MULT_PS(XMM5,XMM0) 1445 SSE_SUB_PS(XMM4,XMM5) 1446 1447 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1448 SSE_SHUFFLE(XMM6,XMM6,0x00) 1449 SSE_MULT_PS(XMM6,XMM1) 1450 SSE_SUB_PS(XMM4,XMM6) 1451 1452 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1453 SSE_SHUFFLE(XMM7,XMM7,0x00) 1454 SSE_MULT_PS(XMM7,XMM2) 1455 SSE_SUB_PS(XMM4,XMM7) 1456 1457 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1458 SSE_SHUFFLE(XMM5,XMM5,0x00) 1459 SSE_MULT_PS(XMM5,XMM3) 1460 SSE_SUB_PS(XMM4,XMM5) 1461 1462 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1463 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1464 1465 /* Second Column */ 1466 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1467 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1468 1469 /* Matrix-Vector Product: */ 1470 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1471 SSE_SHUFFLE(XMM6,XMM6,0x00) 1472 SSE_MULT_PS(XMM6,XMM0) 1473 SSE_SUB_PS(XMM5,XMM6) 1474 1475 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1476 SSE_SHUFFLE(XMM7,XMM7,0x00) 1477 SSE_MULT_PS(XMM7,XMM1) 1478 SSE_SUB_PS(XMM5,XMM7) 1479 1480 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1481 SSE_SHUFFLE(XMM4,XMM4,0x00) 1482 SSE_MULT_PS(XMM4,XMM2) 1483 SSE_SUB_PS(XMM5,XMM4) 1484 1485 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1486 SSE_SHUFFLE(XMM6,XMM6,0x00) 1487 SSE_MULT_PS(XMM6,XMM3) 1488 SSE_SUB_PS(XMM5,XMM6) 1489 1490 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1491 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1492 1493 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1494 1495 /* Third Column */ 1496 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1497 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1498 1499 /* Matrix-Vector Product: */ 1500 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1501 SSE_SHUFFLE(XMM7,XMM7,0x00) 1502 SSE_MULT_PS(XMM7,XMM0) 1503 SSE_SUB_PS(XMM6,XMM7) 1504 1505 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1506 SSE_SHUFFLE(XMM4,XMM4,0x00) 1507 SSE_MULT_PS(XMM4,XMM1) 1508 SSE_SUB_PS(XMM6,XMM4) 1509 1510 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1511 SSE_SHUFFLE(XMM5,XMM5,0x00) 1512 SSE_MULT_PS(XMM5,XMM2) 1513 SSE_SUB_PS(XMM6,XMM5) 1514 1515 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1516 SSE_SHUFFLE(XMM7,XMM7,0x00) 1517 SSE_MULT_PS(XMM7,XMM3) 1518 SSE_SUB_PS(XMM6,XMM7) 1519 1520 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1521 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1522 1523 /* Fourth Column */ 1524 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1525 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1526 1527 /* Matrix-Vector Product: */ 1528 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1529 SSE_SHUFFLE(XMM5,XMM5,0x00) 1530 SSE_MULT_PS(XMM5,XMM0) 1531 SSE_SUB_PS(XMM4,XMM5) 1532 1533 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1534 SSE_SHUFFLE(XMM6,XMM6,0x00) 1535 SSE_MULT_PS(XMM6,XMM1) 1536 SSE_SUB_PS(XMM4,XMM6) 1537 1538 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1539 SSE_SHUFFLE(XMM7,XMM7,0x00) 1540 SSE_MULT_PS(XMM7,XMM2) 1541 SSE_SUB_PS(XMM4,XMM7) 1542 1543 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1544 SSE_SHUFFLE(XMM5,XMM5,0x00) 1545 SSE_MULT_PS(XMM5,XMM3) 1546 SSE_SUB_PS(XMM4,XMM5) 1547 1548 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1549 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1550 SSE_INLINE_END_2; 1551 pv += 16; 1552 } 1553 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1554 } 1555 row = (unsigned int)(*bjtmp++); 1556 /* row = (*bjtmp++)/4; */ 1557 /* bjtmp++; */ 1558 } 1559 /* finished row so stick it into b->a */ 1560 pv = ba + 16*bi[i]; 1561 pj = bj + bi[i]; 1562 nz = bi[i+1] - bi[i]; 1563 1564 /* Copy x block back into pv block */ 1565 for (j=0; j<nz; j++) { 1566 x = rtmp+16*((unsigned int)pj[j]); 1567 /* x = rtmp+4*pj[j]; */ 1568 1569 SSE_INLINE_BEGIN_2(x,pv) 1570 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1571 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1572 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1573 1574 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1575 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1576 1577 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1578 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1579 1580 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1581 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1582 1583 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1584 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1585 1586 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1587 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1588 1589 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1590 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1591 1592 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1593 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1594 SSE_INLINE_END_2; 1595 pv += 16; 1596 } 1597 /* invert diagonal block */ 1598 w = ba + 16*diag_offset[i]; 1599 if (pivotinblocks) { 1600 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1601 } else { 1602 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1603 } 1604 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1605 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1606 } 1607 1608 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1609 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1610 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1611 C->assembled = PETSC_TRUE; 1612 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1613 /* Flop Count from inverting diagonal blocks */ 1614 SSE_SCOPE_END; 1615 PetscFunctionReturn(0); 1616 } 1617 1618 #undef __FUNCT__ 1619 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1620 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1621 { 1622 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1623 PetscErrorCode ierr; 1624 int i,j,n = a->mbs; 1625 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1626 unsigned int row; 1627 int *ajtmpold,nz,*bi=b->i; 1628 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1629 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1630 MatScalar *ba = b->a,*aa = a->a; 1631 int nonzero=0; 1632 /* int nonzero=0,colscale = 16; */ 1633 PetscTruth pivotinblocks = b->pivotinblocks; 1634 PetscReal shift = info->shiftinblocks; 1635 1636 PetscFunctionBegin; 1637 SSE_SCOPE_BEGIN; 1638 1639 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1640 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1641 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1642 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1643 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1644 /* colscale = 4; */ 1645 /* } */ 1646 if ((unsigned long)bj==(unsigned long)aj) { 1647 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1648 } 1649 1650 for (i=0; i<n; i++) { 1651 nz = bi[i+1] - bi[i]; 1652 bjtmp = bj + bi[i]; 1653 /* zero out the 4x4 block accumulators */ 1654 /* zero out one register */ 1655 XOR_PS(XMM7,XMM7); 1656 for (j=0; j<nz; j++) { 1657 x = rtmp+16*((unsigned int)bjtmp[j]); 1658 /* x = rtmp+4*bjtmp[j]; */ 1659 SSE_INLINE_BEGIN_1(x) 1660 /* Copy zero register to memory locations */ 1661 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1662 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1663 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1664 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1665 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1666 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1667 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1668 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1669 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1670 SSE_INLINE_END_1; 1671 } 1672 /* load in initial (unfactored row) */ 1673 nz = ai[i+1] - ai[i]; 1674 ajtmpold = aj + ai[i]; 1675 v = aa + 16*ai[i]; 1676 for (j=0; j<nz; j++) { 1677 x = rtmp+16*ajtmpold[j]; 1678 /* x = rtmp+colscale*ajtmpold[j]; */ 1679 /* Copy v block into x block */ 1680 SSE_INLINE_BEGIN_2(v,x) 1681 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1682 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1683 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1684 1685 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1686 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1687 1688 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1689 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1690 1691 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1692 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1693 1694 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1695 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1696 1697 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1698 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1699 1700 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1701 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1702 1703 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1704 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1705 SSE_INLINE_END_2; 1706 1707 v += 16; 1708 } 1709 /* row = (*bjtmp++)/4; */ 1710 row = (unsigned int)(*bjtmp++); 1711 while (row < i) { 1712 pc = rtmp + 16*row; 1713 SSE_INLINE_BEGIN_1(pc) 1714 /* Load block from lower triangle */ 1715 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1716 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1717 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1718 1719 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1720 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1721 1722 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1723 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1724 1725 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1726 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1727 1728 /* Compare block to zero block */ 1729 1730 SSE_COPY_PS(XMM4,XMM7) 1731 SSE_CMPNEQ_PS(XMM4,XMM0) 1732 1733 SSE_COPY_PS(XMM5,XMM7) 1734 SSE_CMPNEQ_PS(XMM5,XMM1) 1735 1736 SSE_COPY_PS(XMM6,XMM7) 1737 SSE_CMPNEQ_PS(XMM6,XMM2) 1738 1739 SSE_CMPNEQ_PS(XMM7,XMM3) 1740 1741 /* Reduce the comparisons to one SSE register */ 1742 SSE_OR_PS(XMM6,XMM7) 1743 SSE_OR_PS(XMM5,XMM4) 1744 SSE_OR_PS(XMM5,XMM6) 1745 SSE_INLINE_END_1; 1746 1747 /* Reduce the one SSE register to an integer register for branching */ 1748 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1749 MOVEMASK(nonzero,XMM5); 1750 1751 /* If block is nonzero ... */ 1752 if (nonzero) { 1753 pv = ba + 16*diag_offset[row]; 1754 PREFETCH_L1(&pv[16]); 1755 pj = bj + diag_offset[row] + 1; 1756 1757 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1758 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1759 /* but the diagonal was inverted already */ 1760 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1761 1762 SSE_INLINE_BEGIN_2(pv,pc) 1763 /* Column 0, product is accumulated in XMM4 */ 1764 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1765 SSE_SHUFFLE(XMM4,XMM4,0x00) 1766 SSE_MULT_PS(XMM4,XMM0) 1767 1768 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1769 SSE_SHUFFLE(XMM5,XMM5,0x00) 1770 SSE_MULT_PS(XMM5,XMM1) 1771 SSE_ADD_PS(XMM4,XMM5) 1772 1773 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1774 SSE_SHUFFLE(XMM6,XMM6,0x00) 1775 SSE_MULT_PS(XMM6,XMM2) 1776 SSE_ADD_PS(XMM4,XMM6) 1777 1778 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1779 SSE_SHUFFLE(XMM7,XMM7,0x00) 1780 SSE_MULT_PS(XMM7,XMM3) 1781 SSE_ADD_PS(XMM4,XMM7) 1782 1783 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1784 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1785 1786 /* Column 1, product is accumulated in XMM5 */ 1787 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1788 SSE_SHUFFLE(XMM5,XMM5,0x00) 1789 SSE_MULT_PS(XMM5,XMM0) 1790 1791 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1792 SSE_SHUFFLE(XMM6,XMM6,0x00) 1793 SSE_MULT_PS(XMM6,XMM1) 1794 SSE_ADD_PS(XMM5,XMM6) 1795 1796 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1797 SSE_SHUFFLE(XMM7,XMM7,0x00) 1798 SSE_MULT_PS(XMM7,XMM2) 1799 SSE_ADD_PS(XMM5,XMM7) 1800 1801 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1802 SSE_SHUFFLE(XMM6,XMM6,0x00) 1803 SSE_MULT_PS(XMM6,XMM3) 1804 SSE_ADD_PS(XMM5,XMM6) 1805 1806 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1807 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1808 1809 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1810 1811 /* Column 2, product is accumulated in XMM6 */ 1812 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1813 SSE_SHUFFLE(XMM6,XMM6,0x00) 1814 SSE_MULT_PS(XMM6,XMM0) 1815 1816 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1817 SSE_SHUFFLE(XMM7,XMM7,0x00) 1818 SSE_MULT_PS(XMM7,XMM1) 1819 SSE_ADD_PS(XMM6,XMM7) 1820 1821 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1822 SSE_SHUFFLE(XMM7,XMM7,0x00) 1823 SSE_MULT_PS(XMM7,XMM2) 1824 SSE_ADD_PS(XMM6,XMM7) 1825 1826 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1827 SSE_SHUFFLE(XMM7,XMM7,0x00) 1828 SSE_MULT_PS(XMM7,XMM3) 1829 SSE_ADD_PS(XMM6,XMM7) 1830 1831 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1832 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1833 1834 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1835 /* Column 3, product is accumulated in XMM0 */ 1836 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1837 SSE_SHUFFLE(XMM7,XMM7,0x00) 1838 SSE_MULT_PS(XMM0,XMM7) 1839 1840 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1841 SSE_SHUFFLE(XMM7,XMM7,0x00) 1842 SSE_MULT_PS(XMM1,XMM7) 1843 SSE_ADD_PS(XMM0,XMM1) 1844 1845 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1846 SSE_SHUFFLE(XMM1,XMM1,0x00) 1847 SSE_MULT_PS(XMM1,XMM2) 1848 SSE_ADD_PS(XMM0,XMM1) 1849 1850 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1851 SSE_SHUFFLE(XMM7,XMM7,0x00) 1852 SSE_MULT_PS(XMM3,XMM7) 1853 SSE_ADD_PS(XMM0,XMM3) 1854 1855 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1856 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1857 1858 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1859 /* This is code to be maintained and read by humans afterall. */ 1860 /* Copy Multiplier Col 3 into XMM3 */ 1861 SSE_COPY_PS(XMM3,XMM0) 1862 /* Copy Multiplier Col 2 into XMM2 */ 1863 SSE_COPY_PS(XMM2,XMM6) 1864 /* Copy Multiplier Col 1 into XMM1 */ 1865 SSE_COPY_PS(XMM1,XMM5) 1866 /* Copy Multiplier Col 0 into XMM0 */ 1867 SSE_COPY_PS(XMM0,XMM4) 1868 SSE_INLINE_END_2; 1869 1870 /* Update the row: */ 1871 nz = bi[row+1] - diag_offset[row] - 1; 1872 pv += 16; 1873 for (j=0; j<nz; j++) { 1874 PREFETCH_L1(&pv[16]); 1875 x = rtmp + 16*((unsigned int)pj[j]); 1876 /* x = rtmp + 4*pj[j]; */ 1877 1878 /* X:=X-M*PV, One column at a time */ 1879 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1880 SSE_INLINE_BEGIN_2(x,pv) 1881 /* Load First Column of X*/ 1882 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1883 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1884 1885 /* Matrix-Vector Product: */ 1886 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1887 SSE_SHUFFLE(XMM5,XMM5,0x00) 1888 SSE_MULT_PS(XMM5,XMM0) 1889 SSE_SUB_PS(XMM4,XMM5) 1890 1891 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1892 SSE_SHUFFLE(XMM6,XMM6,0x00) 1893 SSE_MULT_PS(XMM6,XMM1) 1894 SSE_SUB_PS(XMM4,XMM6) 1895 1896 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1897 SSE_SHUFFLE(XMM7,XMM7,0x00) 1898 SSE_MULT_PS(XMM7,XMM2) 1899 SSE_SUB_PS(XMM4,XMM7) 1900 1901 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1902 SSE_SHUFFLE(XMM5,XMM5,0x00) 1903 SSE_MULT_PS(XMM5,XMM3) 1904 SSE_SUB_PS(XMM4,XMM5) 1905 1906 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1907 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1908 1909 /* Second Column */ 1910 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1911 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1912 1913 /* Matrix-Vector Product: */ 1914 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1915 SSE_SHUFFLE(XMM6,XMM6,0x00) 1916 SSE_MULT_PS(XMM6,XMM0) 1917 SSE_SUB_PS(XMM5,XMM6) 1918 1919 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1920 SSE_SHUFFLE(XMM7,XMM7,0x00) 1921 SSE_MULT_PS(XMM7,XMM1) 1922 SSE_SUB_PS(XMM5,XMM7) 1923 1924 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1925 SSE_SHUFFLE(XMM4,XMM4,0x00) 1926 SSE_MULT_PS(XMM4,XMM2) 1927 SSE_SUB_PS(XMM5,XMM4) 1928 1929 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1930 SSE_SHUFFLE(XMM6,XMM6,0x00) 1931 SSE_MULT_PS(XMM6,XMM3) 1932 SSE_SUB_PS(XMM5,XMM6) 1933 1934 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1935 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1936 1937 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1938 1939 /* Third Column */ 1940 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1941 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1942 1943 /* Matrix-Vector Product: */ 1944 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1945 SSE_SHUFFLE(XMM7,XMM7,0x00) 1946 SSE_MULT_PS(XMM7,XMM0) 1947 SSE_SUB_PS(XMM6,XMM7) 1948 1949 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1950 SSE_SHUFFLE(XMM4,XMM4,0x00) 1951 SSE_MULT_PS(XMM4,XMM1) 1952 SSE_SUB_PS(XMM6,XMM4) 1953 1954 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1955 SSE_SHUFFLE(XMM5,XMM5,0x00) 1956 SSE_MULT_PS(XMM5,XMM2) 1957 SSE_SUB_PS(XMM6,XMM5) 1958 1959 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1960 SSE_SHUFFLE(XMM7,XMM7,0x00) 1961 SSE_MULT_PS(XMM7,XMM3) 1962 SSE_SUB_PS(XMM6,XMM7) 1963 1964 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1965 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1966 1967 /* Fourth Column */ 1968 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1969 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1970 1971 /* Matrix-Vector Product: */ 1972 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1973 SSE_SHUFFLE(XMM5,XMM5,0x00) 1974 SSE_MULT_PS(XMM5,XMM0) 1975 SSE_SUB_PS(XMM4,XMM5) 1976 1977 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1978 SSE_SHUFFLE(XMM6,XMM6,0x00) 1979 SSE_MULT_PS(XMM6,XMM1) 1980 SSE_SUB_PS(XMM4,XMM6) 1981 1982 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1983 SSE_SHUFFLE(XMM7,XMM7,0x00) 1984 SSE_MULT_PS(XMM7,XMM2) 1985 SSE_SUB_PS(XMM4,XMM7) 1986 1987 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1988 SSE_SHUFFLE(XMM5,XMM5,0x00) 1989 SSE_MULT_PS(XMM5,XMM3) 1990 SSE_SUB_PS(XMM4,XMM5) 1991 1992 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1993 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1994 SSE_INLINE_END_2; 1995 pv += 16; 1996 } 1997 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1998 } 1999 row = (unsigned int)(*bjtmp++); 2000 /* row = (*bjtmp++)/4; */ 2001 /* bjtmp++; */ 2002 } 2003 /* finished row so stick it into b->a */ 2004 pv = ba + 16*bi[i]; 2005 pj = bj + bi[i]; 2006 nz = bi[i+1] - bi[i]; 2007 2008 /* Copy x block back into pv block */ 2009 for (j=0; j<nz; j++) { 2010 x = rtmp+16*((unsigned int)pj[j]); 2011 /* x = rtmp+4*pj[j]; */ 2012 2013 SSE_INLINE_BEGIN_2(x,pv) 2014 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 2015 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 2016 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 2017 2018 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 2019 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 2020 2021 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 2022 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 2023 2024 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 2025 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 2026 2027 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 2028 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 2029 2030 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 2031 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 2032 2033 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 2034 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 2035 2036 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 2037 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 2038 SSE_INLINE_END_2; 2039 pv += 16; 2040 } 2041 /* invert diagonal block */ 2042 w = ba + 16*diag_offset[i]; 2043 if (pivotinblocks) { 2044 ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 2045 } else { 2046 ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 2047 } 2048 /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 2049 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 2050 } 2051 2052 ierr = PetscFree(rtmp);CHKERRQ(ierr); 2053 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 2054 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 2055 C->assembled = PETSC_TRUE; 2056 ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 2057 /* Flop Count from inverting diagonal blocks */ 2058 SSE_SCOPE_END; 2059 PetscFunctionReturn(0); 2060 } 2061 2062 #endif 2063