1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 383287d42SBarry Smith /* 483287d42SBarry Smith Factorization code for BAIJ format. 583287d42SBarry Smith */ 67c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 7c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 883287d42SBarry Smith 983287d42SBarry Smith /* ------------------------------------------------------------*/ 1083287d42SBarry Smith /* 1183287d42SBarry Smith Version for when blocks are 4 by 4 1283287d42SBarry Smith */ 134a2ae208SSatish Balay #undef __FUNCT__ 144a2ae208SSatish Balay #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 150481f469SBarry Smith PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info) 1683287d42SBarry Smith { 1783287d42SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1883287d42SBarry Smith IS isrow = b->row,isicol = b->icol; 196849ba73SBarry Smith PetscErrorCode ierr; 205d0c19d7SBarry Smith const PetscInt *r,*ic; 215d0c19d7SBarry Smith PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 22690b6cddSBarry Smith PetscInt *ajtmpold,*ajtmp,nz,row; 23690b6cddSBarry Smith PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 2483287d42SBarry Smith MatScalar *pv,*v,*rtmp,*pc,*w,*x; 2583287d42SBarry Smith MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 2683287d42SBarry Smith MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 2783287d42SBarry Smith MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 2883287d42SBarry Smith MatScalar m13,m14,m15,m16; 2983287d42SBarry Smith MatScalar *ba = b->a,*aa = a->a; 30bcd9e38bSBarry Smith PetscTruth pivotinblocks = b->pivotinblocks; 3162bba022SBarry Smith PetscReal shift = info->shiftinblocks; 3283287d42SBarry Smith 3383287d42SBarry Smith PetscFunctionBegin; 3483287d42SBarry Smith ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3583287d42SBarry Smith ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 36b0a32e0cSBarry Smith ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 3783287d42SBarry Smith 3883287d42SBarry Smith for (i=0; i<n; i++) { 3983287d42SBarry Smith nz = bi[i+1] - bi[i]; 4083287d42SBarry Smith ajtmp = bj + bi[i]; 4183287d42SBarry Smith for (j=0; j<nz; j++) { 4283287d42SBarry Smith x = rtmp+16*ajtmp[j]; 4383287d42SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 4483287d42SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 4583287d42SBarry Smith } 4683287d42SBarry Smith /* load in initial (unfactored row) */ 4783287d42SBarry Smith idx = r[i]; 4883287d42SBarry Smith nz = ai[idx+1] - ai[idx]; 4983287d42SBarry Smith ajtmpold = aj + ai[idx]; 5083287d42SBarry Smith v = aa + 16*ai[idx]; 5183287d42SBarry Smith for (j=0; j<nz; j++) { 5283287d42SBarry Smith x = rtmp+16*ic[ajtmpold[j]]; 5383287d42SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 5483287d42SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 5583287d42SBarry Smith x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 5683287d42SBarry Smith x[14] = v[14]; x[15] = v[15]; 5783287d42SBarry Smith v += 16; 5883287d42SBarry Smith } 5983287d42SBarry Smith row = *ajtmp++; 6083287d42SBarry Smith while (row < i) { 6183287d42SBarry Smith pc = rtmp + 16*row; 6283287d42SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 6383287d42SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 6483287d42SBarry Smith p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 6583287d42SBarry Smith p15 = pc[14]; p16 = pc[15]; 6683287d42SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 6783287d42SBarry Smith p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 6883287d42SBarry Smith p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 6983287d42SBarry Smith || p16 != 0.0) { 7083287d42SBarry Smith pv = ba + 16*diag_offset[row]; 7183287d42SBarry Smith pj = bj + diag_offset[row] + 1; 7283287d42SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 7383287d42SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 7483287d42SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 7583287d42SBarry Smith x15 = pv[14]; x16 = pv[15]; 7683287d42SBarry Smith pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 7783287d42SBarry Smith pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 7883287d42SBarry Smith pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 7983287d42SBarry Smith pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 8083287d42SBarry Smith 8183287d42SBarry Smith pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 8283287d42SBarry Smith pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 8383287d42SBarry Smith pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 8483287d42SBarry Smith pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 8583287d42SBarry Smith 8683287d42SBarry Smith pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 8783287d42SBarry Smith pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 8883287d42SBarry Smith pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 8983287d42SBarry Smith pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 9083287d42SBarry Smith 9183287d42SBarry Smith pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 9283287d42SBarry Smith pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 9383287d42SBarry Smith pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 9483287d42SBarry Smith pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 9583287d42SBarry Smith 9683287d42SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 9783287d42SBarry Smith pv += 16; 9883287d42SBarry Smith for (j=0; j<nz; j++) { 9983287d42SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 10083287d42SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 10183287d42SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 10283287d42SBarry Smith x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 10383287d42SBarry Smith x = rtmp + 16*pj[j]; 10483287d42SBarry Smith x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 10583287d42SBarry Smith x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 10683287d42SBarry Smith x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 10783287d42SBarry Smith x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 10883287d42SBarry Smith 10983287d42SBarry Smith x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 11083287d42SBarry Smith x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 11183287d42SBarry Smith x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 11283287d42SBarry Smith x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 11383287d42SBarry Smith 11483287d42SBarry Smith x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 11583287d42SBarry Smith x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 11683287d42SBarry Smith x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 11783287d42SBarry Smith x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 11883287d42SBarry Smith 11983287d42SBarry Smith x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 12083287d42SBarry Smith x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 12183287d42SBarry Smith x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 12283287d42SBarry Smith x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 12383287d42SBarry Smith 12483287d42SBarry Smith pv += 16; 12583287d42SBarry Smith } 126dc0b31edSSatish Balay ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 12783287d42SBarry Smith } 12883287d42SBarry Smith row = *ajtmp++; 12983287d42SBarry Smith } 13083287d42SBarry Smith /* finished row so stick it into b->a */ 13183287d42SBarry Smith pv = ba + 16*bi[i]; 13283287d42SBarry Smith pj = bj + bi[i]; 13383287d42SBarry Smith nz = bi[i+1] - bi[i]; 13483287d42SBarry Smith for (j=0; j<nz; j++) { 13583287d42SBarry Smith x = rtmp+16*pj[j]; 13683287d42SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 13783287d42SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 13883287d42SBarry Smith pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 13983287d42SBarry Smith pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 14083287d42SBarry Smith pv += 16; 14183287d42SBarry Smith } 14283287d42SBarry Smith /* invert diagonal block */ 14383287d42SBarry Smith w = ba + 16*diag_offset[i]; 144bcd9e38bSBarry Smith if (pivotinblocks) { 14562bba022SBarry Smith ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 146bcd9e38bSBarry Smith } else { 147bcd9e38bSBarry Smith ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148bcd9e38bSBarry Smith } 14983287d42SBarry Smith } 15083287d42SBarry Smith 15183287d42SBarry Smith ierr = PetscFree(rtmp);CHKERRQ(ierr); 15283287d42SBarry Smith ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 15383287d42SBarry Smith ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154db4efbfdSBarry Smith C->ops->solve = MatSolve_SeqBAIJ_4; 155db4efbfdSBarry Smith C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 15683287d42SBarry Smith C->assembled = PETSC_TRUE; 157efee365bSSatish Balay ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 15883287d42SBarry Smith PetscFunctionReturn(0); 15983287d42SBarry Smith } 160e6580ceeSShri Abhyankar 161209027a4SShri Abhyankar /* MatLUFactorNumeric_SeqBAIJ_4_newdatastruct - 162209027a4SShri Abhyankar copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented 163209027a4SShri Abhyankar Kernel_A_gets_A_times_B() 164209027a4SShri Abhyankar Kernel_A_gets_A_minus_B_times_C() 165209027a4SShri Abhyankar Kernel_A_gets_inverse_A() 166209027a4SShri Abhyankar */ 167209027a4SShri Abhyankar #undef __FUNCT__ 168209027a4SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct" 169209027a4SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 170209027a4SShri Abhyankar { 171209027a4SShri Abhyankar Mat C=B; 172209027a4SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 173209027a4SShri Abhyankar IS isrow = b->row,isicol = b->icol; 174209027a4SShri Abhyankar PetscErrorCode ierr; 175209027a4SShri Abhyankar const PetscInt *r,*ic,*ics; 176209027a4SShri Abhyankar PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 177209027a4SShri Abhyankar PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 178209027a4SShri Abhyankar MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 179209027a4SShri Abhyankar PetscInt bs2 = a->bs2,flg; 180209027a4SShri Abhyankar PetscReal shift = info->shiftinblocks; 181209027a4SShri Abhyankar 182209027a4SShri Abhyankar PetscFunctionBegin; 183209027a4SShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 184209027a4SShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 185209027a4SShri Abhyankar 186209027a4SShri Abhyankar /* generate work space needed by the factorization */ 187209027a4SShri Abhyankar ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 188209027a4SShri Abhyankar mwork = rtmp + bs2*n; 189209027a4SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 190209027a4SShri Abhyankar ics = ic; 191209027a4SShri Abhyankar 192209027a4SShri Abhyankar for (i=0; i<n; i++){ 193209027a4SShri Abhyankar /* zero rtmp */ 194209027a4SShri Abhyankar /* L part */ 195209027a4SShri Abhyankar nz = bi[i+1] - bi[i]; 196209027a4SShri Abhyankar bjtmp = bj + bi[i]; 197209027a4SShri Abhyankar for (j=0; j<nz; j++){ 198209027a4SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 199209027a4SShri Abhyankar } 200209027a4SShri Abhyankar 201209027a4SShri Abhyankar /* U part */ 202209027a4SShri Abhyankar nz = bi[2*n-i+1] - bi[2*n-i]; 203209027a4SShri Abhyankar bjtmp = bj + bi[2*n-i]; 204209027a4SShri Abhyankar for (j=0; j<nz; j++){ 205209027a4SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 206209027a4SShri Abhyankar } 207209027a4SShri Abhyankar 208209027a4SShri Abhyankar /* load in initial (unfactored row) */ 209209027a4SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 210209027a4SShri Abhyankar ajtmp = aj + ai[r[i]]; 211209027a4SShri Abhyankar v = aa + bs2*ai[r[i]]; 212209027a4SShri Abhyankar for (j=0; j<nz; j++) { 213209027a4SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 214209027a4SShri Abhyankar } 215209027a4SShri Abhyankar 216209027a4SShri Abhyankar /* elimination */ 217209027a4SShri Abhyankar bjtmp = bj + bi[i]; 218209027a4SShri Abhyankar nzL = bi[i+1] - bi[i]; 219b1646270SShri Abhyankar for(k=0;k < nzL;k++) { 220b1646270SShri Abhyankar row = bjtmp[k]; 221209027a4SShri Abhyankar pc = rtmp + bs2*row; 222209027a4SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 223209027a4SShri Abhyankar if (flg) { 224209027a4SShri Abhyankar pv = b->a + bs2*bdiag[row]; 225209027a4SShri Abhyankar /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 226209027a4SShri Abhyankar ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 227209027a4SShri Abhyankar 228209027a4SShri Abhyankar pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 229209027a4SShri Abhyankar pv = b->a + bs2*bi[2*n-row]; 230209027a4SShri Abhyankar nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 231209027a4SShri Abhyankar for (j=0; j<nz; j++) { 232209027a4SShri Abhyankar /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 233209027a4SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 234209027a4SShri Abhyankar v = rtmp + bs2*pj[j]; 235209027a4SShri Abhyankar ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 236209027a4SShri Abhyankar pv += bs2; 237209027a4SShri Abhyankar } 238209027a4SShri Abhyankar ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 239209027a4SShri Abhyankar } 240209027a4SShri Abhyankar } 241209027a4SShri Abhyankar 242209027a4SShri Abhyankar /* finished row so stick it into b->a */ 243209027a4SShri Abhyankar /* L part */ 244209027a4SShri Abhyankar pv = b->a + bs2*bi[i] ; 245209027a4SShri Abhyankar pj = b->j + bi[i] ; 246209027a4SShri Abhyankar nz = bi[i+1] - bi[i]; 247209027a4SShri Abhyankar for (j=0; j<nz; j++) { 248209027a4SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 249209027a4SShri Abhyankar } 250209027a4SShri Abhyankar 251209027a4SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 252209027a4SShri Abhyankar pv = b->a + bs2*bdiag[i]; 253209027a4SShri Abhyankar pj = b->j + bdiag[i]; 254209027a4SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 255209027a4SShri Abhyankar /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 256209027a4SShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 257209027a4SShri Abhyankar 258209027a4SShri Abhyankar /* U part */ 259209027a4SShri Abhyankar pv = b->a + bs2*bi[2*n-i]; 260209027a4SShri Abhyankar pj = b->j + bi[2*n-i]; 261209027a4SShri Abhyankar nz = bi[2*n-i+1] - bi[2*n-i] - 1; 262209027a4SShri Abhyankar for (j=0; j<nz; j++){ 263209027a4SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 264209027a4SShri Abhyankar } 265209027a4SShri Abhyankar } 266209027a4SShri Abhyankar 267209027a4SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 268209027a4SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 269209027a4SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 270209027a4SShri Abhyankar 271209027a4SShri Abhyankar C->assembled = PETSC_TRUE; 272209027a4SShri Abhyankar ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 273209027a4SShri Abhyankar PetscFunctionReturn(0); 274209027a4SShri Abhyankar } 275209027a4SShri Abhyankar 276e6580ceeSShri Abhyankar #undef __FUNCT__ 277*78bb4007SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct_v2" 278*78bb4007SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info) 279*78bb4007SShri Abhyankar { 280*78bb4007SShri Abhyankar Mat C=B; 281*78bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 282*78bb4007SShri Abhyankar IS isrow = b->row,isicol = b->icol; 283*78bb4007SShri Abhyankar PetscErrorCode ierr; 284*78bb4007SShri Abhyankar const PetscInt *r,*ic,*ics; 285*78bb4007SShri Abhyankar PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 286*78bb4007SShri Abhyankar PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 287*78bb4007SShri Abhyankar MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 288*78bb4007SShri Abhyankar PetscInt bs2 = a->bs2,flg; 289*78bb4007SShri Abhyankar PetscReal shift = info->shiftinblocks; 290*78bb4007SShri Abhyankar 291*78bb4007SShri Abhyankar PetscFunctionBegin; 292*78bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 293*78bb4007SShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 294*78bb4007SShri Abhyankar 295*78bb4007SShri Abhyankar /* generate work space needed by the factorization */ 296*78bb4007SShri Abhyankar ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 297*78bb4007SShri Abhyankar mwork = rtmp + bs2*n; 298*78bb4007SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 299*78bb4007SShri Abhyankar ics = ic; 300*78bb4007SShri Abhyankar 301*78bb4007SShri Abhyankar for (i=0; i<n; i++){ 302*78bb4007SShri Abhyankar /* zero rtmp */ 303*78bb4007SShri Abhyankar /* L part */ 304*78bb4007SShri Abhyankar nz = bi[i+1] - bi[i]; 305*78bb4007SShri Abhyankar bjtmp = bj + bi[i]; 306*78bb4007SShri Abhyankar for (j=0; j<nz; j++){ 307*78bb4007SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 308*78bb4007SShri Abhyankar } 309*78bb4007SShri Abhyankar 310*78bb4007SShri Abhyankar /* U part */ 311*78bb4007SShri Abhyankar nz = bdiag[i]-bdiag[i+1]; 312*78bb4007SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 313*78bb4007SShri Abhyankar for (j=0; j<nz; j++){ 314*78bb4007SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 315*78bb4007SShri Abhyankar } 316*78bb4007SShri Abhyankar 317*78bb4007SShri Abhyankar /* load in initial (unfactored row) */ 318*78bb4007SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 319*78bb4007SShri Abhyankar ajtmp = aj + ai[r[i]]; 320*78bb4007SShri Abhyankar v = aa + bs2*ai[r[i]]; 321*78bb4007SShri Abhyankar for (j=0; j<nz; j++) { 322*78bb4007SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 323*78bb4007SShri Abhyankar } 324*78bb4007SShri Abhyankar 325*78bb4007SShri Abhyankar /* elimination */ 326*78bb4007SShri Abhyankar bjtmp = bj + bi[i]; 327*78bb4007SShri Abhyankar nzL = bi[i+1] - bi[i]; 328*78bb4007SShri Abhyankar for(k=0;k < nzL;k++) { 329*78bb4007SShri Abhyankar row = bjtmp[k]; 330*78bb4007SShri Abhyankar pc = rtmp + bs2*row; 331*78bb4007SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 332*78bb4007SShri Abhyankar if (flg) { 333*78bb4007SShri Abhyankar pv = b->a + bs2*bdiag[row]; 334*78bb4007SShri Abhyankar /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 335*78bb4007SShri Abhyankar ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 336*78bb4007SShri Abhyankar 337*78bb4007SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 338*78bb4007SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 339*78bb4007SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 340*78bb4007SShri Abhyankar for (j=0; j<nz; j++) { 341*78bb4007SShri Abhyankar /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 342*78bb4007SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 343*78bb4007SShri Abhyankar v = rtmp + bs2*pj[j]; 344*78bb4007SShri Abhyankar ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 345*78bb4007SShri Abhyankar pv += bs2; 346*78bb4007SShri Abhyankar } 347*78bb4007SShri Abhyankar ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 348*78bb4007SShri Abhyankar } 349*78bb4007SShri Abhyankar } 350*78bb4007SShri Abhyankar 351*78bb4007SShri Abhyankar /* finished row so stick it into b->a */ 352*78bb4007SShri Abhyankar /* L part */ 353*78bb4007SShri Abhyankar pv = b->a + bs2*bi[i] ; 354*78bb4007SShri Abhyankar pj = b->j + bi[i] ; 355*78bb4007SShri Abhyankar nz = bi[i+1] - bi[i]; 356*78bb4007SShri Abhyankar for (j=0; j<nz; j++) { 357*78bb4007SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 358*78bb4007SShri Abhyankar } 359*78bb4007SShri Abhyankar 360*78bb4007SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 361*78bb4007SShri Abhyankar pv = b->a + bs2*bdiag[i]; 362*78bb4007SShri Abhyankar pj = b->j + bdiag[i]; 363*78bb4007SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 364*78bb4007SShri Abhyankar /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 365*78bb4007SShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 366*78bb4007SShri Abhyankar 367*78bb4007SShri Abhyankar /* U part */ 368*78bb4007SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 369*78bb4007SShri Abhyankar pj = b->j + bdiag[i+1]+1; 370*78bb4007SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 371*78bb4007SShri Abhyankar for (j=0; j<nz; j++){ 372*78bb4007SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 373*78bb4007SShri Abhyankar } 374*78bb4007SShri Abhyankar } 375*78bb4007SShri Abhyankar 376*78bb4007SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 377*78bb4007SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 378*78bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 379*78bb4007SShri Abhyankar 380*78bb4007SShri Abhyankar C->assembled = PETSC_TRUE; 381*78bb4007SShri Abhyankar ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 382*78bb4007SShri Abhyankar PetscFunctionReturn(0); 383*78bb4007SShri Abhyankar } 384*78bb4007SShri Abhyankar 385*78bb4007SShri Abhyankar #undef __FUNCT__ 386e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 387e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 388e6580ceeSShri Abhyankar { 389e6580ceeSShri Abhyankar /* 390e6580ceeSShri Abhyankar Default Version for when blocks are 4 by 4 Using natural ordering 391e6580ceeSShri Abhyankar */ 392e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 393e6580ceeSShri Abhyankar PetscErrorCode ierr; 394e6580ceeSShri Abhyankar PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 395e6580ceeSShri Abhyankar PetscInt *ajtmpold,*ajtmp,nz,row; 396e6580ceeSShri Abhyankar PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 397e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 398e6580ceeSShri Abhyankar MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 399e6580ceeSShri Abhyankar MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 400e6580ceeSShri Abhyankar MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 401e6580ceeSShri Abhyankar MatScalar m13,m14,m15,m16; 402e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 403e6580ceeSShri Abhyankar PetscTruth pivotinblocks = b->pivotinblocks; 404e6580ceeSShri Abhyankar PetscReal shift = info->shiftinblocks; 405e6580ceeSShri Abhyankar 406e6580ceeSShri Abhyankar PetscFunctionBegin; 407e6580ceeSShri Abhyankar ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 408e6580ceeSShri Abhyankar 409e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 410e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 411e6580ceeSShri Abhyankar ajtmp = bj + bi[i]; 412e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 413e6580ceeSShri Abhyankar x = rtmp+16*ajtmp[j]; 414e6580ceeSShri Abhyankar x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 415e6580ceeSShri Abhyankar x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 416e6580ceeSShri Abhyankar } 417e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 418e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 419e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 420e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 421e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 422e6580ceeSShri Abhyankar x = rtmp+16*ajtmpold[j]; 423e6580ceeSShri Abhyankar x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 424e6580ceeSShri Abhyankar x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 425e6580ceeSShri Abhyankar x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 426e6580ceeSShri Abhyankar x[14] = v[14]; x[15] = v[15]; 427e6580ceeSShri Abhyankar v += 16; 428e6580ceeSShri Abhyankar } 429e6580ceeSShri Abhyankar row = *ajtmp++; 430e6580ceeSShri Abhyankar while (row < i) { 431e6580ceeSShri Abhyankar pc = rtmp + 16*row; 432e6580ceeSShri Abhyankar p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 433e6580ceeSShri Abhyankar p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 434e6580ceeSShri Abhyankar p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 435e6580ceeSShri Abhyankar p15 = pc[14]; p16 = pc[15]; 436e6580ceeSShri Abhyankar if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 437e6580ceeSShri Abhyankar p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 438e6580ceeSShri Abhyankar p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 439e6580ceeSShri Abhyankar || p16 != 0.0) { 440e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 441e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 442e6580ceeSShri Abhyankar x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 443e6580ceeSShri Abhyankar x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 444e6580ceeSShri Abhyankar x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 445e6580ceeSShri Abhyankar x15 = pv[14]; x16 = pv[15]; 446e6580ceeSShri Abhyankar pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 447e6580ceeSShri Abhyankar pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 448e6580ceeSShri Abhyankar pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 449e6580ceeSShri Abhyankar pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 450e6580ceeSShri Abhyankar 451e6580ceeSShri Abhyankar pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 452e6580ceeSShri Abhyankar pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 453e6580ceeSShri Abhyankar pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 454e6580ceeSShri Abhyankar pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 455e6580ceeSShri Abhyankar 456e6580ceeSShri Abhyankar pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 457e6580ceeSShri Abhyankar pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 458e6580ceeSShri Abhyankar pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 459e6580ceeSShri Abhyankar pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 460e6580ceeSShri Abhyankar 461e6580ceeSShri Abhyankar pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 462e6580ceeSShri Abhyankar pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 463e6580ceeSShri Abhyankar pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 464e6580ceeSShri Abhyankar pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 465e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 466e6580ceeSShri Abhyankar pv += 16; 467e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 468e6580ceeSShri Abhyankar x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 469e6580ceeSShri Abhyankar x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 470e6580ceeSShri Abhyankar x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 471e6580ceeSShri Abhyankar x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 472e6580ceeSShri Abhyankar x = rtmp + 16*pj[j]; 473e6580ceeSShri Abhyankar x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 474e6580ceeSShri Abhyankar x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 475e6580ceeSShri Abhyankar x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 476e6580ceeSShri Abhyankar x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 477e6580ceeSShri Abhyankar 478e6580ceeSShri Abhyankar x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 479e6580ceeSShri Abhyankar x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 480e6580ceeSShri Abhyankar x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 481e6580ceeSShri Abhyankar x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 482e6580ceeSShri Abhyankar 483e6580ceeSShri Abhyankar x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 484e6580ceeSShri Abhyankar x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 485e6580ceeSShri Abhyankar x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 486e6580ceeSShri Abhyankar x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 487e6580ceeSShri Abhyankar 488e6580ceeSShri Abhyankar x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 489e6580ceeSShri Abhyankar x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 490e6580ceeSShri Abhyankar x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 491e6580ceeSShri Abhyankar x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 492e6580ceeSShri Abhyankar 493e6580ceeSShri Abhyankar pv += 16; 494e6580ceeSShri Abhyankar } 495e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 496e6580ceeSShri Abhyankar } 497e6580ceeSShri Abhyankar row = *ajtmp++; 498e6580ceeSShri Abhyankar } 499e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 500e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 501e6580ceeSShri Abhyankar pj = bj + bi[i]; 502e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 503e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 504e6580ceeSShri Abhyankar x = rtmp+16*pj[j]; 505e6580ceeSShri Abhyankar pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 506e6580ceeSShri Abhyankar pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 507e6580ceeSShri Abhyankar pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 508e6580ceeSShri Abhyankar pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 509e6580ceeSShri Abhyankar pv += 16; 510e6580ceeSShri Abhyankar } 511e6580ceeSShri Abhyankar /* invert diagonal block */ 512e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 513e6580ceeSShri Abhyankar if (pivotinblocks) { 514e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 515e6580ceeSShri Abhyankar } else { 516e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 517e6580ceeSShri Abhyankar } 518e6580ceeSShri Abhyankar } 519e6580ceeSShri Abhyankar 520e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 521e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 522e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 523e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 524e6580ceeSShri Abhyankar ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 525e6580ceeSShri Abhyankar PetscFunctionReturn(0); 526e6580ceeSShri Abhyankar } 527209027a4SShri Abhyankar /* 528209027a4SShri Abhyankar MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct - 529209027a4SShri Abhyankar copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_newdatastruct() 530209027a4SShri Abhyankar */ 531209027a4SShri Abhyankar #undef __FUNCT__ 532209027a4SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct" 533209027a4SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 534209027a4SShri Abhyankar { 535209027a4SShri Abhyankar Mat C=B; 536209027a4SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 537209027a4SShri Abhyankar PetscErrorCode ierr; 538209027a4SShri Abhyankar PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 539209027a4SShri Abhyankar PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 540209027a4SShri Abhyankar MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 541209027a4SShri Abhyankar PetscInt bs2 = a->bs2,flg; 542209027a4SShri Abhyankar PetscReal shift = info->shiftinblocks; 543e6580ceeSShri Abhyankar 544209027a4SShri Abhyankar PetscFunctionBegin; 545209027a4SShri Abhyankar /* generate work space needed by the factorization */ 546209027a4SShri Abhyankar ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 547209027a4SShri Abhyankar mwork = rtmp + bs2*n; 548209027a4SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 549209027a4SShri Abhyankar 550209027a4SShri Abhyankar for (i=0; i<n; i++){ 551209027a4SShri Abhyankar /* zero rtmp */ 552209027a4SShri Abhyankar /* L part */ 553209027a4SShri Abhyankar nz = bi[i+1] - bi[i]; 554209027a4SShri Abhyankar bjtmp = bj + bi[i]; 555209027a4SShri Abhyankar for (j=0; j<nz; j++){ 556209027a4SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 557209027a4SShri Abhyankar } 558209027a4SShri Abhyankar 559209027a4SShri Abhyankar /* U part */ 560209027a4SShri Abhyankar nz = bi[2*n-i+1] - bi[2*n-i]; 561209027a4SShri Abhyankar bjtmp = bj + bi[2*n-i]; 562209027a4SShri Abhyankar for (j=0; j<nz; j++){ 563209027a4SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 564209027a4SShri Abhyankar } 565209027a4SShri Abhyankar 566209027a4SShri Abhyankar /* load in initial (unfactored row) */ 567209027a4SShri Abhyankar nz = ai[i+1] - ai[i]; 568209027a4SShri Abhyankar ajtmp = aj + ai[i]; 569209027a4SShri Abhyankar v = aa + bs2*ai[i]; 570209027a4SShri Abhyankar for (j=0; j<nz; j++) { 571209027a4SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 572209027a4SShri Abhyankar } 573209027a4SShri Abhyankar 574209027a4SShri Abhyankar /* elimination */ 575209027a4SShri Abhyankar bjtmp = bj + bi[i]; 576209027a4SShri Abhyankar nzL = bi[i+1] - bi[i]; 577b1646270SShri Abhyankar for(k=0;k < nzL;k++) { 578b1646270SShri Abhyankar row = bjtmp[k]; 579209027a4SShri Abhyankar pc = rtmp + bs2*row; 580209027a4SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 581209027a4SShri Abhyankar if (flg) { 582209027a4SShri Abhyankar pv = b->a + bs2*bdiag[row]; 583209027a4SShri Abhyankar /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 584209027a4SShri Abhyankar ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 585209027a4SShri Abhyankar 586209027a4SShri Abhyankar pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 587209027a4SShri Abhyankar pv = b->a + bs2*bi[2*n-row]; 588209027a4SShri Abhyankar nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 589209027a4SShri Abhyankar for (j=0; j<nz; j++) { 590209027a4SShri Abhyankar /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 591209027a4SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 592209027a4SShri Abhyankar v = rtmp + bs2*pj[j]; 593209027a4SShri Abhyankar ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 594209027a4SShri Abhyankar pv += bs2; 595209027a4SShri Abhyankar } 596209027a4SShri Abhyankar ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 597209027a4SShri Abhyankar } 598209027a4SShri Abhyankar } 599209027a4SShri Abhyankar 600209027a4SShri Abhyankar /* finished row so stick it into b->a */ 601209027a4SShri Abhyankar /* L part */ 602209027a4SShri Abhyankar pv = b->a + bs2*bi[i] ; 603209027a4SShri Abhyankar pj = b->j + bi[i] ; 604209027a4SShri Abhyankar nz = bi[i+1] - bi[i]; 605209027a4SShri Abhyankar for (j=0; j<nz; j++) { 606209027a4SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 607209027a4SShri Abhyankar } 608209027a4SShri Abhyankar 609209027a4SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 610209027a4SShri Abhyankar pv = b->a + bs2*bdiag[i]; 611209027a4SShri Abhyankar pj = b->j + bdiag[i]; 612209027a4SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 613209027a4SShri Abhyankar /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 614209027a4SShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 615209027a4SShri Abhyankar 616209027a4SShri Abhyankar /* U part */ 617209027a4SShri Abhyankar pv = b->a + bs2*bi[2*n-i]; 618209027a4SShri Abhyankar pj = b->j + bi[2*n-i]; 619209027a4SShri Abhyankar nz = bi[2*n-i+1] - bi[2*n-i] - 1; 620209027a4SShri Abhyankar for (j=0; j<nz; j++){ 621209027a4SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 622209027a4SShri Abhyankar } 623209027a4SShri Abhyankar } 624209027a4SShri Abhyankar 625209027a4SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 626209027a4SShri Abhyankar C->assembled = PETSC_TRUE; 627209027a4SShri Abhyankar ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 628209027a4SShri Abhyankar PetscFunctionReturn(0); 629209027a4SShri Abhyankar } 630e6580ceeSShri Abhyankar 631b2b2dd24SShri Abhyankar #undef __FUNCT__ 632b2b2dd24SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 633b2b2dd24SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info) 634b2b2dd24SShri Abhyankar { 635b2b2dd24SShri Abhyankar Mat C=B; 636b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 637b2b2dd24SShri Abhyankar PetscErrorCode ierr; 638b2b2dd24SShri Abhyankar PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 639b2b2dd24SShri Abhyankar PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 640b2b2dd24SShri Abhyankar MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 641b2b2dd24SShri Abhyankar PetscInt bs2 = a->bs2,flg; 642b2b2dd24SShri Abhyankar PetscReal shift = info->shiftinblocks; 643b2b2dd24SShri Abhyankar 644b2b2dd24SShri Abhyankar PetscFunctionBegin; 645b2b2dd24SShri Abhyankar /* generate work space needed by the factorization */ 646b2b2dd24SShri Abhyankar ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 647b2b2dd24SShri Abhyankar mwork = rtmp + bs2*n; 648b2b2dd24SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 649b2b2dd24SShri Abhyankar 650b2b2dd24SShri Abhyankar for (i=0; i<n; i++){ 651b2b2dd24SShri Abhyankar /* zero rtmp */ 652b2b2dd24SShri Abhyankar /* L part */ 653b2b2dd24SShri Abhyankar nz = bi[i+1] - bi[i]; 654b2b2dd24SShri Abhyankar bjtmp = bj + bi[i]; 655b2b2dd24SShri Abhyankar for (j=0; j<nz; j++){ 656b2b2dd24SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 657b2b2dd24SShri Abhyankar } 658b2b2dd24SShri Abhyankar 659b2b2dd24SShri Abhyankar /* U part */ 660b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 661b2b2dd24SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 662b2b2dd24SShri Abhyankar for (j=0; j<nz; j++){ 663b2b2dd24SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 664b2b2dd24SShri Abhyankar } 665b2b2dd24SShri Abhyankar 666b2b2dd24SShri Abhyankar /* load in initial (unfactored row) */ 667b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 668b2b2dd24SShri Abhyankar ajtmp = aj + ai[i]; 669b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 670b2b2dd24SShri Abhyankar for (j=0; j<nz; j++) { 671b2b2dd24SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 672b2b2dd24SShri Abhyankar } 673b2b2dd24SShri Abhyankar 674b2b2dd24SShri Abhyankar /* elimination */ 675b2b2dd24SShri Abhyankar bjtmp = bj + bi[i]; 676b2b2dd24SShri Abhyankar nzL = bi[i+1] - bi[i]; 677b2b2dd24SShri Abhyankar for(k=0;k < nzL;k++) { 678b2b2dd24SShri Abhyankar row = bjtmp[k]; 679b2b2dd24SShri Abhyankar pc = rtmp + bs2*row; 680b2b2dd24SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 681b2b2dd24SShri Abhyankar if (flg) { 682b2b2dd24SShri Abhyankar pv = b->a + bs2*bdiag[row]; 683b2b2dd24SShri Abhyankar /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 684b2b2dd24SShri Abhyankar ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 685b2b2dd24SShri Abhyankar 686b2b2dd24SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 687b2b2dd24SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 688b2b2dd24SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 689b2b2dd24SShri Abhyankar for (j=0; j<nz; j++) { 690b2b2dd24SShri Abhyankar /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 691b2b2dd24SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 692b2b2dd24SShri Abhyankar v = rtmp + bs2*pj[j]; 693b2b2dd24SShri Abhyankar ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 694b2b2dd24SShri Abhyankar pv += bs2; 695b2b2dd24SShri Abhyankar } 696b2b2dd24SShri Abhyankar ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 697b2b2dd24SShri Abhyankar } 698b2b2dd24SShri Abhyankar } 699b2b2dd24SShri Abhyankar 700b2b2dd24SShri Abhyankar /* finished row so stick it into b->a */ 701b2b2dd24SShri Abhyankar /* L part */ 702b2b2dd24SShri Abhyankar pv = b->a + bs2*bi[i] ; 703b2b2dd24SShri Abhyankar pj = b->j + bi[i] ; 704b2b2dd24SShri Abhyankar nz = bi[i+1] - bi[i]; 705b2b2dd24SShri Abhyankar for (j=0; j<nz; j++) { 706b2b2dd24SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 707b2b2dd24SShri Abhyankar } 708b2b2dd24SShri Abhyankar 709b2b2dd24SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 710b2b2dd24SShri Abhyankar pv = b->a + bs2*bdiag[i]; 711b2b2dd24SShri Abhyankar pj = b->j + bdiag[i]; 712b2b2dd24SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 713b2b2dd24SShri Abhyankar /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 714b2b2dd24SShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 715b2b2dd24SShri Abhyankar 716b2b2dd24SShri Abhyankar /* U part */ 717b2b2dd24SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 718b2b2dd24SShri Abhyankar pj = b->j + bdiag[i+1]+1; 719b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 720b2b2dd24SShri Abhyankar for (j=0; j<nz; j++){ 721b2b2dd24SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 722b2b2dd24SShri Abhyankar } 723b2b2dd24SShri Abhyankar } 724b2b2dd24SShri Abhyankar 725b2b2dd24SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 726b2b2dd24SShri Abhyankar C->assembled = PETSC_TRUE; 727b2b2dd24SShri Abhyankar ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 728b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 729b2b2dd24SShri Abhyankar } 730b2b2dd24SShri Abhyankar 731e6580ceeSShri Abhyankar #if defined(PETSC_HAVE_SSE) 732e6580ceeSShri Abhyankar 733e6580ceeSShri Abhyankar #include PETSC_HAVE_SSE 734e6580ceeSShri Abhyankar 735e6580ceeSShri Abhyankar /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 736e6580ceeSShri Abhyankar #undef __FUNCT__ 737e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 738e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 739e6580ceeSShri Abhyankar { 740e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 741e6580ceeSShri Abhyankar PetscErrorCode ierr; 742e6580ceeSShri Abhyankar int i,j,n = a->mbs; 743e6580ceeSShri Abhyankar int *bj = b->j,*bjtmp,*pj; 744e6580ceeSShri Abhyankar int row; 745e6580ceeSShri Abhyankar int *ajtmpold,nz,*bi=b->i; 746e6580ceeSShri Abhyankar int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 747e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 748e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 749e6580ceeSShri Abhyankar int nonzero=0; 750e6580ceeSShri Abhyankar /* int nonzero=0,colscale = 16; */ 751e6580ceeSShri Abhyankar PetscTruth pivotinblocks = b->pivotinblocks; 752e6580ceeSShri Abhyankar PetscReal shift = info->shiftinblocks; 753e6580ceeSShri Abhyankar 754e6580ceeSShri Abhyankar PetscFunctionBegin; 755e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 756e6580ceeSShri Abhyankar 757e6580ceeSShri Abhyankar if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 758e6580ceeSShri Abhyankar if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 759e6580ceeSShri Abhyankar ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 760e6580ceeSShri Abhyankar if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 761e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 762e6580ceeSShri Abhyankar /* colscale = 4; */ 763e6580ceeSShri Abhyankar /* } */ 764e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 765e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 766e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 767e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 768e6580ceeSShri Abhyankar /* zero out one register */ 769e6580ceeSShri Abhyankar XOR_PS(XMM7,XMM7); 770e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 771e6580ceeSShri Abhyankar x = rtmp+16*bjtmp[j]; 772e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 773e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 774e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 775e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 776e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 777e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 778e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 779e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 780e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 781e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 782e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 783e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 784e6580ceeSShri Abhyankar SSE_INLINE_END_1; 785e6580ceeSShri Abhyankar } 786e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 787e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 788e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 789e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 790e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 791e6580ceeSShri Abhyankar x = rtmp+16*ajtmpold[j]; 792e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmpold[j]; */ 793e6580ceeSShri Abhyankar /* Copy v block into x block */ 794e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v,x) 795e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 796e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 797e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 798e6580ceeSShri Abhyankar 799e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 800e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 801e6580ceeSShri Abhyankar 802e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 803e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 804e6580ceeSShri Abhyankar 805e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 806e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 807e6580ceeSShri Abhyankar 808e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 809e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 810e6580ceeSShri Abhyankar 811e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 812e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 813e6580ceeSShri Abhyankar 814e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 815e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 816e6580ceeSShri Abhyankar 817e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 818e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 819e6580ceeSShri Abhyankar SSE_INLINE_END_2; 820e6580ceeSShri Abhyankar 821e6580ceeSShri Abhyankar v += 16; 822e6580ceeSShri Abhyankar } 823e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 824e6580ceeSShri Abhyankar row = *bjtmp++; 825e6580ceeSShri Abhyankar while (row < i) { 826e6580ceeSShri Abhyankar pc = rtmp + 16*row; 827e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 828e6580ceeSShri Abhyankar /* Load block from lower triangle */ 829e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 830e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 831e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 832e6580ceeSShri Abhyankar 833e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 834e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 835e6580ceeSShri Abhyankar 836e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 837e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 838e6580ceeSShri Abhyankar 839e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 840e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 841e6580ceeSShri Abhyankar 842e6580ceeSShri Abhyankar /* Compare block to zero block */ 843e6580ceeSShri Abhyankar 844e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4,XMM7) 845e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4,XMM0) 846e6580ceeSShri Abhyankar 847e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5,XMM7) 848e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5,XMM1) 849e6580ceeSShri Abhyankar 850e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6,XMM7) 851e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6,XMM2) 852e6580ceeSShri Abhyankar 853e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7,XMM3) 854e6580ceeSShri Abhyankar 855e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 856e6580ceeSShri Abhyankar SSE_OR_PS(XMM6,XMM7) 857e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM4) 858e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM6) 859e6580ceeSShri Abhyankar SSE_INLINE_END_1; 860e6580ceeSShri Abhyankar 861e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 862e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 863e6580ceeSShri Abhyankar MOVEMASK(nonzero,XMM5); 864e6580ceeSShri Abhyankar 865e6580ceeSShri Abhyankar /* If block is nonzero ... */ 866e6580ceeSShri Abhyankar if (nonzero) { 867e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 868e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 869e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 870e6580ceeSShri Abhyankar 871e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 872e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 873e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 874e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 875e6580ceeSShri Abhyankar 876e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv,pc) 877e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 878e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 879e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 880e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM0) 881e6580ceeSShri Abhyankar 882e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 883e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 884e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM1) 885e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM5) 886e6580ceeSShri Abhyankar 887e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 888e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 889e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM2) 890e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM6) 891e6580ceeSShri Abhyankar 892e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 893e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 894e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 895e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM7) 896e6580ceeSShri Abhyankar 897e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 898e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 899e6580ceeSShri Abhyankar 900e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 901e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 902e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 903e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 904e6580ceeSShri Abhyankar 905e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 906e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 907e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 908e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 909e6580ceeSShri Abhyankar 910e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 911e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 912e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 913e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM7) 914e6580ceeSShri Abhyankar 915e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 916e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 917e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 918e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 919e6580ceeSShri Abhyankar 920e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 921e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 922e6580ceeSShri Abhyankar 923e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 924e6580ceeSShri Abhyankar 925e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 926e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 927e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 928e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 929e6580ceeSShri Abhyankar 930e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 931e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 932e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 933e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 934e6580ceeSShri Abhyankar 935e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 936e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 937e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 938e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 939e6580ceeSShri Abhyankar 940e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 941e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 942e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 943e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 944e6580ceeSShri Abhyankar 945e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 946e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 947e6580ceeSShri Abhyankar 948e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 949e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 950e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 951e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 952e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0,XMM7) 953e6580ceeSShri Abhyankar 954e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 955e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 956e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM7) 957e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 958e6580ceeSShri Abhyankar 959e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 960e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1,XMM1,0x00) 961e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM2) 962e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 963e6580ceeSShri Abhyankar 964e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 965e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 966e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3,XMM7) 967e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM3) 968e6580ceeSShri Abhyankar 969e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 970e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 971e6580ceeSShri Abhyankar 972e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 973e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans afterall. */ 974e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 975e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3,XMM0) 976e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 977e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2,XMM6) 978e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 979e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1,XMM5) 980e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 981e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0,XMM4) 982e6580ceeSShri Abhyankar SSE_INLINE_END_2; 983e6580ceeSShri Abhyankar 984e6580ceeSShri Abhyankar /* Update the row: */ 985e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 986e6580ceeSShri Abhyankar pv += 16; 987e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 988e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 989e6580ceeSShri Abhyankar x = rtmp + 16*pj[j]; 990e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 991e6580ceeSShri Abhyankar 992e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 993e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 994e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 995e6580ceeSShri Abhyankar /* Load First Column of X*/ 996e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 997e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 998e6580ceeSShri Abhyankar 999e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1000e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1001e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1002e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1003e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1004e6580ceeSShri Abhyankar 1005e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1006e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1007e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1008e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1009e6580ceeSShri Abhyankar 1010e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1011e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1012e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1013e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1014e6580ceeSShri Abhyankar 1015e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1016e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1017e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1018e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1019e6580ceeSShri Abhyankar 1020e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1021e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1022e6580ceeSShri Abhyankar 1023e6580ceeSShri Abhyankar /* Second Column */ 1024e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1025e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1026e6580ceeSShri Abhyankar 1027e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1028e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1029e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1030e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1031e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1032e6580ceeSShri Abhyankar 1033e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1034e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1035e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1036e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM7) 1037e6580ceeSShri Abhyankar 1038e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1039e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1040e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM2) 1041e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM4) 1042e6580ceeSShri Abhyankar 1043e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1044e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1045e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1046e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1047e6580ceeSShri Abhyankar 1048e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1049e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1050e6580ceeSShri Abhyankar 1051e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1052e6580ceeSShri Abhyankar 1053e6580ceeSShri Abhyankar /* Third Column */ 1054e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1055e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1056e6580ceeSShri Abhyankar 1057e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1058e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1059e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1060e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM0) 1061e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1062e6580ceeSShri Abhyankar 1063e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1064e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1065e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM1) 1066e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM4) 1067e6580ceeSShri Abhyankar 1068e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1069e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1070e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM2) 1071e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM5) 1072e6580ceeSShri Abhyankar 1073e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1074e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1075e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1076e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1077e6580ceeSShri Abhyankar 1078e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1079e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1080e6580ceeSShri Abhyankar 1081e6580ceeSShri Abhyankar /* Fourth Column */ 1082e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1083e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1084e6580ceeSShri Abhyankar 1085e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1086e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1087e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1088e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1089e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1090e6580ceeSShri Abhyankar 1091e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1092e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1093e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1094e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1095e6580ceeSShri Abhyankar 1096e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1097e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1098e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1099e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1100e6580ceeSShri Abhyankar 1101e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1102e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1103e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1104e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1105e6580ceeSShri Abhyankar 1106e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1107e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1108e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1109e6580ceeSShri Abhyankar pv += 16; 1110e6580ceeSShri Abhyankar } 1111e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1112e6580ceeSShri Abhyankar } 1113e6580ceeSShri Abhyankar row = *bjtmp++; 1114e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1115e6580ceeSShri Abhyankar } 1116e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1117e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 1118e6580ceeSShri Abhyankar pj = bj + bi[i]; 1119e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 1120e6580ceeSShri Abhyankar 1121e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1122e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1123e6580ceeSShri Abhyankar x = rtmp+16*pj[j]; 1124e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1125e6580ceeSShri Abhyankar 1126e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1127e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1128e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1129e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1130e6580ceeSShri Abhyankar 1131e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1132e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1133e6580ceeSShri Abhyankar 1134e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1135e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1136e6580ceeSShri Abhyankar 1137e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1138e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1139e6580ceeSShri Abhyankar 1140e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1141e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1142e6580ceeSShri Abhyankar 1143e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1144e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1145e6580ceeSShri Abhyankar 1146e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1147e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1148e6580ceeSShri Abhyankar 1149e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1150e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1151e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1152e6580ceeSShri Abhyankar pv += 16; 1153e6580ceeSShri Abhyankar } 1154e6580ceeSShri Abhyankar /* invert diagonal block */ 1155e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 1156e6580ceeSShri Abhyankar if (pivotinblocks) { 1157e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1158e6580ceeSShri Abhyankar } else { 1159e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1160e6580ceeSShri Abhyankar } 1161e6580ceeSShri Abhyankar /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1162e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1163e6580ceeSShri Abhyankar } 1164e6580ceeSShri Abhyankar 1165e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 1166e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1167e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1168e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 1169e6580ceeSShri Abhyankar ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1170e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1171e6580ceeSShri Abhyankar SSE_SCOPE_END; 1172e6580ceeSShri Abhyankar PetscFunctionReturn(0); 1173e6580ceeSShri Abhyankar } 1174e6580ceeSShri Abhyankar 1175e6580ceeSShri Abhyankar #undef __FUNCT__ 1176e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 1177e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1178e6580ceeSShri Abhyankar { 1179e6580ceeSShri Abhyankar Mat A=C; 1180e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1181e6580ceeSShri Abhyankar PetscErrorCode ierr; 1182e6580ceeSShri Abhyankar int i,j,n = a->mbs; 1183e6580ceeSShri Abhyankar unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1184e6580ceeSShri Abhyankar unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 1185e6580ceeSShri Abhyankar unsigned int row; 1186e6580ceeSShri Abhyankar int nz,*bi=b->i; 1187e6580ceeSShri Abhyankar int *diag_offset = b->diag,*ai=a->i; 1188e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1189e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 1190e6580ceeSShri Abhyankar int nonzero=0; 1191e6580ceeSShri Abhyankar /* int nonzero=0,colscale = 16; */ 1192e6580ceeSShri Abhyankar PetscTruth pivotinblocks = b->pivotinblocks; 1193e6580ceeSShri Abhyankar PetscReal shift = info->shiftinblocks; 1194e6580ceeSShri Abhyankar 1195e6580ceeSShri Abhyankar PetscFunctionBegin; 1196e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 1197e6580ceeSShri Abhyankar 1198e6580ceeSShri Abhyankar if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1199e6580ceeSShri Abhyankar if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1200e6580ceeSShri Abhyankar ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1201e6580ceeSShri Abhyankar if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1202e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 1203e6580ceeSShri Abhyankar /* colscale = 4; */ 1204e6580ceeSShri Abhyankar /* } */ 1205e6580ceeSShri Abhyankar 1206e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 1207e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 1208e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 1209e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 1210e6580ceeSShri Abhyankar /* zero out one register */ 1211e6580ceeSShri Abhyankar XOR_PS(XMM7,XMM7); 1212e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1213e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)bjtmp[j]); 1214e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 1215e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 1216e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 1217e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1218e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1219e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1220e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1221e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1222e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1223e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1224e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1225e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1226e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1227e6580ceeSShri Abhyankar } 1228e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 1229e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 1230e6580ceeSShri Abhyankar ajtmp = aj + ai[i]; 1231e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 1232e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1233e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)ajtmp[j]); 1234e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmp[j]; */ 1235e6580ceeSShri Abhyankar /* Copy v block into x block */ 1236e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v,x) 1237e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1238e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1239e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1240e6580ceeSShri Abhyankar 1241e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1242e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1243e6580ceeSShri Abhyankar 1244e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1245e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1246e6580ceeSShri Abhyankar 1247e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1248e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1249e6580ceeSShri Abhyankar 1250e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1251e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1252e6580ceeSShri Abhyankar 1253e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1254e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1255e6580ceeSShri Abhyankar 1256e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1257e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1258e6580ceeSShri Abhyankar 1259e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1260e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1261e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1262e6580ceeSShri Abhyankar 1263e6580ceeSShri Abhyankar v += 16; 1264e6580ceeSShri Abhyankar } 1265e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1266e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1267e6580ceeSShri Abhyankar while (row < i) { 1268e6580ceeSShri Abhyankar pc = rtmp + 16*row; 1269e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 1270e6580ceeSShri Abhyankar /* Load block from lower triangle */ 1271e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1272e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1273e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1274e6580ceeSShri Abhyankar 1275e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1276e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1277e6580ceeSShri Abhyankar 1278e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1279e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1280e6580ceeSShri Abhyankar 1281e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1282e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1283e6580ceeSShri Abhyankar 1284e6580ceeSShri Abhyankar /* Compare block to zero block */ 1285e6580ceeSShri Abhyankar 1286e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4,XMM7) 1287e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4,XMM0) 1288e6580ceeSShri Abhyankar 1289e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5,XMM7) 1290e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5,XMM1) 1291e6580ceeSShri Abhyankar 1292e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6,XMM7) 1293e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6,XMM2) 1294e6580ceeSShri Abhyankar 1295e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7,XMM3) 1296e6580ceeSShri Abhyankar 1297e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 1298e6580ceeSShri Abhyankar SSE_OR_PS(XMM6,XMM7) 1299e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM4) 1300e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM6) 1301e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1302e6580ceeSShri Abhyankar 1303e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 1304e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1305e6580ceeSShri Abhyankar MOVEMASK(nonzero,XMM5); 1306e6580ceeSShri Abhyankar 1307e6580ceeSShri Abhyankar /* If block is nonzero ... */ 1308e6580ceeSShri Abhyankar if (nonzero) { 1309e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 1310e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1311e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 1312e6580ceeSShri Abhyankar 1313e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1314e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1315e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 1316e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1317e6580ceeSShri Abhyankar 1318e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv,pc) 1319e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 1320e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1321e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1322e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM0) 1323e6580ceeSShri Abhyankar 1324e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1325e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1326e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM1) 1327e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM5) 1328e6580ceeSShri Abhyankar 1329e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1330e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1331e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM2) 1332e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM6) 1333e6580ceeSShri Abhyankar 1334e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1335e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1336e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1337e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM7) 1338e6580ceeSShri Abhyankar 1339e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1340e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1341e6580ceeSShri Abhyankar 1342e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 1343e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1344e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1345e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1346e6580ceeSShri Abhyankar 1347e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1348e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1349e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1350e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 1351e6580ceeSShri Abhyankar 1352e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1353e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1354e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1355e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM7) 1356e6580ceeSShri Abhyankar 1357e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1358e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1359e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1360e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 1361e6580ceeSShri Abhyankar 1362e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1363e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1364e6580ceeSShri Abhyankar 1365e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1366e6580ceeSShri Abhyankar 1367e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 1368e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1369e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1370e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1371e6580ceeSShri Abhyankar 1372e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1373e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1374e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1375e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1376e6580ceeSShri Abhyankar 1377e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1378e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1379e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1380e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1381e6580ceeSShri Abhyankar 1382e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1383e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1384e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1385e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1386e6580ceeSShri Abhyankar 1387e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1388e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1389e6580ceeSShri Abhyankar 1390e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1391e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 1392e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1393e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1394e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0,XMM7) 1395e6580ceeSShri Abhyankar 1396e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1397e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1398e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM7) 1399e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 1400e6580ceeSShri Abhyankar 1401e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1402e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1,XMM1,0x00) 1403e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM2) 1404e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 1405e6580ceeSShri Abhyankar 1406e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1407e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1408e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3,XMM7) 1409e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM3) 1410e6580ceeSShri Abhyankar 1411e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1412e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1413e6580ceeSShri Abhyankar 1414e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1415e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans afterall. */ 1416e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 1417e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3,XMM0) 1418e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 1419e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2,XMM6) 1420e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 1421e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1,XMM5) 1422e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 1423e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0,XMM4) 1424e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1425e6580ceeSShri Abhyankar 1426e6580ceeSShri Abhyankar /* Update the row: */ 1427e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 1428e6580ceeSShri Abhyankar pv += 16; 1429e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1430e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1431e6580ceeSShri Abhyankar x = rtmp + 16*((unsigned int)pj[j]); 1432e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 1433e6580ceeSShri Abhyankar 1434e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 1435e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1436e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1437e6580ceeSShri Abhyankar /* Load First Column of X*/ 1438e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1439e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1440e6580ceeSShri Abhyankar 1441e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1442e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1443e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1444e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1445e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1446e6580ceeSShri Abhyankar 1447e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1448e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1449e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1450e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1451e6580ceeSShri Abhyankar 1452e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1453e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1454e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1455e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1456e6580ceeSShri Abhyankar 1457e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1458e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1459e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1460e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1461e6580ceeSShri Abhyankar 1462e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1463e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1464e6580ceeSShri Abhyankar 1465e6580ceeSShri Abhyankar /* Second Column */ 1466e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1467e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1468e6580ceeSShri Abhyankar 1469e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1470e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1471e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1472e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1473e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1474e6580ceeSShri Abhyankar 1475e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1476e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1477e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1478e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM7) 1479e6580ceeSShri Abhyankar 1480e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1481e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1482e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM2) 1483e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM4) 1484e6580ceeSShri Abhyankar 1485e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1486e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1487e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1488e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1489e6580ceeSShri Abhyankar 1490e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1491e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1492e6580ceeSShri Abhyankar 1493e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1494e6580ceeSShri Abhyankar 1495e6580ceeSShri Abhyankar /* Third Column */ 1496e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1497e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1498e6580ceeSShri Abhyankar 1499e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1500e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1501e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1502e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM0) 1503e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1504e6580ceeSShri Abhyankar 1505e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1506e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1507e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM1) 1508e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM4) 1509e6580ceeSShri Abhyankar 1510e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1511e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1512e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM2) 1513e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM5) 1514e6580ceeSShri Abhyankar 1515e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1516e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1517e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1518e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1519e6580ceeSShri Abhyankar 1520e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1521e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1522e6580ceeSShri Abhyankar 1523e6580ceeSShri Abhyankar /* Fourth Column */ 1524e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1525e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1526e6580ceeSShri Abhyankar 1527e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1528e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1529e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1530e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1531e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1532e6580ceeSShri Abhyankar 1533e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1534e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1535e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1536e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1537e6580ceeSShri Abhyankar 1538e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1539e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1540e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1541e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1542e6580ceeSShri Abhyankar 1543e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1544e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1545e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1546e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1547e6580ceeSShri Abhyankar 1548e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1549e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1550e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1551e6580ceeSShri Abhyankar pv += 16; 1552e6580ceeSShri Abhyankar } 1553e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1554e6580ceeSShri Abhyankar } 1555e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1556e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1557e6580ceeSShri Abhyankar /* bjtmp++; */ 1558e6580ceeSShri Abhyankar } 1559e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1560e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 1561e6580ceeSShri Abhyankar pj = bj + bi[i]; 1562e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 1563e6580ceeSShri Abhyankar 1564e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1565e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1566e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)pj[j]); 1567e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1568e6580ceeSShri Abhyankar 1569e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1570e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1571e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1572e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1573e6580ceeSShri Abhyankar 1574e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1575e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1576e6580ceeSShri Abhyankar 1577e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1578e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1579e6580ceeSShri Abhyankar 1580e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1581e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1582e6580ceeSShri Abhyankar 1583e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1584e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1585e6580ceeSShri Abhyankar 1586e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1587e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1588e6580ceeSShri Abhyankar 1589e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1590e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1591e6580ceeSShri Abhyankar 1592e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1593e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1594e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1595e6580ceeSShri Abhyankar pv += 16; 1596e6580ceeSShri Abhyankar } 1597e6580ceeSShri Abhyankar /* invert diagonal block */ 1598e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 1599e6580ceeSShri Abhyankar if (pivotinblocks) { 1600e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1601e6580ceeSShri Abhyankar } else { 1602e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1603e6580ceeSShri Abhyankar } 1604e6580ceeSShri Abhyankar /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1605e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1606e6580ceeSShri Abhyankar } 1607e6580ceeSShri Abhyankar 1608e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 1609e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1610e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1611e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 1612e6580ceeSShri Abhyankar ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1613e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1614e6580ceeSShri Abhyankar SSE_SCOPE_END; 1615e6580ceeSShri Abhyankar PetscFunctionReturn(0); 1616e6580ceeSShri Abhyankar } 1617e6580ceeSShri Abhyankar 1618e6580ceeSShri Abhyankar #undef __FUNCT__ 1619e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1620e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1621e6580ceeSShri Abhyankar { 1622e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1623e6580ceeSShri Abhyankar PetscErrorCode ierr; 1624e6580ceeSShri Abhyankar int i,j,n = a->mbs; 1625e6580ceeSShri Abhyankar unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1626e6580ceeSShri Abhyankar unsigned int row; 1627e6580ceeSShri Abhyankar int *ajtmpold,nz,*bi=b->i; 1628e6580ceeSShri Abhyankar int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1629e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1630e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 1631e6580ceeSShri Abhyankar int nonzero=0; 1632e6580ceeSShri Abhyankar /* int nonzero=0,colscale = 16; */ 1633e6580ceeSShri Abhyankar PetscTruth pivotinblocks = b->pivotinblocks; 1634e6580ceeSShri Abhyankar PetscReal shift = info->shiftinblocks; 1635e6580ceeSShri Abhyankar 1636e6580ceeSShri Abhyankar PetscFunctionBegin; 1637e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 1638e6580ceeSShri Abhyankar 1639e6580ceeSShri Abhyankar if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1640e6580ceeSShri Abhyankar if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1641e6580ceeSShri Abhyankar ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1642e6580ceeSShri Abhyankar if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1643e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 1644e6580ceeSShri Abhyankar /* colscale = 4; */ 1645e6580ceeSShri Abhyankar /* } */ 1646e6580ceeSShri Abhyankar if ((unsigned long)bj==(unsigned long)aj) { 1647e6580ceeSShri Abhyankar return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1648e6580ceeSShri Abhyankar } 1649e6580ceeSShri Abhyankar 1650e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 1651e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 1652e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 1653e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 1654e6580ceeSShri Abhyankar /* zero out one register */ 1655e6580ceeSShri Abhyankar XOR_PS(XMM7,XMM7); 1656e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1657e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)bjtmp[j]); 1658e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 1659e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 1660e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 1661e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1662e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1663e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1664e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1665e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1666e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1667e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1668e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1669e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1670e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1671e6580ceeSShri Abhyankar } 1672e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 1673e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 1674e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 1675e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 1676e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1677e6580ceeSShri Abhyankar x = rtmp+16*ajtmpold[j]; 1678e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmpold[j]; */ 1679e6580ceeSShri Abhyankar /* Copy v block into x block */ 1680e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v,x) 1681e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1682e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1683e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1684e6580ceeSShri Abhyankar 1685e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1686e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1687e6580ceeSShri Abhyankar 1688e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1689e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1690e6580ceeSShri Abhyankar 1691e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1692e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1693e6580ceeSShri Abhyankar 1694e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1695e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1696e6580ceeSShri Abhyankar 1697e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1698e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1699e6580ceeSShri Abhyankar 1700e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1701e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1702e6580ceeSShri Abhyankar 1703e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1704e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1705e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1706e6580ceeSShri Abhyankar 1707e6580ceeSShri Abhyankar v += 16; 1708e6580ceeSShri Abhyankar } 1709e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1710e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1711e6580ceeSShri Abhyankar while (row < i) { 1712e6580ceeSShri Abhyankar pc = rtmp + 16*row; 1713e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 1714e6580ceeSShri Abhyankar /* Load block from lower triangle */ 1715e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1716e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1717e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1718e6580ceeSShri Abhyankar 1719e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1720e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1721e6580ceeSShri Abhyankar 1722e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1723e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1724e6580ceeSShri Abhyankar 1725e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1726e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1727e6580ceeSShri Abhyankar 1728e6580ceeSShri Abhyankar /* Compare block to zero block */ 1729e6580ceeSShri Abhyankar 1730e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4,XMM7) 1731e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4,XMM0) 1732e6580ceeSShri Abhyankar 1733e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5,XMM7) 1734e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5,XMM1) 1735e6580ceeSShri Abhyankar 1736e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6,XMM7) 1737e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6,XMM2) 1738e6580ceeSShri Abhyankar 1739e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7,XMM3) 1740e6580ceeSShri Abhyankar 1741e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 1742e6580ceeSShri Abhyankar SSE_OR_PS(XMM6,XMM7) 1743e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM4) 1744e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM6) 1745e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1746e6580ceeSShri Abhyankar 1747e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 1748e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1749e6580ceeSShri Abhyankar MOVEMASK(nonzero,XMM5); 1750e6580ceeSShri Abhyankar 1751e6580ceeSShri Abhyankar /* If block is nonzero ... */ 1752e6580ceeSShri Abhyankar if (nonzero) { 1753e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 1754e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1755e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 1756e6580ceeSShri Abhyankar 1757e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1758e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1759e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 1760e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1761e6580ceeSShri Abhyankar 1762e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv,pc) 1763e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 1764e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1765e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1766e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM0) 1767e6580ceeSShri Abhyankar 1768e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1769e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1770e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM1) 1771e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM5) 1772e6580ceeSShri Abhyankar 1773e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1774e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1775e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM2) 1776e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM6) 1777e6580ceeSShri Abhyankar 1778e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1779e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1780e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1781e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM7) 1782e6580ceeSShri Abhyankar 1783e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1784e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1785e6580ceeSShri Abhyankar 1786e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 1787e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1788e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1789e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1790e6580ceeSShri Abhyankar 1791e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1792e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1793e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1794e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 1795e6580ceeSShri Abhyankar 1796e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1797e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1798e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1799e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM7) 1800e6580ceeSShri Abhyankar 1801e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1802e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1803e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1804e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 1805e6580ceeSShri Abhyankar 1806e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1807e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1808e6580ceeSShri Abhyankar 1809e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1810e6580ceeSShri Abhyankar 1811e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 1812e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1813e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1814e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1815e6580ceeSShri Abhyankar 1816e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1817e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1818e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1819e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1820e6580ceeSShri Abhyankar 1821e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1822e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1823e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1824e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1825e6580ceeSShri Abhyankar 1826e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1827e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1828e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1829e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1830e6580ceeSShri Abhyankar 1831e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1832e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1833e6580ceeSShri Abhyankar 1834e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1835e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 1836e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1837e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1838e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0,XMM7) 1839e6580ceeSShri Abhyankar 1840e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1841e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1842e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM7) 1843e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 1844e6580ceeSShri Abhyankar 1845e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1846e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1,XMM1,0x00) 1847e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM2) 1848e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 1849e6580ceeSShri Abhyankar 1850e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1851e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1852e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3,XMM7) 1853e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM3) 1854e6580ceeSShri Abhyankar 1855e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1856e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1857e6580ceeSShri Abhyankar 1858e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1859e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans afterall. */ 1860e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 1861e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3,XMM0) 1862e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 1863e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2,XMM6) 1864e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 1865e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1,XMM5) 1866e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 1867e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0,XMM4) 1868e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1869e6580ceeSShri Abhyankar 1870e6580ceeSShri Abhyankar /* Update the row: */ 1871e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 1872e6580ceeSShri Abhyankar pv += 16; 1873e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1874e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1875e6580ceeSShri Abhyankar x = rtmp + 16*((unsigned int)pj[j]); 1876e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 1877e6580ceeSShri Abhyankar 1878e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 1879e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1880e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1881e6580ceeSShri Abhyankar /* Load First Column of X*/ 1882e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1883e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1884e6580ceeSShri Abhyankar 1885e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1886e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1887e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1888e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1889e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1890e6580ceeSShri Abhyankar 1891e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1892e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1893e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1894e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1895e6580ceeSShri Abhyankar 1896e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1897e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1898e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1899e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1900e6580ceeSShri Abhyankar 1901e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1902e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1903e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1904e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1905e6580ceeSShri Abhyankar 1906e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1907e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1908e6580ceeSShri Abhyankar 1909e6580ceeSShri Abhyankar /* Second Column */ 1910e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1911e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1912e6580ceeSShri Abhyankar 1913e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1914e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1915e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1916e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1917e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1918e6580ceeSShri Abhyankar 1919e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1920e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1921e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1922e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM7) 1923e6580ceeSShri Abhyankar 1924e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1925e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1926e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM2) 1927e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM4) 1928e6580ceeSShri Abhyankar 1929e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1930e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1931e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1932e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1933e6580ceeSShri Abhyankar 1934e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1935e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1936e6580ceeSShri Abhyankar 1937e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1938e6580ceeSShri Abhyankar 1939e6580ceeSShri Abhyankar /* Third Column */ 1940e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1941e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1942e6580ceeSShri Abhyankar 1943e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1944e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1945e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1946e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM0) 1947e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1948e6580ceeSShri Abhyankar 1949e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1950e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1951e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM1) 1952e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM4) 1953e6580ceeSShri Abhyankar 1954e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1955e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1956e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM2) 1957e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM5) 1958e6580ceeSShri Abhyankar 1959e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1960e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1961e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1962e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1963e6580ceeSShri Abhyankar 1964e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1965e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1966e6580ceeSShri Abhyankar 1967e6580ceeSShri Abhyankar /* Fourth Column */ 1968e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1969e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1970e6580ceeSShri Abhyankar 1971e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1972e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1973e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1974e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1975e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1976e6580ceeSShri Abhyankar 1977e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1978e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1979e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1980e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1981e6580ceeSShri Abhyankar 1982e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1983e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1984e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1985e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1986e6580ceeSShri Abhyankar 1987e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1988e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1989e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1990e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1991e6580ceeSShri Abhyankar 1992e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1993e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1994e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1995e6580ceeSShri Abhyankar pv += 16; 1996e6580ceeSShri Abhyankar } 1997e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1998e6580ceeSShri Abhyankar } 1999e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 2000e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 2001e6580ceeSShri Abhyankar /* bjtmp++; */ 2002e6580ceeSShri Abhyankar } 2003e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 2004e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 2005e6580ceeSShri Abhyankar pj = bj + bi[i]; 2006e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 2007e6580ceeSShri Abhyankar 2008e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 2009e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 2010e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)pj[j]); 2011e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 2012e6580ceeSShri Abhyankar 2013e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 2014e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 2015e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 2016e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 2017e6580ceeSShri Abhyankar 2018e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 2019e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 2020e6580ceeSShri Abhyankar 2021e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 2022e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 2023e6580ceeSShri Abhyankar 2024e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 2025e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 2026e6580ceeSShri Abhyankar 2027e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 2028e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 2029e6580ceeSShri Abhyankar 2030e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 2031e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 2032e6580ceeSShri Abhyankar 2033e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 2034e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 2035e6580ceeSShri Abhyankar 2036e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 2037e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 2038e6580ceeSShri Abhyankar SSE_INLINE_END_2; 2039e6580ceeSShri Abhyankar pv += 16; 2040e6580ceeSShri Abhyankar } 2041e6580ceeSShri Abhyankar /* invert diagonal block */ 2042e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 2043e6580ceeSShri Abhyankar if (pivotinblocks) { 2044e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 2045e6580ceeSShri Abhyankar } else { 2046e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 2047e6580ceeSShri Abhyankar } 2048e6580ceeSShri Abhyankar /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 2049e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 2050e6580ceeSShri Abhyankar } 2051e6580ceeSShri Abhyankar 2052e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 2053e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 2054e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 2055e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 2056e6580ceeSShri Abhyankar ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 2057e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 2058e6580ceeSShri Abhyankar SSE_SCOPE_END; 2059e6580ceeSShri Abhyankar PetscFunctionReturn(0); 2060e6580ceeSShri Abhyankar } 2061e6580ceeSShri Abhyankar 2062e6580ceeSShri Abhyankar #endif 2063