1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 383287d42SBarry Smith /* 483287d42SBarry Smith Factorization code for BAIJ format. 583287d42SBarry Smith */ 67c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 7c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 883287d42SBarry Smith 983287d42SBarry Smith /* ------------------------------------------------------------*/ 1083287d42SBarry Smith /* 1183287d42SBarry Smith Version for when blocks are 4 by 4 1283287d42SBarry Smith */ 134a2ae208SSatish Balay #undef __FUNCT__ 144a2ae208SSatish Balay #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 150481f469SBarry Smith PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info) 1683287d42SBarry Smith { 1783287d42SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1883287d42SBarry Smith IS isrow = b->row,isicol = b->icol; 196849ba73SBarry Smith PetscErrorCode ierr; 205d0c19d7SBarry Smith const PetscInt *r,*ic; 215d0c19d7SBarry Smith PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 22690b6cddSBarry Smith PetscInt *ajtmpold,*ajtmp,nz,row; 23690b6cddSBarry Smith PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 2483287d42SBarry Smith MatScalar *pv,*v,*rtmp,*pc,*w,*x; 2583287d42SBarry Smith MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 2683287d42SBarry Smith MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 2783287d42SBarry Smith MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 2883287d42SBarry Smith MatScalar m13,m14,m15,m16; 2983287d42SBarry Smith MatScalar *ba = b->a,*aa = a->a; 30bcd9e38bSBarry Smith PetscTruth pivotinblocks = b->pivotinblocks; 3162bba022SBarry Smith PetscReal shift = info->shiftinblocks; 3283287d42SBarry Smith 3383287d42SBarry Smith PetscFunctionBegin; 3483287d42SBarry Smith ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3583287d42SBarry Smith ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 36b0a32e0cSBarry Smith ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 3783287d42SBarry Smith 3883287d42SBarry Smith for (i=0; i<n; i++) { 3983287d42SBarry Smith nz = bi[i+1] - bi[i]; 4083287d42SBarry Smith ajtmp = bj + bi[i]; 4183287d42SBarry Smith for (j=0; j<nz; j++) { 4283287d42SBarry Smith x = rtmp+16*ajtmp[j]; 4383287d42SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 4483287d42SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 4583287d42SBarry Smith } 4683287d42SBarry Smith /* load in initial (unfactored row) */ 4783287d42SBarry Smith idx = r[i]; 4883287d42SBarry Smith nz = ai[idx+1] - ai[idx]; 4983287d42SBarry Smith ajtmpold = aj + ai[idx]; 5083287d42SBarry Smith v = aa + 16*ai[idx]; 5183287d42SBarry Smith for (j=0; j<nz; j++) { 5283287d42SBarry Smith x = rtmp+16*ic[ajtmpold[j]]; 5383287d42SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 5483287d42SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 5583287d42SBarry Smith x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 5683287d42SBarry Smith x[14] = v[14]; x[15] = v[15]; 5783287d42SBarry Smith v += 16; 5883287d42SBarry Smith } 5983287d42SBarry Smith row = *ajtmp++; 6083287d42SBarry Smith while (row < i) { 6183287d42SBarry Smith pc = rtmp + 16*row; 6283287d42SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 6383287d42SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 6483287d42SBarry Smith p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 6583287d42SBarry Smith p15 = pc[14]; p16 = pc[15]; 6683287d42SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 6783287d42SBarry Smith p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 6883287d42SBarry Smith p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 6983287d42SBarry Smith || p16 != 0.0) { 7083287d42SBarry Smith pv = ba + 16*diag_offset[row]; 7183287d42SBarry Smith pj = bj + diag_offset[row] + 1; 7283287d42SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 7383287d42SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 7483287d42SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 7583287d42SBarry Smith x15 = pv[14]; x16 = pv[15]; 7683287d42SBarry Smith pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 7783287d42SBarry Smith pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 7883287d42SBarry Smith pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 7983287d42SBarry Smith pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 8083287d42SBarry Smith 8183287d42SBarry Smith pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 8283287d42SBarry Smith pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 8383287d42SBarry Smith pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 8483287d42SBarry Smith pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 8583287d42SBarry Smith 8683287d42SBarry Smith pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 8783287d42SBarry Smith pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 8883287d42SBarry Smith pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 8983287d42SBarry Smith pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 9083287d42SBarry Smith 9183287d42SBarry Smith pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 9283287d42SBarry Smith pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 9383287d42SBarry Smith pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 9483287d42SBarry Smith pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 9583287d42SBarry Smith 9683287d42SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 9783287d42SBarry Smith pv += 16; 9883287d42SBarry Smith for (j=0; j<nz; j++) { 9983287d42SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 10083287d42SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 10183287d42SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 10283287d42SBarry Smith x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 10383287d42SBarry Smith x = rtmp + 16*pj[j]; 10483287d42SBarry Smith x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 10583287d42SBarry Smith x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 10683287d42SBarry Smith x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 10783287d42SBarry Smith x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 10883287d42SBarry Smith 10983287d42SBarry Smith x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 11083287d42SBarry Smith x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 11183287d42SBarry Smith x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 11283287d42SBarry Smith x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 11383287d42SBarry Smith 11483287d42SBarry Smith x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 11583287d42SBarry Smith x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 11683287d42SBarry Smith x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 11783287d42SBarry Smith x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 11883287d42SBarry Smith 11983287d42SBarry Smith x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 12083287d42SBarry Smith x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 12183287d42SBarry Smith x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 12283287d42SBarry Smith x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 12383287d42SBarry Smith 12483287d42SBarry Smith pv += 16; 12583287d42SBarry Smith } 126dc0b31edSSatish Balay ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 12783287d42SBarry Smith } 12883287d42SBarry Smith row = *ajtmp++; 12983287d42SBarry Smith } 13083287d42SBarry Smith /* finished row so stick it into b->a */ 13183287d42SBarry Smith pv = ba + 16*bi[i]; 13283287d42SBarry Smith pj = bj + bi[i]; 13383287d42SBarry Smith nz = bi[i+1] - bi[i]; 13483287d42SBarry Smith for (j=0; j<nz; j++) { 13583287d42SBarry Smith x = rtmp+16*pj[j]; 13683287d42SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 13783287d42SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 13883287d42SBarry Smith pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 13983287d42SBarry Smith pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 14083287d42SBarry Smith pv += 16; 14183287d42SBarry Smith } 14283287d42SBarry Smith /* invert diagonal block */ 14383287d42SBarry Smith w = ba + 16*diag_offset[i]; 144bcd9e38bSBarry Smith if (pivotinblocks) { 14562bba022SBarry Smith ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 146bcd9e38bSBarry Smith } else { 147bcd9e38bSBarry Smith ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148bcd9e38bSBarry Smith } 14983287d42SBarry Smith } 15083287d42SBarry Smith 15183287d42SBarry Smith ierr = PetscFree(rtmp);CHKERRQ(ierr); 15283287d42SBarry Smith ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 15383287d42SBarry Smith ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154db4efbfdSBarry Smith C->ops->solve = MatSolve_SeqBAIJ_4; 155db4efbfdSBarry Smith C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 15683287d42SBarry Smith C->assembled = PETSC_TRUE; 157efee365bSSatish Balay ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 15883287d42SBarry Smith PetscFunctionReturn(0); 15983287d42SBarry Smith } 160*e6580ceeSShri Abhyankar 161*e6580ceeSShri Abhyankar #undef __FUNCT__ 162*e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 163*e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 164*e6580ceeSShri Abhyankar { 165*e6580ceeSShri Abhyankar /* 166*e6580ceeSShri Abhyankar Default Version for when blocks are 4 by 4 Using natural ordering 167*e6580ceeSShri Abhyankar */ 168*e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 169*e6580ceeSShri Abhyankar PetscErrorCode ierr; 170*e6580ceeSShri Abhyankar PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 171*e6580ceeSShri Abhyankar PetscInt *ajtmpold,*ajtmp,nz,row; 172*e6580ceeSShri Abhyankar PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 173*e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 174*e6580ceeSShri Abhyankar MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 175*e6580ceeSShri Abhyankar MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 176*e6580ceeSShri Abhyankar MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 177*e6580ceeSShri Abhyankar MatScalar m13,m14,m15,m16; 178*e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 179*e6580ceeSShri Abhyankar PetscTruth pivotinblocks = b->pivotinblocks; 180*e6580ceeSShri Abhyankar PetscReal shift = info->shiftinblocks; 181*e6580ceeSShri Abhyankar 182*e6580ceeSShri Abhyankar PetscFunctionBegin; 183*e6580ceeSShri Abhyankar ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 184*e6580ceeSShri Abhyankar 185*e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 186*e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 187*e6580ceeSShri Abhyankar ajtmp = bj + bi[i]; 188*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 189*e6580ceeSShri Abhyankar x = rtmp+16*ajtmp[j]; 190*e6580ceeSShri Abhyankar x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 191*e6580ceeSShri Abhyankar x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 192*e6580ceeSShri Abhyankar } 193*e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 194*e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 195*e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 196*e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 197*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 198*e6580ceeSShri Abhyankar x = rtmp+16*ajtmpold[j]; 199*e6580ceeSShri Abhyankar x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 200*e6580ceeSShri Abhyankar x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 201*e6580ceeSShri Abhyankar x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 202*e6580ceeSShri Abhyankar x[14] = v[14]; x[15] = v[15]; 203*e6580ceeSShri Abhyankar v += 16; 204*e6580ceeSShri Abhyankar } 205*e6580ceeSShri Abhyankar row = *ajtmp++; 206*e6580ceeSShri Abhyankar while (row < i) { 207*e6580ceeSShri Abhyankar pc = rtmp + 16*row; 208*e6580ceeSShri Abhyankar p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 209*e6580ceeSShri Abhyankar p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 210*e6580ceeSShri Abhyankar p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 211*e6580ceeSShri Abhyankar p15 = pc[14]; p16 = pc[15]; 212*e6580ceeSShri Abhyankar if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 213*e6580ceeSShri Abhyankar p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 214*e6580ceeSShri Abhyankar p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 215*e6580ceeSShri Abhyankar || p16 != 0.0) { 216*e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 217*e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 218*e6580ceeSShri Abhyankar x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 219*e6580ceeSShri Abhyankar x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 220*e6580ceeSShri Abhyankar x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 221*e6580ceeSShri Abhyankar x15 = pv[14]; x16 = pv[15]; 222*e6580ceeSShri Abhyankar pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 223*e6580ceeSShri Abhyankar pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 224*e6580ceeSShri Abhyankar pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 225*e6580ceeSShri Abhyankar pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 226*e6580ceeSShri Abhyankar 227*e6580ceeSShri Abhyankar pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 228*e6580ceeSShri Abhyankar pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 229*e6580ceeSShri Abhyankar pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 230*e6580ceeSShri Abhyankar pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 231*e6580ceeSShri Abhyankar 232*e6580ceeSShri Abhyankar pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 233*e6580ceeSShri Abhyankar pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 234*e6580ceeSShri Abhyankar pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 235*e6580ceeSShri Abhyankar pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 236*e6580ceeSShri Abhyankar 237*e6580ceeSShri Abhyankar pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 238*e6580ceeSShri Abhyankar pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 239*e6580ceeSShri Abhyankar pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 240*e6580ceeSShri Abhyankar pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 241*e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 242*e6580ceeSShri Abhyankar pv += 16; 243*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 244*e6580ceeSShri Abhyankar x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 245*e6580ceeSShri Abhyankar x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 246*e6580ceeSShri Abhyankar x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 247*e6580ceeSShri Abhyankar x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 248*e6580ceeSShri Abhyankar x = rtmp + 16*pj[j]; 249*e6580ceeSShri Abhyankar x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 250*e6580ceeSShri Abhyankar x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 251*e6580ceeSShri Abhyankar x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 252*e6580ceeSShri Abhyankar x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 253*e6580ceeSShri Abhyankar 254*e6580ceeSShri Abhyankar x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 255*e6580ceeSShri Abhyankar x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 256*e6580ceeSShri Abhyankar x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 257*e6580ceeSShri Abhyankar x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 258*e6580ceeSShri Abhyankar 259*e6580ceeSShri Abhyankar x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 260*e6580ceeSShri Abhyankar x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 261*e6580ceeSShri Abhyankar x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 262*e6580ceeSShri Abhyankar x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 263*e6580ceeSShri Abhyankar 264*e6580ceeSShri Abhyankar x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 265*e6580ceeSShri Abhyankar x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 266*e6580ceeSShri Abhyankar x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 267*e6580ceeSShri Abhyankar x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 268*e6580ceeSShri Abhyankar 269*e6580ceeSShri Abhyankar pv += 16; 270*e6580ceeSShri Abhyankar } 271*e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 272*e6580ceeSShri Abhyankar } 273*e6580ceeSShri Abhyankar row = *ajtmp++; 274*e6580ceeSShri Abhyankar } 275*e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 276*e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 277*e6580ceeSShri Abhyankar pj = bj + bi[i]; 278*e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 279*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 280*e6580ceeSShri Abhyankar x = rtmp+16*pj[j]; 281*e6580ceeSShri Abhyankar pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 282*e6580ceeSShri Abhyankar pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 283*e6580ceeSShri Abhyankar pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 284*e6580ceeSShri Abhyankar pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 285*e6580ceeSShri Abhyankar pv += 16; 286*e6580ceeSShri Abhyankar } 287*e6580ceeSShri Abhyankar /* invert diagonal block */ 288*e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 289*e6580ceeSShri Abhyankar if (pivotinblocks) { 290*e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 291*e6580ceeSShri Abhyankar } else { 292*e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 293*e6580ceeSShri Abhyankar } 294*e6580ceeSShri Abhyankar } 295*e6580ceeSShri Abhyankar 296*e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 297*e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 298*e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 299*e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 300*e6580ceeSShri Abhyankar ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 301*e6580ceeSShri Abhyankar PetscFunctionReturn(0); 302*e6580ceeSShri Abhyankar } 303*e6580ceeSShri Abhyankar 304*e6580ceeSShri Abhyankar 305*e6580ceeSShri Abhyankar #if defined(PETSC_HAVE_SSE) 306*e6580ceeSShri Abhyankar 307*e6580ceeSShri Abhyankar #include PETSC_HAVE_SSE 308*e6580ceeSShri Abhyankar 309*e6580ceeSShri Abhyankar /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 310*e6580ceeSShri Abhyankar #undef __FUNCT__ 311*e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 312*e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 313*e6580ceeSShri Abhyankar { 314*e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 315*e6580ceeSShri Abhyankar PetscErrorCode ierr; 316*e6580ceeSShri Abhyankar int i,j,n = a->mbs; 317*e6580ceeSShri Abhyankar int *bj = b->j,*bjtmp,*pj; 318*e6580ceeSShri Abhyankar int row; 319*e6580ceeSShri Abhyankar int *ajtmpold,nz,*bi=b->i; 320*e6580ceeSShri Abhyankar int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 321*e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 322*e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 323*e6580ceeSShri Abhyankar int nonzero=0; 324*e6580ceeSShri Abhyankar /* int nonzero=0,colscale = 16; */ 325*e6580ceeSShri Abhyankar PetscTruth pivotinblocks = b->pivotinblocks; 326*e6580ceeSShri Abhyankar PetscReal shift = info->shiftinblocks; 327*e6580ceeSShri Abhyankar 328*e6580ceeSShri Abhyankar PetscFunctionBegin; 329*e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 330*e6580ceeSShri Abhyankar 331*e6580ceeSShri Abhyankar if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 332*e6580ceeSShri Abhyankar if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 333*e6580ceeSShri Abhyankar ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 334*e6580ceeSShri Abhyankar if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 335*e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 336*e6580ceeSShri Abhyankar /* colscale = 4; */ 337*e6580ceeSShri Abhyankar /* } */ 338*e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 339*e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 340*e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 341*e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 342*e6580ceeSShri Abhyankar /* zero out one register */ 343*e6580ceeSShri Abhyankar XOR_PS(XMM7,XMM7); 344*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 345*e6580ceeSShri Abhyankar x = rtmp+16*bjtmp[j]; 346*e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 347*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 348*e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 349*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 350*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 351*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 352*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 353*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 354*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 355*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 356*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 357*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 358*e6580ceeSShri Abhyankar SSE_INLINE_END_1; 359*e6580ceeSShri Abhyankar } 360*e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 361*e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 362*e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 363*e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 364*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 365*e6580ceeSShri Abhyankar x = rtmp+16*ajtmpold[j]; 366*e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmpold[j]; */ 367*e6580ceeSShri Abhyankar /* Copy v block into x block */ 368*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v,x) 369*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 370*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 371*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 372*e6580ceeSShri Abhyankar 373*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 374*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 375*e6580ceeSShri Abhyankar 376*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 377*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 378*e6580ceeSShri Abhyankar 379*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 380*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 381*e6580ceeSShri Abhyankar 382*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 383*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 384*e6580ceeSShri Abhyankar 385*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 386*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 387*e6580ceeSShri Abhyankar 388*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 389*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 390*e6580ceeSShri Abhyankar 391*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 392*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 393*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 394*e6580ceeSShri Abhyankar 395*e6580ceeSShri Abhyankar v += 16; 396*e6580ceeSShri Abhyankar } 397*e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 398*e6580ceeSShri Abhyankar row = *bjtmp++; 399*e6580ceeSShri Abhyankar while (row < i) { 400*e6580ceeSShri Abhyankar pc = rtmp + 16*row; 401*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 402*e6580ceeSShri Abhyankar /* Load block from lower triangle */ 403*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 404*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 405*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 406*e6580ceeSShri Abhyankar 407*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 408*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 409*e6580ceeSShri Abhyankar 410*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 411*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 412*e6580ceeSShri Abhyankar 413*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 414*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 415*e6580ceeSShri Abhyankar 416*e6580ceeSShri Abhyankar /* Compare block to zero block */ 417*e6580ceeSShri Abhyankar 418*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4,XMM7) 419*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4,XMM0) 420*e6580ceeSShri Abhyankar 421*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5,XMM7) 422*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5,XMM1) 423*e6580ceeSShri Abhyankar 424*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6,XMM7) 425*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6,XMM2) 426*e6580ceeSShri Abhyankar 427*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7,XMM3) 428*e6580ceeSShri Abhyankar 429*e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 430*e6580ceeSShri Abhyankar SSE_OR_PS(XMM6,XMM7) 431*e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM4) 432*e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM6) 433*e6580ceeSShri Abhyankar SSE_INLINE_END_1; 434*e6580ceeSShri Abhyankar 435*e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 436*e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 437*e6580ceeSShri Abhyankar MOVEMASK(nonzero,XMM5); 438*e6580ceeSShri Abhyankar 439*e6580ceeSShri Abhyankar /* If block is nonzero ... */ 440*e6580ceeSShri Abhyankar if (nonzero) { 441*e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 442*e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 443*e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 444*e6580ceeSShri Abhyankar 445*e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 446*e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 447*e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 448*e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 449*e6580ceeSShri Abhyankar 450*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv,pc) 451*e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 452*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 453*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 454*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM0) 455*e6580ceeSShri Abhyankar 456*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 457*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 458*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM1) 459*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM5) 460*e6580ceeSShri Abhyankar 461*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 462*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 463*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM2) 464*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM6) 465*e6580ceeSShri Abhyankar 466*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 467*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 468*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 469*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM7) 470*e6580ceeSShri Abhyankar 471*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 472*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 473*e6580ceeSShri Abhyankar 474*e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 475*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 476*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 477*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 478*e6580ceeSShri Abhyankar 479*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 480*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 481*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 482*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 483*e6580ceeSShri Abhyankar 484*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 485*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 486*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 487*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM7) 488*e6580ceeSShri Abhyankar 489*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 490*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 491*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 492*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 493*e6580ceeSShri Abhyankar 494*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 495*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 496*e6580ceeSShri Abhyankar 497*e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 498*e6580ceeSShri Abhyankar 499*e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 500*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 501*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 502*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 503*e6580ceeSShri Abhyankar 504*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 505*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 506*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 507*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 508*e6580ceeSShri Abhyankar 509*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 510*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 511*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 512*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 513*e6580ceeSShri Abhyankar 514*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 515*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 516*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 517*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 518*e6580ceeSShri Abhyankar 519*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 520*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 521*e6580ceeSShri Abhyankar 522*e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 523*e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 524*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 525*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 526*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0,XMM7) 527*e6580ceeSShri Abhyankar 528*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 529*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 530*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM7) 531*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 532*e6580ceeSShri Abhyankar 533*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 534*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1,XMM1,0x00) 535*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM2) 536*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 537*e6580ceeSShri Abhyankar 538*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 539*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 540*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3,XMM7) 541*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM3) 542*e6580ceeSShri Abhyankar 543*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 544*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 545*e6580ceeSShri Abhyankar 546*e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 547*e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans afterall. */ 548*e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 549*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3,XMM0) 550*e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 551*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2,XMM6) 552*e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 553*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1,XMM5) 554*e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 555*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0,XMM4) 556*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 557*e6580ceeSShri Abhyankar 558*e6580ceeSShri Abhyankar /* Update the row: */ 559*e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 560*e6580ceeSShri Abhyankar pv += 16; 561*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 562*e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 563*e6580ceeSShri Abhyankar x = rtmp + 16*pj[j]; 564*e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 565*e6580ceeSShri Abhyankar 566*e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 567*e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 568*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 569*e6580ceeSShri Abhyankar /* Load First Column of X*/ 570*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 571*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 572*e6580ceeSShri Abhyankar 573*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 574*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 575*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 576*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 577*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 578*e6580ceeSShri Abhyankar 579*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 580*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 581*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 582*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 583*e6580ceeSShri Abhyankar 584*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 585*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 586*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 587*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 588*e6580ceeSShri Abhyankar 589*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 590*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 591*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 592*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 593*e6580ceeSShri Abhyankar 594*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 595*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 596*e6580ceeSShri Abhyankar 597*e6580ceeSShri Abhyankar /* Second Column */ 598*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 599*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 600*e6580ceeSShri Abhyankar 601*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 602*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 603*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 604*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 605*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 606*e6580ceeSShri Abhyankar 607*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 608*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 609*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 610*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM7) 611*e6580ceeSShri Abhyankar 612*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 613*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 614*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM2) 615*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM4) 616*e6580ceeSShri Abhyankar 617*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 618*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 619*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 620*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 621*e6580ceeSShri Abhyankar 622*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 623*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 624*e6580ceeSShri Abhyankar 625*e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 626*e6580ceeSShri Abhyankar 627*e6580ceeSShri Abhyankar /* Third Column */ 628*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 629*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 630*e6580ceeSShri Abhyankar 631*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 632*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 633*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 634*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM0) 635*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 636*e6580ceeSShri Abhyankar 637*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 638*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 639*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM1) 640*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM4) 641*e6580ceeSShri Abhyankar 642*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 643*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 644*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM2) 645*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM5) 646*e6580ceeSShri Abhyankar 647*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 648*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 649*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 650*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 651*e6580ceeSShri Abhyankar 652*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 653*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 654*e6580ceeSShri Abhyankar 655*e6580ceeSShri Abhyankar /* Fourth Column */ 656*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 657*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 658*e6580ceeSShri Abhyankar 659*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 660*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 661*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 662*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 663*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 664*e6580ceeSShri Abhyankar 665*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 666*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 667*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 668*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 669*e6580ceeSShri Abhyankar 670*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 671*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 672*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 673*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 674*e6580ceeSShri Abhyankar 675*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 676*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 677*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 678*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 679*e6580ceeSShri Abhyankar 680*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 681*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 682*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 683*e6580ceeSShri Abhyankar pv += 16; 684*e6580ceeSShri Abhyankar } 685*e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 686*e6580ceeSShri Abhyankar } 687*e6580ceeSShri Abhyankar row = *bjtmp++; 688*e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 689*e6580ceeSShri Abhyankar } 690*e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 691*e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 692*e6580ceeSShri Abhyankar pj = bj + bi[i]; 693*e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 694*e6580ceeSShri Abhyankar 695*e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 696*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 697*e6580ceeSShri Abhyankar x = rtmp+16*pj[j]; 698*e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 699*e6580ceeSShri Abhyankar 700*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 701*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 702*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 703*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 704*e6580ceeSShri Abhyankar 705*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 706*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 707*e6580ceeSShri Abhyankar 708*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 709*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 710*e6580ceeSShri Abhyankar 711*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 712*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 713*e6580ceeSShri Abhyankar 714*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 715*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 716*e6580ceeSShri Abhyankar 717*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 718*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 719*e6580ceeSShri Abhyankar 720*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 721*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 722*e6580ceeSShri Abhyankar 723*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 724*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 725*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 726*e6580ceeSShri Abhyankar pv += 16; 727*e6580ceeSShri Abhyankar } 728*e6580ceeSShri Abhyankar /* invert diagonal block */ 729*e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 730*e6580ceeSShri Abhyankar if (pivotinblocks) { 731*e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 732*e6580ceeSShri Abhyankar } else { 733*e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 734*e6580ceeSShri Abhyankar } 735*e6580ceeSShri Abhyankar /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 736*e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 737*e6580ceeSShri Abhyankar } 738*e6580ceeSShri Abhyankar 739*e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 740*e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 741*e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 742*e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 743*e6580ceeSShri Abhyankar ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 744*e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 745*e6580ceeSShri Abhyankar SSE_SCOPE_END; 746*e6580ceeSShri Abhyankar PetscFunctionReturn(0); 747*e6580ceeSShri Abhyankar } 748*e6580ceeSShri Abhyankar 749*e6580ceeSShri Abhyankar #undef __FUNCT__ 750*e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 751*e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 752*e6580ceeSShri Abhyankar { 753*e6580ceeSShri Abhyankar Mat A=C; 754*e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 755*e6580ceeSShri Abhyankar PetscErrorCode ierr; 756*e6580ceeSShri Abhyankar int i,j,n = a->mbs; 757*e6580ceeSShri Abhyankar unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 758*e6580ceeSShri Abhyankar unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 759*e6580ceeSShri Abhyankar unsigned int row; 760*e6580ceeSShri Abhyankar int nz,*bi=b->i; 761*e6580ceeSShri Abhyankar int *diag_offset = b->diag,*ai=a->i; 762*e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 763*e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 764*e6580ceeSShri Abhyankar int nonzero=0; 765*e6580ceeSShri Abhyankar /* int nonzero=0,colscale = 16; */ 766*e6580ceeSShri Abhyankar PetscTruth pivotinblocks = b->pivotinblocks; 767*e6580ceeSShri Abhyankar PetscReal shift = info->shiftinblocks; 768*e6580ceeSShri Abhyankar 769*e6580ceeSShri Abhyankar PetscFunctionBegin; 770*e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 771*e6580ceeSShri Abhyankar 772*e6580ceeSShri Abhyankar if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 773*e6580ceeSShri Abhyankar if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 774*e6580ceeSShri Abhyankar ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 775*e6580ceeSShri Abhyankar if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 776*e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 777*e6580ceeSShri Abhyankar /* colscale = 4; */ 778*e6580ceeSShri Abhyankar /* } */ 779*e6580ceeSShri Abhyankar 780*e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 781*e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 782*e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 783*e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 784*e6580ceeSShri Abhyankar /* zero out one register */ 785*e6580ceeSShri Abhyankar XOR_PS(XMM7,XMM7); 786*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 787*e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)bjtmp[j]); 788*e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 789*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 790*e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 791*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 792*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 793*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 794*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 795*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 796*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 797*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 798*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 799*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 800*e6580ceeSShri Abhyankar SSE_INLINE_END_1; 801*e6580ceeSShri Abhyankar } 802*e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 803*e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 804*e6580ceeSShri Abhyankar ajtmp = aj + ai[i]; 805*e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 806*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 807*e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)ajtmp[j]); 808*e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmp[j]; */ 809*e6580ceeSShri Abhyankar /* Copy v block into x block */ 810*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v,x) 811*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 812*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 813*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 814*e6580ceeSShri Abhyankar 815*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 816*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 817*e6580ceeSShri Abhyankar 818*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 819*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 820*e6580ceeSShri Abhyankar 821*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 822*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 823*e6580ceeSShri Abhyankar 824*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 825*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 826*e6580ceeSShri Abhyankar 827*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 828*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 829*e6580ceeSShri Abhyankar 830*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 831*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 832*e6580ceeSShri Abhyankar 833*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 834*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 835*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 836*e6580ceeSShri Abhyankar 837*e6580ceeSShri Abhyankar v += 16; 838*e6580ceeSShri Abhyankar } 839*e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 840*e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 841*e6580ceeSShri Abhyankar while (row < i) { 842*e6580ceeSShri Abhyankar pc = rtmp + 16*row; 843*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 844*e6580ceeSShri Abhyankar /* Load block from lower triangle */ 845*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 846*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 847*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 848*e6580ceeSShri Abhyankar 849*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 850*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 851*e6580ceeSShri Abhyankar 852*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 853*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 854*e6580ceeSShri Abhyankar 855*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 856*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 857*e6580ceeSShri Abhyankar 858*e6580ceeSShri Abhyankar /* Compare block to zero block */ 859*e6580ceeSShri Abhyankar 860*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4,XMM7) 861*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4,XMM0) 862*e6580ceeSShri Abhyankar 863*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5,XMM7) 864*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5,XMM1) 865*e6580ceeSShri Abhyankar 866*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6,XMM7) 867*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6,XMM2) 868*e6580ceeSShri Abhyankar 869*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7,XMM3) 870*e6580ceeSShri Abhyankar 871*e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 872*e6580ceeSShri Abhyankar SSE_OR_PS(XMM6,XMM7) 873*e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM4) 874*e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM6) 875*e6580ceeSShri Abhyankar SSE_INLINE_END_1; 876*e6580ceeSShri Abhyankar 877*e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 878*e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 879*e6580ceeSShri Abhyankar MOVEMASK(nonzero,XMM5); 880*e6580ceeSShri Abhyankar 881*e6580ceeSShri Abhyankar /* If block is nonzero ... */ 882*e6580ceeSShri Abhyankar if (nonzero) { 883*e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 884*e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 885*e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 886*e6580ceeSShri Abhyankar 887*e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 888*e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 889*e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 890*e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 891*e6580ceeSShri Abhyankar 892*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv,pc) 893*e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 894*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 895*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 896*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM0) 897*e6580ceeSShri Abhyankar 898*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 899*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 900*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM1) 901*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM5) 902*e6580ceeSShri Abhyankar 903*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 904*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 905*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM2) 906*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM6) 907*e6580ceeSShri Abhyankar 908*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 909*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 910*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 911*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM7) 912*e6580ceeSShri Abhyankar 913*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 914*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 915*e6580ceeSShri Abhyankar 916*e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 917*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 918*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 919*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 920*e6580ceeSShri Abhyankar 921*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 922*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 923*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 924*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 925*e6580ceeSShri Abhyankar 926*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 927*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 928*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 929*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM7) 930*e6580ceeSShri Abhyankar 931*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 932*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 933*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 934*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 935*e6580ceeSShri Abhyankar 936*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 937*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 938*e6580ceeSShri Abhyankar 939*e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 940*e6580ceeSShri Abhyankar 941*e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 942*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 943*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 944*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 945*e6580ceeSShri Abhyankar 946*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 947*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 948*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 949*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 950*e6580ceeSShri Abhyankar 951*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 952*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 953*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 954*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 955*e6580ceeSShri Abhyankar 956*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 957*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 958*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 959*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 960*e6580ceeSShri Abhyankar 961*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 962*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 963*e6580ceeSShri Abhyankar 964*e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 965*e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 966*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 967*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 968*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0,XMM7) 969*e6580ceeSShri Abhyankar 970*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 971*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 972*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM7) 973*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 974*e6580ceeSShri Abhyankar 975*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 976*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1,XMM1,0x00) 977*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM2) 978*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 979*e6580ceeSShri Abhyankar 980*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 981*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 982*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3,XMM7) 983*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM3) 984*e6580ceeSShri Abhyankar 985*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 986*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 987*e6580ceeSShri Abhyankar 988*e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 989*e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans afterall. */ 990*e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 991*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3,XMM0) 992*e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 993*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2,XMM6) 994*e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 995*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1,XMM5) 996*e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 997*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0,XMM4) 998*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 999*e6580ceeSShri Abhyankar 1000*e6580ceeSShri Abhyankar /* Update the row: */ 1001*e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 1002*e6580ceeSShri Abhyankar pv += 16; 1003*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1004*e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1005*e6580ceeSShri Abhyankar x = rtmp + 16*((unsigned int)pj[j]); 1006*e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 1007*e6580ceeSShri Abhyankar 1008*e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 1009*e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1010*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1011*e6580ceeSShri Abhyankar /* Load First Column of X*/ 1012*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1013*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1014*e6580ceeSShri Abhyankar 1015*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1016*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1017*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1018*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1019*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1020*e6580ceeSShri Abhyankar 1021*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1022*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1023*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1024*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1025*e6580ceeSShri Abhyankar 1026*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1027*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1028*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1029*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1030*e6580ceeSShri Abhyankar 1031*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1032*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1033*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1034*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1035*e6580ceeSShri Abhyankar 1036*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1037*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1038*e6580ceeSShri Abhyankar 1039*e6580ceeSShri Abhyankar /* Second Column */ 1040*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1041*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1042*e6580ceeSShri Abhyankar 1043*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1044*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1045*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1046*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1047*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1048*e6580ceeSShri Abhyankar 1049*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1050*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1051*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1052*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM7) 1053*e6580ceeSShri Abhyankar 1054*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1055*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1056*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM2) 1057*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM4) 1058*e6580ceeSShri Abhyankar 1059*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1060*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1061*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1062*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1063*e6580ceeSShri Abhyankar 1064*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1065*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1066*e6580ceeSShri Abhyankar 1067*e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1068*e6580ceeSShri Abhyankar 1069*e6580ceeSShri Abhyankar /* Third Column */ 1070*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1071*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1072*e6580ceeSShri Abhyankar 1073*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1074*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1075*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1076*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM0) 1077*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1078*e6580ceeSShri Abhyankar 1079*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1080*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1081*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM1) 1082*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM4) 1083*e6580ceeSShri Abhyankar 1084*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1085*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1086*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM2) 1087*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM5) 1088*e6580ceeSShri Abhyankar 1089*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1090*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1091*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1092*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1093*e6580ceeSShri Abhyankar 1094*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1095*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1096*e6580ceeSShri Abhyankar 1097*e6580ceeSShri Abhyankar /* Fourth Column */ 1098*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1099*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1100*e6580ceeSShri Abhyankar 1101*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1102*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1103*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1104*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1105*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1106*e6580ceeSShri Abhyankar 1107*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1108*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1109*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1110*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1111*e6580ceeSShri Abhyankar 1112*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1113*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1114*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1115*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1116*e6580ceeSShri Abhyankar 1117*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1118*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1119*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1120*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1121*e6580ceeSShri Abhyankar 1122*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1123*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1124*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1125*e6580ceeSShri Abhyankar pv += 16; 1126*e6580ceeSShri Abhyankar } 1127*e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1128*e6580ceeSShri Abhyankar } 1129*e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1130*e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1131*e6580ceeSShri Abhyankar /* bjtmp++; */ 1132*e6580ceeSShri Abhyankar } 1133*e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1134*e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 1135*e6580ceeSShri Abhyankar pj = bj + bi[i]; 1136*e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 1137*e6580ceeSShri Abhyankar 1138*e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1139*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1140*e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)pj[j]); 1141*e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1142*e6580ceeSShri Abhyankar 1143*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1144*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1145*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1146*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1147*e6580ceeSShri Abhyankar 1148*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1149*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1150*e6580ceeSShri Abhyankar 1151*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1152*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1153*e6580ceeSShri Abhyankar 1154*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1155*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1156*e6580ceeSShri Abhyankar 1157*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1158*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1159*e6580ceeSShri Abhyankar 1160*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1161*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1162*e6580ceeSShri Abhyankar 1163*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1164*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1165*e6580ceeSShri Abhyankar 1166*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1167*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1168*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1169*e6580ceeSShri Abhyankar pv += 16; 1170*e6580ceeSShri Abhyankar } 1171*e6580ceeSShri Abhyankar /* invert diagonal block */ 1172*e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 1173*e6580ceeSShri Abhyankar if (pivotinblocks) { 1174*e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1175*e6580ceeSShri Abhyankar } else { 1176*e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1177*e6580ceeSShri Abhyankar } 1178*e6580ceeSShri Abhyankar /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1179*e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1180*e6580ceeSShri Abhyankar } 1181*e6580ceeSShri Abhyankar 1182*e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 1183*e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1184*e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1185*e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 1186*e6580ceeSShri Abhyankar ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1187*e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1188*e6580ceeSShri Abhyankar SSE_SCOPE_END; 1189*e6580ceeSShri Abhyankar PetscFunctionReturn(0); 1190*e6580ceeSShri Abhyankar } 1191*e6580ceeSShri Abhyankar 1192*e6580ceeSShri Abhyankar #undef __FUNCT__ 1193*e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1194*e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1195*e6580ceeSShri Abhyankar { 1196*e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1197*e6580ceeSShri Abhyankar PetscErrorCode ierr; 1198*e6580ceeSShri Abhyankar int i,j,n = a->mbs; 1199*e6580ceeSShri Abhyankar unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1200*e6580ceeSShri Abhyankar unsigned int row; 1201*e6580ceeSShri Abhyankar int *ajtmpold,nz,*bi=b->i; 1202*e6580ceeSShri Abhyankar int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1203*e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1204*e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 1205*e6580ceeSShri Abhyankar int nonzero=0; 1206*e6580ceeSShri Abhyankar /* int nonzero=0,colscale = 16; */ 1207*e6580ceeSShri Abhyankar PetscTruth pivotinblocks = b->pivotinblocks; 1208*e6580ceeSShri Abhyankar PetscReal shift = info->shiftinblocks; 1209*e6580ceeSShri Abhyankar 1210*e6580ceeSShri Abhyankar PetscFunctionBegin; 1211*e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 1212*e6580ceeSShri Abhyankar 1213*e6580ceeSShri Abhyankar if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1214*e6580ceeSShri Abhyankar if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1215*e6580ceeSShri Abhyankar ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1216*e6580ceeSShri Abhyankar if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1217*e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 1218*e6580ceeSShri Abhyankar /* colscale = 4; */ 1219*e6580ceeSShri Abhyankar /* } */ 1220*e6580ceeSShri Abhyankar if ((unsigned long)bj==(unsigned long)aj) { 1221*e6580ceeSShri Abhyankar return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1222*e6580ceeSShri Abhyankar } 1223*e6580ceeSShri Abhyankar 1224*e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 1225*e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 1226*e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 1227*e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 1228*e6580ceeSShri Abhyankar /* zero out one register */ 1229*e6580ceeSShri Abhyankar XOR_PS(XMM7,XMM7); 1230*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1231*e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)bjtmp[j]); 1232*e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 1233*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 1234*e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 1235*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1236*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1237*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1238*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1239*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1240*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1241*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1242*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1243*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1244*e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1245*e6580ceeSShri Abhyankar } 1246*e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 1247*e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 1248*e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 1249*e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 1250*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1251*e6580ceeSShri Abhyankar x = rtmp+16*ajtmpold[j]; 1252*e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmpold[j]; */ 1253*e6580ceeSShri Abhyankar /* Copy v block into x block */ 1254*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v,x) 1255*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1256*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1257*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1258*e6580ceeSShri Abhyankar 1259*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1260*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1261*e6580ceeSShri Abhyankar 1262*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1263*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1264*e6580ceeSShri Abhyankar 1265*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1266*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1267*e6580ceeSShri Abhyankar 1268*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1269*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1270*e6580ceeSShri Abhyankar 1271*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1272*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1273*e6580ceeSShri Abhyankar 1274*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1275*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1276*e6580ceeSShri Abhyankar 1277*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1278*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1279*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1280*e6580ceeSShri Abhyankar 1281*e6580ceeSShri Abhyankar v += 16; 1282*e6580ceeSShri Abhyankar } 1283*e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1284*e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1285*e6580ceeSShri Abhyankar while (row < i) { 1286*e6580ceeSShri Abhyankar pc = rtmp + 16*row; 1287*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 1288*e6580ceeSShri Abhyankar /* Load block from lower triangle */ 1289*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1290*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1291*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1292*e6580ceeSShri Abhyankar 1293*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1294*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1295*e6580ceeSShri Abhyankar 1296*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1297*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1298*e6580ceeSShri Abhyankar 1299*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1300*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1301*e6580ceeSShri Abhyankar 1302*e6580ceeSShri Abhyankar /* Compare block to zero block */ 1303*e6580ceeSShri Abhyankar 1304*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4,XMM7) 1305*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4,XMM0) 1306*e6580ceeSShri Abhyankar 1307*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5,XMM7) 1308*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5,XMM1) 1309*e6580ceeSShri Abhyankar 1310*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6,XMM7) 1311*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6,XMM2) 1312*e6580ceeSShri Abhyankar 1313*e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7,XMM3) 1314*e6580ceeSShri Abhyankar 1315*e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 1316*e6580ceeSShri Abhyankar SSE_OR_PS(XMM6,XMM7) 1317*e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM4) 1318*e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM6) 1319*e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1320*e6580ceeSShri Abhyankar 1321*e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 1322*e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1323*e6580ceeSShri Abhyankar MOVEMASK(nonzero,XMM5); 1324*e6580ceeSShri Abhyankar 1325*e6580ceeSShri Abhyankar /* If block is nonzero ... */ 1326*e6580ceeSShri Abhyankar if (nonzero) { 1327*e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 1328*e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1329*e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 1330*e6580ceeSShri Abhyankar 1331*e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1332*e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1333*e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 1334*e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1335*e6580ceeSShri Abhyankar 1336*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv,pc) 1337*e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 1338*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1339*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1340*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM0) 1341*e6580ceeSShri Abhyankar 1342*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1343*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1344*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM1) 1345*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM5) 1346*e6580ceeSShri Abhyankar 1347*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1348*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1349*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM2) 1350*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM6) 1351*e6580ceeSShri Abhyankar 1352*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1353*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1354*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1355*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM7) 1356*e6580ceeSShri Abhyankar 1357*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1358*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1359*e6580ceeSShri Abhyankar 1360*e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 1361*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1362*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1363*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1364*e6580ceeSShri Abhyankar 1365*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1366*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1367*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1368*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 1369*e6580ceeSShri Abhyankar 1370*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1371*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1372*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1373*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM7) 1374*e6580ceeSShri Abhyankar 1375*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1376*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1377*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1378*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 1379*e6580ceeSShri Abhyankar 1380*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1381*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1382*e6580ceeSShri Abhyankar 1383*e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1384*e6580ceeSShri Abhyankar 1385*e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 1386*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1387*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1388*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1389*e6580ceeSShri Abhyankar 1390*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1391*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1392*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1393*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1394*e6580ceeSShri Abhyankar 1395*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1396*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1397*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1398*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1399*e6580ceeSShri Abhyankar 1400*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1401*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1402*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1403*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1404*e6580ceeSShri Abhyankar 1405*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1406*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1407*e6580ceeSShri Abhyankar 1408*e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1409*e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 1410*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1411*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1412*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0,XMM7) 1413*e6580ceeSShri Abhyankar 1414*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1415*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1416*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM7) 1417*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 1418*e6580ceeSShri Abhyankar 1419*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1420*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1,XMM1,0x00) 1421*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM2) 1422*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 1423*e6580ceeSShri Abhyankar 1424*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1425*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1426*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3,XMM7) 1427*e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM3) 1428*e6580ceeSShri Abhyankar 1429*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1430*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1431*e6580ceeSShri Abhyankar 1432*e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1433*e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans afterall. */ 1434*e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 1435*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3,XMM0) 1436*e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 1437*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2,XMM6) 1438*e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 1439*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1,XMM5) 1440*e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 1441*e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0,XMM4) 1442*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1443*e6580ceeSShri Abhyankar 1444*e6580ceeSShri Abhyankar /* Update the row: */ 1445*e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 1446*e6580ceeSShri Abhyankar pv += 16; 1447*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1448*e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1449*e6580ceeSShri Abhyankar x = rtmp + 16*((unsigned int)pj[j]); 1450*e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 1451*e6580ceeSShri Abhyankar 1452*e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 1453*e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1454*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1455*e6580ceeSShri Abhyankar /* Load First Column of X*/ 1456*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1457*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1458*e6580ceeSShri Abhyankar 1459*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1460*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1461*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1462*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1463*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1464*e6580ceeSShri Abhyankar 1465*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1466*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1467*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1468*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1469*e6580ceeSShri Abhyankar 1470*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1471*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1472*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1473*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1474*e6580ceeSShri Abhyankar 1475*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1476*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1477*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1478*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1479*e6580ceeSShri Abhyankar 1480*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1481*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1482*e6580ceeSShri Abhyankar 1483*e6580ceeSShri Abhyankar /* Second Column */ 1484*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1485*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1486*e6580ceeSShri Abhyankar 1487*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1488*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1489*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1490*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1491*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1492*e6580ceeSShri Abhyankar 1493*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1494*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1495*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1496*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM7) 1497*e6580ceeSShri Abhyankar 1498*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1499*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1500*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM2) 1501*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM4) 1502*e6580ceeSShri Abhyankar 1503*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1504*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1505*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1506*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1507*e6580ceeSShri Abhyankar 1508*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1509*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1510*e6580ceeSShri Abhyankar 1511*e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1512*e6580ceeSShri Abhyankar 1513*e6580ceeSShri Abhyankar /* Third Column */ 1514*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1515*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1516*e6580ceeSShri Abhyankar 1517*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1518*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1519*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1520*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM0) 1521*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1522*e6580ceeSShri Abhyankar 1523*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1524*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1525*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM1) 1526*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM4) 1527*e6580ceeSShri Abhyankar 1528*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1529*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1530*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM2) 1531*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM5) 1532*e6580ceeSShri Abhyankar 1533*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1534*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1535*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1536*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1537*e6580ceeSShri Abhyankar 1538*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1539*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1540*e6580ceeSShri Abhyankar 1541*e6580ceeSShri Abhyankar /* Fourth Column */ 1542*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1543*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1544*e6580ceeSShri Abhyankar 1545*e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1546*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1547*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1548*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1549*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1550*e6580ceeSShri Abhyankar 1551*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1552*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1553*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1554*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1555*e6580ceeSShri Abhyankar 1556*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1557*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1558*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1559*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1560*e6580ceeSShri Abhyankar 1561*e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1562*e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1563*e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1564*e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1565*e6580ceeSShri Abhyankar 1566*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1567*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1568*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1569*e6580ceeSShri Abhyankar pv += 16; 1570*e6580ceeSShri Abhyankar } 1571*e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1572*e6580ceeSShri Abhyankar } 1573*e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1574*e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1575*e6580ceeSShri Abhyankar /* bjtmp++; */ 1576*e6580ceeSShri Abhyankar } 1577*e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1578*e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 1579*e6580ceeSShri Abhyankar pj = bj + bi[i]; 1580*e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 1581*e6580ceeSShri Abhyankar 1582*e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1583*e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1584*e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)pj[j]); 1585*e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1586*e6580ceeSShri Abhyankar 1587*e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1588*e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1589*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1590*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1591*e6580ceeSShri Abhyankar 1592*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1593*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1594*e6580ceeSShri Abhyankar 1595*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1596*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1597*e6580ceeSShri Abhyankar 1598*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1599*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1600*e6580ceeSShri Abhyankar 1601*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1602*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1603*e6580ceeSShri Abhyankar 1604*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1605*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1606*e6580ceeSShri Abhyankar 1607*e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1608*e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1609*e6580ceeSShri Abhyankar 1610*e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1611*e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1612*e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1613*e6580ceeSShri Abhyankar pv += 16; 1614*e6580ceeSShri Abhyankar } 1615*e6580ceeSShri Abhyankar /* invert diagonal block */ 1616*e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 1617*e6580ceeSShri Abhyankar if (pivotinblocks) { 1618*e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1619*e6580ceeSShri Abhyankar } else { 1620*e6580ceeSShri Abhyankar ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1621*e6580ceeSShri Abhyankar } 1622*e6580ceeSShri Abhyankar /* ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1623*e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1624*e6580ceeSShri Abhyankar } 1625*e6580ceeSShri Abhyankar 1626*e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 1627*e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1628*e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1629*e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 1630*e6580ceeSShri Abhyankar ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); 1631*e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1632*e6580ceeSShri Abhyankar SSE_SCOPE_END; 1633*e6580ceeSShri Abhyankar PetscFunctionReturn(0); 1634*e6580ceeSShri Abhyankar } 1635*e6580ceeSShri Abhyankar 1636*e6580ceeSShri Abhyankar #endif 1637