1be1d678aSKris Buschelman 283287d42SBarry Smith /* 383287d42SBarry Smith Factorization code for BAIJ format. 483287d42SBarry Smith */ 5c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 6af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 783287d42SBarry Smith 883287d42SBarry Smith /* ------------------------------------------------------------*/ 983287d42SBarry Smith /* 1083287d42SBarry Smith Version for when blocks are 4 by 4 1183287d42SBarry Smith */ 1206e38f1dSHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 1383287d42SBarry Smith { 1483287d42SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1583287d42SBarry Smith IS isrow = b->row,isicol = b->icol; 166849ba73SBarry Smith PetscErrorCode ierr; 175d0c19d7SBarry Smith const PetscInt *r,*ic; 185d0c19d7SBarry Smith PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 19690b6cddSBarry Smith PetscInt *ajtmpold,*ajtmp,nz,row; 20690b6cddSBarry Smith PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 2183287d42SBarry Smith MatScalar *pv,*v,*rtmp,*pc,*w,*x; 2283287d42SBarry Smith MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 2383287d42SBarry Smith MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 2483287d42SBarry Smith MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 2583287d42SBarry Smith MatScalar m13,m14,m15,m16; 2683287d42SBarry Smith MatScalar *ba = b->a,*aa = a->a; 27a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 28182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 290164db54SHong Zhang PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 3083287d42SBarry Smith 3183287d42SBarry Smith PetscFunctionBegin; 3283287d42SBarry Smith ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3383287d42SBarry Smith ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 34785e854fSJed Brown ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 350164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 3683287d42SBarry Smith 3783287d42SBarry Smith for (i=0; i<n; i++) { 3883287d42SBarry Smith nz = bi[i+1] - bi[i]; 3983287d42SBarry Smith ajtmp = bj + bi[i]; 4083287d42SBarry Smith for (j=0; j<nz; j++) { 4183287d42SBarry Smith x = rtmp+16*ajtmp[j]; 4283287d42SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 4383287d42SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 4483287d42SBarry Smith } 4583287d42SBarry Smith /* load in initial (unfactored row) */ 4683287d42SBarry Smith idx = r[i]; 4783287d42SBarry Smith nz = ai[idx+1] - ai[idx]; 4883287d42SBarry Smith ajtmpold = aj + ai[idx]; 4983287d42SBarry Smith v = aa + 16*ai[idx]; 5083287d42SBarry Smith for (j=0; j<nz; j++) { 5183287d42SBarry Smith x = rtmp+16*ic[ajtmpold[j]]; 5283287d42SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 5383287d42SBarry Smith x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 5483287d42SBarry Smith x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 5583287d42SBarry Smith x[14] = v[14]; x[15] = v[15]; 5683287d42SBarry Smith v += 16; 5783287d42SBarry Smith } 5883287d42SBarry Smith row = *ajtmp++; 5983287d42SBarry Smith while (row < i) { 6083287d42SBarry Smith pc = rtmp + 16*row; 6183287d42SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 6283287d42SBarry Smith p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 6383287d42SBarry Smith p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 6483287d42SBarry Smith p15 = pc[14]; p16 = pc[15]; 6583287d42SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 6683287d42SBarry Smith p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 6783287d42SBarry Smith p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 6883287d42SBarry Smith || p16 != 0.0) { 6983287d42SBarry Smith pv = ba + 16*diag_offset[row]; 7083287d42SBarry Smith pj = bj + diag_offset[row] + 1; 7183287d42SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 7283287d42SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 7383287d42SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 7483287d42SBarry Smith x15 = pv[14]; x16 = pv[15]; 7583287d42SBarry Smith pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 7683287d42SBarry Smith pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 7783287d42SBarry Smith pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 7883287d42SBarry Smith pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 7983287d42SBarry Smith 8083287d42SBarry Smith pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 8183287d42SBarry Smith pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 8283287d42SBarry Smith pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 8383287d42SBarry Smith pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 8483287d42SBarry Smith 8583287d42SBarry Smith pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 8683287d42SBarry Smith pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 8783287d42SBarry Smith pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 8883287d42SBarry Smith pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 8983287d42SBarry Smith 9083287d42SBarry Smith pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 9183287d42SBarry Smith pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 9283287d42SBarry Smith pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 9383287d42SBarry Smith pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 9483287d42SBarry Smith 9583287d42SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 9683287d42SBarry Smith pv += 16; 9783287d42SBarry Smith for (j=0; j<nz; j++) { 9883287d42SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 9983287d42SBarry Smith x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 10083287d42SBarry Smith x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 10183287d42SBarry Smith x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 10283287d42SBarry Smith x = rtmp + 16*pj[j]; 10383287d42SBarry Smith x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 10483287d42SBarry Smith x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 10583287d42SBarry Smith x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 10683287d42SBarry Smith x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 10783287d42SBarry Smith 10883287d42SBarry Smith x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 10983287d42SBarry Smith x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 11083287d42SBarry Smith x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 11183287d42SBarry Smith x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 11283287d42SBarry Smith 11383287d42SBarry Smith x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 11483287d42SBarry Smith x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 11583287d42SBarry Smith x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 11683287d42SBarry Smith x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 11783287d42SBarry Smith 11883287d42SBarry Smith x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 11983287d42SBarry Smith x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 12083287d42SBarry Smith x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 12183287d42SBarry Smith x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 12283287d42SBarry Smith 12383287d42SBarry Smith pv += 16; 12483287d42SBarry Smith } 125dc0b31edSSatish Balay ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 12683287d42SBarry Smith } 12783287d42SBarry Smith row = *ajtmp++; 12883287d42SBarry Smith } 12983287d42SBarry Smith /* finished row so stick it into b->a */ 13083287d42SBarry Smith pv = ba + 16*bi[i]; 13183287d42SBarry Smith pj = bj + bi[i]; 13283287d42SBarry Smith nz = bi[i+1] - bi[i]; 13383287d42SBarry Smith for (j=0; j<nz; j++) { 13483287d42SBarry Smith x = rtmp+16*pj[j]; 13583287d42SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 13683287d42SBarry Smith pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 13783287d42SBarry Smith pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 13883287d42SBarry Smith pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 13983287d42SBarry Smith pv += 16; 14083287d42SBarry Smith } 14183287d42SBarry Smith /* invert diagonal block */ 14283287d42SBarry Smith w = ba + 16*diag_offset[i]; 143bcd9e38bSBarry Smith if (pivotinblocks) { 144a455e926SHong Zhang ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 1457b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 146bcd9e38bSBarry Smith } else { 14796b95a6bSBarry Smith ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148bcd9e38bSBarry Smith } 14983287d42SBarry Smith } 15083287d42SBarry Smith 15183287d42SBarry Smith ierr = PetscFree(rtmp);CHKERRQ(ierr); 15283287d42SBarry Smith ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 15383287d42SBarry Smith ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 15426fbe8dcSKarl Rupp 15506e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 15606e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 15783287d42SBarry Smith C->assembled = PETSC_TRUE; 15826fbe8dcSKarl Rupp 159766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 16083287d42SBarry Smith PetscFunctionReturn(0); 16183287d42SBarry Smith } 162e6580ceeSShri Abhyankar 1634dd39f65SShri Abhyankar /* MatLUFactorNumeric_SeqBAIJ_4 - 1644dd39f65SShri Abhyankar copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 16596b95a6bSBarry Smith PetscKernel_A_gets_A_times_B() 16696b95a6bSBarry Smith PetscKernel_A_gets_A_minus_B_times_C() 16796b95a6bSBarry Smith PetscKernel_A_gets_inverse_A() 168209027a4SShri Abhyankar */ 169c0c7eb62SShri Abhyankar 1704dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 171209027a4SShri Abhyankar { 172209027a4SShri Abhyankar Mat C = B; 173209027a4SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 174209027a4SShri Abhyankar IS isrow = b->row,isicol = b->icol; 175209027a4SShri Abhyankar PetscErrorCode ierr; 1765a586d82SBarry Smith const PetscInt *r,*ic; 177bbd65245SShri Abhyankar PetscInt i,j,k,nz,nzL,row; 178bbd65245SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 179bbd65245SShri Abhyankar const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 180209027a4SShri Abhyankar MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 181bbd65245SShri Abhyankar PetscInt flg; 1826ba06ab7SHong Zhang PetscReal shift; 183a455e926SHong Zhang PetscBool allowzeropivot,zeropivotdetected; 184209027a4SShri Abhyankar 185209027a4SShri Abhyankar PetscFunctionBegin; 1860164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 187209027a4SShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 188209027a4SShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 189209027a4SShri Abhyankar 1906ba06ab7SHong Zhang if (info->shifttype == MAT_SHIFT_NONE) { 1916ba06ab7SHong Zhang shift = 0; 1926ba06ab7SHong Zhang } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 1936ba06ab7SHong Zhang shift = info->shiftamount; 1946ba06ab7SHong Zhang } 1956ba06ab7SHong Zhang 196209027a4SShri Abhyankar /* generate work space needed by the factorization */ 197dcca6d9dSJed Brown ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 198580bdb30SBarry Smith ierr = PetscArrayzero(rtmp,bs2*n);CHKERRQ(ierr); 199209027a4SShri Abhyankar 200209027a4SShri Abhyankar for (i=0; i<n; i++) { 201209027a4SShri Abhyankar /* zero rtmp */ 202209027a4SShri Abhyankar /* L part */ 203209027a4SShri Abhyankar nz = bi[i+1] - bi[i]; 204209027a4SShri Abhyankar bjtmp = bj + bi[i]; 205209027a4SShri Abhyankar for (j=0; j<nz; j++) { 206580bdb30SBarry Smith ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 207209027a4SShri Abhyankar } 208209027a4SShri Abhyankar 209209027a4SShri Abhyankar /* U part */ 21078bb4007SShri Abhyankar nz = bdiag[i]-bdiag[i+1]; 21178bb4007SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 21278bb4007SShri Abhyankar for (j=0; j<nz; j++) { 213580bdb30SBarry Smith ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 21478bb4007SShri Abhyankar } 21578bb4007SShri Abhyankar 21678bb4007SShri Abhyankar /* load in initial (unfactored row) */ 21778bb4007SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 21878bb4007SShri Abhyankar ajtmp = aj + ai[r[i]]; 21978bb4007SShri Abhyankar v = aa + bs2*ai[r[i]]; 22078bb4007SShri Abhyankar for (j=0; j<nz; j++) { 221580bdb30SBarry Smith ierr = PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2);CHKERRQ(ierr); 22278bb4007SShri Abhyankar } 22378bb4007SShri Abhyankar 22478bb4007SShri Abhyankar /* elimination */ 22578bb4007SShri Abhyankar bjtmp = bj + bi[i]; 22678bb4007SShri Abhyankar nzL = bi[i+1] - bi[i]; 22778bb4007SShri Abhyankar for (k=0; k < nzL; k++) { 22878bb4007SShri Abhyankar row = bjtmp[k]; 22978bb4007SShri Abhyankar pc = rtmp + bs2*row; 230c35f09e5SBarry Smith for (flg=0,j=0; j<bs2; j++) { 231c35f09e5SBarry Smith if (pc[j]!=0.0) { 232c35f09e5SBarry Smith flg = 1; 233c35f09e5SBarry Smith break; 234c35f09e5SBarry Smith } 235c35f09e5SBarry Smith } 23678bb4007SShri Abhyankar if (flg) { 23778bb4007SShri Abhyankar pv = b->a + bs2*bdiag[row]; 23896b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 23996b95a6bSBarry Smith ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 24078bb4007SShri Abhyankar 24178bb4007SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 24278bb4007SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 24378bb4007SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 24478bb4007SShri Abhyankar for (j=0; j<nz; j++) { 24596b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 24678bb4007SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 24778bb4007SShri Abhyankar v = rtmp + bs2*pj[j]; 24896b95a6bSBarry Smith ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 24978bb4007SShri Abhyankar pv += bs2; 25078bb4007SShri Abhyankar } 251*ca0c957dSBarry Smith ierr = PetscLogFlops(128.0*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 25278bb4007SShri Abhyankar } 25378bb4007SShri Abhyankar } 25478bb4007SShri Abhyankar 25578bb4007SShri Abhyankar /* finished row so stick it into b->a */ 25678bb4007SShri Abhyankar /* L part */ 25778bb4007SShri Abhyankar pv = b->a + bs2*bi[i]; 25878bb4007SShri Abhyankar pj = b->j + bi[i]; 25978bb4007SShri Abhyankar nz = bi[i+1] - bi[i]; 26078bb4007SShri Abhyankar for (j=0; j<nz; j++) { 261580bdb30SBarry Smith ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 26278bb4007SShri Abhyankar } 26378bb4007SShri Abhyankar 26478bb4007SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 26578bb4007SShri Abhyankar pv = b->a + bs2*bdiag[i]; 26678bb4007SShri Abhyankar pj = b->j + bdiag[i]; 267580bdb30SBarry Smith ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);CHKERRQ(ierr); 268a455e926SHong Zhang ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2697b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 27078bb4007SShri Abhyankar 27178bb4007SShri Abhyankar /* U part */ 27278bb4007SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 27378bb4007SShri Abhyankar pj = b->j + bdiag[i+1]+1; 27478bb4007SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 27578bb4007SShri Abhyankar for (j=0; j<nz; j++) { 276580bdb30SBarry Smith ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 27778bb4007SShri Abhyankar } 27878bb4007SShri Abhyankar } 27978bb4007SShri Abhyankar 280fca92195SBarry Smith ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 28178bb4007SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 28278bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 28326fbe8dcSKarl Rupp 2844dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4; 2854dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 28678bb4007SShri Abhyankar C->assembled = PETSC_TRUE; 28726fbe8dcSKarl Rupp 288766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 28978bb4007SShri Abhyankar PetscFunctionReturn(0); 29078bb4007SShri Abhyankar } 29178bb4007SShri Abhyankar 29206e38f1dSHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 293e6580ceeSShri Abhyankar { 294e6580ceeSShri Abhyankar /* 295e6580ceeSShri Abhyankar Default Version for when blocks are 4 by 4 Using natural ordering 296e6580ceeSShri Abhyankar */ 297e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 298e6580ceeSShri Abhyankar PetscErrorCode ierr; 299e6580ceeSShri Abhyankar PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 300e6580ceeSShri Abhyankar PetscInt *ajtmpold,*ajtmp,nz,row; 301e6580ceeSShri Abhyankar PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 302e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 303e6580ceeSShri Abhyankar MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 304e6580ceeSShri Abhyankar MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 305e6580ceeSShri Abhyankar MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 306e6580ceeSShri Abhyankar MatScalar m13,m14,m15,m16; 307e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 308a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 309182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 3100164db54SHong Zhang PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 311e6580ceeSShri Abhyankar 312e6580ceeSShri Abhyankar PetscFunctionBegin; 3130164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 314785e854fSJed Brown ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 315e6580ceeSShri Abhyankar 316e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 317e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 318e6580ceeSShri Abhyankar ajtmp = bj + bi[i]; 319e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 320e6580ceeSShri Abhyankar x = rtmp+16*ajtmp[j]; 321e6580ceeSShri Abhyankar x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 322e6580ceeSShri Abhyankar x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 323e6580ceeSShri Abhyankar } 324e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 325e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 326e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 327e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 328e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 329e6580ceeSShri Abhyankar x = rtmp+16*ajtmpold[j]; 330e6580ceeSShri Abhyankar x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 331e6580ceeSShri Abhyankar x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 332e6580ceeSShri Abhyankar x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 333e6580ceeSShri Abhyankar x[14] = v[14]; x[15] = v[15]; 334e6580ceeSShri Abhyankar v += 16; 335e6580ceeSShri Abhyankar } 336e6580ceeSShri Abhyankar row = *ajtmp++; 337e6580ceeSShri Abhyankar while (row < i) { 338e6580ceeSShri Abhyankar pc = rtmp + 16*row; 339e6580ceeSShri Abhyankar p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 340e6580ceeSShri Abhyankar p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 341e6580ceeSShri Abhyankar p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 342e6580ceeSShri Abhyankar p15 = pc[14]; p16 = pc[15]; 343e6580ceeSShri Abhyankar if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 344e6580ceeSShri Abhyankar p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 345e6580ceeSShri Abhyankar p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 346e6580ceeSShri Abhyankar || p16 != 0.0) { 347e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 348e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 349e6580ceeSShri Abhyankar x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 350e6580ceeSShri Abhyankar x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 351e6580ceeSShri Abhyankar x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 352e6580ceeSShri Abhyankar x15 = pv[14]; x16 = pv[15]; 353e6580ceeSShri Abhyankar pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 354e6580ceeSShri Abhyankar pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 355e6580ceeSShri Abhyankar pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 356e6580ceeSShri Abhyankar pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 357e6580ceeSShri Abhyankar 358e6580ceeSShri Abhyankar pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 359e6580ceeSShri Abhyankar pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 360e6580ceeSShri Abhyankar pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 361e6580ceeSShri Abhyankar pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 362e6580ceeSShri Abhyankar 363e6580ceeSShri Abhyankar pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 364e6580ceeSShri Abhyankar pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 365e6580ceeSShri Abhyankar pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 366e6580ceeSShri Abhyankar pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 367e6580ceeSShri Abhyankar 368e6580ceeSShri Abhyankar pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 369e6580ceeSShri Abhyankar pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 370e6580ceeSShri Abhyankar pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 371e6580ceeSShri Abhyankar pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 372e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 373e6580ceeSShri Abhyankar pv += 16; 374e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 375e6580ceeSShri Abhyankar x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 376e6580ceeSShri Abhyankar x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 377e6580ceeSShri Abhyankar x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 378e6580ceeSShri Abhyankar x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 379e6580ceeSShri Abhyankar x = rtmp + 16*pj[j]; 380e6580ceeSShri Abhyankar x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 381e6580ceeSShri Abhyankar x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 382e6580ceeSShri Abhyankar x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 383e6580ceeSShri Abhyankar x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 384e6580ceeSShri Abhyankar 385e6580ceeSShri Abhyankar x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 386e6580ceeSShri Abhyankar x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 387e6580ceeSShri Abhyankar x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 388e6580ceeSShri Abhyankar x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 389e6580ceeSShri Abhyankar 390e6580ceeSShri Abhyankar x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 391e6580ceeSShri Abhyankar x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 392e6580ceeSShri Abhyankar x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 393e6580ceeSShri Abhyankar x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 394e6580ceeSShri Abhyankar 395e6580ceeSShri Abhyankar x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 396e6580ceeSShri Abhyankar x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 397e6580ceeSShri Abhyankar x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 398e6580ceeSShri Abhyankar x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 399e6580ceeSShri Abhyankar 400e6580ceeSShri Abhyankar pv += 16; 401e6580ceeSShri Abhyankar } 402e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 403e6580ceeSShri Abhyankar } 404e6580ceeSShri Abhyankar row = *ajtmp++; 405e6580ceeSShri Abhyankar } 406e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 407e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 408e6580ceeSShri Abhyankar pj = bj + bi[i]; 409e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 410e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 411e6580ceeSShri Abhyankar x = rtmp+16*pj[j]; 412e6580ceeSShri Abhyankar pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 413e6580ceeSShri Abhyankar pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 414e6580ceeSShri Abhyankar pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 415e6580ceeSShri Abhyankar pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 416e6580ceeSShri Abhyankar pv += 16; 417e6580ceeSShri Abhyankar } 418e6580ceeSShri Abhyankar /* invert diagonal block */ 419e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 420e6580ceeSShri Abhyankar if (pivotinblocks) { 421a455e926SHong Zhang ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 4227b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 423e6580ceeSShri Abhyankar } else { 42496b95a6bSBarry Smith ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 425e6580ceeSShri Abhyankar } 426e6580ceeSShri Abhyankar } 427e6580ceeSShri Abhyankar 428e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 42926fbe8dcSKarl Rupp 43006e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 43106e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 432e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 43326fbe8dcSKarl Rupp 434766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 435e6580ceeSShri Abhyankar PetscFunctionReturn(0); 436e6580ceeSShri Abhyankar } 437c0c7eb62SShri Abhyankar 438209027a4SShri Abhyankar /* 4394dd39f65SShri Abhyankar MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 4404dd39f65SShri Abhyankar copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 441209027a4SShri Abhyankar */ 4424dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 443209027a4SShri Abhyankar { 444209027a4SShri Abhyankar Mat C =B; 445209027a4SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 446209027a4SShri Abhyankar PetscErrorCode ierr; 447bbd65245SShri Abhyankar PetscInt i,j,k,nz,nzL,row; 448bbd65245SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 449bbd65245SShri Abhyankar const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 450209027a4SShri Abhyankar MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 451bbd65245SShri Abhyankar PetscInt flg; 4526ba06ab7SHong Zhang PetscReal shift; 453a455e926SHong Zhang PetscBool allowzeropivot,zeropivotdetected; 454e6580ceeSShri Abhyankar 455209027a4SShri Abhyankar PetscFunctionBegin; 4560164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 4570164db54SHong Zhang 458209027a4SShri Abhyankar /* generate work space needed by the factorization */ 459dcca6d9dSJed Brown ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 460580bdb30SBarry Smith ierr = PetscArrayzero(rtmp,bs2*n);CHKERRQ(ierr); 461209027a4SShri Abhyankar 4626ba06ab7SHong Zhang if (info->shifttype == MAT_SHIFT_NONE) { 4636ba06ab7SHong Zhang shift = 0; 4646ba06ab7SHong Zhang } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 4656ba06ab7SHong Zhang shift = info->shiftamount; 4666ba06ab7SHong Zhang } 4676ba06ab7SHong Zhang 468209027a4SShri Abhyankar for (i=0; i<n; i++) { 469209027a4SShri Abhyankar /* zero rtmp */ 470209027a4SShri Abhyankar /* L part */ 471209027a4SShri Abhyankar nz = bi[i+1] - bi[i]; 472209027a4SShri Abhyankar bjtmp = bj + bi[i]; 473209027a4SShri Abhyankar for (j=0; j<nz; j++) { 474580bdb30SBarry Smith ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 475209027a4SShri Abhyankar } 476209027a4SShri Abhyankar 477209027a4SShri Abhyankar /* U part */ 478b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 479b2b2dd24SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 480b2b2dd24SShri Abhyankar for (j=0; j<nz; j++) { 481580bdb30SBarry Smith ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 482b2b2dd24SShri Abhyankar } 483b2b2dd24SShri Abhyankar 484b2b2dd24SShri Abhyankar /* load in initial (unfactored row) */ 485b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 486b2b2dd24SShri Abhyankar ajtmp = aj + ai[i]; 487b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 488b2b2dd24SShri Abhyankar for (j=0; j<nz; j++) { 489580bdb30SBarry Smith ierr = PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2);CHKERRQ(ierr); 490b2b2dd24SShri Abhyankar } 491b2b2dd24SShri Abhyankar 492b2b2dd24SShri Abhyankar /* elimination */ 493b2b2dd24SShri Abhyankar bjtmp = bj + bi[i]; 494b2b2dd24SShri Abhyankar nzL = bi[i+1] - bi[i]; 495b2b2dd24SShri Abhyankar for (k=0; k < nzL; k++) { 496b2b2dd24SShri Abhyankar row = bjtmp[k]; 497b2b2dd24SShri Abhyankar pc = rtmp + bs2*row; 498c35f09e5SBarry Smith for (flg=0,j=0; j<bs2; j++) { 499c35f09e5SBarry Smith if (pc[j]!=0.0) { 500c35f09e5SBarry Smith flg = 1; 501c35f09e5SBarry Smith break; 502c35f09e5SBarry Smith } 503c35f09e5SBarry Smith } 504b2b2dd24SShri Abhyankar if (flg) { 505b2b2dd24SShri Abhyankar pv = b->a + bs2*bdiag[row]; 50696b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 50796b95a6bSBarry Smith ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 508b2b2dd24SShri Abhyankar 509b2b2dd24SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 510b2b2dd24SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 511b2b2dd24SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 512b2b2dd24SShri Abhyankar for (j=0; j<nz; j++) { 51396b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 514b2b2dd24SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 515b2b2dd24SShri Abhyankar v = rtmp + bs2*pj[j]; 51696b95a6bSBarry Smith ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 517b2b2dd24SShri Abhyankar pv += bs2; 518b2b2dd24SShri Abhyankar } 519*ca0c957dSBarry Smith ierr = PetscLogFlops(128.0*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 520b2b2dd24SShri Abhyankar } 521b2b2dd24SShri Abhyankar } 522b2b2dd24SShri Abhyankar 523b2b2dd24SShri Abhyankar /* finished row so stick it into b->a */ 524b2b2dd24SShri Abhyankar /* L part */ 525b2b2dd24SShri Abhyankar pv = b->a + bs2*bi[i]; 526b2b2dd24SShri Abhyankar pj = b->j + bi[i]; 527b2b2dd24SShri Abhyankar nz = bi[i+1] - bi[i]; 528b2b2dd24SShri Abhyankar for (j=0; j<nz; j++) { 529580bdb30SBarry Smith ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 530b2b2dd24SShri Abhyankar } 531b2b2dd24SShri Abhyankar 532b2b2dd24SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 533b2b2dd24SShri Abhyankar pv = b->a + bs2*bdiag[i]; 534b2b2dd24SShri Abhyankar pj = b->j + bdiag[i]; 535580bdb30SBarry Smith ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);CHKERRQ(ierr); 536a455e926SHong Zhang ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 5377b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 538b2b2dd24SShri Abhyankar 539b2b2dd24SShri Abhyankar /* U part */ 540b2b2dd24SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 541b2b2dd24SShri Abhyankar pj = b->j + bdiag[i+1]+1; 542b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 543b2b2dd24SShri Abhyankar for (j=0; j<nz; j++) { 544580bdb30SBarry Smith ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 545b2b2dd24SShri Abhyankar } 546b2b2dd24SShri Abhyankar } 547fca92195SBarry Smith ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 54826fbe8dcSKarl Rupp 5494dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 5504dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 551b2b2dd24SShri Abhyankar C->assembled = PETSC_TRUE; 55226fbe8dcSKarl Rupp 553766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 554b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 555b2b2dd24SShri Abhyankar } 556b2b2dd24SShri Abhyankar 557e6580ceeSShri Abhyankar #if defined(PETSC_HAVE_SSE) 558e6580ceeSShri Abhyankar 559e6580ceeSShri Abhyankar #include PETSC_HAVE_SSE 560e6580ceeSShri Abhyankar 561e6580ceeSShri Abhyankar /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 562e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 563e6580ceeSShri Abhyankar { 564e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 565e6580ceeSShri Abhyankar PetscErrorCode ierr; 566e6580ceeSShri Abhyankar int i,j,n = a->mbs; 567e6580ceeSShri Abhyankar int *bj = b->j,*bjtmp,*pj; 568e6580ceeSShri Abhyankar int row; 569e6580ceeSShri Abhyankar int *ajtmpold,nz,*bi=b->i; 570e6580ceeSShri Abhyankar int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 571e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 572e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 573e6580ceeSShri Abhyankar int nonzero=0; 574a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 575182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 5760164db54SHong Zhang PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 577e6580ceeSShri Abhyankar 578e6580ceeSShri Abhyankar PetscFunctionBegin; 5790164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 580e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 581e6580ceeSShri Abhyankar 582e32f2f54SBarry Smith if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 583e32f2f54SBarry Smith if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 584785e854fSJed Brown ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 585e32f2f54SBarry Smith if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 586e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 587e6580ceeSShri Abhyankar /* colscale = 4; */ 588e6580ceeSShri Abhyankar /* } */ 589e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 590e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 591e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 592e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 593e6580ceeSShri Abhyankar /* zero out one register */ 594e6580ceeSShri Abhyankar XOR_PS(XMM7,XMM7); 595e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 596e6580ceeSShri Abhyankar x = rtmp+16*bjtmp[j]; 597e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 598e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 599e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 600e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 601e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 602e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 603e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 604e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 605e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 606e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 607e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 608e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 609e6580ceeSShri Abhyankar SSE_INLINE_END_1; 610e6580ceeSShri Abhyankar } 611e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 612e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 613e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 614e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 615e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 616e6580ceeSShri Abhyankar x = rtmp+16*ajtmpold[j]; 617e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmpold[j]; */ 618e6580ceeSShri Abhyankar /* Copy v block into x block */ 619e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v,x) 620e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 621e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 622e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 623e6580ceeSShri Abhyankar 624e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 625e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 626e6580ceeSShri Abhyankar 627e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 628e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 629e6580ceeSShri Abhyankar 630e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 631e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 632e6580ceeSShri Abhyankar 633e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 634e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 635e6580ceeSShri Abhyankar 636e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 637e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 638e6580ceeSShri Abhyankar 639e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 640e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 641e6580ceeSShri Abhyankar 642e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 643e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 644e6580ceeSShri Abhyankar SSE_INLINE_END_2; 645e6580ceeSShri Abhyankar 646e6580ceeSShri Abhyankar v += 16; 647e6580ceeSShri Abhyankar } 648e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 649e6580ceeSShri Abhyankar row = *bjtmp++; 650e6580ceeSShri Abhyankar while (row < i) { 651e6580ceeSShri Abhyankar pc = rtmp + 16*row; 652e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 653e6580ceeSShri Abhyankar /* Load block from lower triangle */ 654e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 655e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 656e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 657e6580ceeSShri Abhyankar 658e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 659e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 660e6580ceeSShri Abhyankar 661e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 662e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 663e6580ceeSShri Abhyankar 664e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 665e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 666e6580ceeSShri Abhyankar 667e6580ceeSShri Abhyankar /* Compare block to zero block */ 668e6580ceeSShri Abhyankar 669e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4,XMM7) 670e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4,XMM0) 671e6580ceeSShri Abhyankar 672e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5,XMM7) 673e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5,XMM1) 674e6580ceeSShri Abhyankar 675e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6,XMM7) 676e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6,XMM2) 677e6580ceeSShri Abhyankar 678e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7,XMM3) 679e6580ceeSShri Abhyankar 680e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 681e6580ceeSShri Abhyankar SSE_OR_PS(XMM6,XMM7) 682e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM4) 683e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM6) 684e6580ceeSShri Abhyankar SSE_INLINE_END_1; 685e6580ceeSShri Abhyankar 686e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 687e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 688e6580ceeSShri Abhyankar MOVEMASK(nonzero,XMM5); 689e6580ceeSShri Abhyankar 690e6580ceeSShri Abhyankar /* If block is nonzero ... */ 691e6580ceeSShri Abhyankar if (nonzero) { 692e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 693e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 694e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 695e6580ceeSShri Abhyankar 696e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 697e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 698e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 699e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 700e6580ceeSShri Abhyankar 701e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv,pc) 702e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 703e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 704e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 705e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM0) 706e6580ceeSShri Abhyankar 707e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 708e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 709e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM1) 710e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM5) 711e6580ceeSShri Abhyankar 712e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 713e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 714e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM2) 715e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM6) 716e6580ceeSShri Abhyankar 717e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 718e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 719e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 720e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM7) 721e6580ceeSShri Abhyankar 722e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 723e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 724e6580ceeSShri Abhyankar 725e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 726e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 727e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 728e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 729e6580ceeSShri Abhyankar 730e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 731e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 732e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 733e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 734e6580ceeSShri Abhyankar 735e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 736e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 737e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 738e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM7) 739e6580ceeSShri Abhyankar 740e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 741e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 742e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 743e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 744e6580ceeSShri Abhyankar 745e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 746e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 747e6580ceeSShri Abhyankar 748e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 749e6580ceeSShri Abhyankar 750e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 751e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 752e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 753e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 754e6580ceeSShri Abhyankar 755e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 756e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 757e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 758e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 759e6580ceeSShri Abhyankar 760e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 761e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 762e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 763e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 764e6580ceeSShri Abhyankar 765e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 766e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 767e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 768e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 769e6580ceeSShri Abhyankar 770e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 771e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 772e6580ceeSShri Abhyankar 773e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 774e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 775e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 776e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 777e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0,XMM7) 778e6580ceeSShri Abhyankar 779e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 780e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 781e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM7) 782e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 783e6580ceeSShri Abhyankar 784e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 785e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1,XMM1,0x00) 786e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM2) 787e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 788e6580ceeSShri Abhyankar 789e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 790e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 791e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3,XMM7) 792e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM3) 793e6580ceeSShri Abhyankar 794e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 795e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 796e6580ceeSShri Abhyankar 797e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 798e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans afterall. */ 799e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 800e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3,XMM0) 801e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 802e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2,XMM6) 803e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 804e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1,XMM5) 805e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 806e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0,XMM4) 807e6580ceeSShri Abhyankar SSE_INLINE_END_2; 808e6580ceeSShri Abhyankar 809e6580ceeSShri Abhyankar /* Update the row: */ 810e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 811e6580ceeSShri Abhyankar pv += 16; 812e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 813e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 814e6580ceeSShri Abhyankar x = rtmp + 16*pj[j]; 815e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 816e6580ceeSShri Abhyankar 817e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 818e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 819e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 820e6580ceeSShri Abhyankar /* Load First Column of X*/ 821e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 822e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 823e6580ceeSShri Abhyankar 824e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 825e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 826e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 827e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 828e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 829e6580ceeSShri Abhyankar 830e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 831e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 832e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 833e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 834e6580ceeSShri Abhyankar 835e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 836e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 837e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 838e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 839e6580ceeSShri Abhyankar 840e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 841e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 842e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 843e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 844e6580ceeSShri Abhyankar 845e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 846e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 847e6580ceeSShri Abhyankar 848e6580ceeSShri Abhyankar /* Second Column */ 849e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 850e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 851e6580ceeSShri Abhyankar 852e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 853e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 854e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 855e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 856e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 857e6580ceeSShri Abhyankar 858e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 859e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 860e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 861e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM7) 862e6580ceeSShri Abhyankar 863e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 864e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 865e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM2) 866e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM4) 867e6580ceeSShri Abhyankar 868e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 869e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 870e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 871e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 872e6580ceeSShri Abhyankar 873e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 874e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 875e6580ceeSShri Abhyankar 876e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 877e6580ceeSShri Abhyankar 878e6580ceeSShri Abhyankar /* Third Column */ 879e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 880e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 881e6580ceeSShri Abhyankar 882e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 883e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 884e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 885e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM0) 886e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 887e6580ceeSShri Abhyankar 888e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 889e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 890e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM1) 891e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM4) 892e6580ceeSShri Abhyankar 893e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 894e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 895e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM2) 896e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM5) 897e6580ceeSShri Abhyankar 898e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 899e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 900e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 901e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 902e6580ceeSShri Abhyankar 903e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 904e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 905e6580ceeSShri Abhyankar 906e6580ceeSShri Abhyankar /* Fourth Column */ 907e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 908e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 909e6580ceeSShri Abhyankar 910e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 911e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 912e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 913e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 914e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 915e6580ceeSShri Abhyankar 916e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 917e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 918e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 919e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 920e6580ceeSShri Abhyankar 921e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 922e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 923e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 924e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 925e6580ceeSShri Abhyankar 926e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 927e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 928e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 929e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 930e6580ceeSShri Abhyankar 931e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 932e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 933e6580ceeSShri Abhyankar SSE_INLINE_END_2; 934e6580ceeSShri Abhyankar pv += 16; 935e6580ceeSShri Abhyankar } 936e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 937e6580ceeSShri Abhyankar } 938e6580ceeSShri Abhyankar row = *bjtmp++; 939e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 940e6580ceeSShri Abhyankar } 941e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 942e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 943e6580ceeSShri Abhyankar pj = bj + bi[i]; 944e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 945e6580ceeSShri Abhyankar 946e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 947e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 948e6580ceeSShri Abhyankar x = rtmp+16*pj[j]; 949e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 950e6580ceeSShri Abhyankar 951e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 952e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 953e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 954e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 955e6580ceeSShri Abhyankar 956e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 957e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 958e6580ceeSShri Abhyankar 959e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 960e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 961e6580ceeSShri Abhyankar 962e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 963e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 964e6580ceeSShri Abhyankar 965e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 966e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 967e6580ceeSShri Abhyankar 968e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 969e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 970e6580ceeSShri Abhyankar 971e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 972e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 973e6580ceeSShri Abhyankar 974e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 975e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 976e6580ceeSShri Abhyankar SSE_INLINE_END_2; 977e6580ceeSShri Abhyankar pv += 16; 978e6580ceeSShri Abhyankar } 979e6580ceeSShri Abhyankar /* invert diagonal block */ 980e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 981e6580ceeSShri Abhyankar if (pivotinblocks) { 982a455e926SHong Zhang ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 983603e8f96SBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 984e6580ceeSShri Abhyankar } else { 98596b95a6bSBarry Smith ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 986e6580ceeSShri Abhyankar } 98796b95a6bSBarry Smith /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 988e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 989e6580ceeSShri Abhyankar } 990e6580ceeSShri Abhyankar 991e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 99226fbe8dcSKarl Rupp 993e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 994e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 995e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 99626fbe8dcSKarl Rupp 997766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 998e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 999e6580ceeSShri Abhyankar SSE_SCOPE_END; 1000e6580ceeSShri Abhyankar PetscFunctionReturn(0); 1001e6580ceeSShri Abhyankar } 1002e6580ceeSShri Abhyankar 1003e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1004e6580ceeSShri Abhyankar { 1005e6580ceeSShri Abhyankar Mat A =C; 1006e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1007e6580ceeSShri Abhyankar PetscErrorCode ierr; 1008e6580ceeSShri Abhyankar int i,j,n = a->mbs; 1009e6580ceeSShri Abhyankar unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1010e6580ceeSShri Abhyankar unsigned short *aj = (unsigned short*)(a->j),*ajtmp; 1011e6580ceeSShri Abhyankar unsigned int row; 1012e6580ceeSShri Abhyankar int nz,*bi=b->i; 1013e6580ceeSShri Abhyankar int *diag_offset = b->diag,*ai=a->i; 1014e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1015e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 1016e6580ceeSShri Abhyankar int nonzero=0; 1017a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 1018182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 10190164db54SHong Zhang PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 1020e6580ceeSShri Abhyankar 1021e6580ceeSShri Abhyankar PetscFunctionBegin; 10220164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 1023e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 1024e6580ceeSShri Abhyankar 1025e32f2f54SBarry Smith if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1026e32f2f54SBarry Smith if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1027785e854fSJed Brown ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1028e32f2f54SBarry Smith if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1029e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 1030e6580ceeSShri Abhyankar /* colscale = 4; */ 1031e6580ceeSShri Abhyankar /* } */ 1032e6580ceeSShri Abhyankar 1033e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 1034e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 1035e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 1036e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 1037e6580ceeSShri Abhyankar /* zero out one register */ 1038e6580ceeSShri Abhyankar XOR_PS(XMM7,XMM7); 1039e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1040e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)bjtmp[j]); 1041e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 1042e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 1043e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 1044e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1045e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1046e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1047e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1048e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1049e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1050e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1051e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1052e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1053e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1054e6580ceeSShri Abhyankar } 1055e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 1056e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 1057e6580ceeSShri Abhyankar ajtmp = aj + ai[i]; 1058e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 1059e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1060e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)ajtmp[j]); 1061e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmp[j]; */ 1062e6580ceeSShri Abhyankar /* Copy v block into x block */ 1063e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v,x) 1064e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1065e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1066e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1067e6580ceeSShri Abhyankar 1068e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1069e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1070e6580ceeSShri Abhyankar 1071e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1072e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1073e6580ceeSShri Abhyankar 1074e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1075e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1076e6580ceeSShri Abhyankar 1077e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1078e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1079e6580ceeSShri Abhyankar 1080e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1081e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1082e6580ceeSShri Abhyankar 1083e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1084e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1085e6580ceeSShri Abhyankar 1086e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1087e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1088e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1089e6580ceeSShri Abhyankar 1090e6580ceeSShri Abhyankar v += 16; 1091e6580ceeSShri Abhyankar } 1092e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1093e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1094e6580ceeSShri Abhyankar while (row < i) { 1095e6580ceeSShri Abhyankar pc = rtmp + 16*row; 1096e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 1097e6580ceeSShri Abhyankar /* Load block from lower triangle */ 1098e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1099e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1100e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1101e6580ceeSShri Abhyankar 1102e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1103e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1104e6580ceeSShri Abhyankar 1105e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1106e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1107e6580ceeSShri Abhyankar 1108e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1109e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1110e6580ceeSShri Abhyankar 1111e6580ceeSShri Abhyankar /* Compare block to zero block */ 1112e6580ceeSShri Abhyankar 1113e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4,XMM7) 1114e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4,XMM0) 1115e6580ceeSShri Abhyankar 1116e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5,XMM7) 1117e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5,XMM1) 1118e6580ceeSShri Abhyankar 1119e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6,XMM7) 1120e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6,XMM2) 1121e6580ceeSShri Abhyankar 1122e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7,XMM3) 1123e6580ceeSShri Abhyankar 1124e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 1125e6580ceeSShri Abhyankar SSE_OR_PS(XMM6,XMM7) 1126e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM4) 1127e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM6) 1128e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1129e6580ceeSShri Abhyankar 1130e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 1131e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1132e6580ceeSShri Abhyankar MOVEMASK(nonzero,XMM5); 1133e6580ceeSShri Abhyankar 1134e6580ceeSShri Abhyankar /* If block is nonzero ... */ 1135e6580ceeSShri Abhyankar if (nonzero) { 1136e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 1137e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1138e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 1139e6580ceeSShri Abhyankar 1140e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1141e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1142e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 1143e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1144e6580ceeSShri Abhyankar 1145e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv,pc) 1146e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 1147e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1148e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1149e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM0) 1150e6580ceeSShri Abhyankar 1151e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1152e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1153e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM1) 1154e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM5) 1155e6580ceeSShri Abhyankar 1156e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1157e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1158e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM2) 1159e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM6) 1160e6580ceeSShri Abhyankar 1161e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1162e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1163e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1164e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM7) 1165e6580ceeSShri Abhyankar 1166e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1167e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1168e6580ceeSShri Abhyankar 1169e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 1170e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1171e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1172e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1173e6580ceeSShri Abhyankar 1174e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1175e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1176e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1177e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 1178e6580ceeSShri Abhyankar 1179e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1180e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1181e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1182e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM7) 1183e6580ceeSShri Abhyankar 1184e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1185e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1186e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1187e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 1188e6580ceeSShri Abhyankar 1189e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1190e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1191e6580ceeSShri Abhyankar 1192e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1193e6580ceeSShri Abhyankar 1194e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 1195e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1196e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1197e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1198e6580ceeSShri Abhyankar 1199e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1200e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1201e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1202e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1203e6580ceeSShri Abhyankar 1204e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1205e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1206e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1207e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1208e6580ceeSShri Abhyankar 1209e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1210e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1211e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1212e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1213e6580ceeSShri Abhyankar 1214e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1215e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1216e6580ceeSShri Abhyankar 1217e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1218e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 1219e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1220e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1221e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0,XMM7) 1222e6580ceeSShri Abhyankar 1223e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1224e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1225e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM7) 1226e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 1227e6580ceeSShri Abhyankar 1228e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1229e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1,XMM1,0x00) 1230e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM2) 1231e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 1232e6580ceeSShri Abhyankar 1233e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1234e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1235e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3,XMM7) 1236e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM3) 1237e6580ceeSShri Abhyankar 1238e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1239e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1240e6580ceeSShri Abhyankar 1241e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1242e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans afterall. */ 1243e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 1244e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3,XMM0) 1245e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 1246e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2,XMM6) 1247e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 1248e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1,XMM5) 1249e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 1250e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0,XMM4) 1251e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1252e6580ceeSShri Abhyankar 1253e6580ceeSShri Abhyankar /* Update the row: */ 1254e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 1255e6580ceeSShri Abhyankar pv += 16; 1256e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1257e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1258e6580ceeSShri Abhyankar x = rtmp + 16*((unsigned int)pj[j]); 1259e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 1260e6580ceeSShri Abhyankar 1261e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 1262e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1263e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1264e6580ceeSShri Abhyankar /* Load First Column of X*/ 1265e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1266e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1267e6580ceeSShri Abhyankar 1268e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1269e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1270e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1271e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1272e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1273e6580ceeSShri Abhyankar 1274e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1275e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1276e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1277e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1278e6580ceeSShri Abhyankar 1279e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1280e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1281e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1282e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1283e6580ceeSShri Abhyankar 1284e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1285e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1286e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1287e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1288e6580ceeSShri Abhyankar 1289e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1290e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1291e6580ceeSShri Abhyankar 1292e6580ceeSShri Abhyankar /* Second Column */ 1293e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1294e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1295e6580ceeSShri Abhyankar 1296e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1297e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1298e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1299e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1300e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1301e6580ceeSShri Abhyankar 1302e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1303e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1304e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1305e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM7) 1306e6580ceeSShri Abhyankar 1307e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1308e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1309e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM2) 1310e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM4) 1311e6580ceeSShri Abhyankar 1312e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1313e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1314e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1315e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1316e6580ceeSShri Abhyankar 1317e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1318e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1319e6580ceeSShri Abhyankar 1320e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1321e6580ceeSShri Abhyankar 1322e6580ceeSShri Abhyankar /* Third Column */ 1323e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1324e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1325e6580ceeSShri Abhyankar 1326e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1327e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1328e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1329e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM0) 1330e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1331e6580ceeSShri Abhyankar 1332e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1333e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1334e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM1) 1335e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM4) 1336e6580ceeSShri Abhyankar 1337e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1338e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1339e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM2) 1340e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM5) 1341e6580ceeSShri Abhyankar 1342e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1343e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1344e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1345e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1346e6580ceeSShri Abhyankar 1347e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1348e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1349e6580ceeSShri Abhyankar 1350e6580ceeSShri Abhyankar /* Fourth Column */ 1351e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1352e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1353e6580ceeSShri Abhyankar 1354e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1355e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1356e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1357e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1358e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1359e6580ceeSShri Abhyankar 1360e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1361e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1362e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1363e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1364e6580ceeSShri Abhyankar 1365e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1366e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1367e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1368e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1369e6580ceeSShri Abhyankar 1370e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1371e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1372e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1373e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1374e6580ceeSShri Abhyankar 1375e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1376e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1377e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1378e6580ceeSShri Abhyankar pv += 16; 1379e6580ceeSShri Abhyankar } 1380e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1381e6580ceeSShri Abhyankar } 1382e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1383e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1384e6580ceeSShri Abhyankar /* bjtmp++; */ 1385e6580ceeSShri Abhyankar } 1386e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1387e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 1388e6580ceeSShri Abhyankar pj = bj + bi[i]; 1389e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 1390e6580ceeSShri Abhyankar 1391e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1392e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1393e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)pj[j]); 1394e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1395e6580ceeSShri Abhyankar 1396e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1397e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1398e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1399e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1400e6580ceeSShri Abhyankar 1401e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1402e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1403e6580ceeSShri Abhyankar 1404e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1405e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1406e6580ceeSShri Abhyankar 1407e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1408e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1409e6580ceeSShri Abhyankar 1410e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1411e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1412e6580ceeSShri Abhyankar 1413e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1414e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1415e6580ceeSShri Abhyankar 1416e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1417e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1418e6580ceeSShri Abhyankar 1419e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1420e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1421e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1422e6580ceeSShri Abhyankar pv += 16; 1423e6580ceeSShri Abhyankar } 1424e6580ceeSShri Abhyankar /* invert diagonal block */ 1425e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 1426e6580ceeSShri Abhyankar if (pivotinblocks) { 1427a455e926SHong Zhang ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 1428603e8f96SBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1429e6580ceeSShri Abhyankar } else { 143096b95a6bSBarry Smith ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1431e6580ceeSShri Abhyankar } 1432e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1433e6580ceeSShri Abhyankar } 1434e6580ceeSShri Abhyankar 1435e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 143626fbe8dcSKarl Rupp 1437e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1438e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1439e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 144026fbe8dcSKarl Rupp 1441766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1442e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1443e6580ceeSShri Abhyankar SSE_SCOPE_END; 1444e6580ceeSShri Abhyankar PetscFunctionReturn(0); 1445e6580ceeSShri Abhyankar } 1446e6580ceeSShri Abhyankar 1447e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1448e6580ceeSShri Abhyankar { 1449e6580ceeSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1450e6580ceeSShri Abhyankar PetscErrorCode ierr; 1451e6580ceeSShri Abhyankar int i,j,n = a->mbs; 1452e6580ceeSShri Abhyankar unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1453e6580ceeSShri Abhyankar unsigned int row; 1454e6580ceeSShri Abhyankar int *ajtmpold,nz,*bi=b->i; 1455e6580ceeSShri Abhyankar int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1456e6580ceeSShri Abhyankar MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1457e6580ceeSShri Abhyankar MatScalar *ba = b->a,*aa = a->a; 1458e6580ceeSShri Abhyankar int nonzero=0; 1459a455e926SHong Zhang PetscBool pivotinblocks = b->pivotinblocks; 1460182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 14610164db54SHong Zhang PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 1462e6580ceeSShri Abhyankar 1463e6580ceeSShri Abhyankar PetscFunctionBegin; 14640164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 1465e6580ceeSShri Abhyankar SSE_SCOPE_BEGIN; 1466e6580ceeSShri Abhyankar 1467e32f2f54SBarry Smith if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1468e32f2f54SBarry Smith if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1469785e854fSJed Brown ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1470e32f2f54SBarry Smith if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1471e6580ceeSShri Abhyankar /* if ((unsigned long)bj==(unsigned long)aj) { */ 1472e6580ceeSShri Abhyankar /* colscale = 4; */ 1473e6580ceeSShri Abhyankar /* } */ 1474e6580ceeSShri Abhyankar if ((unsigned long)bj==(unsigned long)aj) { 1475e6580ceeSShri Abhyankar return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1476e6580ceeSShri Abhyankar } 1477e6580ceeSShri Abhyankar 1478e6580ceeSShri Abhyankar for (i=0; i<n; i++) { 1479e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 1480e6580ceeSShri Abhyankar bjtmp = bj + bi[i]; 1481e6580ceeSShri Abhyankar /* zero out the 4x4 block accumulators */ 1482e6580ceeSShri Abhyankar /* zero out one register */ 1483e6580ceeSShri Abhyankar XOR_PS(XMM7,XMM7); 1484e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1485e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)bjtmp[j]); 1486e6580ceeSShri Abhyankar /* x = rtmp+4*bjtmp[j]; */ 1487e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(x) 1488e6580ceeSShri Abhyankar /* Copy zero register to memory locations */ 1489e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1490e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1491e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1492e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1493e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1494e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1495e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1496e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1497e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1498e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1499e6580ceeSShri Abhyankar } 1500e6580ceeSShri Abhyankar /* load in initial (unfactored row) */ 1501e6580ceeSShri Abhyankar nz = ai[i+1] - ai[i]; 1502e6580ceeSShri Abhyankar ajtmpold = aj + ai[i]; 1503e6580ceeSShri Abhyankar v = aa + 16*ai[i]; 1504e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1505e6580ceeSShri Abhyankar x = rtmp+16*ajtmpold[j]; 1506e6580ceeSShri Abhyankar /* x = rtmp+colscale*ajtmpold[j]; */ 1507e6580ceeSShri Abhyankar /* Copy v block into x block */ 1508e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(v,x) 1509e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1510e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1511e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1512e6580ceeSShri Abhyankar 1513e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1514e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1515e6580ceeSShri Abhyankar 1516e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1517e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1518e6580ceeSShri Abhyankar 1519e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1520e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1521e6580ceeSShri Abhyankar 1522e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1523e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1524e6580ceeSShri Abhyankar 1525e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1526e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1527e6580ceeSShri Abhyankar 1528e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1529e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1530e6580ceeSShri Abhyankar 1531e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1532e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1533e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1534e6580ceeSShri Abhyankar 1535e6580ceeSShri Abhyankar v += 16; 1536e6580ceeSShri Abhyankar } 1537e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1538e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1539e6580ceeSShri Abhyankar while (row < i) { 1540e6580ceeSShri Abhyankar pc = rtmp + 16*row; 1541e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_1(pc) 1542e6580ceeSShri Abhyankar /* Load block from lower triangle */ 1543e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1544e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1545e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1546e6580ceeSShri Abhyankar 1547e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1548e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1549e6580ceeSShri Abhyankar 1550e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1551e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1552e6580ceeSShri Abhyankar 1553e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1554e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1555e6580ceeSShri Abhyankar 1556e6580ceeSShri Abhyankar /* Compare block to zero block */ 1557e6580ceeSShri Abhyankar 1558e6580ceeSShri Abhyankar SSE_COPY_PS(XMM4,XMM7) 1559e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM4,XMM0) 1560e6580ceeSShri Abhyankar 1561e6580ceeSShri Abhyankar SSE_COPY_PS(XMM5,XMM7) 1562e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM5,XMM1) 1563e6580ceeSShri Abhyankar 1564e6580ceeSShri Abhyankar SSE_COPY_PS(XMM6,XMM7) 1565e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM6,XMM2) 1566e6580ceeSShri Abhyankar 1567e6580ceeSShri Abhyankar SSE_CMPNEQ_PS(XMM7,XMM3) 1568e6580ceeSShri Abhyankar 1569e6580ceeSShri Abhyankar /* Reduce the comparisons to one SSE register */ 1570e6580ceeSShri Abhyankar SSE_OR_PS(XMM6,XMM7) 1571e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM4) 1572e6580ceeSShri Abhyankar SSE_OR_PS(XMM5,XMM6) 1573e6580ceeSShri Abhyankar SSE_INLINE_END_1; 1574e6580ceeSShri Abhyankar 1575e6580ceeSShri Abhyankar /* Reduce the one SSE register to an integer register for branching */ 1576e6580ceeSShri Abhyankar /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1577e6580ceeSShri Abhyankar MOVEMASK(nonzero,XMM5); 1578e6580ceeSShri Abhyankar 1579e6580ceeSShri Abhyankar /* If block is nonzero ... */ 1580e6580ceeSShri Abhyankar if (nonzero) { 1581e6580ceeSShri Abhyankar pv = ba + 16*diag_offset[row]; 1582e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1583e6580ceeSShri Abhyankar pj = bj + diag_offset[row] + 1; 1584e6580ceeSShri Abhyankar 1585e6580ceeSShri Abhyankar /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1586e6580ceeSShri Abhyankar /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1587e6580ceeSShri Abhyankar /* but the diagonal was inverted already */ 1588e6580ceeSShri Abhyankar /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1589e6580ceeSShri Abhyankar 1590e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(pv,pc) 1591e6580ceeSShri Abhyankar /* Column 0, product is accumulated in XMM4 */ 1592e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1593e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1594e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM0) 1595e6580ceeSShri Abhyankar 1596e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1597e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1598e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM1) 1599e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM5) 1600e6580ceeSShri Abhyankar 1601e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1602e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1603e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM2) 1604e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM6) 1605e6580ceeSShri Abhyankar 1606e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1607e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1608e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1609e6580ceeSShri Abhyankar SSE_ADD_PS(XMM4,XMM7) 1610e6580ceeSShri Abhyankar 1611e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1612e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1613e6580ceeSShri Abhyankar 1614e6580ceeSShri Abhyankar /* Column 1, product is accumulated in XMM5 */ 1615e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1616e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1617e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1618e6580ceeSShri Abhyankar 1619e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1620e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1621e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1622e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 1623e6580ceeSShri Abhyankar 1624e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1625e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1626e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1627e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM7) 1628e6580ceeSShri Abhyankar 1629e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1630e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1631e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1632e6580ceeSShri Abhyankar SSE_ADD_PS(XMM5,XMM6) 1633e6580ceeSShri Abhyankar 1634e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1635e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1636e6580ceeSShri Abhyankar 1637e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1638e6580ceeSShri Abhyankar 1639e6580ceeSShri Abhyankar /* Column 2, product is accumulated in XMM6 */ 1640e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1641e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1642e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1643e6580ceeSShri Abhyankar 1644e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1645e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1646e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1647e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1648e6580ceeSShri Abhyankar 1649e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1650e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1651e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1652e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1653e6580ceeSShri Abhyankar 1654e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1655e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1656e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1657e6580ceeSShri Abhyankar SSE_ADD_PS(XMM6,XMM7) 1658e6580ceeSShri Abhyankar 1659e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1660e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1661e6580ceeSShri Abhyankar 1662e6580ceeSShri Abhyankar /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1663e6580ceeSShri Abhyankar /* Column 3, product is accumulated in XMM0 */ 1664e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1665e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1666e6580ceeSShri Abhyankar SSE_MULT_PS(XMM0,XMM7) 1667e6580ceeSShri Abhyankar 1668e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1669e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1670e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM7) 1671e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 1672e6580ceeSShri Abhyankar 1673e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1674e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM1,XMM1,0x00) 1675e6580ceeSShri Abhyankar SSE_MULT_PS(XMM1,XMM2) 1676e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM1) 1677e6580ceeSShri Abhyankar 1678e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1679e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1680e6580ceeSShri Abhyankar SSE_MULT_PS(XMM3,XMM7) 1681e6580ceeSShri Abhyankar SSE_ADD_PS(XMM0,XMM3) 1682e6580ceeSShri Abhyankar 1683e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1684e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1685e6580ceeSShri Abhyankar 1686e6580ceeSShri Abhyankar /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1687e6580ceeSShri Abhyankar /* This is code to be maintained and read by humans afterall. */ 1688e6580ceeSShri Abhyankar /* Copy Multiplier Col 3 into XMM3 */ 1689e6580ceeSShri Abhyankar SSE_COPY_PS(XMM3,XMM0) 1690e6580ceeSShri Abhyankar /* Copy Multiplier Col 2 into XMM2 */ 1691e6580ceeSShri Abhyankar SSE_COPY_PS(XMM2,XMM6) 1692e6580ceeSShri Abhyankar /* Copy Multiplier Col 1 into XMM1 */ 1693e6580ceeSShri Abhyankar SSE_COPY_PS(XMM1,XMM5) 1694e6580ceeSShri Abhyankar /* Copy Multiplier Col 0 into XMM0 */ 1695e6580ceeSShri Abhyankar SSE_COPY_PS(XMM0,XMM4) 1696e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1697e6580ceeSShri Abhyankar 1698e6580ceeSShri Abhyankar /* Update the row: */ 1699e6580ceeSShri Abhyankar nz = bi[row+1] - diag_offset[row] - 1; 1700e6580ceeSShri Abhyankar pv += 16; 1701e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1702e6580ceeSShri Abhyankar PREFETCH_L1(&pv[16]); 1703e6580ceeSShri Abhyankar x = rtmp + 16*((unsigned int)pj[j]); 1704e6580ceeSShri Abhyankar /* x = rtmp + 4*pj[j]; */ 1705e6580ceeSShri Abhyankar 1706e6580ceeSShri Abhyankar /* X:=X-M*PV, One column at a time */ 1707e6580ceeSShri Abhyankar /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1708e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1709e6580ceeSShri Abhyankar /* Load First Column of X*/ 1710e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1711e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1712e6580ceeSShri Abhyankar 1713e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1714e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1715e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1716e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1717e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1718e6580ceeSShri Abhyankar 1719e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1720e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1721e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1722e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1723e6580ceeSShri Abhyankar 1724e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1725e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1726e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1727e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1728e6580ceeSShri Abhyankar 1729e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1730e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1731e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1732e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1733e6580ceeSShri Abhyankar 1734e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1735e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1736e6580ceeSShri Abhyankar 1737e6580ceeSShri Abhyankar /* Second Column */ 1738e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1739e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1740e6580ceeSShri Abhyankar 1741e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1742e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1743e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1744e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM0) 1745e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1746e6580ceeSShri Abhyankar 1747e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1748e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1749e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM1) 1750e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM7) 1751e6580ceeSShri Abhyankar 1752e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1753e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1754e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM2) 1755e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM4) 1756e6580ceeSShri Abhyankar 1757e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1758e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1759e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM3) 1760e6580ceeSShri Abhyankar SSE_SUB_PS(XMM5,XMM6) 1761e6580ceeSShri Abhyankar 1762e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1763e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1764e6580ceeSShri Abhyankar 1765e6580ceeSShri Abhyankar SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1766e6580ceeSShri Abhyankar 1767e6580ceeSShri Abhyankar /* Third Column */ 1768e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1769e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1770e6580ceeSShri Abhyankar 1771e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1772e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1773e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1774e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM0) 1775e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1776e6580ceeSShri Abhyankar 1777e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1778e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM4,XMM4,0x00) 1779e6580ceeSShri Abhyankar SSE_MULT_PS(XMM4,XMM1) 1780e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM4) 1781e6580ceeSShri Abhyankar 1782e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1783e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1784e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM2) 1785e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM5) 1786e6580ceeSShri Abhyankar 1787e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1788e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1789e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM3) 1790e6580ceeSShri Abhyankar SSE_SUB_PS(XMM6,XMM7) 1791e6580ceeSShri Abhyankar 1792e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1793e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1794e6580ceeSShri Abhyankar 1795e6580ceeSShri Abhyankar /* Fourth Column */ 1796e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1797e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1798e6580ceeSShri Abhyankar 1799e6580ceeSShri Abhyankar /* Matrix-Vector Product: */ 1800e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1801e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1802e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM0) 1803e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1804e6580ceeSShri Abhyankar 1805e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1806e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM6,XMM6,0x00) 1807e6580ceeSShri Abhyankar SSE_MULT_PS(XMM6,XMM1) 1808e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM6) 1809e6580ceeSShri Abhyankar 1810e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1811e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM7,XMM7,0x00) 1812e6580ceeSShri Abhyankar SSE_MULT_PS(XMM7,XMM2) 1813e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM7) 1814e6580ceeSShri Abhyankar 1815e6580ceeSShri Abhyankar SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1816e6580ceeSShri Abhyankar SSE_SHUFFLE(XMM5,XMM5,0x00) 1817e6580ceeSShri Abhyankar SSE_MULT_PS(XMM5,XMM3) 1818e6580ceeSShri Abhyankar SSE_SUB_PS(XMM4,XMM5) 1819e6580ceeSShri Abhyankar 1820e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1821e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1822e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1823e6580ceeSShri Abhyankar pv += 16; 1824e6580ceeSShri Abhyankar } 1825e6580ceeSShri Abhyankar ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1826e6580ceeSShri Abhyankar } 1827e6580ceeSShri Abhyankar row = (unsigned int)(*bjtmp++); 1828e6580ceeSShri Abhyankar /* row = (*bjtmp++)/4; */ 1829e6580ceeSShri Abhyankar /* bjtmp++; */ 1830e6580ceeSShri Abhyankar } 1831e6580ceeSShri Abhyankar /* finished row so stick it into b->a */ 1832e6580ceeSShri Abhyankar pv = ba + 16*bi[i]; 1833e6580ceeSShri Abhyankar pj = bj + bi[i]; 1834e6580ceeSShri Abhyankar nz = bi[i+1] - bi[i]; 1835e6580ceeSShri Abhyankar 1836e6580ceeSShri Abhyankar /* Copy x block back into pv block */ 1837e6580ceeSShri Abhyankar for (j=0; j<nz; j++) { 1838e6580ceeSShri Abhyankar x = rtmp+16*((unsigned int)pj[j]); 1839e6580ceeSShri Abhyankar /* x = rtmp+4*pj[j]; */ 1840e6580ceeSShri Abhyankar 1841e6580ceeSShri Abhyankar SSE_INLINE_BEGIN_2(x,pv) 1842e6580ceeSShri Abhyankar /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1843e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1844e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1845e6580ceeSShri Abhyankar 1846e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1847e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1848e6580ceeSShri Abhyankar 1849e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1850e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1851e6580ceeSShri Abhyankar 1852e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1853e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1854e6580ceeSShri Abhyankar 1855e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1856e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1857e6580ceeSShri Abhyankar 1858e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1859e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1860e6580ceeSShri Abhyankar 1861e6580ceeSShri Abhyankar SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1862e6580ceeSShri Abhyankar SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1863e6580ceeSShri Abhyankar 1864e6580ceeSShri Abhyankar SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1865e6580ceeSShri Abhyankar SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1866e6580ceeSShri Abhyankar SSE_INLINE_END_2; 1867e6580ceeSShri Abhyankar pv += 16; 1868e6580ceeSShri Abhyankar } 1869e6580ceeSShri Abhyankar /* invert diagonal block */ 1870e6580ceeSShri Abhyankar w = ba + 16*diag_offset[i]; 1871e6580ceeSShri Abhyankar if (pivotinblocks) { 1872a455e926SHong Zhang ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);CHKERRQ(ierr); 1873603e8f96SBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1874e6580ceeSShri Abhyankar } else { 187596b95a6bSBarry Smith ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1876e6580ceeSShri Abhyankar } 187796b95a6bSBarry Smith /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1878e6580ceeSShri Abhyankar /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1879e6580ceeSShri Abhyankar } 1880e6580ceeSShri Abhyankar 1881e6580ceeSShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 188226fbe8dcSKarl Rupp 1883e6580ceeSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1884e6580ceeSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1885e6580ceeSShri Abhyankar C->assembled = PETSC_TRUE; 188626fbe8dcSKarl Rupp 1887766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1888e6580ceeSShri Abhyankar /* Flop Count from inverting diagonal blocks */ 1889e6580ceeSShri Abhyankar SSE_SCOPE_END; 1890e6580ceeSShri Abhyankar PetscFunctionReturn(0); 1891e6580ceeSShri Abhyankar } 1892e6580ceeSShri Abhyankar 1893e6580ceeSShri Abhyankar #endif 1894