1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 1016a2bf60SHong Zhang #include "petscbt.h" 1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 124e2b4712SSatish Balay 134a2ae208SSatish Balay #undef __FUNCT__ 14*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 15*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 16f1af5d2fSBarry Smith { 17f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18dfbe8321SBarry Smith PetscErrorCode ierr; 19690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20690b6cddSBarry Smith PetscInt *diag = a->diag; 21f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2287828ca2SBarry Smith PetscScalar s1,*x,*b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 59*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 60*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 66f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6887828ca2SBarry Smith PetscScalar *x,*b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 1111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 1186929473cSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct" 1196929473cSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1206929473cSShri Abhyankar { 1216929473cSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1226929473cSShri Abhyankar PetscErrorCode ierr; 1236929473cSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1246929473cSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 1256929473cSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 1266929473cSShri Abhyankar MatScalar *aa=a->a,*v; 1276929473cSShri Abhyankar PetscScalar s1,s2,x1,x2; 1286929473cSShri Abhyankar PetscScalar *x,*b; 1296929473cSShri Abhyankar 1306929473cSShri Abhyankar PetscFunctionBegin; 1316929473cSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1326929473cSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1336929473cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1346929473cSShri Abhyankar 1356929473cSShri Abhyankar /* forward solve the U^T */ 1366929473cSShri Abhyankar idx = 0; 1376929473cSShri Abhyankar for (i=0; i<n; i++) { 1386929473cSShri Abhyankar v = aa + bs2*diag[i]; 1396929473cSShri Abhyankar /* multiply by the inverse of the block diagonal */ 1406929473cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 1416929473cSShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 1426929473cSShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 1436929473cSShri Abhyankar v -= bs2; 1446929473cSShri Abhyankar 1456929473cSShri Abhyankar vi = aj + diag[i] - 1; 1466929473cSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 1476929473cSShri Abhyankar for(j=0;j>-nz;j--){ 1486929473cSShri Abhyankar oidx = bs*vi[j]; 1496929473cSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2; 1506929473cSShri Abhyankar x[oidx+1] -= v[2]*s1 + v[3]*s2; 1516929473cSShri Abhyankar v -= bs2; 1526929473cSShri Abhyankar } 1536929473cSShri Abhyankar x[idx] = s1;x[1+idx] = s2; 1546929473cSShri Abhyankar idx += bs; 1556929473cSShri Abhyankar } 1566929473cSShri Abhyankar /* backward solve the L^T */ 1576929473cSShri Abhyankar for (i=n-1; i>=0; i--){ 1586929473cSShri Abhyankar v = aa + bs2*ai[i]; 1596929473cSShri Abhyankar vi = aj + ai[i]; 1606929473cSShri Abhyankar nz = ai[i+1] - ai[i]; 1616929473cSShri Abhyankar idt = bs*i; 1626929473cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 1636929473cSShri Abhyankar for(j=0;j<nz;j++){ 1646929473cSShri Abhyankar idx = bs*vi[j]; 1656929473cSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2; 1666929473cSShri Abhyankar x[idx+1] -= v[2]*s1 + v[3]*s2; 1676929473cSShri Abhyankar v += bs2; 1686929473cSShri Abhyankar } 1696929473cSShri Abhyankar } 1706929473cSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1716929473cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1726929473cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1736929473cSShri Abhyankar PetscFunctionReturn(0); 1746929473cSShri Abhyankar } 1756929473cSShri Abhyankar 1766929473cSShri Abhyankar #undef __FUNCT__ 177*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 178*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 179f1af5d2fSBarry Smith { 180f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181dfbe8321SBarry Smith PetscErrorCode ierr; 182690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 183690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 184f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18587828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 18687828ca2SBarry Smith PetscScalar *x,*b; 187f1af5d2fSBarry Smith 188f1af5d2fSBarry Smith PetscFunctionBegin; 189ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1901ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192f1af5d2fSBarry Smith 193f1af5d2fSBarry Smith /* forward solve the U^T */ 194f1af5d2fSBarry Smith idx = 0; 195f1af5d2fSBarry Smith for (i=0; i<n; i++) { 196f1af5d2fSBarry Smith 197f1af5d2fSBarry Smith v = aa + 9*diag[i]; 198f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 199ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 200f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 201f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 202f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 203f1af5d2fSBarry Smith v += 9; 204f1af5d2fSBarry Smith 205f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 206f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 207f1af5d2fSBarry Smith while (nz--) { 208f1af5d2fSBarry Smith oidx = 3*(*vi++); 209f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 210f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 211f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 212f1af5d2fSBarry Smith v += 9; 213f1af5d2fSBarry Smith } 214f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 215f1af5d2fSBarry Smith idx += 3; 216f1af5d2fSBarry Smith } 217f1af5d2fSBarry Smith /* backward solve the L^T */ 218f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 219f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 220f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 221f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 222f1af5d2fSBarry Smith idt = 3*i; 223f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 224f1af5d2fSBarry Smith while (nz--) { 225f1af5d2fSBarry Smith idx = 3*(*vi--); 226f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 227f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 228f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 229f1af5d2fSBarry Smith v -= 9; 230f1af5d2fSBarry Smith } 231f1af5d2fSBarry Smith } 2321ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 235f1af5d2fSBarry Smith PetscFunctionReturn(0); 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith 2384a2ae208SSatish Balay #undef __FUNCT__ 2398499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct" 2408499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2418499736aSShri Abhyankar { 2428499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2438499736aSShri Abhyankar PetscErrorCode ierr; 2448499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2458499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 2468499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 2478499736aSShri Abhyankar MatScalar *aa=a->a,*v; 2488499736aSShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 2498499736aSShri Abhyankar PetscScalar *x,*b; 2508499736aSShri Abhyankar 2518499736aSShri Abhyankar PetscFunctionBegin; 2528499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2538499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2548499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2558499736aSShri Abhyankar 2568499736aSShri Abhyankar /* forward solve the U^T */ 2578499736aSShri Abhyankar idx = 0; 2588499736aSShri Abhyankar for (i=0; i<n; i++) { 2598499736aSShri Abhyankar v = aa + bs2*diag[i]; 2608499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 2618499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2628499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 2638499736aSShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 2648499736aSShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 2658499736aSShri Abhyankar v -= bs2; 2668499736aSShri Abhyankar 2678499736aSShri Abhyankar vi = aj + diag[i] - 1; 2688499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 2698499736aSShri Abhyankar for(j=0;j>-nz;j--){ 2708499736aSShri Abhyankar oidx = bs*vi[j]; 2718499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 2728499736aSShri Abhyankar x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 2738499736aSShri Abhyankar x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 2748499736aSShri Abhyankar v -= bs2; 2758499736aSShri Abhyankar } 2768499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 2778499736aSShri Abhyankar idx += bs; 2788499736aSShri Abhyankar } 2798499736aSShri Abhyankar /* backward solve the L^T */ 2808499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 2818499736aSShri Abhyankar v = aa + bs2*ai[i]; 2828499736aSShri Abhyankar vi = aj + ai[i]; 2838499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 2848499736aSShri Abhyankar idt = bs*i; 2858499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 2868499736aSShri Abhyankar for(j=0;j<nz;j++){ 2878499736aSShri Abhyankar idx = bs*vi[j]; 2888499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 2898499736aSShri Abhyankar x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 2908499736aSShri Abhyankar x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 2918499736aSShri Abhyankar v += bs2; 2928499736aSShri Abhyankar } 2938499736aSShri Abhyankar } 2948499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2958499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2968499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2978499736aSShri Abhyankar PetscFunctionReturn(0); 2988499736aSShri Abhyankar } 2998499736aSShri Abhyankar 3008499736aSShri Abhyankar #undef __FUNCT__ 301*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 302*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 303f1af5d2fSBarry Smith { 304f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305dfbe8321SBarry Smith PetscErrorCode ierr; 306690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 307690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 308f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 30987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 31087828ca2SBarry Smith PetscScalar *x,*b; 311f1af5d2fSBarry Smith 312f1af5d2fSBarry Smith PetscFunctionBegin; 313ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3141ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3151ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 316f1af5d2fSBarry Smith 317f1af5d2fSBarry Smith /* forward solve the U^T */ 318f1af5d2fSBarry Smith idx = 0; 319f1af5d2fSBarry Smith for (i=0; i<n; i++) { 320f1af5d2fSBarry Smith 321f1af5d2fSBarry Smith v = aa + 16*diag[i]; 322f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 323ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 324f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 325f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 326f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 327f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 328f1af5d2fSBarry Smith v += 16; 329f1af5d2fSBarry Smith 330f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 331f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 332f1af5d2fSBarry Smith while (nz--) { 333f1af5d2fSBarry Smith oidx = 4*(*vi++); 334f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 335f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 336f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 337f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 338f1af5d2fSBarry Smith v += 16; 339f1af5d2fSBarry Smith } 340f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 341f1af5d2fSBarry Smith idx += 4; 342f1af5d2fSBarry Smith } 343f1af5d2fSBarry Smith /* backward solve the L^T */ 344f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 345f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 346f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 347f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 348f1af5d2fSBarry Smith idt = 4*i; 349f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 350f1af5d2fSBarry Smith while (nz--) { 351f1af5d2fSBarry Smith idx = 4*(*vi--); 352f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 353f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 354f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 355f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 356f1af5d2fSBarry Smith v -= 16; 357f1af5d2fSBarry Smith } 358f1af5d2fSBarry Smith } 3591ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3601ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 361dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 362f1af5d2fSBarry Smith PetscFunctionReturn(0); 363f1af5d2fSBarry Smith } 364f1af5d2fSBarry Smith 3654a2ae208SSatish Balay #undef __FUNCT__ 3668499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3678499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3688499736aSShri Abhyankar { 3698499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3708499736aSShri Abhyankar PetscErrorCode ierr; 3718499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 3728499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 3738499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 3748499736aSShri Abhyankar MatScalar *aa=a->a,*v; 3758499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3768499736aSShri Abhyankar PetscScalar *x,*b; 3778499736aSShri Abhyankar 3788499736aSShri Abhyankar PetscFunctionBegin; 3798499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3808499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3818499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3828499736aSShri Abhyankar 3838499736aSShri Abhyankar /* forward solve the U^T */ 3848499736aSShri Abhyankar idx = 0; 3858499736aSShri Abhyankar for (i=0; i<n; i++) { 3868499736aSShri Abhyankar v = aa + bs2*diag[i]; 3878499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 3888499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 3898499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 3908499736aSShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 3918499736aSShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 3928499736aSShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 3938499736aSShri Abhyankar v -= bs2; 3948499736aSShri Abhyankar 3958499736aSShri Abhyankar vi = aj + diag[i] - 1; 3968499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 3978499736aSShri Abhyankar for(j=0;j>-nz;j--){ 3988499736aSShri Abhyankar oidx = bs*vi[j]; 3998499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4008499736aSShri Abhyankar x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4018499736aSShri Abhyankar x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4028499736aSShri Abhyankar x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4038499736aSShri Abhyankar v -= bs2; 4048499736aSShri Abhyankar } 4058499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 4068499736aSShri Abhyankar idx += bs; 4078499736aSShri Abhyankar } 4088499736aSShri Abhyankar /* backward solve the L^T */ 4098499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 4108499736aSShri Abhyankar v = aa + bs2*ai[i]; 4118499736aSShri Abhyankar vi = aj + ai[i]; 4128499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 4138499736aSShri Abhyankar idt = bs*i; 4148499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 4158499736aSShri Abhyankar for(j=0;j<nz;j++){ 4168499736aSShri Abhyankar idx = bs*vi[j]; 4178499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4188499736aSShri Abhyankar x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4198499736aSShri Abhyankar x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4208499736aSShri Abhyankar x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4218499736aSShri Abhyankar v += bs2; 4228499736aSShri Abhyankar } 4238499736aSShri Abhyankar } 4248499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4258499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4268499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4278499736aSShri Abhyankar PetscFunctionReturn(0); 4288499736aSShri Abhyankar } 4298499736aSShri Abhyankar 4308499736aSShri Abhyankar #undef __FUNCT__ 431*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 432*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 433f1af5d2fSBarry Smith { 434f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 435dfbe8321SBarry Smith PetscErrorCode ierr; 436690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 437690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 438f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 43987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 44087828ca2SBarry Smith PetscScalar *x,*b; 441f1af5d2fSBarry Smith 442f1af5d2fSBarry Smith PetscFunctionBegin; 443ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4441ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4451ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 446f1af5d2fSBarry Smith 447f1af5d2fSBarry Smith /* forward solve the U^T */ 448f1af5d2fSBarry Smith idx = 0; 449f1af5d2fSBarry Smith for (i=0; i<n; i++) { 450f1af5d2fSBarry Smith 451f1af5d2fSBarry Smith v = aa + 25*diag[i]; 452f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 453ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 454f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 455f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 456f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 457f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 458f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 459f1af5d2fSBarry Smith v += 25; 460f1af5d2fSBarry Smith 461f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 462f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 463f1af5d2fSBarry Smith while (nz--) { 464f1af5d2fSBarry Smith oidx = 5*(*vi++); 465f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 466f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 467f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 468f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 469f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 470f1af5d2fSBarry Smith v += 25; 471f1af5d2fSBarry Smith } 472f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 473f1af5d2fSBarry Smith idx += 5; 474f1af5d2fSBarry Smith } 475f1af5d2fSBarry Smith /* backward solve the L^T */ 476f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 477f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 478f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 479f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 480f1af5d2fSBarry Smith idt = 5*i; 481f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 482f1af5d2fSBarry Smith while (nz--) { 483f1af5d2fSBarry Smith idx = 5*(*vi--); 484f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 485f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 486f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 487f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 488f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 489f1af5d2fSBarry Smith v -= 25; 490f1af5d2fSBarry Smith } 491f1af5d2fSBarry Smith } 4921ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 494dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 495f1af5d2fSBarry Smith PetscFunctionReturn(0); 496f1af5d2fSBarry Smith } 497f1af5d2fSBarry Smith 4984a2ae208SSatish Balay #undef __FUNCT__ 4998499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct" 5008499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 5018499736aSShri Abhyankar { 5028499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5038499736aSShri Abhyankar PetscErrorCode ierr; 5048499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5058499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 5068499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 5078499736aSShri Abhyankar MatScalar *aa=a->a,*v; 5088499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 5098499736aSShri Abhyankar PetscScalar *x,*b; 5108499736aSShri Abhyankar 5118499736aSShri Abhyankar PetscFunctionBegin; 5128499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 5138499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5148499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5158499736aSShri Abhyankar 5168499736aSShri Abhyankar /* forward solve the U^T */ 5178499736aSShri Abhyankar idx = 0; 5188499736aSShri Abhyankar for (i=0; i<n; i++) { 5198499736aSShri Abhyankar v = aa + bs2*diag[i]; 5208499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 5218499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 5228499736aSShri Abhyankar x5 = x[4+idx]; 5238499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 5248499736aSShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 5258499736aSShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 5268499736aSShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 5278499736aSShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 5288499736aSShri Abhyankar v -= bs2; 5298499736aSShri Abhyankar 5308499736aSShri Abhyankar vi = aj + diag[i] - 1; 5318499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 5328499736aSShri Abhyankar for(j=0;j>-nz;j--){ 5338499736aSShri Abhyankar oidx = bs*vi[j]; 5348499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5358499736aSShri Abhyankar x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5368499736aSShri Abhyankar x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5378499736aSShri Abhyankar x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5388499736aSShri Abhyankar x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5398499736aSShri Abhyankar v -= bs2; 5408499736aSShri Abhyankar } 5418499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 5428499736aSShri Abhyankar idx += bs; 5438499736aSShri Abhyankar } 5448499736aSShri Abhyankar /* backward solve the L^T */ 5458499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 5468499736aSShri Abhyankar v = aa + bs2*ai[i]; 5478499736aSShri Abhyankar vi = aj + ai[i]; 5488499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 5498499736aSShri Abhyankar idt = bs*i; 5508499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 5518499736aSShri Abhyankar for(j=0;j<nz;j++){ 5528499736aSShri Abhyankar idx = bs*vi[j]; 5538499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5548499736aSShri Abhyankar x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5558499736aSShri Abhyankar x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5568499736aSShri Abhyankar x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5578499736aSShri Abhyankar x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5588499736aSShri Abhyankar v += bs2; 5598499736aSShri Abhyankar } 5608499736aSShri Abhyankar } 5618499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5628499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5638499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5648499736aSShri Abhyankar PetscFunctionReturn(0); 5658499736aSShri Abhyankar } 5668499736aSShri Abhyankar 5678499736aSShri Abhyankar #undef __FUNCT__ 568*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 569*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 570f1af5d2fSBarry Smith { 571f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 572dfbe8321SBarry Smith PetscErrorCode ierr; 573690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 574690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 575f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 57687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 57787828ca2SBarry Smith PetscScalar *x,*b; 578f1af5d2fSBarry Smith 579f1af5d2fSBarry Smith PetscFunctionBegin; 580ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 5811ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 583f1af5d2fSBarry Smith 584f1af5d2fSBarry Smith /* forward solve the U^T */ 585f1af5d2fSBarry Smith idx = 0; 586f1af5d2fSBarry Smith for (i=0; i<n; i++) { 587f1af5d2fSBarry Smith 588f1af5d2fSBarry Smith v = aa + 36*diag[i]; 589f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 590ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 591ef66eb69SBarry Smith x6 = x[5+idx]; 592f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 593f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 594f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 595f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 596f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 597f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 598f1af5d2fSBarry Smith v += 36; 599f1af5d2fSBarry Smith 600f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 601f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 602f1af5d2fSBarry Smith while (nz--) { 603f1af5d2fSBarry Smith oidx = 6*(*vi++); 604f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 605f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 606f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 607f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 608f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 609f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 610f1af5d2fSBarry Smith v += 36; 611f1af5d2fSBarry Smith } 612f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 613f1af5d2fSBarry Smith x[5+idx] = s6; 614f1af5d2fSBarry Smith idx += 6; 615f1af5d2fSBarry Smith } 616f1af5d2fSBarry Smith /* backward solve the L^T */ 617f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 618f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 619f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 620f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 621f1af5d2fSBarry Smith idt = 6*i; 622f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 623f1af5d2fSBarry Smith s6 = x[5+idt]; 624f1af5d2fSBarry Smith while (nz--) { 625f1af5d2fSBarry Smith idx = 6*(*vi--); 626f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632f1af5d2fSBarry Smith v -= 36; 633f1af5d2fSBarry Smith } 634f1af5d2fSBarry Smith } 6351ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6361ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 637dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 638f1af5d2fSBarry Smith PetscFunctionReturn(0); 639f1af5d2fSBarry Smith } 640f1af5d2fSBarry Smith 6414a2ae208SSatish Balay #undef __FUNCT__ 6428499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct" 6438499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 6448499736aSShri Abhyankar { 6458499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 6468499736aSShri Abhyankar PetscErrorCode ierr; 6478499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 6488499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 6498499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 6508499736aSShri Abhyankar MatScalar *aa=a->a,*v; 6518499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 6528499736aSShri Abhyankar PetscScalar *x,*b; 6538499736aSShri Abhyankar 6548499736aSShri Abhyankar PetscFunctionBegin; 6558499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 6568499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6578499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 6588499736aSShri Abhyankar 6598499736aSShri Abhyankar /* forward solve the U^T */ 6608499736aSShri Abhyankar idx = 0; 6618499736aSShri Abhyankar for (i=0; i<n; i++) { 6628499736aSShri Abhyankar v = aa + bs2*diag[i]; 6638499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 6648499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 6658499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; 6668499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 6678499736aSShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 6688499736aSShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 6698499736aSShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 6708499736aSShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 6718499736aSShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 6728499736aSShri Abhyankar v -= bs2; 6738499736aSShri Abhyankar 6748499736aSShri Abhyankar vi = aj + diag[i] - 1; 6758499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 6768499736aSShri Abhyankar for(j=0;j>-nz;j--){ 6778499736aSShri Abhyankar oidx = bs*vi[j]; 6788499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 6798499736aSShri Abhyankar x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 6808499736aSShri Abhyankar x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 6818499736aSShri Abhyankar x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 6828499736aSShri Abhyankar x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 6838499736aSShri Abhyankar x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 6848499736aSShri Abhyankar v -= bs2; 6858499736aSShri Abhyankar } 6868499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 6878499736aSShri Abhyankar x[5+idx] = s6; 6888499736aSShri Abhyankar idx += bs; 6898499736aSShri Abhyankar } 6908499736aSShri Abhyankar /* backward solve the L^T */ 6918499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 6928499736aSShri Abhyankar v = aa + bs2*ai[i]; 6938499736aSShri Abhyankar vi = aj + ai[i]; 6948499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 6958499736aSShri Abhyankar idt = bs*i; 6968499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 6978499736aSShri Abhyankar s6 = x[5+idt]; 6988499736aSShri Abhyankar for(j=0;j<nz;j++){ 6998499736aSShri Abhyankar idx = bs*vi[j]; 7008499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 7018499736aSShri Abhyankar x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 7028499736aSShri Abhyankar x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 7038499736aSShri Abhyankar x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 7048499736aSShri Abhyankar x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 7058499736aSShri Abhyankar x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 7068499736aSShri Abhyankar v += bs2; 7078499736aSShri Abhyankar } 7088499736aSShri Abhyankar } 7098499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7108499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 7118499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 7128499736aSShri Abhyankar PetscFunctionReturn(0); 7138499736aSShri Abhyankar } 7148499736aSShri Abhyankar 7158499736aSShri Abhyankar #undef __FUNCT__ 716*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 717*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 718f1af5d2fSBarry Smith { 719f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 720dfbe8321SBarry Smith PetscErrorCode ierr; 721690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 722690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 723f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 72487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 72587828ca2SBarry Smith PetscScalar *x,*b; 726f1af5d2fSBarry Smith 727f1af5d2fSBarry Smith PetscFunctionBegin; 728ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 7291ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7301ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 731f1af5d2fSBarry Smith 732f1af5d2fSBarry Smith /* forward solve the U^T */ 733f1af5d2fSBarry Smith idx = 0; 734f1af5d2fSBarry Smith for (i=0; i<n; i++) { 735f1af5d2fSBarry Smith 736f1af5d2fSBarry Smith v = aa + 49*diag[i]; 737f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 738ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 739ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 740f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 741f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 742f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 743f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 744f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 745f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 746f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 747f1af5d2fSBarry Smith v += 49; 748f1af5d2fSBarry Smith 749f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 750f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 751f1af5d2fSBarry Smith while (nz--) { 752f1af5d2fSBarry Smith oidx = 7*(*vi++); 753f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 754f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 755f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 756f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 757f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 758f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 759f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 760f1af5d2fSBarry Smith v += 49; 761f1af5d2fSBarry Smith } 762f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 763f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 764f1af5d2fSBarry Smith idx += 7; 765f1af5d2fSBarry Smith } 766f1af5d2fSBarry Smith /* backward solve the L^T */ 767f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 768f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 769f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 770f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 771f1af5d2fSBarry Smith idt = 7*i; 772f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 773f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 774f1af5d2fSBarry Smith while (nz--) { 775f1af5d2fSBarry Smith idx = 7*(*vi--); 776f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 777f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 778f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 779f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 780f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 781f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 782f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 783f1af5d2fSBarry Smith v -= 49; 784f1af5d2fSBarry Smith } 785f1af5d2fSBarry Smith } 7861ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7871ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 788dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 789f1af5d2fSBarry Smith PetscFunctionReturn(0); 790f1af5d2fSBarry Smith } 7918499736aSShri Abhyankar #undef __FUNCT__ 7928499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct" 7938499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 7948499736aSShri Abhyankar { 7958499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 7968499736aSShri Abhyankar PetscErrorCode ierr; 7978499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 7988499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 7998499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 8008499736aSShri Abhyankar MatScalar *aa=a->a,*v; 8018499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 8028499736aSShri Abhyankar PetscScalar *x,*b; 8038499736aSShri Abhyankar 8048499736aSShri Abhyankar PetscFunctionBegin; 8058499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 8068499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8078499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 8088499736aSShri Abhyankar 8098499736aSShri Abhyankar /* forward solve the U^T */ 8108499736aSShri Abhyankar idx = 0; 8118499736aSShri Abhyankar for (i=0; i<n; i++) { 8128499736aSShri Abhyankar v = aa + bs2*diag[i]; 8138499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 8148499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 8158499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 8168499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 8178499736aSShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 8188499736aSShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 8198499736aSShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 8208499736aSShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 8218499736aSShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 8228499736aSShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 8238499736aSShri Abhyankar v -= bs2; 8248499736aSShri Abhyankar vi = aj + diag[i] - 1; 8258499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 8268499736aSShri Abhyankar for(j=0;j>-nz;j--){ 8278499736aSShri Abhyankar oidx = bs*vi[j]; 8288499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8298499736aSShri Abhyankar x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8308499736aSShri Abhyankar x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8318499736aSShri Abhyankar x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8328499736aSShri Abhyankar x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8338499736aSShri Abhyankar x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8348499736aSShri Abhyankar x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8358499736aSShri Abhyankar v -= bs2; 8368499736aSShri Abhyankar } 8378499736aSShri Abhyankar x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 8388499736aSShri Abhyankar x[5+idx] = s6; x[6+idx] = s7; 8398499736aSShri Abhyankar idx += bs; 8408499736aSShri Abhyankar } 8418499736aSShri Abhyankar /* backward solve the L^T */ 8428499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 8438499736aSShri Abhyankar v = aa + bs2*ai[i]; 8448499736aSShri Abhyankar vi = aj + ai[i]; 8458499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 8468499736aSShri Abhyankar idt = bs*i; 8478499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 8488499736aSShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; 8498499736aSShri Abhyankar for(j=0;j<nz;j++){ 8508499736aSShri Abhyankar idx = bs*vi[j]; 8518499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8528499736aSShri Abhyankar x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8538499736aSShri Abhyankar x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8548499736aSShri Abhyankar x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8558499736aSShri Abhyankar x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8568499736aSShri Abhyankar x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8578499736aSShri Abhyankar x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8588499736aSShri Abhyankar v += bs2; 8598499736aSShri Abhyankar } 8608499736aSShri Abhyankar } 8618499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8628499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 8638499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 8648499736aSShri Abhyankar PetscFunctionReturn(0); 8658499736aSShri Abhyankar } 866f1af5d2fSBarry Smith 867f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 8684a2ae208SSatish Balay #undef __FUNCT__ 869*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 870*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 871f1af5d2fSBarry Smith { 872f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 873f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8746849ba73SBarry Smith PetscErrorCode ierr; 8755d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8765d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 877690b6cddSBarry Smith PetscInt *diag = a->diag; 878f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 87987828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 880f1af5d2fSBarry Smith 881f1af5d2fSBarry Smith PetscFunctionBegin; 8821ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8831ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 884f1af5d2fSBarry Smith t = a->solve_work; 885f1af5d2fSBarry Smith 886f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 887f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 888f1af5d2fSBarry Smith 889f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 890f1af5d2fSBarry Smith for (i=0; i<n; i++) { 891f1af5d2fSBarry Smith t[i] = b[c[i]]; 892f1af5d2fSBarry Smith } 893f1af5d2fSBarry Smith 894f1af5d2fSBarry Smith /* forward solve the U^T */ 895f1af5d2fSBarry Smith for (i=0; i<n; i++) { 896f1af5d2fSBarry Smith 897f1af5d2fSBarry Smith v = aa + diag[i]; 898f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 899f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 900f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 901f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 902f1af5d2fSBarry Smith while (nz--) { 903f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 904f1af5d2fSBarry Smith } 905f1af5d2fSBarry Smith t[i] = s1; 906f1af5d2fSBarry Smith } 907f1af5d2fSBarry Smith /* backward solve the L^T */ 908f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 909f1af5d2fSBarry Smith v = aa + diag[i] - 1; 910f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 911f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 912f1af5d2fSBarry Smith s1 = t[i]; 913f1af5d2fSBarry Smith while (nz--) { 914f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 915f1af5d2fSBarry Smith } 916f1af5d2fSBarry Smith } 917f1af5d2fSBarry Smith 918f1af5d2fSBarry Smith /* copy t into x according to permutation */ 919f1af5d2fSBarry Smith for (i=0; i<n; i++) { 920f1af5d2fSBarry Smith x[r[i]] = t[i]; 921f1af5d2fSBarry Smith } 922f1af5d2fSBarry Smith 923f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 924f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9251ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 9261ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 927dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 928f1af5d2fSBarry Smith PetscFunctionReturn(0); 929f1af5d2fSBarry Smith } 930f1af5d2fSBarry Smith 9314a2ae208SSatish Balay #undef __FUNCT__ 932*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 933*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 934f1af5d2fSBarry Smith { 935f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 936f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9376849ba73SBarry Smith PetscErrorCode ierr; 9385d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9395d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 940690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 941f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 94287828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 94387828ca2SBarry Smith PetscScalar *x,*b,*t; 944f1af5d2fSBarry Smith 945f1af5d2fSBarry Smith PetscFunctionBegin; 9461ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9471ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 948f1af5d2fSBarry Smith t = a->solve_work; 949f1af5d2fSBarry Smith 950f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 951f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 952f1af5d2fSBarry Smith 953f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 954f1af5d2fSBarry Smith ii = 0; 955f1af5d2fSBarry Smith for (i=0; i<n; i++) { 956f1af5d2fSBarry Smith ic = 2*c[i]; 957f1af5d2fSBarry Smith t[ii] = b[ic]; 958f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 959f1af5d2fSBarry Smith ii += 2; 960f1af5d2fSBarry Smith } 961f1af5d2fSBarry Smith 962f1af5d2fSBarry Smith /* forward solve the U^T */ 963f1af5d2fSBarry Smith idx = 0; 964f1af5d2fSBarry Smith for (i=0; i<n; i++) { 965f1af5d2fSBarry Smith 966f1af5d2fSBarry Smith v = aa + 4*diag[i]; 967f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 968f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 969f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 970f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 971f1af5d2fSBarry Smith v += 4; 972f1af5d2fSBarry Smith 973f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 974f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 975f1af5d2fSBarry Smith while (nz--) { 976f1af5d2fSBarry Smith oidx = 2*(*vi++); 977f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 978f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 979f1af5d2fSBarry Smith v += 4; 980f1af5d2fSBarry Smith } 981f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 982f1af5d2fSBarry Smith idx += 2; 983f1af5d2fSBarry Smith } 984f1af5d2fSBarry Smith /* backward solve the L^T */ 985f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 986f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 987f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 988f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 989f1af5d2fSBarry Smith idt = 2*i; 990f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 991f1af5d2fSBarry Smith while (nz--) { 992f1af5d2fSBarry Smith idx = 2*(*vi--); 993f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 994f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 995f1af5d2fSBarry Smith v -= 4; 996f1af5d2fSBarry Smith } 997f1af5d2fSBarry Smith } 998f1af5d2fSBarry Smith 999f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1000f1af5d2fSBarry Smith ii = 0; 1001f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1002f1af5d2fSBarry Smith ir = 2*r[i]; 1003f1af5d2fSBarry Smith x[ir] = t[ii]; 1004f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1005f1af5d2fSBarry Smith ii += 2; 1006f1af5d2fSBarry Smith } 1007f1af5d2fSBarry Smith 1008f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1009f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 10101ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10111ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1012dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1013f1af5d2fSBarry Smith PetscFunctionReturn(0); 1014f1af5d2fSBarry Smith } 1015f1af5d2fSBarry Smith 10164a2ae208SSatish Balay #undef __FUNCT__ 101732121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_newdatastruct" 101832121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 101932121132SShri Abhyankar { 102032121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 102132121132SShri Abhyankar PetscErrorCode ierr; 102232121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 102332121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 102432121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 102532121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 102632121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 102732121132SShri Abhyankar MatScalar *aa=a->a,*v; 102832121132SShri Abhyankar PetscScalar s1,s2,x1,x2; 102932121132SShri Abhyankar PetscScalar *x,*b,*t; 103032121132SShri Abhyankar 103132121132SShri Abhyankar PetscFunctionBegin; 103232121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 103332121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 103432121132SShri Abhyankar t = a->solve_work; 103532121132SShri Abhyankar 103632121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 103732121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 103832121132SShri Abhyankar 103932121132SShri Abhyankar /* copy b into temp work space according to permutation */ 104032121132SShri Abhyankar for(i=0;i<n;i++){ 104132121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 104232121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; 104332121132SShri Abhyankar } 104432121132SShri Abhyankar 104532121132SShri Abhyankar /* forward solve the U^T */ 104632121132SShri Abhyankar idx = 0; 104732121132SShri Abhyankar for (i=0; i<n; i++) { 104832121132SShri Abhyankar v = aa + bs2*diag[i]; 104932121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 105032121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 105132121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 105232121132SShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 105332121132SShri Abhyankar v -= bs2; 105432121132SShri Abhyankar 105532121132SShri Abhyankar vi = aj + diag[i] - 1; 105632121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 105732121132SShri Abhyankar for(j=0;j>-nz;j--){ 105832121132SShri Abhyankar oidx = bs*vi[j]; 105932121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2; 106032121132SShri Abhyankar t[oidx+1] -= v[2]*s1 + v[3]*s2; 106132121132SShri Abhyankar v -= bs2; 106232121132SShri Abhyankar } 106332121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 106432121132SShri Abhyankar idx += bs; 106532121132SShri Abhyankar } 106632121132SShri Abhyankar /* backward solve the L^T */ 106732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 106832121132SShri Abhyankar v = aa + bs2*ai[i]; 106932121132SShri Abhyankar vi = aj + ai[i]; 107032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 107132121132SShri Abhyankar idt = bs*i; 107232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 107332121132SShri Abhyankar for(j=0;j<nz;j++){ 107432121132SShri Abhyankar idx = bs*vi[j]; 107532121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2; 107632121132SShri Abhyankar t[idx+1] -= v[2]*s1 + v[3]*s2; 107732121132SShri Abhyankar v += bs2; 107832121132SShri Abhyankar } 107932121132SShri Abhyankar } 108032121132SShri Abhyankar 108132121132SShri Abhyankar /* copy t into x according to permutation */ 108232121132SShri Abhyankar for(i=0;i<n;i++){ 108332121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 108432121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; 108532121132SShri Abhyankar } 108632121132SShri Abhyankar 108732121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 108832121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 108932121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 109032121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 109132121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 109232121132SShri Abhyankar PetscFunctionReturn(0); 109332121132SShri Abhyankar } 109432121132SShri Abhyankar 109532121132SShri Abhyankar #undef __FUNCT__ 1096*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 1097*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1098f1af5d2fSBarry Smith { 1099f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1100f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 11016849ba73SBarry Smith PetscErrorCode ierr; 11025d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 11035d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1104690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1105f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 110687828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 110787828ca2SBarry Smith PetscScalar *x,*b,*t; 1108f1af5d2fSBarry Smith 1109f1af5d2fSBarry Smith PetscFunctionBegin; 11101ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11111ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1112f1af5d2fSBarry Smith t = a->solve_work; 1113f1af5d2fSBarry Smith 1114f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1115f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1116f1af5d2fSBarry Smith 1117f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1118f1af5d2fSBarry Smith ii = 0; 1119f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1120f1af5d2fSBarry Smith ic = 3*c[i]; 1121f1af5d2fSBarry Smith t[ii] = b[ic]; 1122f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1123f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1124f1af5d2fSBarry Smith ii += 3; 1125f1af5d2fSBarry Smith } 1126f1af5d2fSBarry Smith 1127f1af5d2fSBarry Smith /* forward solve the U^T */ 1128f1af5d2fSBarry Smith idx = 0; 1129f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1130f1af5d2fSBarry Smith 1131f1af5d2fSBarry Smith v = aa + 9*diag[i]; 1132f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1133f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1134f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1135f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1136f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1137f1af5d2fSBarry Smith v += 9; 1138f1af5d2fSBarry Smith 1139f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1140f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1141f1af5d2fSBarry Smith while (nz--) { 1142f1af5d2fSBarry Smith oidx = 3*(*vi++); 1143f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1144f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1145f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1146f1af5d2fSBarry Smith v += 9; 1147f1af5d2fSBarry Smith } 1148f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1149f1af5d2fSBarry Smith idx += 3; 1150f1af5d2fSBarry Smith } 1151f1af5d2fSBarry Smith /* backward solve the L^T */ 1152f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1153f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 1154f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1155f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1156f1af5d2fSBarry Smith idt = 3*i; 1157f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1158f1af5d2fSBarry Smith while (nz--) { 1159f1af5d2fSBarry Smith idx = 3*(*vi--); 1160f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1161f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1162f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1163f1af5d2fSBarry Smith v -= 9; 1164f1af5d2fSBarry Smith } 1165f1af5d2fSBarry Smith } 1166f1af5d2fSBarry Smith 1167f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1168f1af5d2fSBarry Smith ii = 0; 1169f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1170f1af5d2fSBarry Smith ir = 3*r[i]; 1171f1af5d2fSBarry Smith x[ir] = t[ii]; 1172f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1173f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1174f1af5d2fSBarry Smith ii += 3; 1175f1af5d2fSBarry Smith } 1176f1af5d2fSBarry Smith 1177f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1178f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11791ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11801ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1181dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1182f1af5d2fSBarry Smith PetscFunctionReturn(0); 1183f1af5d2fSBarry Smith } 1184f1af5d2fSBarry Smith 11854a2ae208SSatish Balay #undef __FUNCT__ 118632121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_newdatastruct" 118732121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 118832121132SShri Abhyankar { 118932121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 119032121132SShri Abhyankar PetscErrorCode ierr; 119132121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 119232121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 119332121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 119432121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 119532121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 119632121132SShri Abhyankar MatScalar *aa=a->a,*v; 119732121132SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 119832121132SShri Abhyankar PetscScalar *x,*b,*t; 119932121132SShri Abhyankar 120032121132SShri Abhyankar PetscFunctionBegin; 120132121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 120232121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 120332121132SShri Abhyankar t = a->solve_work; 120432121132SShri Abhyankar 120532121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 120632121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 120732121132SShri Abhyankar 120832121132SShri Abhyankar /* copy b into temp work space according to permutation */ 120932121132SShri Abhyankar for(i=0;i<n;i++){ 121032121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 121132121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 121232121132SShri Abhyankar } 121332121132SShri Abhyankar 121432121132SShri Abhyankar /* forward solve the U^T */ 121532121132SShri Abhyankar idx = 0; 121632121132SShri Abhyankar for (i=0; i<n; i++) { 121732121132SShri Abhyankar v = aa + bs2*diag[i]; 121832121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 121932121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 122032121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 122132121132SShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 122232121132SShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 122332121132SShri Abhyankar v -= bs2; 122432121132SShri Abhyankar 122532121132SShri Abhyankar vi = aj + diag[i] - 1; 122632121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 122732121132SShri Abhyankar for(j=0;j>-nz;j--){ 122832121132SShri Abhyankar oidx = bs*vi[j]; 122932121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 123032121132SShri Abhyankar t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 123132121132SShri Abhyankar t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 123232121132SShri Abhyankar v -= bs2; 123332121132SShri Abhyankar } 123432121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 123532121132SShri Abhyankar idx += bs; 123632121132SShri Abhyankar } 123732121132SShri Abhyankar /* backward solve the L^T */ 123832121132SShri Abhyankar for (i=n-1; i>=0; i--){ 123932121132SShri Abhyankar v = aa + bs2*ai[i]; 124032121132SShri Abhyankar vi = aj + ai[i]; 124132121132SShri Abhyankar nz = ai[i+1] - ai[i]; 124232121132SShri Abhyankar idt = bs*i; 124332121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 124432121132SShri Abhyankar for(j=0;j<nz;j++){ 124532121132SShri Abhyankar idx = bs*vi[j]; 124632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 124732121132SShri Abhyankar t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 124832121132SShri Abhyankar t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 124932121132SShri Abhyankar v += bs2; 125032121132SShri Abhyankar } 125132121132SShri Abhyankar } 125232121132SShri Abhyankar 125332121132SShri Abhyankar /* copy t into x according to permutation */ 125432121132SShri Abhyankar for(i=0;i<n;i++){ 125532121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 125632121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 125732121132SShri Abhyankar } 125832121132SShri Abhyankar 125932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 126032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 126132121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 126232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 126332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 126432121132SShri Abhyankar PetscFunctionReturn(0); 126532121132SShri Abhyankar } 126632121132SShri Abhyankar 126732121132SShri Abhyankar #undef __FUNCT__ 1268*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 1269*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1270f1af5d2fSBarry Smith { 1271f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1272f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 12736849ba73SBarry Smith PetscErrorCode ierr; 12745d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 12755d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1276690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1277f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 127887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 127987828ca2SBarry Smith PetscScalar *x,*b,*t; 1280f1af5d2fSBarry Smith 1281f1af5d2fSBarry Smith PetscFunctionBegin; 12821ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 12831ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1284f1af5d2fSBarry Smith t = a->solve_work; 1285f1af5d2fSBarry Smith 1286f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1287f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1288f1af5d2fSBarry Smith 1289f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1290f1af5d2fSBarry Smith ii = 0; 1291f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1292f1af5d2fSBarry Smith ic = 4*c[i]; 1293f1af5d2fSBarry Smith t[ii] = b[ic]; 1294f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1295f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1296f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1297f1af5d2fSBarry Smith ii += 4; 1298f1af5d2fSBarry Smith } 1299f1af5d2fSBarry Smith 1300f1af5d2fSBarry Smith /* forward solve the U^T */ 1301f1af5d2fSBarry Smith idx = 0; 1302f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1303f1af5d2fSBarry Smith 1304f1af5d2fSBarry Smith v = aa + 16*diag[i]; 1305f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1306f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1307f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1308f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1309f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1310f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1311f1af5d2fSBarry Smith v += 16; 1312f1af5d2fSBarry Smith 1313f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1314f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1315f1af5d2fSBarry Smith while (nz--) { 1316f1af5d2fSBarry Smith oidx = 4*(*vi++); 1317f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1318f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1319f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1320f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1321f1af5d2fSBarry Smith v += 16; 1322f1af5d2fSBarry Smith } 1323f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1324f1af5d2fSBarry Smith idx += 4; 1325f1af5d2fSBarry Smith } 1326f1af5d2fSBarry Smith /* backward solve the L^T */ 1327f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1328f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 1329f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1330f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1331f1af5d2fSBarry Smith idt = 4*i; 1332f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1333f1af5d2fSBarry Smith while (nz--) { 1334f1af5d2fSBarry Smith idx = 4*(*vi--); 1335f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1336f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1337f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1338f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1339f1af5d2fSBarry Smith v -= 16; 1340f1af5d2fSBarry Smith } 1341f1af5d2fSBarry Smith } 1342f1af5d2fSBarry Smith 1343f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1344f1af5d2fSBarry Smith ii = 0; 1345f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1346f1af5d2fSBarry Smith ir = 4*r[i]; 1347f1af5d2fSBarry Smith x[ir] = t[ii]; 1348f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1349f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1350f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1351f1af5d2fSBarry Smith ii += 4; 1352f1af5d2fSBarry Smith } 1353f1af5d2fSBarry Smith 1354f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1355f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13561ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 13571ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1358dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1359f1af5d2fSBarry Smith PetscFunctionReturn(0); 1360f1af5d2fSBarry Smith } 1361f1af5d2fSBarry Smith 13624a2ae208SSatish Balay #undef __FUNCT__ 136332121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_newdatastruct" 136432121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 136532121132SShri Abhyankar { 136632121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 136732121132SShri Abhyankar PetscErrorCode ierr; 136832121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 136932121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 137032121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 137132121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 137232121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 137332121132SShri Abhyankar MatScalar *aa=a->a,*v; 137432121132SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 137532121132SShri Abhyankar PetscScalar *x,*b,*t; 137632121132SShri Abhyankar 137732121132SShri Abhyankar PetscFunctionBegin; 137832121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 137932121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 138032121132SShri Abhyankar t = a->solve_work; 138132121132SShri Abhyankar 138232121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 138332121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 138432121132SShri Abhyankar 138532121132SShri Abhyankar /* copy b into temp work space according to permutation */ 138632121132SShri Abhyankar for(i=0;i<n;i++){ 138732121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 138832121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 138932121132SShri Abhyankar } 139032121132SShri Abhyankar 139132121132SShri Abhyankar /* forward solve the U^T */ 139232121132SShri Abhyankar idx = 0; 139332121132SShri Abhyankar for (i=0; i<n; i++) { 139432121132SShri Abhyankar v = aa + bs2*diag[i]; 139532121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 139632121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 139732121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 139832121132SShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 139932121132SShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 140032121132SShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 140132121132SShri Abhyankar v -= bs2; 140232121132SShri Abhyankar 140332121132SShri Abhyankar vi = aj + diag[i] - 1; 140432121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 140532121132SShri Abhyankar for(j=0;j>-nz;j--){ 140632121132SShri Abhyankar oidx = bs*vi[j]; 140732121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 140832121132SShri Abhyankar t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 140932121132SShri Abhyankar t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 141032121132SShri Abhyankar t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 141132121132SShri Abhyankar v -= bs2; 141232121132SShri Abhyankar } 141332121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 141432121132SShri Abhyankar idx += bs; 141532121132SShri Abhyankar } 141632121132SShri Abhyankar /* backward solve the L^T */ 141732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 141832121132SShri Abhyankar v = aa + bs2*ai[i]; 141932121132SShri Abhyankar vi = aj + ai[i]; 142032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 142132121132SShri Abhyankar idt = bs*i; 142232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 142332121132SShri Abhyankar for(j=0;j<nz;j++){ 142432121132SShri Abhyankar idx = bs*vi[j]; 142532121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 142632121132SShri Abhyankar t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 142732121132SShri Abhyankar t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 142832121132SShri Abhyankar t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 142932121132SShri Abhyankar v += bs2; 143032121132SShri Abhyankar } 143132121132SShri Abhyankar } 143232121132SShri Abhyankar 143332121132SShri Abhyankar /* copy t into x according to permutation */ 143432121132SShri Abhyankar for(i=0;i<n;i++){ 143532121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 143632121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 143732121132SShri Abhyankar } 143832121132SShri Abhyankar 143932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 144032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 144132121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 144232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 144332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 144432121132SShri Abhyankar PetscFunctionReturn(0); 144532121132SShri Abhyankar } 144632121132SShri Abhyankar 144732121132SShri Abhyankar #undef __FUNCT__ 1448*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 1449*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1450f1af5d2fSBarry Smith { 1451f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1452f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 14536849ba73SBarry Smith PetscErrorCode ierr; 14545d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 14555d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1456690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1457f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 145887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 145987828ca2SBarry Smith PetscScalar *x,*b,*t; 1460f1af5d2fSBarry Smith 1461f1af5d2fSBarry Smith PetscFunctionBegin; 14621ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 14631ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1464f1af5d2fSBarry Smith t = a->solve_work; 1465f1af5d2fSBarry Smith 1466f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1467f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1468f1af5d2fSBarry Smith 1469f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1470f1af5d2fSBarry Smith ii = 0; 1471f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1472f1af5d2fSBarry Smith ic = 5*c[i]; 1473f1af5d2fSBarry Smith t[ii] = b[ic]; 1474f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1475f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1476f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1477f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1478f1af5d2fSBarry Smith ii += 5; 1479f1af5d2fSBarry Smith } 1480f1af5d2fSBarry Smith 1481f1af5d2fSBarry Smith /* forward solve the U^T */ 1482f1af5d2fSBarry Smith idx = 0; 1483f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1484f1af5d2fSBarry Smith 1485f1af5d2fSBarry Smith v = aa + 25*diag[i]; 1486f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1487f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1488f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1489f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1490f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1491f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1492f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1493f1af5d2fSBarry Smith v += 25; 1494f1af5d2fSBarry Smith 1495f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1496f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1497f1af5d2fSBarry Smith while (nz--) { 1498f1af5d2fSBarry Smith oidx = 5*(*vi++); 1499f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1500f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1501f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1502f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1503f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1504f1af5d2fSBarry Smith v += 25; 1505f1af5d2fSBarry Smith } 1506f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1507f1af5d2fSBarry Smith idx += 5; 1508f1af5d2fSBarry Smith } 1509f1af5d2fSBarry Smith /* backward solve the L^T */ 1510f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1511f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 1512f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1513f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1514f1af5d2fSBarry Smith idt = 5*i; 1515f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1516f1af5d2fSBarry Smith while (nz--) { 1517f1af5d2fSBarry Smith idx = 5*(*vi--); 1518f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1519f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1520f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1521f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1522f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1523f1af5d2fSBarry Smith v -= 25; 1524f1af5d2fSBarry Smith } 1525f1af5d2fSBarry Smith } 1526f1af5d2fSBarry Smith 1527f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1528f1af5d2fSBarry Smith ii = 0; 1529f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1530f1af5d2fSBarry Smith ir = 5*r[i]; 1531f1af5d2fSBarry Smith x[ir] = t[ii]; 1532f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1533f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1534f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1535f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1536f1af5d2fSBarry Smith ii += 5; 1537f1af5d2fSBarry Smith } 1538f1af5d2fSBarry Smith 1539f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1540f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 15411ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 15421ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1543dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1544f1af5d2fSBarry Smith PetscFunctionReturn(0); 1545f1af5d2fSBarry Smith } 1546f1af5d2fSBarry Smith 15474a2ae208SSatish Balay #undef __FUNCT__ 154832121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_newdatastruct" 154932121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 155032121132SShri Abhyankar { 155132121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 155232121132SShri Abhyankar PetscErrorCode ierr; 155332121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 155432121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 155532121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 155632121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 155732121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 155832121132SShri Abhyankar MatScalar *aa=a->a,*v; 155932121132SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 156032121132SShri Abhyankar PetscScalar *x,*b,*t; 156132121132SShri Abhyankar 156232121132SShri Abhyankar PetscFunctionBegin; 156332121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 156432121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 156532121132SShri Abhyankar t = a->solve_work; 156632121132SShri Abhyankar 156732121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 156832121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 156932121132SShri Abhyankar 157032121132SShri Abhyankar /* copy b into temp work space according to permutation */ 157132121132SShri Abhyankar for(i=0;i<n;i++){ 157232121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 157332121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 157432121132SShri Abhyankar t[ii+4] = b[ic+4]; 157532121132SShri Abhyankar } 157632121132SShri Abhyankar 157732121132SShri Abhyankar /* forward solve the U^T */ 157832121132SShri Abhyankar idx = 0; 157932121132SShri Abhyankar for (i=0; i<n; i++) { 158032121132SShri Abhyankar v = aa + bs2*diag[i]; 158132121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 158232121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 158332121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 158432121132SShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 158532121132SShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 158632121132SShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 158732121132SShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 158832121132SShri Abhyankar v -= bs2; 158932121132SShri Abhyankar 159032121132SShri Abhyankar vi = aj + diag[i] - 1; 159132121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 159232121132SShri Abhyankar for(j=0;j>-nz;j--){ 159332121132SShri Abhyankar oidx = bs*vi[j]; 159432121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 159532121132SShri Abhyankar t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 159632121132SShri Abhyankar t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 159732121132SShri Abhyankar t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 159832121132SShri Abhyankar t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 159932121132SShri Abhyankar v -= bs2; 160032121132SShri Abhyankar } 160132121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 160232121132SShri Abhyankar idx += bs; 160332121132SShri Abhyankar } 160432121132SShri Abhyankar /* backward solve the L^T */ 160532121132SShri Abhyankar for (i=n-1; i>=0; i--){ 160632121132SShri Abhyankar v = aa + bs2*ai[i]; 160732121132SShri Abhyankar vi = aj + ai[i]; 160832121132SShri Abhyankar nz = ai[i+1] - ai[i]; 160932121132SShri Abhyankar idt = bs*i; 161032121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 161132121132SShri Abhyankar for(j=0;j<nz;j++){ 161232121132SShri Abhyankar idx = bs*vi[j]; 161332121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 161432121132SShri Abhyankar t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 161532121132SShri Abhyankar t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 161632121132SShri Abhyankar t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 161732121132SShri Abhyankar t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 161832121132SShri Abhyankar v += bs2; 161932121132SShri Abhyankar } 162032121132SShri Abhyankar } 162132121132SShri Abhyankar 162232121132SShri Abhyankar /* copy t into x according to permutation */ 162332121132SShri Abhyankar for(i=0;i<n;i++){ 162432121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 162532121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 162632121132SShri Abhyankar x[ir+4] = t[ii+4]; 162732121132SShri Abhyankar } 162832121132SShri Abhyankar 162932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 163032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 163132121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 163232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 163332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 163432121132SShri Abhyankar PetscFunctionReturn(0); 163532121132SShri Abhyankar } 163632121132SShri Abhyankar 163732121132SShri Abhyankar #undef __FUNCT__ 1638*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 1639*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1640f1af5d2fSBarry Smith { 1641f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1642f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 16436849ba73SBarry Smith PetscErrorCode ierr; 16445d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 16455d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1646690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1647f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 164887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 164987828ca2SBarry Smith PetscScalar *x,*b,*t; 1650f1af5d2fSBarry Smith 1651f1af5d2fSBarry Smith PetscFunctionBegin; 16521ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 16531ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1654f1af5d2fSBarry Smith t = a->solve_work; 1655f1af5d2fSBarry Smith 1656f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1657f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1658f1af5d2fSBarry Smith 1659f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1660f1af5d2fSBarry Smith ii = 0; 1661f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1662f1af5d2fSBarry Smith ic = 6*c[i]; 1663f1af5d2fSBarry Smith t[ii] = b[ic]; 1664f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1665f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1666f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1667f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1668f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1669f1af5d2fSBarry Smith ii += 6; 1670f1af5d2fSBarry Smith } 1671f1af5d2fSBarry Smith 1672f1af5d2fSBarry Smith /* forward solve the U^T */ 1673f1af5d2fSBarry Smith idx = 0; 1674f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1675f1af5d2fSBarry Smith 1676f1af5d2fSBarry Smith v = aa + 36*diag[i]; 1677f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1678f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1679f1af5d2fSBarry Smith x6 = t[5+idx]; 1680f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1681f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1682f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1683f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1684f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1685f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1686f1af5d2fSBarry Smith v += 36; 1687f1af5d2fSBarry Smith 1688f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1689f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1690f1af5d2fSBarry Smith while (nz--) { 1691f1af5d2fSBarry Smith oidx = 6*(*vi++); 1692f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1693f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1694f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1695f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1696f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1697f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1698f1af5d2fSBarry Smith v += 36; 1699f1af5d2fSBarry Smith } 1700f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1701f1af5d2fSBarry Smith t[5+idx] = s6; 1702f1af5d2fSBarry Smith idx += 6; 1703f1af5d2fSBarry Smith } 1704f1af5d2fSBarry Smith /* backward solve the L^T */ 1705f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1706f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 1707f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1708f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1709f1af5d2fSBarry Smith idt = 6*i; 1710f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1711f1af5d2fSBarry Smith s6 = t[5+idt]; 1712f1af5d2fSBarry Smith while (nz--) { 1713f1af5d2fSBarry Smith idx = 6*(*vi--); 1714f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1715f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1716f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1717f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1718f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1719f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1720f1af5d2fSBarry Smith v -= 36; 1721f1af5d2fSBarry Smith } 1722f1af5d2fSBarry Smith } 1723f1af5d2fSBarry Smith 1724f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1725f1af5d2fSBarry Smith ii = 0; 1726f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1727f1af5d2fSBarry Smith ir = 6*r[i]; 1728f1af5d2fSBarry Smith x[ir] = t[ii]; 1729f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1730f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1731f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1732f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1733f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1734f1af5d2fSBarry Smith ii += 6; 1735f1af5d2fSBarry Smith } 1736f1af5d2fSBarry Smith 1737f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1738f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 17391ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 17401ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1741dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1742f1af5d2fSBarry Smith PetscFunctionReturn(0); 1743f1af5d2fSBarry Smith } 1744f1af5d2fSBarry Smith 17454a2ae208SSatish Balay #undef __FUNCT__ 174632121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_newdatastruct" 174732121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 174832121132SShri Abhyankar { 174932121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 175032121132SShri Abhyankar PetscErrorCode ierr; 175132121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 175232121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 175332121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 175432121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 175532121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 175632121132SShri Abhyankar MatScalar *aa=a->a,*v; 175732121132SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 175832121132SShri Abhyankar PetscScalar *x,*b,*t; 175932121132SShri Abhyankar 176032121132SShri Abhyankar PetscFunctionBegin; 176132121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 176232121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 176332121132SShri Abhyankar t = a->solve_work; 176432121132SShri Abhyankar 176532121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 176632121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 176732121132SShri Abhyankar 176832121132SShri Abhyankar /* copy b into temp work space according to permutation */ 176932121132SShri Abhyankar for(i=0;i<n;i++){ 177032121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 177132121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 177232121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 177332121132SShri Abhyankar } 177432121132SShri Abhyankar 177532121132SShri Abhyankar /* forward solve the U^T */ 177632121132SShri Abhyankar idx = 0; 177732121132SShri Abhyankar for (i=0; i<n; i++) { 177832121132SShri Abhyankar v = aa + bs2*diag[i]; 177932121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 178032121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 178132121132SShri Abhyankar x6 = t[5+idx]; 178232121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 178332121132SShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 178432121132SShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 178532121132SShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 178632121132SShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 178732121132SShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 178832121132SShri Abhyankar v -= bs2; 178932121132SShri Abhyankar 179032121132SShri Abhyankar vi = aj + diag[i] - 1; 179132121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 179232121132SShri Abhyankar for(j=0;j>-nz;j--){ 179332121132SShri Abhyankar oidx = bs*vi[j]; 179432121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 179532121132SShri Abhyankar t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 179632121132SShri Abhyankar t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 179732121132SShri Abhyankar t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 179832121132SShri Abhyankar t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 179932121132SShri Abhyankar t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 180032121132SShri Abhyankar v -= bs2; 180132121132SShri Abhyankar } 180232121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 180332121132SShri Abhyankar t[5+idx] = s6; 180432121132SShri Abhyankar idx += bs; 180532121132SShri Abhyankar } 180632121132SShri Abhyankar /* backward solve the L^T */ 180732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 180832121132SShri Abhyankar v = aa + bs2*ai[i]; 180932121132SShri Abhyankar vi = aj + ai[i]; 181032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 181132121132SShri Abhyankar idt = bs*i; 181232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 181332121132SShri Abhyankar s6 = t[5+idt]; 181432121132SShri Abhyankar for(j=0;j<nz;j++){ 181532121132SShri Abhyankar idx = bs*vi[j]; 181632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 181732121132SShri Abhyankar t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 181832121132SShri Abhyankar t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 181932121132SShri Abhyankar t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 182032121132SShri Abhyankar t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 182132121132SShri Abhyankar t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 182232121132SShri Abhyankar v += bs2; 182332121132SShri Abhyankar } 182432121132SShri Abhyankar } 182532121132SShri Abhyankar 182632121132SShri Abhyankar /* copy t into x according to permutation */ 182732121132SShri Abhyankar for(i=0;i<n;i++){ 182832121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 182932121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 183032121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 183132121132SShri Abhyankar } 183232121132SShri Abhyankar 183332121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 183432121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 183532121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 183632121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 183732121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 183832121132SShri Abhyankar PetscFunctionReturn(0); 183932121132SShri Abhyankar } 184032121132SShri Abhyankar 184132121132SShri Abhyankar #undef __FUNCT__ 1842*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 1843*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1844f1af5d2fSBarry Smith { 1845f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1846f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 18476849ba73SBarry Smith PetscErrorCode ierr; 18485d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 18495d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1850690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1851f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 185287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 185387828ca2SBarry Smith PetscScalar *x,*b,*t; 1854f1af5d2fSBarry Smith 1855f1af5d2fSBarry Smith PetscFunctionBegin; 18561ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 18571ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1858f1af5d2fSBarry Smith t = a->solve_work; 1859f1af5d2fSBarry Smith 1860f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1861f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1862f1af5d2fSBarry Smith 1863f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1864f1af5d2fSBarry Smith ii = 0; 1865f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1866f1af5d2fSBarry Smith ic = 7*c[i]; 1867f1af5d2fSBarry Smith t[ii] = b[ic]; 1868f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1869f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1870f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1871f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1872f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1873f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1874f1af5d2fSBarry Smith ii += 7; 1875f1af5d2fSBarry Smith } 1876f1af5d2fSBarry Smith 1877f1af5d2fSBarry Smith /* forward solve the U^T */ 1878f1af5d2fSBarry Smith idx = 0; 1879f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1880f1af5d2fSBarry Smith 1881f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1882f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1883f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1884f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1885f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1886f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1887f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1888f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1889f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1890f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1891f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1892f1af5d2fSBarry Smith v += 49; 1893f1af5d2fSBarry Smith 1894f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1895f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1896f1af5d2fSBarry Smith while (nz--) { 1897f1af5d2fSBarry Smith oidx = 7*(*vi++); 1898f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1899f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1900f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1901f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1902f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1903f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1904f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1905f1af5d2fSBarry Smith v += 49; 1906f1af5d2fSBarry Smith } 1907f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1908f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1909f1af5d2fSBarry Smith idx += 7; 1910f1af5d2fSBarry Smith } 1911f1af5d2fSBarry Smith /* backward solve the L^T */ 1912f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1913f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1914f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1915f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1916f1af5d2fSBarry Smith idt = 7*i; 1917f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1918f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1919f1af5d2fSBarry Smith while (nz--) { 1920f1af5d2fSBarry Smith idx = 7*(*vi--); 1921f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1922f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1923f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1924f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1925f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1926f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1927f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1928f1af5d2fSBarry Smith v -= 49; 1929f1af5d2fSBarry Smith } 1930f1af5d2fSBarry Smith } 1931f1af5d2fSBarry Smith 1932f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1933f1af5d2fSBarry Smith ii = 0; 1934f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1935f1af5d2fSBarry Smith ir = 7*r[i]; 1936f1af5d2fSBarry Smith x[ir] = t[ii]; 1937f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1938f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1939f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1940f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1941f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1942f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1943f1af5d2fSBarry Smith ii += 7; 1944f1af5d2fSBarry Smith } 1945f1af5d2fSBarry Smith 1946f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1947f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 19481ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 19491ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1950dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1951f1af5d2fSBarry Smith PetscFunctionReturn(0); 1952f1af5d2fSBarry Smith } 195332121132SShri Abhyankar #undef __FUNCT__ 195432121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_newdatastruct" 195532121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 195632121132SShri Abhyankar { 195732121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 195832121132SShri Abhyankar PetscErrorCode ierr; 195932121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 196032121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 196132121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 196232121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 196332121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 196432121132SShri Abhyankar MatScalar *aa=a->a,*v; 196532121132SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 196632121132SShri Abhyankar PetscScalar *x,*b,*t; 196732121132SShri Abhyankar 196832121132SShri Abhyankar PetscFunctionBegin; 196932121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 197032121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 197132121132SShri Abhyankar t = a->solve_work; 197232121132SShri Abhyankar 197332121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 197432121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 197532121132SShri Abhyankar 197632121132SShri Abhyankar /* copy b into temp work space according to permutation */ 197732121132SShri Abhyankar for(i=0;i<n;i++){ 197832121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 197932121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 198032121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 198132121132SShri Abhyankar } 198232121132SShri Abhyankar 198332121132SShri Abhyankar /* forward solve the U^T */ 198432121132SShri Abhyankar idx = 0; 198532121132SShri Abhyankar for (i=0; i<n; i++) { 198632121132SShri Abhyankar v = aa + bs2*diag[i]; 198732121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 198832121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 198932121132SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 199032121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 199132121132SShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 199232121132SShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 199332121132SShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 199432121132SShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 199532121132SShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 199632121132SShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 199732121132SShri Abhyankar v -= bs2; 199832121132SShri Abhyankar 199932121132SShri Abhyankar vi = aj + diag[i] - 1; 200032121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 200132121132SShri Abhyankar for(j=0;j>-nz;j--){ 200232121132SShri Abhyankar oidx = bs*vi[j]; 200332121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 200432121132SShri Abhyankar t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 200532121132SShri Abhyankar t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 200632121132SShri Abhyankar t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 200732121132SShri Abhyankar t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 200832121132SShri Abhyankar t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 200932121132SShri Abhyankar t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 201032121132SShri Abhyankar v -= bs2; 201132121132SShri Abhyankar } 201232121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 201332121132SShri Abhyankar t[5+idx] = s6; t[6+idx] = s7; 201432121132SShri Abhyankar idx += bs; 201532121132SShri Abhyankar } 201632121132SShri Abhyankar /* backward solve the L^T */ 201732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 201832121132SShri Abhyankar v = aa + bs2*ai[i]; 201932121132SShri Abhyankar vi = aj + ai[i]; 202032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 202132121132SShri Abhyankar idt = bs*i; 202232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 202332121132SShri Abhyankar s6 = t[5+idt]; s7 = t[6+idt]; 202432121132SShri Abhyankar for(j=0;j<nz;j++){ 202532121132SShri Abhyankar idx = bs*vi[j]; 202632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 202732121132SShri Abhyankar t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 202832121132SShri Abhyankar t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 202932121132SShri Abhyankar t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 203032121132SShri Abhyankar t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 203132121132SShri Abhyankar t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 203232121132SShri Abhyankar t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 203332121132SShri Abhyankar v += bs2; 203432121132SShri Abhyankar } 203532121132SShri Abhyankar } 203632121132SShri Abhyankar 203732121132SShri Abhyankar /* copy t into x according to permutation */ 203832121132SShri Abhyankar for(i=0;i<n;i++){ 203932121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 204032121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 204132121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 204232121132SShri Abhyankar } 204332121132SShri Abhyankar 204432121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 204532121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 204632121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 204732121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 204832121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 204932121132SShri Abhyankar PetscFunctionReturn(0); 205032121132SShri Abhyankar } 2051f1af5d2fSBarry Smith 20524e2b4712SSatish Balay /* ----------------------------------------------------------- */ 20534a2ae208SSatish Balay #undef __FUNCT__ 2054*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 2055*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 20564e2b4712SSatish Balay { 20574e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 20584e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 20596849ba73SBarry Smith PetscErrorCode ierr; 20605d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 20615d0c19d7SBarry Smith PetscInt i,n=a->mbs; 20625d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 20633f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 206487828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 20654e2b4712SSatish Balay 20664e2b4712SSatish Balay PetscFunctionBegin; 20671ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 20681ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2069f1af5d2fSBarry Smith t = a->solve_work; 20704e2b4712SSatish Balay 20714e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 20724e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 20734e2b4712SSatish Balay 20744e2b4712SSatish Balay /* forward solve the lower triangular */ 207587828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 20764e2b4712SSatish Balay for (i=1; i<n; i++) { 20774e2b4712SSatish Balay v = aa + bs2*ai[i]; 20784e2b4712SSatish Balay vi = aj + ai[i]; 20794e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 2080f1af5d2fSBarry Smith s = t + bs*i; 208187828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 20824e2b4712SSatish Balay while (nz--) { 2083f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 20844e2b4712SSatish Balay v += bs2; 20854e2b4712SSatish Balay } 20864e2b4712SSatish Balay } 20874e2b4712SSatish Balay /* backward solve the upper triangular */ 2088d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 20894e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 20904e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 20914e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 20924e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 209387828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 20944e2b4712SSatish Balay while (nz--) { 2095f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 20964e2b4712SSatish Balay v += bs2; 20974e2b4712SSatish Balay } 2098f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 209987828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 21004e2b4712SSatish Balay } 21014e2b4712SSatish Balay 21024e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21034e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21041ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 21051ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2106dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 21074e2b4712SSatish Balay PetscFunctionReturn(0); 21084e2b4712SSatish Balay } 21094e2b4712SSatish Balay 21105c42ef9dSBarry Smith /* ----------------------------------------------------------- */ 21115c42ef9dSBarry Smith #undef __FUNCT__ 2112*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 2113*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 21145c42ef9dSBarry Smith { 21155c42ef9dSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21165c42ef9dSBarry Smith IS iscol=a->col,isrow=a->row; 21175c42ef9dSBarry Smith PetscErrorCode ierr; 21185c42ef9dSBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 21195c42ef9dSBarry Smith PetscInt i,n=a->mbs,j; 21205c42ef9dSBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 21215c42ef9dSBarry Smith const MatScalar *aa=a->a,*v; 21225c42ef9dSBarry Smith PetscScalar *x,*t,*ls; 21235c42ef9dSBarry Smith const PetscScalar *b; 21245c42ef9dSBarry Smith PetscFunctionBegin; 21255c42ef9dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21265c42ef9dSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21275c42ef9dSBarry Smith t = a->solve_work; 21285c42ef9dSBarry Smith 21295c42ef9dSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21305c42ef9dSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 21315c42ef9dSBarry Smith 21325c42ef9dSBarry Smith /* copy the b into temp work space according to permutation */ 21335c42ef9dSBarry Smith for (i=0; i<n; i++) { 21345c42ef9dSBarry Smith for (j=0; j<bs; j++) { 21355c42ef9dSBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 21365c42ef9dSBarry Smith } 21375c42ef9dSBarry Smith } 21385c42ef9dSBarry Smith 21395c42ef9dSBarry Smith 21405c42ef9dSBarry Smith /* forward solve the upper triangular transpose */ 21415c42ef9dSBarry Smith ls = a->solve_work + A->cmap->n; 21425c42ef9dSBarry Smith for (i=0; i<n; i++){ 21435c42ef9dSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 21445c42ef9dSBarry Smith Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 21455c42ef9dSBarry Smith v = aa + bs2*(a->diag[i] + 1); 21465c42ef9dSBarry Smith vi = aj + a->diag[i] + 1; 21475c42ef9dSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 21485c42ef9dSBarry Smith while (nz--) { 21495c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 21505c42ef9dSBarry Smith v += bs2; 21515c42ef9dSBarry Smith } 21525c42ef9dSBarry Smith } 21535c42ef9dSBarry Smith 21545c42ef9dSBarry Smith /* backward solve the lower triangular transpose */ 21555c42ef9dSBarry Smith for (i=n-1; i>=0; i--) { 21565c42ef9dSBarry Smith v = aa + bs2*ai[i]; 21575c42ef9dSBarry Smith vi = aj + ai[i]; 21585c42ef9dSBarry Smith nz = a->diag[i] - ai[i]; 21595c42ef9dSBarry Smith while (nz--) { 21605c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 21615c42ef9dSBarry Smith v += bs2; 21625c42ef9dSBarry Smith } 21635c42ef9dSBarry Smith } 21645c42ef9dSBarry Smith 21655c42ef9dSBarry Smith /* copy t into x according to permutation */ 21665c42ef9dSBarry Smith for (i=0; i<n; i++) { 21675c42ef9dSBarry Smith for (j=0; j<bs; j++) { 21685c42ef9dSBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 21695c42ef9dSBarry Smith } 21705c42ef9dSBarry Smith } 21715c42ef9dSBarry Smith 21725c42ef9dSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21735c42ef9dSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21745c42ef9dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21755c42ef9dSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 21765c42ef9dSBarry Smith ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 21775c42ef9dSBarry Smith PetscFunctionReturn(0); 21785c42ef9dSBarry Smith } 21795c42ef9dSBarry Smith 21804a2ae208SSatish Balay #undef __FUNCT__ 21818499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_newdatastruct" 21828499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx) 21838499736aSShri Abhyankar { 21848499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21858499736aSShri Abhyankar IS iscol=a->col,isrow=a->row; 21868499736aSShri Abhyankar PetscErrorCode ierr; 21878499736aSShri Abhyankar const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 21888499736aSShri Abhyankar PetscInt i,n=a->mbs,j; 21898499736aSShri Abhyankar PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 21908499736aSShri Abhyankar const MatScalar *aa=a->a,*v; 21918499736aSShri Abhyankar PetscScalar *x,*t,*ls; 21928499736aSShri Abhyankar const PetscScalar *b; 21938499736aSShri Abhyankar PetscFunctionBegin; 21948499736aSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21958499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21968499736aSShri Abhyankar t = a->solve_work; 21978499736aSShri Abhyankar 21988499736aSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21998499736aSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22008499736aSShri Abhyankar 22018499736aSShri Abhyankar /* copy the b into temp work space according to permutation */ 22028499736aSShri Abhyankar for (i=0; i<n; i++) { 22038499736aSShri Abhyankar for (j=0; j<bs; j++) { 22048499736aSShri Abhyankar t[i*bs+j] = b[c[i]*bs+j]; 22058499736aSShri Abhyankar } 22068499736aSShri Abhyankar } 22078499736aSShri Abhyankar 22088499736aSShri Abhyankar 22098499736aSShri Abhyankar /* forward solve the upper triangular transpose */ 22108499736aSShri Abhyankar ls = a->solve_work + A->cmap->n; 22118499736aSShri Abhyankar for (i=0; i<n; i++){ 22128499736aSShri Abhyankar ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 22138499736aSShri Abhyankar Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 22148499736aSShri Abhyankar v = aa + bs2*(diag[i] - 1); 22158499736aSShri Abhyankar vi = aj + diag[i] - 1; 22168499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 22178499736aSShri Abhyankar for(j=0;j>-nz;j--){ 22188499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 22198499736aSShri Abhyankar v -= bs2; 22208499736aSShri Abhyankar } 22218499736aSShri Abhyankar } 22228499736aSShri Abhyankar 22238499736aSShri Abhyankar /* backward solve the lower triangular transpose */ 22248499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 22258499736aSShri Abhyankar v = aa + bs2*ai[i]; 22268499736aSShri Abhyankar vi = aj + ai[i]; 22278499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 22288499736aSShri Abhyankar for(j=0;j<nz;j++){ 22298499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 22308499736aSShri Abhyankar v += bs2; 22318499736aSShri Abhyankar } 22328499736aSShri Abhyankar } 22338499736aSShri Abhyankar 22348499736aSShri Abhyankar /* copy t into x according to permutation */ 22358499736aSShri Abhyankar for (i=0; i<n; i++) { 22368499736aSShri Abhyankar for (j=0; j<bs; j++) { 22378499736aSShri Abhyankar x[bs*r[i]+j] = t[bs*i+j]; 22388499736aSShri Abhyankar } 22398499736aSShri Abhyankar } 22408499736aSShri Abhyankar 22418499736aSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22428499736aSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 22438499736aSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22448499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 22458499736aSShri Abhyankar ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 22468499736aSShri Abhyankar PetscFunctionReturn(0); 22478499736aSShri Abhyankar } 22488499736aSShri Abhyankar 22498499736aSShri Abhyankar #undef __FUNCT__ 2250*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 2251*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 22524e2b4712SSatish Balay { 22534e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 22544e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 22556849ba73SBarry Smith PetscErrorCode ierr; 22565d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 22575d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 22583f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 225987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 226087828ca2SBarry Smith PetscScalar *x,*b,*t; 22614e2b4712SSatish Balay 22624e2b4712SSatish Balay PetscFunctionBegin; 22631ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 22641ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2265f1af5d2fSBarry Smith t = a->solve_work; 22664e2b4712SSatish Balay 22674e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 22684e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 22694e2b4712SSatish Balay 22704e2b4712SSatish Balay /* forward solve the lower triangular */ 22714e2b4712SSatish Balay idx = 7*(*r++); 2272f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2273f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2274f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 22754e2b4712SSatish Balay 22764e2b4712SSatish Balay for (i=1; i<n; i++) { 22774e2b4712SSatish Balay v = aa + 49*ai[i]; 22784e2b4712SSatish Balay vi = aj + ai[i]; 22794e2b4712SSatish Balay nz = diag[i] - ai[i]; 22804e2b4712SSatish Balay idx = 7*(*r++); 2281f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2282f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 22834e2b4712SSatish Balay while (nz--) { 22844e2b4712SSatish Balay idx = 7*(*vi++); 2285f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2286f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2287f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 2288f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2289f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2290f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2291f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2292f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2293f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2294f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 22954e2b4712SSatish Balay v += 49; 22964e2b4712SSatish Balay } 22974e2b4712SSatish Balay idx = 7*i; 2298f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2299f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2300f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 23014e2b4712SSatish Balay } 23024e2b4712SSatish Balay /* backward solve the upper triangular */ 23034e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 23044e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 23054e2b4712SSatish Balay vi = aj + diag[i] + 1; 23064e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 23074e2b4712SSatish Balay idt = 7*i; 2308f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2309f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2310f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 23114e2b4712SSatish Balay while (nz--) { 23124e2b4712SSatish Balay idx = 7*(*vi++); 2313f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2314f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2315f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 2316f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2317f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2318f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2319f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2320f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2321f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2322f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 23234e2b4712SSatish Balay v += 49; 23244e2b4712SSatish Balay } 23254e2b4712SSatish Balay idc = 7*(*c--); 23264e2b4712SSatish Balay v = aa + 49*diag[i]; 2327f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2328f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2329f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2330f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2331f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2332f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2333f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2334f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2335f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2336f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2337f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2338f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2339f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2340f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 23414e2b4712SSatish Balay } 23424e2b4712SSatish Balay 23434e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 23444e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 23451ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 23461ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2347dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 23484e2b4712SSatish Balay PetscFunctionReturn(0); 23494e2b4712SSatish Balay } 23504e2b4712SSatish Balay 23518f690400SShri Abhyankar #undef __FUNCT__ 2352a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 2353a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 235435aa4fcfSShri Abhyankar { 235535aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 235635aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 235735aa4fcfSShri Abhyankar PetscErrorCode ierr; 235835aa4fcfSShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 235935aa4fcfSShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 236035aa4fcfSShri Abhyankar MatScalar *aa=a->a,*v; 236135aa4fcfSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 236235aa4fcfSShri Abhyankar PetscScalar *x,*b,*t; 236335aa4fcfSShri Abhyankar 236435aa4fcfSShri Abhyankar PetscFunctionBegin; 236535aa4fcfSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 236635aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 236735aa4fcfSShri Abhyankar t = a->solve_work; 236835aa4fcfSShri Abhyankar 236935aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 237035aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 237135aa4fcfSShri Abhyankar 237235aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 237335aa4fcfSShri Abhyankar idx = 7*r[0]; 237435aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 237535aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 237635aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 237735aa4fcfSShri Abhyankar 237835aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 237935aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 238035aa4fcfSShri Abhyankar vi = aj + ai[i]; 238135aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 238235aa4fcfSShri Abhyankar idx = 7*r[i]; 238335aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 238435aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 238535aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 238635aa4fcfSShri Abhyankar idx = 7*vi[m]; 238735aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 238835aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 238935aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 239035aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 239135aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 239235aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 239335aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 239435aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 239535aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 239635aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 239735aa4fcfSShri Abhyankar v += 49; 239835aa4fcfSShri Abhyankar } 239935aa4fcfSShri Abhyankar idx = 7*i; 240035aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 240135aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 240235aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 240335aa4fcfSShri Abhyankar } 240435aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 240535aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 240635aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 240735aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 240835aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 240935aa4fcfSShri Abhyankar idt = 7*i; 241035aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 241135aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 241235aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 241335aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 241435aa4fcfSShri Abhyankar idx = 7*vi[m]; 241535aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 241635aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 241735aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 241835aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 241935aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 242035aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 242135aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 242235aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 242335aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 242435aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 242535aa4fcfSShri Abhyankar v += 49; 242635aa4fcfSShri Abhyankar } 242735aa4fcfSShri Abhyankar idc = 7*c[i]; 242835aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 242935aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 243035aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 243135aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 243235aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 243335aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 243435aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 243535aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 243635aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 243735aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 243835aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 243935aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 244035aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 244135aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 244235aa4fcfSShri Abhyankar } 244335aa4fcfSShri Abhyankar 244435aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 244535aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 244635aa4fcfSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 244735aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 244835aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 244935aa4fcfSShri Abhyankar PetscFunctionReturn(0); 245035aa4fcfSShri Abhyankar } 245135aa4fcfSShri Abhyankar 245235aa4fcfSShri Abhyankar #undef __FUNCT__ 2453*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 2454*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 245515091d37SBarry Smith { 245615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2457690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2458dfbe8321SBarry Smith PetscErrorCode ierr; 2459690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2460d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2461d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2462d9fead3dSBarry Smith const PetscScalar *b; 246315091d37SBarry Smith 246415091d37SBarry Smith PetscFunctionBegin; 2465d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24661ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 246715091d37SBarry Smith /* forward solve the lower triangular */ 246815091d37SBarry Smith idx = 0; 246915091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 247015091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 247115091d37SBarry Smith x[6] = b[6+idx]; 247215091d37SBarry Smith for (i=1; i<n; i++) { 247315091d37SBarry Smith v = aa + 49*ai[i]; 247415091d37SBarry Smith vi = aj + ai[i]; 247515091d37SBarry Smith nz = diag[i] - ai[i]; 247615091d37SBarry Smith idx = 7*i; 2477f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2478f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2479f1af5d2fSBarry Smith s7 = b[6+idx]; 248015091d37SBarry Smith while (nz--) { 248115091d37SBarry Smith jdx = 7*(*vi++); 248215091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 248315091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 248415091d37SBarry Smith x7 = x[6+jdx]; 2485f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2486f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2487f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2488f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2489f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2490f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2491f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 249215091d37SBarry Smith v += 49; 249315091d37SBarry Smith } 2494f1af5d2fSBarry Smith x[idx] = s1; 2495f1af5d2fSBarry Smith x[1+idx] = s2; 2496f1af5d2fSBarry Smith x[2+idx] = s3; 2497f1af5d2fSBarry Smith x[3+idx] = s4; 2498f1af5d2fSBarry Smith x[4+idx] = s5; 2499f1af5d2fSBarry Smith x[5+idx] = s6; 2500f1af5d2fSBarry Smith x[6+idx] = s7; 250115091d37SBarry Smith } 250215091d37SBarry Smith /* backward solve the upper triangular */ 250315091d37SBarry Smith for (i=n-1; i>=0; i--){ 250415091d37SBarry Smith v = aa + 49*diag[i] + 49; 250515091d37SBarry Smith vi = aj + diag[i] + 1; 250615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 250715091d37SBarry Smith idt = 7*i; 2508f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2509f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2510f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 2511f1af5d2fSBarry Smith s7 = x[6+idt]; 251215091d37SBarry Smith while (nz--) { 251315091d37SBarry Smith idx = 7*(*vi++); 251415091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 251515091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 251615091d37SBarry Smith x7 = x[6+idx]; 2517f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2518f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2519f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2520f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2521f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2522f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2523f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 252415091d37SBarry Smith v += 49; 252515091d37SBarry Smith } 252615091d37SBarry Smith v = aa + 49*diag[i]; 2527f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2528f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 2529f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2530f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 2531f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2532f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 2533f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2534f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 2535f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2536f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 2537f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2538f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 2539f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2540f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 254115091d37SBarry Smith } 254215091d37SBarry Smith 2543d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25441ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2545dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 254615091d37SBarry Smith PetscFunctionReturn(0); 254715091d37SBarry Smith } 254815091d37SBarry Smith 2549cee9d6f2SShri Abhyankar #undef __FUNCT__ 2550a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 2551a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 255253cca76cSShri Abhyankar { 255353cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 255453cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 255553cca76cSShri Abhyankar PetscErrorCode ierr; 255653cca76cSShri Abhyankar PetscInt idx,jdx,idt; 255753cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 255853cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 255953cca76cSShri Abhyankar PetscScalar *x; 256053cca76cSShri Abhyankar const PetscScalar *b; 256153cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 256253cca76cSShri Abhyankar 256353cca76cSShri Abhyankar PetscFunctionBegin; 256453cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 256553cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 256653cca76cSShri Abhyankar /* forward solve the lower triangular */ 256753cca76cSShri Abhyankar idx = 0; 256853cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 256953cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 257053cca76cSShri Abhyankar for (i=1; i<n; i++) { 257153cca76cSShri Abhyankar v = aa + bs2*ai[i]; 257253cca76cSShri Abhyankar vi = aj + ai[i]; 257353cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 257453cca76cSShri Abhyankar idx = bs*i; 257553cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 257653cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 257753cca76cSShri Abhyankar for(k=0;k<nz;k++) { 257853cca76cSShri Abhyankar jdx = bs*vi[k]; 257953cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 258053cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 258153cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 258253cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 258353cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 258453cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 258553cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 258653cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 258753cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 258853cca76cSShri Abhyankar v += bs2; 258953cca76cSShri Abhyankar } 259053cca76cSShri Abhyankar 259153cca76cSShri Abhyankar x[idx] = s1; 259253cca76cSShri Abhyankar x[1+idx] = s2; 259353cca76cSShri Abhyankar x[2+idx] = s3; 259453cca76cSShri Abhyankar x[3+idx] = s4; 259553cca76cSShri Abhyankar x[4+idx] = s5; 259653cca76cSShri Abhyankar x[5+idx] = s6; 259753cca76cSShri Abhyankar x[6+idx] = s7; 259853cca76cSShri Abhyankar } 259953cca76cSShri Abhyankar 260053cca76cSShri Abhyankar /* backward solve the upper triangular */ 260153cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 260253cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 260353cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 260453cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 260553cca76cSShri Abhyankar idt = bs*i; 260653cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 260753cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 260853cca76cSShri Abhyankar for(k=0;k<nz;k++) { 260953cca76cSShri Abhyankar idx = bs*vi[k]; 261053cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 261153cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 261253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 261353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 261453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 261553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 261653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 261753cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 261853cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 261953cca76cSShri Abhyankar v += bs2; 262053cca76cSShri Abhyankar } 262153cca76cSShri Abhyankar /* x = inv_diagonal*x */ 262253cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 262353cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 262453cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 262553cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 262653cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 262753cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 262853cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 262953cca76cSShri Abhyankar } 263053cca76cSShri Abhyankar 263153cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 263253cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 263353cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 263453cca76cSShri Abhyankar PetscFunctionReturn(0); 263553cca76cSShri Abhyankar } 263653cca76cSShri Abhyankar 263753cca76cSShri Abhyankar #undef __FUNCT__ 2638*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 2639*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 264015091d37SBarry Smith { 264115091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 264215091d37SBarry Smith IS iscol=a->col,isrow=a->row; 26436849ba73SBarry Smith PetscErrorCode ierr; 26445d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 26455d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2646d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2647d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2648d9fead3dSBarry Smith const PetscScalar *b; 264915091d37SBarry Smith PetscFunctionBegin; 2650d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26511ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2652f1af5d2fSBarry Smith t = a->solve_work; 265315091d37SBarry Smith 265415091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 265515091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 265615091d37SBarry Smith 265715091d37SBarry Smith /* forward solve the lower triangular */ 265815091d37SBarry Smith idx = 6*(*r++); 2659f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2660f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 2661f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 266215091d37SBarry Smith for (i=1; i<n; i++) { 266315091d37SBarry Smith v = aa + 36*ai[i]; 266415091d37SBarry Smith vi = aj + ai[i]; 266515091d37SBarry Smith nz = diag[i] - ai[i]; 266615091d37SBarry Smith idx = 6*(*r++); 2667f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2668f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 266915091d37SBarry Smith while (nz--) { 267015091d37SBarry Smith idx = 6*(*vi++); 2671f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2672f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2673f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2674f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2675f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2676f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2677f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2678f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 267915091d37SBarry Smith v += 36; 268015091d37SBarry Smith } 268115091d37SBarry Smith idx = 6*i; 2682f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2683f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 2684f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 268515091d37SBarry Smith } 268615091d37SBarry Smith /* backward solve the upper triangular */ 268715091d37SBarry Smith for (i=n-1; i>=0; i--){ 268815091d37SBarry Smith v = aa + 36*diag[i] + 36; 268915091d37SBarry Smith vi = aj + diag[i] + 1; 269015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 269115091d37SBarry Smith idt = 6*i; 2692f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2693f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 2694f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 269515091d37SBarry Smith while (nz--) { 269615091d37SBarry Smith idx = 6*(*vi++); 2697f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2698f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2699f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 2700f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2701f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2702f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2703f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2704f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2705f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 270615091d37SBarry Smith v += 36; 270715091d37SBarry Smith } 270815091d37SBarry Smith idc = 6*(*c--); 270915091d37SBarry Smith v = aa + 36*diag[i]; 2710f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2711f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 2712f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2713f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 2714f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2715f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 2716f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2717f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 2718f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2719f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 2720f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2721f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 272215091d37SBarry Smith } 272315091d37SBarry Smith 272415091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 272515091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2726d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27271ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2728dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 272915091d37SBarry Smith PetscFunctionReturn(0); 273015091d37SBarry Smith } 273115091d37SBarry Smith 27326506fda5SShri Abhyankar #undef __FUNCT__ 2733a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 2734a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 27356506fda5SShri Abhyankar { 27366506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 27376506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 27386506fda5SShri Abhyankar PetscErrorCode ierr; 27396506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 27406506fda5SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 27416506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 27426506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 27436506fda5SShri Abhyankar const PetscScalar *b; 27446506fda5SShri Abhyankar PetscFunctionBegin; 27456506fda5SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27466506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 27476506fda5SShri Abhyankar t = a->solve_work; 27486506fda5SShri Abhyankar 27496506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 27506506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 27516506fda5SShri Abhyankar 27526506fda5SShri Abhyankar /* forward solve the lower triangular */ 27536506fda5SShri Abhyankar idx = 6*r[0]; 27546506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 27556506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 27566506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 27576506fda5SShri Abhyankar for (i=1; i<n; i++) { 27586506fda5SShri Abhyankar v = aa + 36*ai[i]; 27596506fda5SShri Abhyankar vi = aj + ai[i]; 27606506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 27616506fda5SShri Abhyankar idx = 6*r[i]; 27626506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 27636506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 27646506fda5SShri Abhyankar for(m=0;m<nz;m++){ 27656506fda5SShri Abhyankar idx = 6*vi[m]; 27666506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 27676506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 27686506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 27696506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 27706506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 27716506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 27726506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 27736506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 27746506fda5SShri Abhyankar v += 36; 27756506fda5SShri Abhyankar } 27766506fda5SShri Abhyankar idx = 6*i; 27776506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 27786506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 27796506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 27806506fda5SShri Abhyankar } 27816506fda5SShri Abhyankar /* backward solve the upper triangular */ 27826506fda5SShri Abhyankar for (i=n-1; i>=0; i--){ 27836506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 27846506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 27856506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 27866506fda5SShri Abhyankar idt = 6*i; 27876506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 27886506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 27896506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 27906506fda5SShri Abhyankar for(m=0;m<nz;m++){ 27916506fda5SShri Abhyankar idx = 6*vi[m]; 27926506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 27936506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 27946506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 27956506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 27966506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 27976506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 27986506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 27996506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 28006506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 28016506fda5SShri Abhyankar v += 36; 28026506fda5SShri Abhyankar } 28036506fda5SShri Abhyankar idc = 6*c[i]; 28046506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 28056506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 28066506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 28076506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 28086506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 28096506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 28106506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 28116506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 28126506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 28136506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 28146506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 28156506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 28166506fda5SShri Abhyankar } 28176506fda5SShri Abhyankar 28186506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 28196506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 28206506fda5SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28216506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 28226506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 28236506fda5SShri Abhyankar PetscFunctionReturn(0); 28246506fda5SShri Abhyankar } 28258f690400SShri Abhyankar 28268f690400SShri Abhyankar #undef __FUNCT__ 2827*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 2828*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 282915091d37SBarry Smith { 283015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2831690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2832dfbe8321SBarry Smith PetscErrorCode ierr; 2833690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2834d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2835d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2836d9fead3dSBarry Smith const PetscScalar *b; 283715091d37SBarry Smith 283815091d37SBarry Smith PetscFunctionBegin; 2839d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28401ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 284115091d37SBarry Smith /* forward solve the lower triangular */ 284215091d37SBarry Smith idx = 0; 284315091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 284415091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 284515091d37SBarry Smith for (i=1; i<n; i++) { 284615091d37SBarry Smith v = aa + 36*ai[i]; 284715091d37SBarry Smith vi = aj + ai[i]; 284815091d37SBarry Smith nz = diag[i] - ai[i]; 284915091d37SBarry Smith idx = 6*i; 2850f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2851f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 285215091d37SBarry Smith while (nz--) { 285315091d37SBarry Smith jdx = 6*(*vi++); 285415091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 285515091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2856f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2857f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2858f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2859f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2860f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2861f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 286215091d37SBarry Smith v += 36; 286315091d37SBarry Smith } 2864f1af5d2fSBarry Smith x[idx] = s1; 2865f1af5d2fSBarry Smith x[1+idx] = s2; 2866f1af5d2fSBarry Smith x[2+idx] = s3; 2867f1af5d2fSBarry Smith x[3+idx] = s4; 2868f1af5d2fSBarry Smith x[4+idx] = s5; 2869f1af5d2fSBarry Smith x[5+idx] = s6; 287015091d37SBarry Smith } 287115091d37SBarry Smith /* backward solve the upper triangular */ 287215091d37SBarry Smith for (i=n-1; i>=0; i--){ 287315091d37SBarry Smith v = aa + 36*diag[i] + 36; 287415091d37SBarry Smith vi = aj + diag[i] + 1; 287515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 287615091d37SBarry Smith idt = 6*i; 2877f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2878f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2879f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 288015091d37SBarry Smith while (nz--) { 288115091d37SBarry Smith idx = 6*(*vi++); 288215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 288315091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2884f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2885f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2886f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2887f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2888f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2889f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 289015091d37SBarry Smith v += 36; 289115091d37SBarry Smith } 289215091d37SBarry Smith v = aa + 36*diag[i]; 2893f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2894f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2895f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2896f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2897f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2898f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 289915091d37SBarry Smith } 290015091d37SBarry Smith 2901d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29021ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2903dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 290415091d37SBarry Smith PetscFunctionReturn(0); 290515091d37SBarry Smith } 290615091d37SBarry Smith 2907cee9d6f2SShri Abhyankar #undef __FUNCT__ 2908a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2909a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 291053cca76cSShri Abhyankar { 291153cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 291253cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 291353cca76cSShri Abhyankar PetscErrorCode ierr; 291453cca76cSShri Abhyankar PetscInt idx,jdx,idt; 291553cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 291653cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 291753cca76cSShri Abhyankar PetscScalar *x; 291853cca76cSShri Abhyankar const PetscScalar *b; 291953cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 292053cca76cSShri Abhyankar 292153cca76cSShri Abhyankar PetscFunctionBegin; 292253cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 292353cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 292453cca76cSShri Abhyankar /* forward solve the lower triangular */ 292553cca76cSShri Abhyankar idx = 0; 292653cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 292753cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 292853cca76cSShri Abhyankar for (i=1; i<n; i++) { 292953cca76cSShri Abhyankar v = aa + bs2*ai[i]; 293053cca76cSShri Abhyankar vi = aj + ai[i]; 293153cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 293253cca76cSShri Abhyankar idx = bs*i; 293353cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 293453cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 293553cca76cSShri Abhyankar for(k=0;k<nz;k++){ 293653cca76cSShri Abhyankar jdx = bs*vi[k]; 293753cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 293853cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 293953cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 294053cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 294153cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 294253cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 294353cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 294453cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 294553cca76cSShri Abhyankar v += bs2; 294653cca76cSShri Abhyankar } 294753cca76cSShri Abhyankar 294853cca76cSShri Abhyankar x[idx] = s1; 294953cca76cSShri Abhyankar x[1+idx] = s2; 295053cca76cSShri Abhyankar x[2+idx] = s3; 295153cca76cSShri Abhyankar x[3+idx] = s4; 295253cca76cSShri Abhyankar x[4+idx] = s5; 295353cca76cSShri Abhyankar x[5+idx] = s6; 295453cca76cSShri Abhyankar } 295553cca76cSShri Abhyankar 295653cca76cSShri Abhyankar /* backward solve the upper triangular */ 295753cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 295853cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 295953cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 296053cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 296153cca76cSShri Abhyankar idt = bs*i; 296253cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 296353cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 296453cca76cSShri Abhyankar for(k=0;k<nz;k++){ 296553cca76cSShri Abhyankar idx = bs*vi[k]; 296653cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 296753cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 296853cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 296953cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 297053cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 297153cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 297253cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 297353cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 297453cca76cSShri Abhyankar v += bs2; 297553cca76cSShri Abhyankar } 297653cca76cSShri Abhyankar /* x = inv_diagonal*x */ 297753cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 297853cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 297953cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 298053cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 298153cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 298253cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 298353cca76cSShri Abhyankar } 298453cca76cSShri Abhyankar 298553cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 298653cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 298753cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 298853cca76cSShri Abhyankar PetscFunctionReturn(0); 298953cca76cSShri Abhyankar } 299053cca76cSShri Abhyankar 299153cca76cSShri Abhyankar #undef __FUNCT__ 2992*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 2993*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 29944e2b4712SSatish Balay { 29954e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 29964e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 29976849ba73SBarry Smith PetscErrorCode ierr; 29985d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 29995d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3000d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3001d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3002d9fead3dSBarry Smith const PetscScalar *b; 30034e2b4712SSatish Balay 30044e2b4712SSatish Balay PetscFunctionBegin; 3005d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30061ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3007f1af5d2fSBarry Smith t = a->solve_work; 30084e2b4712SSatish Balay 30094e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 30104e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 30114e2b4712SSatish Balay 30124e2b4712SSatish Balay /* forward solve the lower triangular */ 30134e2b4712SSatish Balay idx = 5*(*r++); 3014f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3015f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 30164e2b4712SSatish Balay for (i=1; i<n; i++) { 30174e2b4712SSatish Balay v = aa + 25*ai[i]; 30184e2b4712SSatish Balay vi = aj + ai[i]; 30194e2b4712SSatish Balay nz = diag[i] - ai[i]; 30204e2b4712SSatish Balay idx = 5*(*r++); 3021f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3022f1af5d2fSBarry Smith s5 = b[4+idx]; 30234e2b4712SSatish Balay while (nz--) { 30244e2b4712SSatish Balay idx = 5*(*vi++); 3025f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3026f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 3027f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3028f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3029f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3030f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3031f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 30324e2b4712SSatish Balay v += 25; 30334e2b4712SSatish Balay } 30344e2b4712SSatish Balay idx = 5*i; 3035f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3036f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 30374e2b4712SSatish Balay } 30384e2b4712SSatish Balay /* backward solve the upper triangular */ 30394e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 30404e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 30414e2b4712SSatish Balay vi = aj + diag[i] + 1; 30424e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 30434e2b4712SSatish Balay idt = 5*i; 3044f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3045f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 30464e2b4712SSatish Balay while (nz--) { 30474e2b4712SSatish Balay idx = 5*(*vi++); 3048f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3049f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3050f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3051f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3052f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3053f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3054f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 30554e2b4712SSatish Balay v += 25; 30564e2b4712SSatish Balay } 30574e2b4712SSatish Balay idc = 5*(*c--); 30584e2b4712SSatish Balay v = aa + 25*diag[i]; 3059f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3060f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 3061f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3062f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 3063f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3064f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 3065f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3066f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 3067f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3068f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 30694e2b4712SSatish Balay } 30704e2b4712SSatish Balay 30714e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 30724e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3073d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3075dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 30764e2b4712SSatish Balay PetscFunctionReturn(0); 30774e2b4712SSatish Balay } 30784e2b4712SSatish Balay 307978bb4007SShri Abhyankar #undef __FUNCT__ 3080a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 3081a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 308278bb4007SShri Abhyankar { 308378bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 308478bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 308578bb4007SShri Abhyankar PetscErrorCode ierr; 308678bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 308778bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 308878bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 308978bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 309078bb4007SShri Abhyankar const PetscScalar *b; 309178bb4007SShri Abhyankar 309278bb4007SShri Abhyankar PetscFunctionBegin; 309378bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 309478bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 309578bb4007SShri Abhyankar t = a->solve_work; 309678bb4007SShri Abhyankar 309778bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 309878bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 309978bb4007SShri Abhyankar 310078bb4007SShri Abhyankar /* forward solve the lower triangular */ 310178bb4007SShri Abhyankar idx = 5*r[0]; 310278bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 310378bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 310478bb4007SShri Abhyankar for (i=1; i<n; i++) { 310578bb4007SShri Abhyankar v = aa + 25*ai[i]; 310678bb4007SShri Abhyankar vi = aj + ai[i]; 310778bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 310878bb4007SShri Abhyankar idx = 5*r[i]; 310978bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 311078bb4007SShri Abhyankar s5 = b[4+idx]; 311178bb4007SShri Abhyankar for(m=0;m<nz;m++){ 311278bb4007SShri Abhyankar idx = 5*vi[m]; 311378bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 311478bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 311578bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 311678bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 311778bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 311878bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 311978bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 312078bb4007SShri Abhyankar v += 25; 312178bb4007SShri Abhyankar } 312278bb4007SShri Abhyankar idx = 5*i; 312378bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 312478bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 312578bb4007SShri Abhyankar } 312678bb4007SShri Abhyankar /* backward solve the upper triangular */ 312778bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 312878bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 312978bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 313078bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 313178bb4007SShri Abhyankar idt = 5*i; 313278bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 313378bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 313478bb4007SShri Abhyankar for(m=0;m<nz;m++){ 313578bb4007SShri Abhyankar idx = 5*vi[m]; 313678bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 313778bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 313878bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 313978bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 314078bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 314178bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 314278bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 314378bb4007SShri Abhyankar v += 25; 314478bb4007SShri Abhyankar } 314578bb4007SShri Abhyankar idc = 5*c[i]; 314678bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 314778bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 314878bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 314978bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 315078bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 315178bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 315278bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 315378bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 315478bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 315578bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 315678bb4007SShri Abhyankar } 315778bb4007SShri Abhyankar 315878bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 315978bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 316078bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 316178bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 316278bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 316378bb4007SShri Abhyankar PetscFunctionReturn(0); 316478bb4007SShri Abhyankar } 316578bb4007SShri Abhyankar 31668f690400SShri Abhyankar #undef __FUNCT__ 3167*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 3168*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 316915091d37SBarry Smith { 317015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3171690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 3172dfbe8321SBarry Smith PetscErrorCode ierr; 3173690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 3174d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3175d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3176d9fead3dSBarry Smith const PetscScalar *b; 317715091d37SBarry Smith 317815091d37SBarry Smith PetscFunctionBegin; 3179d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 31801ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 318115091d37SBarry Smith /* forward solve the lower triangular */ 318215091d37SBarry Smith idx = 0; 318315091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 318415091d37SBarry Smith for (i=1; i<n; i++) { 318515091d37SBarry Smith v = aa + 25*ai[i]; 318615091d37SBarry Smith vi = aj + ai[i]; 318715091d37SBarry Smith nz = diag[i] - ai[i]; 318815091d37SBarry Smith idx = 5*i; 3189f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 319015091d37SBarry Smith while (nz--) { 319115091d37SBarry Smith jdx = 5*(*vi++); 319215091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3193f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3194f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3195f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3196f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3197f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 319815091d37SBarry Smith v += 25; 319915091d37SBarry Smith } 3200f1af5d2fSBarry Smith x[idx] = s1; 3201f1af5d2fSBarry Smith x[1+idx] = s2; 3202f1af5d2fSBarry Smith x[2+idx] = s3; 3203f1af5d2fSBarry Smith x[3+idx] = s4; 3204f1af5d2fSBarry Smith x[4+idx] = s5; 320515091d37SBarry Smith } 320615091d37SBarry Smith /* backward solve the upper triangular */ 320715091d37SBarry Smith for (i=n-1; i>=0; i--){ 320815091d37SBarry Smith v = aa + 25*diag[i] + 25; 320915091d37SBarry Smith vi = aj + diag[i] + 1; 321015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 321115091d37SBarry Smith idt = 5*i; 3212f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3213f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 321415091d37SBarry Smith while (nz--) { 321515091d37SBarry Smith idx = 5*(*vi++); 321615091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3217f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3218f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3219f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3220f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3221f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 322215091d37SBarry Smith v += 25; 322315091d37SBarry Smith } 322415091d37SBarry Smith v = aa + 25*diag[i]; 3225f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3226f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3227f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3228f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3229f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 323015091d37SBarry Smith } 323115091d37SBarry Smith 3232d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 32331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3234dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 323515091d37SBarry Smith PetscFunctionReturn(0); 323615091d37SBarry Smith } 323715091d37SBarry Smith 3238cee9d6f2SShri Abhyankar #undef __FUNCT__ 3239a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 3240a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 324153cca76cSShri Abhyankar { 324253cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 324353cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 324453cca76cSShri Abhyankar PetscErrorCode ierr; 324553cca76cSShri Abhyankar PetscInt jdx; 324653cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 324753cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 324853cca76cSShri Abhyankar const PetscScalar *b; 324953cca76cSShri Abhyankar 325053cca76cSShri Abhyankar PetscFunctionBegin; 325153cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 325253cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 325353cca76cSShri Abhyankar /* forward solve the lower triangular */ 325453cca76cSShri Abhyankar idx = 0; 325553cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 325653cca76cSShri Abhyankar for (i=1; i<n; i++) { 325753cca76cSShri Abhyankar v = aa + 25*ai[i]; 325853cca76cSShri Abhyankar vi = aj + ai[i]; 325953cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 326053cca76cSShri Abhyankar idx = 5*i; 326153cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 326253cca76cSShri Abhyankar for(k=0;k<nz;k++) { 326353cca76cSShri Abhyankar jdx = 5*vi[k]; 326453cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 326553cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 326653cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 326753cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 326853cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 326953cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 327053cca76cSShri Abhyankar v += 25; 327153cca76cSShri Abhyankar } 327253cca76cSShri Abhyankar x[idx] = s1; 327353cca76cSShri Abhyankar x[1+idx] = s2; 327453cca76cSShri Abhyankar x[2+idx] = s3; 327553cca76cSShri Abhyankar x[3+idx] = s4; 327653cca76cSShri Abhyankar x[4+idx] = s5; 327753cca76cSShri Abhyankar } 327853cca76cSShri Abhyankar 327953cca76cSShri Abhyankar /* backward solve the upper triangular */ 328053cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 328153cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 328253cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 328353cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 328453cca76cSShri Abhyankar idt = 5*i; 328553cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 328653cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 328753cca76cSShri Abhyankar for(k=0;k<nz;k++){ 328853cca76cSShri Abhyankar idx = 5*vi[k]; 328953cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 329053cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 329153cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 329253cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 329353cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 329453cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 329553cca76cSShri Abhyankar v += 25; 329653cca76cSShri Abhyankar } 329753cca76cSShri Abhyankar /* x = inv_diagonal*x */ 329853cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 329953cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 330053cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 330153cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 330253cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 330353cca76cSShri Abhyankar } 330453cca76cSShri Abhyankar 330553cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 330653cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 330753cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 330853cca76cSShri Abhyankar PetscFunctionReturn(0); 330953cca76cSShri Abhyankar } 331053cca76cSShri Abhyankar 331153cca76cSShri Abhyankar #undef __FUNCT__ 3312*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 3313*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 33144e2b4712SSatish Balay { 33154e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33164e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 33176849ba73SBarry Smith PetscErrorCode ierr; 33185d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 33195d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3320d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3321d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3322d9fead3dSBarry Smith const PetscScalar *b; 33234e2b4712SSatish Balay 33244e2b4712SSatish Balay PetscFunctionBegin; 3325d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 33261ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3327f1af5d2fSBarry Smith t = a->solve_work; 33284e2b4712SSatish Balay 33294e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 33304e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 33314e2b4712SSatish Balay 33324e2b4712SSatish Balay /* forward solve the lower triangular */ 33334e2b4712SSatish Balay idx = 4*(*r++); 3334f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3335f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 33364e2b4712SSatish Balay for (i=1; i<n; i++) { 33374e2b4712SSatish Balay v = aa + 16*ai[i]; 33384e2b4712SSatish Balay vi = aj + ai[i]; 33394e2b4712SSatish Balay nz = diag[i] - ai[i]; 33404e2b4712SSatish Balay idx = 4*(*r++); 3341f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 33424e2b4712SSatish Balay while (nz--) { 33434e2b4712SSatish Balay idx = 4*(*vi++); 3344f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3345f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3346f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3347f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3348f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 33494e2b4712SSatish Balay v += 16; 33504e2b4712SSatish Balay } 33514e2b4712SSatish Balay idx = 4*i; 3352f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3353f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 33544e2b4712SSatish Balay } 33554e2b4712SSatish Balay /* backward solve the upper triangular */ 33564e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 33574e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 33584e2b4712SSatish Balay vi = aj + diag[i] + 1; 33594e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 33604e2b4712SSatish Balay idt = 4*i; 3361f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3362f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 33634e2b4712SSatish Balay while (nz--) { 33644e2b4712SSatish Balay idx = 4*(*vi++); 3365f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3366f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 3367f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3368f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3369f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3370f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 33714e2b4712SSatish Balay v += 16; 33724e2b4712SSatish Balay } 33734e2b4712SSatish Balay idc = 4*(*c--); 33744e2b4712SSatish Balay v = aa + 16*diag[i]; 3375f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3376f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3377f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3378f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 33794e2b4712SSatish Balay } 33804e2b4712SSatish Balay 33814e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 33824e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3383d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 33841ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3385dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 33864e2b4712SSatish Balay PetscFunctionReturn(0); 33874e2b4712SSatish Balay } 3388f26ec98cSKris Buschelman 33898f690400SShri Abhyankar #undef __FUNCT__ 3390a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 3391a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 339278bb4007SShri Abhyankar { 339378bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 339478bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 339578bb4007SShri Abhyankar PetscErrorCode ierr; 339678bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 339778bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 339878bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 339978bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 340078bb4007SShri Abhyankar const PetscScalar *b; 340178bb4007SShri Abhyankar 340278bb4007SShri Abhyankar PetscFunctionBegin; 340378bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 340478bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 340578bb4007SShri Abhyankar t = a->solve_work; 340678bb4007SShri Abhyankar 340778bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 340878bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 340978bb4007SShri Abhyankar 341078bb4007SShri Abhyankar /* forward solve the lower triangular */ 341178bb4007SShri Abhyankar idx = 4*r[0]; 341278bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 341378bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 341478bb4007SShri Abhyankar for (i=1; i<n; i++) { 341578bb4007SShri Abhyankar v = aa + 16*ai[i]; 341678bb4007SShri Abhyankar vi = aj + ai[i]; 341778bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 341878bb4007SShri Abhyankar idx = 4*r[i]; 341978bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 342078bb4007SShri Abhyankar for(m=0;m<nz;m++){ 342178bb4007SShri Abhyankar idx = 4*vi[m]; 342278bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 342378bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 342478bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 342578bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 342678bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 342778bb4007SShri Abhyankar v += 16; 342878bb4007SShri Abhyankar } 342978bb4007SShri Abhyankar idx = 4*i; 343078bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 343178bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 343278bb4007SShri Abhyankar } 343378bb4007SShri Abhyankar /* backward solve the upper triangular */ 343478bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 343578bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 343678bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 343778bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 343878bb4007SShri Abhyankar idt = 4*i; 343978bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 344078bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 344178bb4007SShri Abhyankar for(m=0;m<nz;m++){ 344278bb4007SShri Abhyankar idx = 4*vi[m]; 344378bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 344478bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 344578bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 344678bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 344778bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 344878bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 344978bb4007SShri Abhyankar v += 16; 345078bb4007SShri Abhyankar } 345178bb4007SShri Abhyankar idc = 4*c[i]; 345278bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 345378bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 345478bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 345578bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 345678bb4007SShri Abhyankar } 345778bb4007SShri Abhyankar 345878bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 345978bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 346078bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 346178bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 346278bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 346378bb4007SShri Abhyankar PetscFunctionReturn(0); 346478bb4007SShri Abhyankar } 346578bb4007SShri Abhyankar 346678bb4007SShri Abhyankar #undef __FUNCT__ 3467f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3468dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3469f26ec98cSKris Buschelman { 3470f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3471f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 34726849ba73SBarry Smith PetscErrorCode ierr; 34735d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 34745d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3475d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3476d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3477d9fead3dSBarry Smith PetscScalar *x; 3478d9fead3dSBarry Smith const PetscScalar *b; 3479f26ec98cSKris Buschelman 3480f26ec98cSKris Buschelman PetscFunctionBegin; 3481d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 34821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3483f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 3484f26ec98cSKris Buschelman 3485f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3486f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3487f26ec98cSKris Buschelman 3488f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3489f26ec98cSKris Buschelman idx = 4*(*r++); 3490f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 3491f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 3492f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 3493f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 3494f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3495f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3496f26ec98cSKris Buschelman vi = aj + ai[i]; 3497f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3498f26ec98cSKris Buschelman idx = 4*(*r++); 3499f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3500f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3501f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3502f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3503f26ec98cSKris Buschelman while (nz--) { 3504f26ec98cSKris Buschelman idx = 4*(*vi++); 3505f26ec98cSKris Buschelman x1 = t[idx]; 3506f26ec98cSKris Buschelman x2 = t[1+idx]; 3507f26ec98cSKris Buschelman x3 = t[2+idx]; 3508f26ec98cSKris Buschelman x4 = t[3+idx]; 3509f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3510f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3511f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3512f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3513f26ec98cSKris Buschelman v += 16; 3514f26ec98cSKris Buschelman } 3515f26ec98cSKris Buschelman idx = 4*i; 3516f26ec98cSKris Buschelman t[idx] = s1; 3517f26ec98cSKris Buschelman t[1+idx] = s2; 3518f26ec98cSKris Buschelman t[2+idx] = s3; 3519f26ec98cSKris Buschelman t[3+idx] = s4; 3520f26ec98cSKris Buschelman } 3521f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3522f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3523f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 3524f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3525f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3526f26ec98cSKris Buschelman idt = 4*i; 3527f26ec98cSKris Buschelman s1 = t[idt]; 3528f26ec98cSKris Buschelman s2 = t[1+idt]; 3529f26ec98cSKris Buschelman s3 = t[2+idt]; 3530f26ec98cSKris Buschelman s4 = t[3+idt]; 3531f26ec98cSKris Buschelman while (nz--) { 3532f26ec98cSKris Buschelman idx = 4*(*vi++); 3533f26ec98cSKris Buschelman x1 = t[idx]; 3534f26ec98cSKris Buschelman x2 = t[1+idx]; 3535f26ec98cSKris Buschelman x3 = t[2+idx]; 3536f26ec98cSKris Buschelman x4 = t[3+idx]; 3537f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3538f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3539f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3540f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3541f26ec98cSKris Buschelman v += 16; 3542f26ec98cSKris Buschelman } 3543f26ec98cSKris Buschelman idc = 4*(*c--); 3544f26ec98cSKris Buschelman v = aa + 16*diag[i]; 3545f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3546f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3547f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3548f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3549f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 3550f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 3551f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 3552f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 3553f26ec98cSKris Buschelman } 3554f26ec98cSKris Buschelman 3555f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3556f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3557d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 35581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3559dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3560f26ec98cSKris Buschelman PetscFunctionReturn(0); 3561f26ec98cSKris Buschelman } 3562f26ec98cSKris Buschelman 356324c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 356424c233c2SKris Buschelman 356524c233c2SKris Buschelman #include PETSC_HAVE_SSE 356624c233c2SKris Buschelman 356724c233c2SKris Buschelman #undef __FUNCT__ 356824c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3569dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 357024c233c2SKris Buschelman { 357124c233c2SKris Buschelman /* 357224c233c2SKris Buschelman Note: This code uses demotion of double 357324c233c2SKris Buschelman to float when performing the mixed-mode computation. 357424c233c2SKris Buschelman This may not be numerically reasonable for all applications. 357524c233c2SKris Buschelman */ 357624c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 357724c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 35786849ba73SBarry Smith PetscErrorCode ierr; 35795d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 35805d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 358124c233c2SKris Buschelman MatScalar *aa=a->a,*v; 358287828ca2SBarry Smith PetscScalar *x,*b,*t; 358324c233c2SKris Buschelman 358424c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 358524c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 358624c233c2SKris Buschelman unsigned long offset; 358724c233c2SKris Buschelman 358824c233c2SKris Buschelman PetscFunctionBegin; 358924c233c2SKris Buschelman SSE_SCOPE_BEGIN; 359024c233c2SKris Buschelman 359124c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 359224c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 359324c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 359424c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 359524c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 359624c233c2SKris Buschelman 35971ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 35981ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 359924c233c2SKris Buschelman t = a->solve_work; 360024c233c2SKris Buschelman 360124c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 360224c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 360324c233c2SKris Buschelman 360424c233c2SKris Buschelman /* forward solve the lower triangular */ 360524c233c2SKris Buschelman idx = 4*(*r++); 360624c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 360724c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 360824c233c2SKris Buschelman v = aa + 16*ai[1]; 360924c233c2SKris Buschelman 361024c233c2SKris Buschelman for (i=1; i<n;) { 361124c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 361224c233c2SKris Buschelman vi = aj + ai[i]; 361324c233c2SKris Buschelman nz = diag[i] - ai[i]; 361424c233c2SKris Buschelman idx = 4*(*r++); 361524c233c2SKris Buschelman 361624c233c2SKris Buschelman /* Demote sum from double to float */ 361724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 361824c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 361924c233c2SKris Buschelman 362024c233c2SKris Buschelman while (nz--) { 362124c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 362224c233c2SKris Buschelman idx = 4*(*vi++); 362324c233c2SKris Buschelman 362424c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 362524c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 362624c233c2SKris Buschelman 362724c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 362824c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 362924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 363024c233c2SKris Buschelman 363124c233c2SKris Buschelman /* First Column */ 363224c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 363324c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 363424c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 363524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 363624c233c2SKris Buschelman 363724c233c2SKris Buschelman /* Second Column */ 363824c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 363924c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 364024c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 364124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 364224c233c2SKris Buschelman 364324c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 364424c233c2SKris Buschelman 364524c233c2SKris Buschelman /* Third Column */ 364624c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 364724c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 364824c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 364924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 365024c233c2SKris Buschelman 365124c233c2SKris Buschelman /* Fourth Column */ 365224c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 365324c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 365424c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 365524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 365624c233c2SKris Buschelman SSE_INLINE_END_2 365724c233c2SKris Buschelman 365824c233c2SKris Buschelman v += 16; 365924c233c2SKris Buschelman } 366024c233c2SKris Buschelman idx = 4*i; 366124c233c2SKris Buschelman v = aa + 16*ai[++i]; 366224c233c2SKris Buschelman PREFETCH_NTA(v); 366324c233c2SKris Buschelman STORE_PS(tmps,XMM7); 366424c233c2SKris Buschelman 366524c233c2SKris Buschelman /* Promote result from float to double */ 366624c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 366724c233c2SKris Buschelman } 366824c233c2SKris Buschelman /* backward solve the upper triangular */ 366924c233c2SKris Buschelman idt = 4*(n-1); 367024c233c2SKris Buschelman ai16 = 16*diag[n-1]; 367124c233c2SKris Buschelman v = aa + ai16 + 16; 367224c233c2SKris Buschelman for (i=n-1; i>=0;){ 367324c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 367424c233c2SKris Buschelman vi = aj + diag[i] + 1; 367524c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 367624c233c2SKris Buschelman 367724c233c2SKris Buschelman /* Demote accumulator from double to float */ 367824c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 367924c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 368024c233c2SKris Buschelman 368124c233c2SKris Buschelman while (nz--) { 368224c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 368324c233c2SKris Buschelman idx = 4*(*vi++); 368424c233c2SKris Buschelman 368524c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 368624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 368724c233c2SKris Buschelman 368824c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 368924c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 369024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 369124c233c2SKris Buschelman 369224c233c2SKris Buschelman /* First Column */ 369324c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 369424c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 369524c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 369624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 369724c233c2SKris Buschelman 369824c233c2SKris Buschelman /* Second Column */ 369924c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 370024c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 370124c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 370224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 370324c233c2SKris Buschelman 370424c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 370524c233c2SKris Buschelman 370624c233c2SKris Buschelman /* Third Column */ 370724c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 370824c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 370924c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 371024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 371124c233c2SKris Buschelman 371224c233c2SKris Buschelman /* Fourth Column */ 371324c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 371424c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 371524c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 371624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 371724c233c2SKris Buschelman SSE_INLINE_END_2 371824c233c2SKris Buschelman v += 16; 371924c233c2SKris Buschelman } 372024c233c2SKris Buschelman v = aa + ai16; 372124c233c2SKris Buschelman ai16 = 16*diag[--i]; 372224c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 372324c233c2SKris Buschelman /* 372424c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 372524c233c2SKris Buschelman which was inverted as part of the factorization 372624c233c2SKris Buschelman */ 372724c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 372824c233c2SKris Buschelman /* First Column */ 372924c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 373024c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 373124c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 373224c233c2SKris Buschelman 373324c233c2SKris Buschelman /* Second Column */ 373424c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 373524c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 373624c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 373724c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 373824c233c2SKris Buschelman 373924c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 374024c233c2SKris Buschelman 374124c233c2SKris Buschelman /* Third Column */ 374224c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 374324c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 374424c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 374524c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 374624c233c2SKris Buschelman 374724c233c2SKris Buschelman /* Fourth Column */ 374824c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 374924c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 375024c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 375124c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 375224c233c2SKris Buschelman 375324c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 375424c233c2SKris Buschelman SSE_INLINE_END_3 375524c233c2SKris Buschelman 375624c233c2SKris Buschelman /* Promote solution from float to double */ 375724c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 375824c233c2SKris Buschelman 375924c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 376024c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 376124c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 376224c233c2SKris Buschelman idc = 4*(*c--); 376324c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 376424c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 376524c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 376624c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 376724c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 376824c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 376924c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 377024c233c2SKris Buschelman SSE_INLINE_END_2 377124c233c2SKris Buschelman v = aa + ai16 + 16; 377224c233c2SKris Buschelman idt -= 4; 377324c233c2SKris Buschelman } 377424c233c2SKris Buschelman 377524c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 377624c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 37771ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 37781ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3779dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 378024c233c2SKris Buschelman SSE_SCOPE_END; 378124c233c2SKris Buschelman PetscFunctionReturn(0); 378224c233c2SKris Buschelman } 378324c233c2SKris Buschelman 378424c233c2SKris Buschelman #endif 37850ef38995SBarry Smith 37860ef38995SBarry Smith 37874e2b4712SSatish Balay /* 37884e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 37894e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 37904e2b4712SSatish Balay */ 37914a2ae208SSatish Balay #undef __FUNCT__ 3792*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 3793*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 37944e2b4712SSatish Balay { 37954e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3796356650c2SBarry Smith PetscInt n=a->mbs; 3797356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 3798dfbe8321SBarry Smith PetscErrorCode ierr; 3799356650c2SBarry Smith const PetscInt *diag = a->diag; 3800d9fead3dSBarry Smith const MatScalar *aa=a->a; 3801d9fead3dSBarry Smith PetscScalar *x; 3802d9fead3dSBarry Smith const PetscScalar *b; 38034e2b4712SSatish Balay 38044e2b4712SSatish Balay PetscFunctionBegin; 3805d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 38061ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 38074e2b4712SSatish Balay 3808aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 38092853dc0eSBarry Smith { 381087828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 38112853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 38122853dc0eSBarry Smith } 3813aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 38142853dc0eSBarry Smith { 381587828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 38162853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 38172853dc0eSBarry Smith } 3818aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 38192853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3820e1293385SBarry Smith #else 382130d4dcafSBarry Smith { 382287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3823d9fead3dSBarry Smith const MatScalar *v; 3824356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 3825356650c2SBarry Smith const PetscInt *vi; 3826e1293385SBarry Smith 38274e2b4712SSatish Balay /* forward solve the lower triangular */ 38284e2b4712SSatish Balay idx = 0; 3829e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 38304e2b4712SSatish Balay for (i=1; i<n; i++) { 38314e2b4712SSatish Balay v = aa + 16*ai[i]; 38324e2b4712SSatish Balay vi = aj + ai[i]; 38334e2b4712SSatish Balay nz = diag[i] - ai[i]; 3834e1293385SBarry Smith idx += 4; 3835f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 38364e2b4712SSatish Balay while (nz--) { 38374e2b4712SSatish Balay jdx = 4*(*vi++); 38384e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3839f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3840f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3841f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3842f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 38434e2b4712SSatish Balay v += 16; 38444e2b4712SSatish Balay } 3845f1af5d2fSBarry Smith x[idx] = s1; 3846f1af5d2fSBarry Smith x[1+idx] = s2; 3847f1af5d2fSBarry Smith x[2+idx] = s3; 3848f1af5d2fSBarry Smith x[3+idx] = s4; 38494e2b4712SSatish Balay } 38504e2b4712SSatish Balay /* backward solve the upper triangular */ 38514e555682SBarry Smith idt = 4*(n-1); 38524e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 38534e555682SBarry Smith ai16 = 16*diag[i]; 38544e555682SBarry Smith v = aa + ai16 + 16; 38554e2b4712SSatish Balay vi = aj + diag[i] + 1; 38564e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3857f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3858f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 38594e2b4712SSatish Balay while (nz--) { 38604e2b4712SSatish Balay idx = 4*(*vi++); 38614e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3862f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3863f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3864f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3865f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 38664e2b4712SSatish Balay v += 16; 38674e2b4712SSatish Balay } 38684e555682SBarry Smith v = aa + ai16; 3869f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3870f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3871f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3872f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3873329f5518SBarry Smith idt -= 4; 38744e2b4712SSatish Balay } 387530d4dcafSBarry Smith } 3876e1293385SBarry Smith #endif 38774e2b4712SSatish Balay 3878d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 38791ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3880dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 38814e2b4712SSatish Balay PetscFunctionReturn(0); 38824e2b4712SSatish Balay } 38834e2b4712SSatish Balay 3884b2b2dd24SShri Abhyankar #undef __FUNCT__ 3885a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3886a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3887b2b2dd24SShri Abhyankar { 3888b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3889b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3890b2b2dd24SShri Abhyankar PetscErrorCode ierr; 3891b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 3892b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3893b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 3894b2b2dd24SShri Abhyankar PetscScalar *x; 3895b2b2dd24SShri Abhyankar const PetscScalar *b; 3896b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3897cee9d6f2SShri Abhyankar 3898b2b2dd24SShri Abhyankar PetscFunctionBegin; 3899b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3900b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3901b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 3902b2b2dd24SShri Abhyankar idx = 0; 3903b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3904b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 3905b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 3906b2b2dd24SShri Abhyankar vi = aj + ai[i]; 3907b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 3908b2b2dd24SShri Abhyankar idx = bs*i; 3909b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3910b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 3911b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 3912b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3913b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3914b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3915b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3916b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3917b2b2dd24SShri Abhyankar 3918b2b2dd24SShri Abhyankar v += bs2; 3919b2b2dd24SShri Abhyankar } 3920b2b2dd24SShri Abhyankar 3921b2b2dd24SShri Abhyankar x[idx] = s1; 3922b2b2dd24SShri Abhyankar x[1+idx] = s2; 3923b2b2dd24SShri Abhyankar x[2+idx] = s3; 3924b2b2dd24SShri Abhyankar x[3+idx] = s4; 3925b2b2dd24SShri Abhyankar } 3926b2b2dd24SShri Abhyankar 3927b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 3928b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 3929b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 3930b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 3931b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 3932b2b2dd24SShri Abhyankar idt = bs*i; 3933b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3934b2b2dd24SShri Abhyankar 3935b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 3936b2b2dd24SShri Abhyankar idx = bs*vi[k]; 3937b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3938b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3939b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3940b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3941b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3942b2b2dd24SShri Abhyankar 3943b2b2dd24SShri Abhyankar v += bs2; 3944b2b2dd24SShri Abhyankar } 3945b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 3946b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3947b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3948b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3949b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3950b2b2dd24SShri Abhyankar 3951b2b2dd24SShri Abhyankar } 3952b2b2dd24SShri Abhyankar 3953b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3954b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3955b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3956b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 3957b2b2dd24SShri Abhyankar } 3958cee9d6f2SShri Abhyankar 3959cee9d6f2SShri Abhyankar #undef __FUNCT__ 3960f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3961dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3962f26ec98cSKris Buschelman { 3963f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3964690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3965dfbe8321SBarry Smith PetscErrorCode ierr; 3966690b6cddSBarry Smith PetscInt *diag = a->diag; 3967f26ec98cSKris Buschelman MatScalar *aa=a->a; 3968f26ec98cSKris Buschelman PetscScalar *x,*b; 3969f26ec98cSKris Buschelman 3970f26ec98cSKris Buschelman PetscFunctionBegin; 39711ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 39721ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3973f26ec98cSKris Buschelman 3974f26ec98cSKris Buschelman { 3975f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3976f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 3977690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3978f26ec98cSKris Buschelman 3979f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3980f26ec98cSKris Buschelman idx = 0; 3981f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 3982f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 3983f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 3984f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 3985f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3986f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3987f26ec98cSKris Buschelman vi = aj + ai[i]; 3988f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3989f26ec98cSKris Buschelman idx += 4; 3990f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3991f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3992f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3993f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3994f26ec98cSKris Buschelman while (nz--) { 3995f26ec98cSKris Buschelman jdx = 4*(*vi++); 3996f26ec98cSKris Buschelman x1 = t[jdx]; 3997f26ec98cSKris Buschelman x2 = t[1+jdx]; 3998f26ec98cSKris Buschelman x3 = t[2+jdx]; 3999f26ec98cSKris Buschelman x4 = t[3+jdx]; 4000f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4001f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4002f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4003f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4004f26ec98cSKris Buschelman v += 16; 4005f26ec98cSKris Buschelman } 4006f26ec98cSKris Buschelman t[idx] = s1; 4007f26ec98cSKris Buschelman t[1+idx] = s2; 4008f26ec98cSKris Buschelman t[2+idx] = s3; 4009f26ec98cSKris Buschelman t[3+idx] = s4; 4010f26ec98cSKris Buschelman } 4011f26ec98cSKris Buschelman /* backward solve the upper triangular */ 4012f26ec98cSKris Buschelman idt = 4*(n-1); 4013f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 4014f26ec98cSKris Buschelman ai16 = 16*diag[i]; 4015f26ec98cSKris Buschelman v = aa + ai16 + 16; 4016f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 4017f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 4018f26ec98cSKris Buschelman s1 = t[idt]; 4019f26ec98cSKris Buschelman s2 = t[1+idt]; 4020f26ec98cSKris Buschelman s3 = t[2+idt]; 4021f26ec98cSKris Buschelman s4 = t[3+idt]; 4022f26ec98cSKris Buschelman while (nz--) { 4023f26ec98cSKris Buschelman idx = 4*(*vi++); 4024f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 4025f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 4026f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 4027f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 4028f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4029f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4030f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4031f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4032f26ec98cSKris Buschelman v += 16; 4033f26ec98cSKris Buschelman } 4034f26ec98cSKris Buschelman v = aa + ai16; 4035f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4036f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4037f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4038f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4039f26ec98cSKris Buschelman idt -= 4; 4040f26ec98cSKris Buschelman } 4041f26ec98cSKris Buschelman } 4042f26ec98cSKris Buschelman 40431ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 40441ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4045dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4046f26ec98cSKris Buschelman PetscFunctionReturn(0); 4047f26ec98cSKris Buschelman } 4048f26ec98cSKris Buschelman 40493660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 40503660e330SKris Buschelman 40513660e330SKris Buschelman #include PETSC_HAVE_SSE 40523660e330SKris Buschelman #undef __FUNCT__ 40537cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4054dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 40553660e330SKris Buschelman { 40563660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 40572aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 4058dfbe8321SBarry Smith PetscErrorCode ierr; 4059dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 40603660e330SKris Buschelman MatScalar *aa=a->a; 406187828ca2SBarry Smith PetscScalar *x,*b; 40623660e330SKris Buschelman 40633660e330SKris Buschelman PetscFunctionBegin; 40643660e330SKris Buschelman SSE_SCOPE_BEGIN; 40653660e330SKris Buschelman /* 40663660e330SKris Buschelman Note: This code currently uses demotion of double 40673660e330SKris Buschelman to float when performing the mixed-mode computation. 40683660e330SKris Buschelman This may not be numerically reasonable for all applications. 40693660e330SKris Buschelman */ 40703660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 40713660e330SKris Buschelman 40721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 40731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 40743660e330SKris Buschelman { 4075eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 4076eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 40772aa5897fSKris Buschelman int nz,i,idt,ai16; 40782aa5897fSKris Buschelman unsigned int jdx,idx; 40792aa5897fSKris Buschelman unsigned short *vi; 4080eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 40813660e330SKris Buschelman 4082eb05f457SKris Buschelman /* First block is the identity. */ 40833660e330SKris Buschelman idx = 0; 4084eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 40852aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 40863660e330SKris Buschelman 40873660e330SKris Buschelman for (i=1; i<n;) { 40883660e330SKris Buschelman PREFETCH_NTA(&v[8]); 40893660e330SKris Buschelman vi = aj + ai[i]; 40903660e330SKris Buschelman nz = diag[i] - ai[i]; 40913660e330SKris Buschelman idx += 4; 40923660e330SKris Buschelman 4093eb05f457SKris Buschelman /* Demote RHS from double to float. */ 4094eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4095eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 40963660e330SKris Buschelman 40973660e330SKris Buschelman while (nz--) { 40983660e330SKris Buschelman PREFETCH_NTA(&v[16]); 40992aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 41003660e330SKris Buschelman 41013660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 4102eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 41033660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 41043660e330SKris Buschelman 41053660e330SKris Buschelman /* First Column */ 41063660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 41073660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 41083660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 41093660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 41103660e330SKris Buschelman 41113660e330SKris Buschelman /* Second Column */ 41123660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 41133660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 41143660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 41153660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 41163660e330SKris Buschelman 41173660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 41183660e330SKris Buschelman 41193660e330SKris Buschelman /* Third Column */ 41203660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 41213660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 41223660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 41233660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 41243660e330SKris Buschelman 41253660e330SKris Buschelman /* Fourth Column */ 41263660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 41273660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 41283660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 41293660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 41303660e330SKris Buschelman SSE_INLINE_END_2 41313660e330SKris Buschelman 41323660e330SKris Buschelman v += 16; 41333660e330SKris Buschelman } 41343660e330SKris Buschelman v = aa + 16*ai[++i]; 41353660e330SKris Buschelman PREFETCH_NTA(v); 4136eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 41373660e330SKris Buschelman } 4138eb05f457SKris Buschelman 4139eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 4140eb05f457SKris Buschelman 41413660e330SKris Buschelman idt = 4*(n-1); 41423660e330SKris Buschelman ai16 = 16*diag[n-1]; 41433660e330SKris Buschelman v = aa + ai16 + 16; 41443660e330SKris Buschelman for (i=n-1; i>=0;){ 41453660e330SKris Buschelman PREFETCH_NTA(&v[8]); 41463660e330SKris Buschelman vi = aj + diag[i] + 1; 41473660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 41483660e330SKris Buschelman 4149eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 41503660e330SKris Buschelman 41513660e330SKris Buschelman while (nz--) { 41523660e330SKris Buschelman PREFETCH_NTA(&v[16]); 41532aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 41543660e330SKris Buschelman 41553660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 4156eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 41573660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 41583660e330SKris Buschelman 41593660e330SKris Buschelman /* First Column */ 41603660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 41613660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 41623660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 41633660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 41643660e330SKris Buschelman 41653660e330SKris Buschelman /* Second Column */ 41663660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 41673660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 41683660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 41693660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 41703660e330SKris Buschelman 41713660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 41723660e330SKris Buschelman 41733660e330SKris Buschelman /* Third Column */ 41743660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 41753660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 41763660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 41773660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 41783660e330SKris Buschelman 41793660e330SKris Buschelman /* Fourth Column */ 41803660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 41813660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 41823660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 41833660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 41843660e330SKris Buschelman SSE_INLINE_END_2 41853660e330SKris Buschelman v += 16; 41863660e330SKris Buschelman } 41873660e330SKris Buschelman v = aa + ai16; 41883660e330SKris Buschelman ai16 = 16*diag[--i]; 41893660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 41903660e330SKris Buschelman /* 41913660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 41923660e330SKris Buschelman which was inverted as part of the factorization 41933660e330SKris Buschelman */ 4194eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 41953660e330SKris Buschelman /* First Column */ 41963660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 41973660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 41983660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 41993660e330SKris Buschelman 42003660e330SKris Buschelman /* Second Column */ 42013660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 42023660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 42033660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 42043660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 42053660e330SKris Buschelman 42063660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 42073660e330SKris Buschelman 42083660e330SKris Buschelman /* Third Column */ 42093660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 42103660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 42113660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 42123660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 42133660e330SKris Buschelman 42143660e330SKris Buschelman /* Fourth Column */ 42153660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 42163660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 42173660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 42183660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 42193660e330SKris Buschelman 42203660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 42213660e330SKris Buschelman SSE_INLINE_END_3 42223660e330SKris Buschelman 42233660e330SKris Buschelman v = aa + ai16 + 16; 42243660e330SKris Buschelman idt -= 4; 42253660e330SKris Buschelman } 4226eb05f457SKris Buschelman 4227eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 4228eb05f457SKris Buschelman idt = 4*(n-1); 4229eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 4230eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4231eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4232eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 4233eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 4234eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 4235eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 4236eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 4237eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 423854693613SKris Buschelman idt -= 4; 42393660e330SKris Buschelman } 4240eb05f457SKris Buschelman 4241eb05f457SKris Buschelman } /* End of artificial scope. */ 42421ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 42431ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4244dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 42453660e330SKris Buschelman SSE_SCOPE_END; 42463660e330SKris Buschelman PetscFunctionReturn(0); 42473660e330SKris Buschelman } 42483660e330SKris Buschelman 42497cf1b8d3SKris Buschelman #undef __FUNCT__ 42507cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4251dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 42527cf1b8d3SKris Buschelman { 42537cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 42547cf1b8d3SKris Buschelman int *aj=a->j; 4255dfbe8321SBarry Smith PetscErrorCode ierr; 4256dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 42577cf1b8d3SKris Buschelman MatScalar *aa=a->a; 42587cf1b8d3SKris Buschelman PetscScalar *x,*b; 42597cf1b8d3SKris Buschelman 42607cf1b8d3SKris Buschelman PetscFunctionBegin; 42617cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 42627cf1b8d3SKris Buschelman /* 42637cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 42647cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 42657cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 42667cf1b8d3SKris Buschelman */ 42677cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 42687cf1b8d3SKris Buschelman 42691ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 42701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 42717cf1b8d3SKris Buschelman { 42727cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 42737cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 42747cf1b8d3SKris Buschelman int nz,i,idt,ai16; 42757cf1b8d3SKris Buschelman int jdx,idx; 42767cf1b8d3SKris Buschelman int *vi; 42777cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 42787cf1b8d3SKris Buschelman 42797cf1b8d3SKris Buschelman /* First block is the identity. */ 42807cf1b8d3SKris Buschelman idx = 0; 42817cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 42827cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 42837cf1b8d3SKris Buschelman 42847cf1b8d3SKris Buschelman for (i=1; i<n;) { 42857cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 42867cf1b8d3SKris Buschelman vi = aj + ai[i]; 42877cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 42887cf1b8d3SKris Buschelman idx += 4; 42897cf1b8d3SKris Buschelman 42907cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 42917cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 42927cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 42937cf1b8d3SKris Buschelman 42947cf1b8d3SKris Buschelman while (nz--) { 42957cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 42967cf1b8d3SKris Buschelman jdx = 4*(*vi++); 42977cf1b8d3SKris Buschelman /* jdx = *vi++; */ 42987cf1b8d3SKris Buschelman 42997cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 43007cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 43017cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 43027cf1b8d3SKris Buschelman 43037cf1b8d3SKris Buschelman /* First Column */ 43047cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 43057cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 43067cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 43077cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 43087cf1b8d3SKris Buschelman 43097cf1b8d3SKris Buschelman /* Second Column */ 43107cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 43117cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 43127cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 43137cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 43147cf1b8d3SKris Buschelman 43157cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 43167cf1b8d3SKris Buschelman 43177cf1b8d3SKris Buschelman /* Third Column */ 43187cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 43197cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 43207cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 43217cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 43227cf1b8d3SKris Buschelman 43237cf1b8d3SKris Buschelman /* Fourth Column */ 43247cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 43257cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 43267cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 43277cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 43287cf1b8d3SKris Buschelman SSE_INLINE_END_2 43297cf1b8d3SKris Buschelman 43307cf1b8d3SKris Buschelman v += 16; 43317cf1b8d3SKris Buschelman } 43327cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 43337cf1b8d3SKris Buschelman PREFETCH_NTA(v); 43347cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 43357cf1b8d3SKris Buschelman } 43367cf1b8d3SKris Buschelman 43377cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 43387cf1b8d3SKris Buschelman 43397cf1b8d3SKris Buschelman idt = 4*(n-1); 43407cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 43417cf1b8d3SKris Buschelman v = aa + ai16 + 16; 43427cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 43437cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 43447cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 43457cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 43467cf1b8d3SKris Buschelman 43477cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 43487cf1b8d3SKris Buschelman 43497cf1b8d3SKris Buschelman while (nz--) { 43507cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 43517cf1b8d3SKris Buschelman idx = 4*(*vi++); 43527cf1b8d3SKris Buschelman /* idx = *vi++; */ 43537cf1b8d3SKris Buschelman 43547cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 43557cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 43567cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 43577cf1b8d3SKris Buschelman 43587cf1b8d3SKris Buschelman /* First Column */ 43597cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 43607cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 43617cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 43627cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 43637cf1b8d3SKris Buschelman 43647cf1b8d3SKris Buschelman /* Second Column */ 43657cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 43667cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 43677cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 43687cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 43697cf1b8d3SKris Buschelman 43707cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 43717cf1b8d3SKris Buschelman 43727cf1b8d3SKris Buschelman /* Third Column */ 43737cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 43747cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 43757cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 43767cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 43777cf1b8d3SKris Buschelman 43787cf1b8d3SKris Buschelman /* Fourth Column */ 43797cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 43807cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 43817cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 43827cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 43837cf1b8d3SKris Buschelman SSE_INLINE_END_2 43847cf1b8d3SKris Buschelman v += 16; 43857cf1b8d3SKris Buschelman } 43867cf1b8d3SKris Buschelman v = aa + ai16; 43877cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 43887cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 43897cf1b8d3SKris Buschelman /* 43907cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 43917cf1b8d3SKris Buschelman which was inverted as part of the factorization 43927cf1b8d3SKris Buschelman */ 43937cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 43947cf1b8d3SKris Buschelman /* First Column */ 43957cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 43967cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 43977cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 43987cf1b8d3SKris Buschelman 43997cf1b8d3SKris Buschelman /* Second Column */ 44007cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 44017cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 44027cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 44037cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 44047cf1b8d3SKris Buschelman 44057cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 44067cf1b8d3SKris Buschelman 44077cf1b8d3SKris Buschelman /* Third Column */ 44087cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 44097cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 44107cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 44117cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 44127cf1b8d3SKris Buschelman 44137cf1b8d3SKris Buschelman /* Fourth Column */ 44147cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 44157cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 44167cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 44177cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 44187cf1b8d3SKris Buschelman 44197cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 44207cf1b8d3SKris Buschelman SSE_INLINE_END_3 44217cf1b8d3SKris Buschelman 44227cf1b8d3SKris Buschelman v = aa + ai16 + 16; 44237cf1b8d3SKris Buschelman idt -= 4; 44247cf1b8d3SKris Buschelman } 44257cf1b8d3SKris Buschelman 44267cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 44277cf1b8d3SKris Buschelman idt = 4*(n-1); 44287cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 44297cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 44307cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 44317cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 44327cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 44337cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 44347cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 44357cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 44367cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 44377cf1b8d3SKris Buschelman idt -= 4; 44387cf1b8d3SKris Buschelman } 44397cf1b8d3SKris Buschelman 44407cf1b8d3SKris Buschelman } /* End of artificial scope. */ 44411ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 44421ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4443dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 44447cf1b8d3SKris Buschelman SSE_SCOPE_END; 44457cf1b8d3SKris Buschelman PetscFunctionReturn(0); 44467cf1b8d3SKris Buschelman } 44477cf1b8d3SKris Buschelman 44483660e330SKris Buschelman #endif 44498f690400SShri Abhyankar 44504a2ae208SSatish Balay #undef __FUNCT__ 4451*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 4452*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 44534e2b4712SSatish Balay { 44544e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 44554e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 44566849ba73SBarry Smith PetscErrorCode ierr; 44575d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 44585d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4459d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4460d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4461d9fead3dSBarry Smith const PetscScalar *b; 44624e2b4712SSatish Balay 44634e2b4712SSatish Balay PetscFunctionBegin; 4464d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 44651ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4466f1af5d2fSBarry Smith t = a->solve_work; 44674e2b4712SSatish Balay 44684e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 44694e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 44704e2b4712SSatish Balay 44714e2b4712SSatish Balay /* forward solve the lower triangular */ 44724e2b4712SSatish Balay idx = 3*(*r++); 4473f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 44744e2b4712SSatish Balay for (i=1; i<n; i++) { 44754e2b4712SSatish Balay v = aa + 9*ai[i]; 44764e2b4712SSatish Balay vi = aj + ai[i]; 44774e2b4712SSatish Balay nz = diag[i] - ai[i]; 44784e2b4712SSatish Balay idx = 3*(*r++); 4479f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 44804e2b4712SSatish Balay while (nz--) { 44814e2b4712SSatish Balay idx = 3*(*vi++); 4482f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4483f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4484f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4485f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 44864e2b4712SSatish Balay v += 9; 44874e2b4712SSatish Balay } 44884e2b4712SSatish Balay idx = 3*i; 4489f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 44904e2b4712SSatish Balay } 44914e2b4712SSatish Balay /* backward solve the upper triangular */ 44924e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 44934e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 44944e2b4712SSatish Balay vi = aj + diag[i] + 1; 44954e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 44964e2b4712SSatish Balay idt = 3*i; 4497f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 44984e2b4712SSatish Balay while (nz--) { 44994e2b4712SSatish Balay idx = 3*(*vi++); 4500f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4501f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4502f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4503f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 45044e2b4712SSatish Balay v += 9; 45054e2b4712SSatish Balay } 45064e2b4712SSatish Balay idc = 3*(*c--); 45074e2b4712SSatish Balay v = aa + 9*diag[i]; 4508f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4509f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4510f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 45114e2b4712SSatish Balay } 45124e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 45134e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4514d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45151ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4516dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 45174e2b4712SSatish Balay PetscFunctionReturn(0); 45184e2b4712SSatish Balay } 45194e2b4712SSatish Balay 45200c4413a7SShri Abhyankar #undef __FUNCT__ 4521a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 4522a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 45230c4413a7SShri Abhyankar { 45240c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 45250c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 45260c4413a7SShri Abhyankar PetscErrorCode ierr; 45270c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 45280c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 45290c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 45300c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 45310c4413a7SShri Abhyankar const PetscScalar *b; 45320c4413a7SShri Abhyankar 45330c4413a7SShri Abhyankar PetscFunctionBegin; 45340c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45350c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 45360c4413a7SShri Abhyankar t = a->solve_work; 45370c4413a7SShri Abhyankar 45380c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 45390c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 45400c4413a7SShri Abhyankar 45410c4413a7SShri Abhyankar /* forward solve the lower triangular */ 45420c4413a7SShri Abhyankar idx = 3*r[0]; 45430c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 45440c4413a7SShri Abhyankar for (i=1; i<n; i++) { 45450c4413a7SShri Abhyankar v = aa + 9*ai[i]; 45460c4413a7SShri Abhyankar vi = aj + ai[i]; 45470c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 45480c4413a7SShri Abhyankar idx = 3*r[i]; 45490c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 45500c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 45510c4413a7SShri Abhyankar idx = 3*vi[m]; 45520c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 45530c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 45540c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 45550c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 45560c4413a7SShri Abhyankar v += 9; 45570c4413a7SShri Abhyankar } 45580c4413a7SShri Abhyankar idx = 3*i; 45590c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 45600c4413a7SShri Abhyankar } 45610c4413a7SShri Abhyankar /* backward solve the upper triangular */ 45620c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 45630c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 45640c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 45650c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 45660c4413a7SShri Abhyankar idt = 3*i; 45670c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 45680c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 45690c4413a7SShri Abhyankar idx = 3*vi[m]; 45700c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 45710c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 45720c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 45730c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 45740c4413a7SShri Abhyankar v += 9; 45750c4413a7SShri Abhyankar } 45760c4413a7SShri Abhyankar idc = 3*c[i]; 45770c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 45780c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 45790c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 45800c4413a7SShri Abhyankar } 45810c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 45820c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 45830c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45840c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 45850c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 45860c4413a7SShri Abhyankar PetscFunctionReturn(0); 45870c4413a7SShri Abhyankar } 45880c4413a7SShri Abhyankar 458915091d37SBarry Smith /* 459015091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 459115091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 459215091d37SBarry Smith */ 45934a2ae208SSatish Balay #undef __FUNCT__ 4594*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 4595*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 459615091d37SBarry Smith { 459715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4598690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4599dfbe8321SBarry Smith PetscErrorCode ierr; 4600690b6cddSBarry Smith PetscInt *diag = a->diag; 4601d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4602d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4603d9fead3dSBarry Smith const PetscScalar *b; 4604690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 460515091d37SBarry Smith 460615091d37SBarry Smith PetscFunctionBegin; 4607d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46081ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 460915091d37SBarry Smith 461015091d37SBarry Smith /* forward solve the lower triangular */ 461115091d37SBarry Smith idx = 0; 461215091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 461315091d37SBarry Smith for (i=1; i<n; i++) { 461415091d37SBarry Smith v = aa + 9*ai[i]; 461515091d37SBarry Smith vi = aj + ai[i]; 461615091d37SBarry Smith nz = diag[i] - ai[i]; 461715091d37SBarry Smith idx += 3; 4618f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 461915091d37SBarry Smith while (nz--) { 462015091d37SBarry Smith jdx = 3*(*vi++); 462115091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4622f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4623f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4624f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 462515091d37SBarry Smith v += 9; 462615091d37SBarry Smith } 4627f1af5d2fSBarry Smith x[idx] = s1; 4628f1af5d2fSBarry Smith x[1+idx] = s2; 4629f1af5d2fSBarry Smith x[2+idx] = s3; 463015091d37SBarry Smith } 463115091d37SBarry Smith /* backward solve the upper triangular */ 463215091d37SBarry Smith for (i=n-1; i>=0; i--){ 463315091d37SBarry Smith v = aa + 9*diag[i] + 9; 463415091d37SBarry Smith vi = aj + diag[i] + 1; 463515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 463615091d37SBarry Smith idt = 3*i; 4637f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4638f1af5d2fSBarry Smith s3 = x[2+idt]; 463915091d37SBarry Smith while (nz--) { 464015091d37SBarry Smith idx = 3*(*vi++); 464115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4642f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4643f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4644f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 464515091d37SBarry Smith v += 9; 464615091d37SBarry Smith } 464715091d37SBarry Smith v = aa + 9*diag[i]; 4648f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4649f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4650f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 465115091d37SBarry Smith } 465215091d37SBarry Smith 4653d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46541ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4655dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 465615091d37SBarry Smith PetscFunctionReturn(0); 465715091d37SBarry Smith } 465815091d37SBarry Smith 4659cee9d6f2SShri Abhyankar #undef __FUNCT__ 4660a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4661a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4662b2b2dd24SShri Abhyankar { 4663b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4664b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4665b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4666b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 4667b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4668b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4669b2b2dd24SShri Abhyankar PetscScalar *x; 4670b2b2dd24SShri Abhyankar const PetscScalar *b; 4671b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4672b2b2dd24SShri Abhyankar 4673b2b2dd24SShri Abhyankar PetscFunctionBegin; 4674b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4675b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4676b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4677b2b2dd24SShri Abhyankar idx = 0; 4678b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4679b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4680b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4681b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4682b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4683b2b2dd24SShri Abhyankar idx = bs*i; 4684b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4685b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4686b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4687b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4688b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4689b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4690b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4691b2b2dd24SShri Abhyankar 4692b2b2dd24SShri Abhyankar v += bs2; 4693b2b2dd24SShri Abhyankar } 4694b2b2dd24SShri Abhyankar 4695b2b2dd24SShri Abhyankar x[idx] = s1; 4696b2b2dd24SShri Abhyankar x[1+idx] = s2; 4697b2b2dd24SShri Abhyankar x[2+idx] = s3; 4698b2b2dd24SShri Abhyankar } 4699b2b2dd24SShri Abhyankar 4700b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4701b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4702b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4703b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4704b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4705b2b2dd24SShri Abhyankar idt = bs*i; 4706b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4707b2b2dd24SShri Abhyankar 4708b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4709b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4710b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4711b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4712b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4713b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4714b2b2dd24SShri Abhyankar 4715b2b2dd24SShri Abhyankar v += bs2; 4716b2b2dd24SShri Abhyankar } 4717b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4718b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4719b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4720b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4721b2b2dd24SShri Abhyankar 4722b2b2dd24SShri Abhyankar } 4723b2b2dd24SShri Abhyankar 4724b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4725b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4726b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4727b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4728b2b2dd24SShri Abhyankar } 4729b2b2dd24SShri Abhyankar 4730b2b2dd24SShri Abhyankar #undef __FUNCT__ 4731*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 4732*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 47334e2b4712SSatish Balay { 47344e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 47354e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 47366849ba73SBarry Smith PetscErrorCode ierr; 47375d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 47385d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4739d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4740d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 4741d9fead3dSBarry Smith const PetscScalar *b; 47424e2b4712SSatish Balay 47434e2b4712SSatish Balay PetscFunctionBegin; 4744d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47451ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4746f1af5d2fSBarry Smith t = a->solve_work; 47474e2b4712SSatish Balay 47484e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 47494e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 47504e2b4712SSatish Balay 47514e2b4712SSatish Balay /* forward solve the lower triangular */ 47524e2b4712SSatish Balay idx = 2*(*r++); 4753f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 47544e2b4712SSatish Balay for (i=1; i<n; i++) { 47554e2b4712SSatish Balay v = aa + 4*ai[i]; 47564e2b4712SSatish Balay vi = aj + ai[i]; 47574e2b4712SSatish Balay nz = diag[i] - ai[i]; 47584e2b4712SSatish Balay idx = 2*(*r++); 4759f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 47604e2b4712SSatish Balay while (nz--) { 47614e2b4712SSatish Balay idx = 2*(*vi++); 4762f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4763f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4764f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 47654e2b4712SSatish Balay v += 4; 47664e2b4712SSatish Balay } 47674e2b4712SSatish Balay idx = 2*i; 4768f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 47694e2b4712SSatish Balay } 47704e2b4712SSatish Balay /* backward solve the upper triangular */ 47714e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 47724e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 47734e2b4712SSatish Balay vi = aj + diag[i] + 1; 47744e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 47754e2b4712SSatish Balay idt = 2*i; 4776f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 47774e2b4712SSatish Balay while (nz--) { 47784e2b4712SSatish Balay idx = 2*(*vi++); 4779f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4780f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4781f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 47824e2b4712SSatish Balay v += 4; 47834e2b4712SSatish Balay } 47844e2b4712SSatish Balay idc = 2*(*c--); 47854e2b4712SSatish Balay v = aa + 4*diag[i]; 4786f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4787f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 47884e2b4712SSatish Balay } 47894e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 47904e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4791d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4793dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 47944e2b4712SSatish Balay PetscFunctionReturn(0); 47954e2b4712SSatish Balay } 47964e2b4712SSatish Balay 47970c4413a7SShri Abhyankar #undef __FUNCT__ 4798a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 4799a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 48000c4413a7SShri Abhyankar { 48010c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 48020c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 48030c4413a7SShri Abhyankar PetscErrorCode ierr; 48040c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 48050c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 48060c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 48070c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 48080c4413a7SShri Abhyankar const PetscScalar *b; 48090c4413a7SShri Abhyankar 48100c4413a7SShri Abhyankar PetscFunctionBegin; 48110c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 48120c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 48130c4413a7SShri Abhyankar t = a->solve_work; 48140c4413a7SShri Abhyankar 48150c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 48160c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 48170c4413a7SShri Abhyankar 48180c4413a7SShri Abhyankar /* forward solve the lower triangular */ 48190c4413a7SShri Abhyankar idx = 2*r[0]; 48200c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 48210c4413a7SShri Abhyankar for (i=1; i<n; i++) { 48220c4413a7SShri Abhyankar v = aa + 4*ai[i]; 48230c4413a7SShri Abhyankar vi = aj + ai[i]; 48240c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 48250c4413a7SShri Abhyankar idx = 2*r[i]; 48260c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 48270c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 48280c4413a7SShri Abhyankar jdx = 2*vi[m]; 48290c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 48300c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 48310c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 48320c4413a7SShri Abhyankar v += 4; 48330c4413a7SShri Abhyankar } 48340c4413a7SShri Abhyankar idx = 2*i; 48350c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 48360c4413a7SShri Abhyankar } 48370c4413a7SShri Abhyankar /* backward solve the upper triangular */ 48380c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 48390c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 48400c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 48410c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 48420c4413a7SShri Abhyankar idt = 2*i; 48430c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 48440c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 48450c4413a7SShri Abhyankar idx = 2*vi[m]; 48460c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 48470c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 48480c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 48490c4413a7SShri Abhyankar v += 4; 48500c4413a7SShri Abhyankar } 48510c4413a7SShri Abhyankar idc = 2*c[i]; 48520c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 48530c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 48540c4413a7SShri Abhyankar } 48550c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 48560c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 48570c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 48580c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 48590c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 48600c4413a7SShri Abhyankar PetscFunctionReturn(0); 48610c4413a7SShri Abhyankar } 48628f690400SShri Abhyankar 486315091d37SBarry Smith /* 486415091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 486515091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 486615091d37SBarry Smith */ 48674a2ae208SSatish Balay #undef __FUNCT__ 4868*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 4869*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 487015091d37SBarry Smith { 487115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4872690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4873dfbe8321SBarry Smith PetscErrorCode ierr; 4874690b6cddSBarry Smith PetscInt *diag = a->diag; 4875d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4876d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 4877d9fead3dSBarry Smith const PetscScalar *b; 4878690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 487915091d37SBarry Smith 488015091d37SBarry Smith PetscFunctionBegin; 4881d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 48821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 488315091d37SBarry Smith 488415091d37SBarry Smith /* forward solve the lower triangular */ 488515091d37SBarry Smith idx = 0; 488615091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 488715091d37SBarry Smith for (i=1; i<n; i++) { 488815091d37SBarry Smith v = aa + 4*ai[i]; 488915091d37SBarry Smith vi = aj + ai[i]; 489015091d37SBarry Smith nz = diag[i] - ai[i]; 489115091d37SBarry Smith idx += 2; 4892f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 489315091d37SBarry Smith while (nz--) { 489415091d37SBarry Smith jdx = 2*(*vi++); 489515091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 4896f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4897f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 489815091d37SBarry Smith v += 4; 489915091d37SBarry Smith } 4900f1af5d2fSBarry Smith x[idx] = s1; 4901f1af5d2fSBarry Smith x[1+idx] = s2; 490215091d37SBarry Smith } 490315091d37SBarry Smith /* backward solve the upper triangular */ 490415091d37SBarry Smith for (i=n-1; i>=0; i--){ 490515091d37SBarry Smith v = aa + 4*diag[i] + 4; 490615091d37SBarry Smith vi = aj + diag[i] + 1; 490715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 490815091d37SBarry Smith idt = 2*i; 4909f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 491015091d37SBarry Smith while (nz--) { 491115091d37SBarry Smith idx = 2*(*vi++); 491215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 4913f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4914f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 491515091d37SBarry Smith v += 4; 491615091d37SBarry Smith } 491715091d37SBarry Smith v = aa + 4*diag[i]; 4918f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 4919f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 492015091d37SBarry Smith } 492115091d37SBarry Smith 4922d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 49231ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4924dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 492515091d37SBarry Smith PetscFunctionReturn(0); 492615091d37SBarry Smith } 492715091d37SBarry Smith 4928cee9d6f2SShri Abhyankar #undef __FUNCT__ 4929a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4930a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4931b2b2dd24SShri Abhyankar { 4932b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4933b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4934b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4935b2b2dd24SShri Abhyankar PetscInt jdx; 4936b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4937b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4938b2b2dd24SShri Abhyankar const PetscScalar *b; 4939b2b2dd24SShri Abhyankar 4940b2b2dd24SShri Abhyankar PetscFunctionBegin; 4941b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4942b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4943b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4944b2b2dd24SShri Abhyankar idx = 0; 4945b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4946b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4947b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 4948b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4949b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4950b2b2dd24SShri Abhyankar idx = 2*i; 4951b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4952b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4953b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 4954b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4955b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4956b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4957b2b2dd24SShri Abhyankar v += 4; 4958b2b2dd24SShri Abhyankar } 4959b2b2dd24SShri Abhyankar x[idx] = s1; 4960b2b2dd24SShri Abhyankar x[1+idx] = s2; 4961b2b2dd24SShri Abhyankar } 4962b2b2dd24SShri Abhyankar 4963b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4964b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4965b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 4966b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4967b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4968b2b2dd24SShri Abhyankar idt = 2*i; 4969b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4970b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4971b2b2dd24SShri Abhyankar idx = 2*vi[k]; 4972b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4973b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4974b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4975b2b2dd24SShri Abhyankar v += 4; 4976b2b2dd24SShri Abhyankar } 4977b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4978b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4979b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4980b2b2dd24SShri Abhyankar } 4981b2b2dd24SShri Abhyankar 4982b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4983b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4984b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4985b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4986b2b2dd24SShri Abhyankar } 4987b2b2dd24SShri Abhyankar 4988b2b2dd24SShri Abhyankar #undef __FUNCT__ 4989*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 4990*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 49914e2b4712SSatish Balay { 49924e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 49934e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 49946849ba73SBarry Smith PetscErrorCode ierr; 49955d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 49965d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 49973f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 499887828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 49994e2b4712SSatish Balay 50004e2b4712SSatish Balay PetscFunctionBegin; 50014e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 50024e2b4712SSatish Balay 50031ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 50041ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5005f1af5d2fSBarry Smith t = a->solve_work; 50064e2b4712SSatish Balay 50074e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 50084e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 50094e2b4712SSatish Balay 50104e2b4712SSatish Balay /* forward solve the lower triangular */ 5011f1af5d2fSBarry Smith t[0] = b[*r++]; 50124e2b4712SSatish Balay for (i=1; i<n; i++) { 50134e2b4712SSatish Balay v = aa + ai[i]; 50144e2b4712SSatish Balay vi = aj + ai[i]; 50154e2b4712SSatish Balay nz = diag[i] - ai[i]; 5016f1af5d2fSBarry Smith s1 = b[*r++]; 50174e2b4712SSatish Balay while (nz--) { 5018f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 50194e2b4712SSatish Balay } 5020f1af5d2fSBarry Smith t[i] = s1; 50214e2b4712SSatish Balay } 50224e2b4712SSatish Balay /* backward solve the upper triangular */ 50234e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 50244e2b4712SSatish Balay v = aa + diag[i] + 1; 50254e2b4712SSatish Balay vi = aj + diag[i] + 1; 50264e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 5027f1af5d2fSBarry Smith s1 = t[i]; 50284e2b4712SSatish Balay while (nz--) { 5029f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 50304e2b4712SSatish Balay } 5031f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 50324e2b4712SSatish Balay } 50334e2b4712SSatish Balay 50344e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 50354e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 50361ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50371ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5038dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 50394e2b4712SSatish Balay PetscFunctionReturn(0); 50404e2b4712SSatish Balay } 504115091d37SBarry Smith /* 504215091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 504315091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 504415091d37SBarry Smith */ 50454a2ae208SSatish Balay #undef __FUNCT__ 5046*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 5047*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 504815091d37SBarry Smith { 504915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5050690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 5051dfbe8321SBarry Smith PetscErrorCode ierr; 5052690b6cddSBarry Smith PetscInt *diag = a->diag; 505315091d37SBarry Smith MatScalar *aa=a->a; 505487828ca2SBarry Smith PetscScalar *x,*b; 505587828ca2SBarry Smith PetscScalar s1,x1; 505615091d37SBarry Smith MatScalar *v; 5057690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 505815091d37SBarry Smith 505915091d37SBarry Smith PetscFunctionBegin; 50601ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 50611ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 506215091d37SBarry Smith 506315091d37SBarry Smith /* forward solve the lower triangular */ 506415091d37SBarry Smith idx = 0; 506515091d37SBarry Smith x[0] = b[0]; 506615091d37SBarry Smith for (i=1; i<n; i++) { 506715091d37SBarry Smith v = aa + ai[i]; 506815091d37SBarry Smith vi = aj + ai[i]; 506915091d37SBarry Smith nz = diag[i] - ai[i]; 507015091d37SBarry Smith idx += 1; 5071f1af5d2fSBarry Smith s1 = b[idx]; 507215091d37SBarry Smith while (nz--) { 507315091d37SBarry Smith jdx = *vi++; 507415091d37SBarry Smith x1 = x[jdx]; 5075f1af5d2fSBarry Smith s1 -= v[0]*x1; 507615091d37SBarry Smith v += 1; 507715091d37SBarry Smith } 5078f1af5d2fSBarry Smith x[idx] = s1; 507915091d37SBarry Smith } 508015091d37SBarry Smith /* backward solve the upper triangular */ 508115091d37SBarry Smith for (i=n-1; i>=0; i--){ 508215091d37SBarry Smith v = aa + diag[i] + 1; 508315091d37SBarry Smith vi = aj + diag[i] + 1; 508415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 508515091d37SBarry Smith idt = i; 5086f1af5d2fSBarry Smith s1 = x[idt]; 508715091d37SBarry Smith while (nz--) { 508815091d37SBarry Smith idx = *vi++; 508915091d37SBarry Smith x1 = x[idx]; 5090f1af5d2fSBarry Smith s1 -= v[0]*x1; 509115091d37SBarry Smith v += 1; 509215091d37SBarry Smith } 509315091d37SBarry Smith v = aa + diag[i]; 5094f1af5d2fSBarry Smith x[idt] = v[0]*s1; 509515091d37SBarry Smith } 50961ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5098dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 509915091d37SBarry Smith PetscFunctionReturn(0); 510015091d37SBarry Smith } 51014e2b4712SSatish Balay 51024e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 510316a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 51046bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 5105ae3d28f0SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth); 51066bce7ff8SHong Zhang 51076bce7ff8SHong Zhang #undef __FUNCT__ 51086bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 51096bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 51106bce7ff8SHong Zhang { 51116bce7ff8SHong Zhang Mat C=B; 51126bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 51136bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 51146bce7ff8SHong Zhang PetscErrorCode ierr; 51156bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 51166bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 51176bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5118b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5119914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5120914a18a2SHong Zhang MatScalar *v_work; 5121ae3d28f0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 51226bce7ff8SHong Zhang 51236bce7ff8SHong Zhang PetscFunctionBegin; 51246bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 51256bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5126ae3d28f0SHong Zhang 5127fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5128fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 51296bce7ff8SHong Zhang ics = ic; 51306bce7ff8SHong Zhang 5131914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 5132fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5133914a18a2SHong Zhang 51346bce7ff8SHong Zhang for (i=0; i<n; i++){ 51356bce7ff8SHong Zhang /* zero rtmp */ 51366bce7ff8SHong Zhang /* L part */ 51376bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 51386bce7ff8SHong Zhang bjtmp = bj + bi[i]; 5139914a18a2SHong Zhang for (j=0; j<nz; j++){ 5140914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5141914a18a2SHong Zhang } 51426bce7ff8SHong Zhang 51436bce7ff8SHong Zhang /* U part */ 51441a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 51451a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 51461a83e813SShri Abhyankar for (j=0; j<nz; j++){ 51471a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 51481a83e813SShri Abhyankar } 51491a83e813SShri Abhyankar 51501a83e813SShri Abhyankar /* load in initial (unfactored row) */ 51511a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 51521a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 51531a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 51541a83e813SShri Abhyankar for (j=0; j<nz; j++) { 51551a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 51561a83e813SShri Abhyankar } 51571a83e813SShri Abhyankar 51581a83e813SShri Abhyankar /* elimination */ 51591a83e813SShri Abhyankar bjtmp = bj + bi[i]; 51601a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 51611a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 51621a83e813SShri Abhyankar row = bjtmp[k]; 51631a83e813SShri Abhyankar pc = rtmp + bs2*row; 51641a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 51651a83e813SShri Abhyankar if (flg) { 51661a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 51671a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 51681a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 51691a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 51701a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 51711a83e813SShri Abhyankar for (j=0; j<nz; j++) { 51721a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 51731a83e813SShri Abhyankar } 51741a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 51751a83e813SShri Abhyankar } 51761a83e813SShri Abhyankar } 51771a83e813SShri Abhyankar 51781a83e813SShri Abhyankar /* finished row so stick it into b->a */ 51791a83e813SShri Abhyankar /* L part */ 51801a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 51811a83e813SShri Abhyankar pj = b->j + bi[i] ; 51821a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 51831a83e813SShri Abhyankar for (j=0; j<nz; j++) { 51841a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 51851a83e813SShri Abhyankar } 51861a83e813SShri Abhyankar 51871a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 51881a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 51891a83e813SShri Abhyankar pj = b->j + bdiag[i]; 51901a83e813SShri Abhyankar /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 51911a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 51921a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 51931a83e813SShri Abhyankar 51941a83e813SShri Abhyankar /* U part */ 51951a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 51961a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 51971a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 51981a83e813SShri Abhyankar for (j=0; j<nz; j++){ 51991a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 52001a83e813SShri Abhyankar } 52011a83e813SShri Abhyankar } 52021a83e813SShri Abhyankar 52031a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 5204fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 52051a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 52061a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 52071a83e813SShri Abhyankar 5208ae3d28f0SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5209ae3d28f0SHong Zhang ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5210ae3d28f0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 5211ae3d28f0SHong Zhang if (both_identity){ 5212a2d6a19aSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 5213ae3d28f0SHong Zhang } else { 5214a2d6a19aSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 5215ae3d28f0SHong Zhang } 52168499736aSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N_newdatastruct; 5217ae3d28f0SHong Zhang 52181a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 52191a83e813SShri Abhyankar ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 52201a83e813SShri Abhyankar PetscFunctionReturn(0); 52211a83e813SShri Abhyankar } 52221a83e813SShri Abhyankar 52236bce7ff8SHong Zhang /* 52246bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 522516a2bf60SHong Zhang See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 522616a2bf60SHong Zhang because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 52276bce7ff8SHong Zhang */ 5228c0c7eb62SShri Abhyankar 52296bce7ff8SHong Zhang #undef __FUNCT__ 52306bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 52316bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 52326bce7ff8SHong Zhang { 52336bce7ff8SHong Zhang 52346bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 52356bce7ff8SHong Zhang PetscErrorCode ierr; 523616a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 523735aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 523835aa4fcfSShri Abhyankar 523935aa4fcfSShri Abhyankar PetscFunctionBegin; 524035aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 524135aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 524235aa4fcfSShri Abhyankar 524335aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 524435aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 524535aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 524635aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 524735aa4fcfSShri Abhyankar if (!b->diag){ 524835aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 524935aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 525035aa4fcfSShri Abhyankar } 525135aa4fcfSShri Abhyankar bdiag = b->diag; 525235aa4fcfSShri Abhyankar 525335aa4fcfSShri Abhyankar if (n > 0) { 525435aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 525535aa4fcfSShri Abhyankar } 525635aa4fcfSShri Abhyankar 525735aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 525835aa4fcfSShri Abhyankar bi = b->i; 525935aa4fcfSShri Abhyankar bj = b->j; 526035aa4fcfSShri Abhyankar 526135aa4fcfSShri Abhyankar /* L part */ 526235aa4fcfSShri Abhyankar bi[0] = 0; 526335aa4fcfSShri Abhyankar for (i=0; i<n; i++){ 526435aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 526535aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 526635aa4fcfSShri Abhyankar aj = a->j + ai[i]; 526735aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 526835aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 526935aa4fcfSShri Abhyankar } 527035aa4fcfSShri Abhyankar } 527135aa4fcfSShri Abhyankar 527235aa4fcfSShri Abhyankar /* U part */ 527335aa4fcfSShri Abhyankar bi_temp = bi[n]; 527435aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 527535aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 527635aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 527735aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 527835aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 527935aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 528035aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 528135aa4fcfSShri Abhyankar } 528235aa4fcfSShri Abhyankar /* diag[i] */ 528335aa4fcfSShri Abhyankar *bj = i; bj++; 528435aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 528535aa4fcfSShri Abhyankar } 528635aa4fcfSShri Abhyankar PetscFunctionReturn(0); 528735aa4fcfSShri Abhyankar } 528835aa4fcfSShri Abhyankar 528935aa4fcfSShri Abhyankar #undef __FUNCT__ 529016a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 529116a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 529216a2bf60SHong Zhang { 529316a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 529416a2bf60SHong Zhang IS isicol; 529516a2bf60SHong Zhang PetscErrorCode ierr; 529616a2bf60SHong Zhang const PetscInt *r,*ic; 52977fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 529816a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 529916a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 530016a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 53017fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 530216a2bf60SHong Zhang PetscReal f; 530316a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 530416a2bf60SHong Zhang PetscBT lnkbt; 530516a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 530616a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 530716a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 530816a2bf60SHong Zhang PetscTruth missing; 53097fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 5310*06e38f1dSHong Zhang PetscTruth newdatastruct = PETSC_FALSE; 531116a2bf60SHong Zhang 531216a2bf60SHong Zhang PetscFunctionBegin; 5313*06e38f1dSHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_old",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 5314*06e38f1dSHong Zhang if (newdatastruct){ 5315*06e38f1dSHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_inplace(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5316*06e38f1dSHong Zhang PetscFunctionReturn(0); 5317*06e38f1dSHong Zhang } 531816a2bf60SHong Zhang if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 531916a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 532016a2bf60SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 532116a2bf60SHong Zhang 532216a2bf60SHong Zhang f = info->fill; 532316a2bf60SHong Zhang levels = (PetscInt)info->levels; 532416a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 532516a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 532616a2bf60SHong Zhang 532716a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 532816a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 53297fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 533016a2bf60SHong Zhang 53317fa3a6a0SHong Zhang if (!levels && both_identity) { 533216a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 533316a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5334ae3d28f0SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 533535aa4fcfSShri Abhyankar 533635aa4fcfSShri Abhyankar fact->factor = MAT_FACTOR_ILU; 533735aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 533835aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 533935aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 534035aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 534135aa4fcfSShri Abhyankar b->row = isrow; 534235aa4fcfSShri Abhyankar b->col = iscol; 534335aa4fcfSShri Abhyankar b->icol = isicol; 534435aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 534535aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 534635aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 534735aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 534835aa4fcfSShri Abhyankar PetscFunctionReturn(0); 534935aa4fcfSShri Abhyankar } 535035aa4fcfSShri Abhyankar 535135aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 535235aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 535335aa4fcfSShri Abhyankar 535435aa4fcfSShri Abhyankar /* get new row pointers */ 535535aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 535635aa4fcfSShri Abhyankar bi[0] = 0; 535735aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 535835aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 535935aa4fcfSShri Abhyankar bdiag[0] = 0; 536035aa4fcfSShri Abhyankar 5361fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 536235aa4fcfSShri Abhyankar 536335aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 536435aa4fcfSShri Abhyankar nlnk = n + 1; 536535aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 536635aa4fcfSShri Abhyankar 536735aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 536835aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 536935aa4fcfSShri Abhyankar current_space = free_space; 537035aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 537135aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 537235aa4fcfSShri Abhyankar 537335aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 537435aa4fcfSShri Abhyankar nzi = 0; 537535aa4fcfSShri Abhyankar /* copy current row into linked list */ 537635aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 537735aa4fcfSShri Abhyankar if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 537835aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 537935aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 538035aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 538135aa4fcfSShri Abhyankar nzi += nlnk; 538235aa4fcfSShri Abhyankar 538335aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 538435aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 538535aa4fcfSShri Abhyankar fm = n; 538635aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 538735aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 538835aa4fcfSShri Abhyankar lnk[fm] = i; 538935aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 539035aa4fcfSShri Abhyankar nzi++; dcount++; 539135aa4fcfSShri Abhyankar } 539235aa4fcfSShri Abhyankar 539335aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 539435aa4fcfSShri Abhyankar nzbd = 0; 539535aa4fcfSShri Abhyankar prow = lnk[n]; 539635aa4fcfSShri Abhyankar while (prow < i) { 539735aa4fcfSShri Abhyankar nnz = bdiag[prow]; 539835aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 539935aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 540035aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 540135aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 540235aa4fcfSShri Abhyankar nzi += nlnk; 540335aa4fcfSShri Abhyankar prow = lnk[prow]; 540435aa4fcfSShri Abhyankar nzbd++; 540535aa4fcfSShri Abhyankar } 540635aa4fcfSShri Abhyankar bdiag[i] = nzbd; 540735aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 540835aa4fcfSShri Abhyankar 540935aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 541035aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 541135aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 541235aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 541335aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 541435aa4fcfSShri Abhyankar reallocs++; 541535aa4fcfSShri Abhyankar } 541635aa4fcfSShri Abhyankar 541735aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 541835aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 541935aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 542035aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 542135aa4fcfSShri Abhyankar 542235aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 542335aa4fcfSShri Abhyankar if (*(bj_ptr[i]+bdiag[i]) != i) { 542435aa4fcfSShri Abhyankar SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 542535aa4fcfSShri Abhyankar try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 542635aa4fcfSShri Abhyankar } 542735aa4fcfSShri Abhyankar 542835aa4fcfSShri Abhyankar current_space->array += nzi; 542935aa4fcfSShri Abhyankar current_space->local_used += nzi; 543035aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 543135aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 543235aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 543335aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 543435aa4fcfSShri Abhyankar } 543535aa4fcfSShri Abhyankar 543635aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 543735aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 543835aa4fcfSShri Abhyankar 543935aa4fcfSShri Abhyankar /* destroy list of free space and other temporary arrays */ 544035aa4fcfSShri Abhyankar ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 544135aa4fcfSShri Abhyankar 544235aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 544335aa4fcfSShri Abhyankar ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 544435aa4fcfSShri Abhyankar 544535aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 544635aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5447fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 544835aa4fcfSShri Abhyankar 544935aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 545035aa4fcfSShri Abhyankar { 545135aa4fcfSShri Abhyankar PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 545235aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 545335aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 545435aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 545535aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 545635aa4fcfSShri Abhyankar if (diagonal_fill) { 545735aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 545835aa4fcfSShri Abhyankar } 545935aa4fcfSShri Abhyankar } 546035aa4fcfSShri Abhyankar #endif 546135aa4fcfSShri Abhyankar 546235aa4fcfSShri Abhyankar /* put together the new matrix */ 546335aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 546435aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 546535aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 546635aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 546735aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 546835aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 546935aa4fcfSShri Abhyankar ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 547035aa4fcfSShri Abhyankar b->j = bj; 547135aa4fcfSShri Abhyankar b->i = bi; 547235aa4fcfSShri Abhyankar b->diag = bdiag; 547335aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 547435aa4fcfSShri Abhyankar b->ilen = 0; 547535aa4fcfSShri Abhyankar b->imax = 0; 547635aa4fcfSShri Abhyankar b->row = isrow; 547735aa4fcfSShri Abhyankar b->col = iscol; 547835aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 547935aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 548035aa4fcfSShri Abhyankar b->icol = isicol; 548135aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 548235aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 548335aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 548435aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 548535aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 5486ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocs; 5487ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 5488ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5489ae3d28f0SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 549035aa4fcfSShri Abhyankar PetscFunctionReturn(0); 549135aa4fcfSShri Abhyankar } 549235aa4fcfSShri Abhyankar 549335aa4fcfSShri Abhyankar 54944e2b4712SSatish Balay /* 54954e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 54964e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 54974e2b4712SSatish Balay Not a good example of code reuse. 54984e2b4712SSatish Balay */ 54994a2ae208SSatish Balay #undef __FUNCT__ 5500*06e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 5501*06e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 55024e2b4712SSatish Balay { 55034e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 55044e2b4712SSatish Balay IS isicol; 55056849ba73SBarry Smith PetscErrorCode ierr; 55065d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 55075d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5508a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5509d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 551041df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 5511329f5518SBarry Smith PetscReal f; 55124e2b4712SSatish Balay 55134e2b4712SSatish Balay PetscFunctionBegin; 55146bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 55156bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 55166bce7ff8SHong Zhang 5517435faa5fSBarry Smith f = info->fill; 5518690b6cddSBarry Smith levels = (PetscInt)info->levels; 5519690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 55204c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 552116a2bf60SHong Zhang 5522667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5523667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 55247d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 5525309c388cSBarry Smith 552641df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 552716a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 55286bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 55296bce7ff8SHong Zhang 5530719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 5531ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 5532bb3d539aSBarry Smith b->row = isrow; 5533bb3d539aSBarry Smith b->col = iscol; 5534bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5535bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5536bb3d539aSBarry Smith b->icol = isicol; 5537bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5538b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 55396bce7ff8SHong Zhang PetscFunctionReturn(0); 55406bce7ff8SHong Zhang } 55416bce7ff8SHong Zhang 55426bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 55434e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 55444e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 55454e2b4712SSatish Balay 55464e2b4712SSatish Balay /* get new row pointers */ 5547690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 55484e2b4712SSatish Balay ainew[0] = 0; 55494e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 5550690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 5551690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 55524e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 5553690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 55544e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 5555690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 55564e2b4712SSatish Balay /* im is level for each filled value */ 5557690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 55584e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 5559690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 55604e2b4712SSatish Balay dloc[0] = 0; 55614e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 5562435faa5fSBarry Smith 5563435faa5fSBarry Smith /* copy prow into linked list */ 55644e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 55653b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 55664e2b4712SSatish Balay xi = aj + ai[r[prow]]; 55674e2b4712SSatish Balay fill[n] = n; 5568435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 55694e2b4712SSatish Balay while (nz--) { 55704e2b4712SSatish Balay fm = n; 55714e2b4712SSatish Balay idx = ic[*xi++]; 55724e2b4712SSatish Balay do { 55734e2b4712SSatish Balay m = fm; 55744e2b4712SSatish Balay fm = fill[m]; 55754e2b4712SSatish Balay } while (fm < idx); 55764e2b4712SSatish Balay fill[m] = idx; 55774e2b4712SSatish Balay fill[idx] = fm; 55784e2b4712SSatish Balay im[idx] = 0; 55794e2b4712SSatish Balay } 5580435faa5fSBarry Smith 5581435faa5fSBarry Smith /* make sure diagonal entry is included */ 5582435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 5583435faa5fSBarry Smith fm = n; 5584435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 5585435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5586435faa5fSBarry Smith fill[fm] = prow; 5587435faa5fSBarry Smith im[prow] = 0; 5588435faa5fSBarry Smith nzf++; 5589335d9088SBarry Smith dcount++; 5590435faa5fSBarry Smith } 5591435faa5fSBarry Smith 55924e2b4712SSatish Balay nzi = 0; 55934e2b4712SSatish Balay row = fill[n]; 55944e2b4712SSatish Balay while (row < prow) { 55954e2b4712SSatish Balay incrlev = im[row] + 1; 55964e2b4712SSatish Balay nz = dloc[row]; 5597435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 55984e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 55994e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 56004e2b4712SSatish Balay fm = row; 56014e2b4712SSatish Balay while (nnz-- > 0) { 56024e2b4712SSatish Balay idx = *xi++; 56034e2b4712SSatish Balay if (*flev + incrlev > levels) { 56044e2b4712SSatish Balay flev++; 56054e2b4712SSatish Balay continue; 56064e2b4712SSatish Balay } 56074e2b4712SSatish Balay do { 56084e2b4712SSatish Balay m = fm; 56094e2b4712SSatish Balay fm = fill[m]; 56104e2b4712SSatish Balay } while (fm < idx); 56114e2b4712SSatish Balay if (fm != idx) { 56124e2b4712SSatish Balay im[idx] = *flev + incrlev; 56134e2b4712SSatish Balay fill[m] = idx; 56144e2b4712SSatish Balay fill[idx] = fm; 56154e2b4712SSatish Balay fm = idx; 56164e2b4712SSatish Balay nzf++; 5617ecf371e4SBarry Smith } else { 56184e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 56194e2b4712SSatish Balay } 56204e2b4712SSatish Balay flev++; 56214e2b4712SSatish Balay } 56224e2b4712SSatish Balay row = fill[row]; 56234e2b4712SSatish Balay nzi++; 56244e2b4712SSatish Balay } 56254e2b4712SSatish Balay /* copy new filled row into permanent storage */ 56264e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 56274e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 5628ecf371e4SBarry Smith 5629ecf371e4SBarry Smith /* estimate how much additional space we will need */ 5630ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5631ecf371e4SBarry Smith /* just double the memory each time */ 5632690b6cddSBarry Smith PetscInt maxadd = jmax; 5633ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 56344e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 56354e2b4712SSatish Balay jmax += maxadd; 5636ecf371e4SBarry Smith 5637ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 56385d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 56395d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5640606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 56415d0c19d7SBarry Smith ajnew = xitmp; 56425d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 56435d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5644606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 56455d0c19d7SBarry Smith ajfill = xitmp; 5646eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 56474e2b4712SSatish Balay } 56485d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 56494e2b4712SSatish Balay flev = ajfill + ainew[prow]; 56504e2b4712SSatish Balay dloc[prow] = nzi; 56514e2b4712SSatish Balay fm = fill[n]; 56524e2b4712SSatish Balay while (nzf--) { 56535d0c19d7SBarry Smith *xitmp++ = fm; 56544e2b4712SSatish Balay *flev++ = im[fm]; 56554e2b4712SSatish Balay fm = fill[fm]; 56564e2b4712SSatish Balay } 5657435faa5fSBarry Smith /* make sure row has diagonal entry */ 5658435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 565977431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 56602401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5661435faa5fSBarry Smith } 56624e2b4712SSatish Balay } 5663606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 56644e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 56654e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5666606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 5667606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 56684e2b4712SSatish Balay 56696cf91177SBarry Smith #if defined(PETSC_USE_INFO) 56704e2b4712SSatish Balay { 5671329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5672ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5673ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5674ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5675ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5676335d9088SBarry Smith if (diagonal_fill) { 5677ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5678335d9088SBarry Smith } 56794e2b4712SSatish Balay } 568063ba0a88SBarry Smith #endif 56814e2b4712SSatish Balay 56824e2b4712SSatish Balay /* put together the new matrix */ 5683719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5684719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5685ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 5686e6b907acSBarry Smith b->free_a = PETSC_TRUE; 5687e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 56887c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 5689a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 56904e2b4712SSatish Balay b->j = ajnew; 56914e2b4712SSatish Balay b->i = ainew; 56924e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 56934e2b4712SSatish Balay b->diag = dloc; 56947f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 56954e2b4712SSatish Balay b->ilen = 0; 56964e2b4712SSatish Balay b->imax = 0; 56974e2b4712SSatish Balay b->row = isrow; 56984e2b4712SSatish Balay b->col = iscol; 5699bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5700c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5701c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5702e51c0b9cSSatish Balay b->icol = isicol; 570387828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 57044e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 57054e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 5706719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 57074e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 57084e2b4712SSatish Balay 5709ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocate; 5710ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 5711ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 57126bce7ff8SHong Zhang 571341df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 57148661488fSKris Buschelman PetscFunctionReturn(0); 57158661488fSKris Buschelman } 57168661488fSKris Buschelman 5717732ee342SKris Buschelman #undef __FUNCT__ 57187e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5719dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 57207e7071cdSKris Buschelman { 572112272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 572212272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 57235a9542e3SKris Buschelman PetscFunctionBegin; 57247cf1b8d3SKris Buschelman /* Undo Column scaling */ 57257cf1b8d3SKris Buschelman /* while (nz--) { */ 57267cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 57277cf1b8d3SKris Buschelman /* } */ 5728c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 5729c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 57307cf1b8d3SKris Buschelman PetscFunctionReturn(0); 57317cf1b8d3SKris Buschelman } 57327cf1b8d3SKris Buschelman 57337cf1b8d3SKris Buschelman #undef __FUNCT__ 57347cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5735dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 57367cf1b8d3SKris Buschelman { 57377cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5738b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 57392aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 57405a9542e3SKris Buschelman PetscFunctionBegin; 57410b9da03eSKris Buschelman /* Is this really necessary? */ 574220235379SKris Buschelman while (nz--) { 57430b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 57447e7071cdSKris Buschelman } 5745c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 57467e7071cdSKris Buschelman PetscFunctionReturn(0); 57477e7071cdSKris Buschelman } 57487e7071cdSKris Buschelman 5749732ee342SKris Buschelman 5750