1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 34e2b4712SSatish Balay /* 44e2b4712SSatish Balay Factorization code for BAIJ format. 54e2b4712SSatish Balay */ 64e2b4712SSatish Balay 77c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 8c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 916a2bf60SHong Zhang #include "petscbt.h" 1016a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 114e2b4712SSatish Balay 124a2ae208SSatish Balay #undef __FUNCT__ 1306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 1406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 15f1af5d2fSBarry Smith { 16f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 17dfbe8321SBarry Smith PetscErrorCode ierr; 180b68f018SBarry Smith PetscInt i,nz; 190b68f018SBarry Smith const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 200b68f018SBarry Smith const MatScalar *aa=a->a,*v; 210b68f018SBarry Smith PetscScalar s1,*x; 220b68f018SBarry Smith const PetscScalar *b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 260b68f018SBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 520b68f018SBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 5906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 6006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 65b3260449SShri Abhyankar const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 66b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 67b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x; 68b3260449SShri Abhyankar const PetscScalar *b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 111b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 1184dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 1194dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 1206929473cSShri Abhyankar { 1216929473cSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1226929473cSShri Abhyankar PetscErrorCode ierr; 123b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1246929473cSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 125b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 126b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 127b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x; 128b3260449SShri Abhyankar const PetscScalar *b; 1296929473cSShri Abhyankar 1306929473cSShri Abhyankar PetscFunctionBegin; 1316929473cSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 132b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1336929473cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1346929473cSShri Abhyankar 1356929473cSShri Abhyankar /* forward solve the U^T */ 1366929473cSShri Abhyankar idx = 0; 1376929473cSShri Abhyankar for (i=0; i<n; i++) { 1386929473cSShri Abhyankar v = aa + bs2*diag[i]; 1396929473cSShri Abhyankar /* multiply by the inverse of the block diagonal */ 1406929473cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 1416929473cSShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 1426929473cSShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 1436929473cSShri Abhyankar v -= bs2; 1446929473cSShri Abhyankar 1456929473cSShri Abhyankar vi = aj + diag[i] - 1; 1466929473cSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 1476929473cSShri Abhyankar for(j=0;j>-nz;j--){ 1486929473cSShri Abhyankar oidx = bs*vi[j]; 1496929473cSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2; 1506929473cSShri Abhyankar x[oidx+1] -= v[2]*s1 + v[3]*s2; 1516929473cSShri Abhyankar v -= bs2; 1526929473cSShri Abhyankar } 1536929473cSShri Abhyankar x[idx] = s1;x[1+idx] = s2; 1546929473cSShri Abhyankar idx += bs; 1556929473cSShri Abhyankar } 1566929473cSShri Abhyankar /* backward solve the L^T */ 1576929473cSShri Abhyankar for (i=n-1; i>=0; i--){ 1586929473cSShri Abhyankar v = aa + bs2*ai[i]; 1596929473cSShri Abhyankar vi = aj + ai[i]; 1606929473cSShri Abhyankar nz = ai[i+1] - ai[i]; 1616929473cSShri Abhyankar idt = bs*i; 1626929473cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 1636929473cSShri Abhyankar for(j=0;j<nz;j++){ 1646929473cSShri Abhyankar idx = bs*vi[j]; 1656929473cSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2; 1666929473cSShri Abhyankar x[idx+1] -= v[2]*s1 + v[3]*s2; 1676929473cSShri Abhyankar v += bs2; 1686929473cSShri Abhyankar } 1696929473cSShri Abhyankar } 170b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1716929473cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1726929473cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1736929473cSShri Abhyankar PetscFunctionReturn(0); 1746929473cSShri Abhyankar } 1756929473cSShri Abhyankar 1766929473cSShri Abhyankar #undef __FUNCT__ 17706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 17806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 179f1af5d2fSBarry Smith { 180f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181dfbe8321SBarry Smith PetscErrorCode ierr; 182b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 183b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 184b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 185b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x; 186b3260449SShri Abhyankar const PetscScalar *b; 187f1af5d2fSBarry Smith 188f1af5d2fSBarry Smith PetscFunctionBegin; 189ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 190b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192f1af5d2fSBarry Smith 193f1af5d2fSBarry Smith /* forward solve the U^T */ 194f1af5d2fSBarry Smith idx = 0; 195f1af5d2fSBarry Smith for (i=0; i<n; i++) { 196f1af5d2fSBarry Smith 197f1af5d2fSBarry Smith v = aa + 9*diag[i]; 198f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 199ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 200f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 201f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 202f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 203f1af5d2fSBarry Smith v += 9; 204f1af5d2fSBarry Smith 205f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 206f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 207f1af5d2fSBarry Smith while (nz--) { 208f1af5d2fSBarry Smith oidx = 3*(*vi++); 209f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 210f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 211f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 212f1af5d2fSBarry Smith v += 9; 213f1af5d2fSBarry Smith } 214f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 215f1af5d2fSBarry Smith idx += 3; 216f1af5d2fSBarry Smith } 217f1af5d2fSBarry Smith /* backward solve the L^T */ 218f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 219f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 220f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 221f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 222f1af5d2fSBarry Smith idt = 3*i; 223f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 224f1af5d2fSBarry Smith while (nz--) { 225f1af5d2fSBarry Smith idx = 3*(*vi--); 226f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 227f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 228f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 229f1af5d2fSBarry Smith v -= 9; 230f1af5d2fSBarry Smith } 231f1af5d2fSBarry Smith } 232b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 235f1af5d2fSBarry Smith PetscFunctionReturn(0); 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith 2384a2ae208SSatish Balay #undef __FUNCT__ 2394dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 2404dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2418499736aSShri Abhyankar { 2428499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2438499736aSShri Abhyankar PetscErrorCode ierr; 244b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2458499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 246b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 247b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 248b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x; 249b3260449SShri Abhyankar const PetscScalar *b; 2508499736aSShri Abhyankar 2518499736aSShri Abhyankar PetscFunctionBegin; 2528499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 253b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2548499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2558499736aSShri Abhyankar 2568499736aSShri Abhyankar /* forward solve the U^T */ 2578499736aSShri Abhyankar idx = 0; 2588499736aSShri Abhyankar for (i=0; i<n; i++) { 2598499736aSShri Abhyankar v = aa + bs2*diag[i]; 2608499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 2618499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2628499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 2638499736aSShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 2648499736aSShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 2658499736aSShri Abhyankar v -= bs2; 2668499736aSShri Abhyankar 2678499736aSShri Abhyankar vi = aj + diag[i] - 1; 2688499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 2698499736aSShri Abhyankar for(j=0;j>-nz;j--){ 2708499736aSShri Abhyankar oidx = bs*vi[j]; 2718499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 2728499736aSShri Abhyankar x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 2738499736aSShri Abhyankar x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 2748499736aSShri Abhyankar v -= bs2; 2758499736aSShri Abhyankar } 2768499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 2778499736aSShri Abhyankar idx += bs; 2788499736aSShri Abhyankar } 2798499736aSShri Abhyankar /* backward solve the L^T */ 2808499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 2818499736aSShri Abhyankar v = aa + bs2*ai[i]; 2828499736aSShri Abhyankar vi = aj + ai[i]; 2838499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 2848499736aSShri Abhyankar idt = bs*i; 2858499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 2868499736aSShri Abhyankar for(j=0;j<nz;j++){ 2878499736aSShri Abhyankar idx = bs*vi[j]; 2888499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 2898499736aSShri Abhyankar x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 2908499736aSShri Abhyankar x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 2918499736aSShri Abhyankar v += bs2; 2928499736aSShri Abhyankar } 2938499736aSShri Abhyankar } 294b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2958499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2968499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2978499736aSShri Abhyankar PetscFunctionReturn(0); 2988499736aSShri Abhyankar } 2998499736aSShri Abhyankar 3008499736aSShri Abhyankar #undef __FUNCT__ 30106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 30206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 303f1af5d2fSBarry Smith { 304f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305dfbe8321SBarry Smith PetscErrorCode ierr; 306b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 307b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 308b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 309b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 310b3260449SShri Abhyankar const PetscScalar *b; 311f1af5d2fSBarry Smith 312f1af5d2fSBarry Smith PetscFunctionBegin; 313ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 314b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3151ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 316f1af5d2fSBarry Smith 317f1af5d2fSBarry Smith /* forward solve the U^T */ 318f1af5d2fSBarry Smith idx = 0; 319f1af5d2fSBarry Smith for (i=0; i<n; i++) { 320f1af5d2fSBarry Smith 321f1af5d2fSBarry Smith v = aa + 16*diag[i]; 322f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 323ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 324f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 325f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 326f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 327f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 328f1af5d2fSBarry Smith v += 16; 329f1af5d2fSBarry Smith 330f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 331f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 332f1af5d2fSBarry Smith while (nz--) { 333f1af5d2fSBarry Smith oidx = 4*(*vi++); 334f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 335f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 336f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 337f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 338f1af5d2fSBarry Smith v += 16; 339f1af5d2fSBarry Smith } 340f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 341f1af5d2fSBarry Smith idx += 4; 342f1af5d2fSBarry Smith } 343f1af5d2fSBarry Smith /* backward solve the L^T */ 344f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 345f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 346f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 347f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 348f1af5d2fSBarry Smith idt = 4*i; 349f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 350f1af5d2fSBarry Smith while (nz--) { 351f1af5d2fSBarry Smith idx = 4*(*vi--); 352f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 353f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 354f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 355f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 356f1af5d2fSBarry Smith v -= 16; 357f1af5d2fSBarry Smith } 358f1af5d2fSBarry Smith } 359b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3601ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 361dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 362f1af5d2fSBarry Smith PetscFunctionReturn(0); 363f1af5d2fSBarry Smith } 364f1af5d2fSBarry Smith 3654a2ae208SSatish Balay #undef __FUNCT__ 3664dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 3674dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 3688499736aSShri Abhyankar { 3698499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3708499736aSShri Abhyankar PetscErrorCode ierr; 371b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 3728499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 373b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 374b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 375b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 376b3260449SShri Abhyankar const PetscScalar *b; 3778499736aSShri Abhyankar 3788499736aSShri Abhyankar PetscFunctionBegin; 3798499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 380b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3818499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3828499736aSShri Abhyankar 3838499736aSShri Abhyankar /* forward solve the U^T */ 3848499736aSShri Abhyankar idx = 0; 3858499736aSShri Abhyankar for (i=0; i<n; i++) { 3868499736aSShri Abhyankar v = aa + bs2*diag[i]; 3878499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 3888499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 3898499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 3908499736aSShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 3918499736aSShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 3928499736aSShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 3938499736aSShri Abhyankar v -= bs2; 3948499736aSShri Abhyankar 3958499736aSShri Abhyankar vi = aj + diag[i] - 1; 3968499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 3978499736aSShri Abhyankar for(j=0;j>-nz;j--){ 3988499736aSShri Abhyankar oidx = bs*vi[j]; 3998499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4008499736aSShri Abhyankar x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4018499736aSShri Abhyankar x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4028499736aSShri Abhyankar x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4038499736aSShri Abhyankar v -= bs2; 4048499736aSShri Abhyankar } 4058499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 4068499736aSShri Abhyankar idx += bs; 4078499736aSShri Abhyankar } 4088499736aSShri Abhyankar /* backward solve the L^T */ 4098499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 4108499736aSShri Abhyankar v = aa + bs2*ai[i]; 4118499736aSShri Abhyankar vi = aj + ai[i]; 4128499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 4138499736aSShri Abhyankar idt = bs*i; 4148499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 4158499736aSShri Abhyankar for(j=0;j<nz;j++){ 4168499736aSShri Abhyankar idx = bs*vi[j]; 4178499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4188499736aSShri Abhyankar x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4198499736aSShri Abhyankar x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4208499736aSShri Abhyankar x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4218499736aSShri Abhyankar v += bs2; 4228499736aSShri Abhyankar } 4238499736aSShri Abhyankar } 424b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4258499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4268499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4278499736aSShri Abhyankar PetscFunctionReturn(0); 4288499736aSShri Abhyankar } 4298499736aSShri Abhyankar 4308499736aSShri Abhyankar #undef __FUNCT__ 43106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 43206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 433f1af5d2fSBarry Smith { 434f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 435dfbe8321SBarry Smith PetscErrorCode ierr; 436b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 437b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 438b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 439b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 440b3260449SShri Abhyankar const PetscScalar *b; 441f1af5d2fSBarry Smith 442f1af5d2fSBarry Smith PetscFunctionBegin; 443ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 444b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4451ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 446f1af5d2fSBarry Smith 447f1af5d2fSBarry Smith /* forward solve the U^T */ 448f1af5d2fSBarry Smith idx = 0; 449f1af5d2fSBarry Smith for (i=0; i<n; i++) { 450f1af5d2fSBarry Smith 451f1af5d2fSBarry Smith v = aa + 25*diag[i]; 452f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 453ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 454f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 455f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 456f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 457f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 458f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 459f1af5d2fSBarry Smith v += 25; 460f1af5d2fSBarry Smith 461f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 462f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 463f1af5d2fSBarry Smith while (nz--) { 464f1af5d2fSBarry Smith oidx = 5*(*vi++); 465f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 466f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 467f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 468f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 469f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 470f1af5d2fSBarry Smith v += 25; 471f1af5d2fSBarry Smith } 472f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 473f1af5d2fSBarry Smith idx += 5; 474f1af5d2fSBarry Smith } 475f1af5d2fSBarry Smith /* backward solve the L^T */ 476f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 477f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 478f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 479f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 480f1af5d2fSBarry Smith idt = 5*i; 481f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 482f1af5d2fSBarry Smith while (nz--) { 483f1af5d2fSBarry Smith idx = 5*(*vi--); 484f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 485f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 486f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 487f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 488f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 489f1af5d2fSBarry Smith v -= 25; 490f1af5d2fSBarry Smith } 491f1af5d2fSBarry Smith } 492b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 494dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 495f1af5d2fSBarry Smith PetscFunctionReturn(0); 496f1af5d2fSBarry Smith } 497f1af5d2fSBarry Smith 4984a2ae208SSatish Balay #undef __FUNCT__ 4994dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 5004dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 5018499736aSShri Abhyankar { 5028499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5038499736aSShri Abhyankar PetscErrorCode ierr; 504b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5058499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 506b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 507b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 508b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 509b3260449SShri Abhyankar const PetscScalar *b; 5108499736aSShri Abhyankar 5118499736aSShri Abhyankar PetscFunctionBegin; 5128499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 513b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5148499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5158499736aSShri Abhyankar 5168499736aSShri Abhyankar /* forward solve the U^T */ 5178499736aSShri Abhyankar idx = 0; 5188499736aSShri Abhyankar for (i=0; i<n; i++) { 5198499736aSShri Abhyankar v = aa + bs2*diag[i]; 5208499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 5218499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 5228499736aSShri Abhyankar x5 = x[4+idx]; 5238499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 5248499736aSShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 5258499736aSShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 5268499736aSShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 5278499736aSShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 5288499736aSShri Abhyankar v -= bs2; 5298499736aSShri Abhyankar 5308499736aSShri Abhyankar vi = aj + diag[i] - 1; 5318499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 5328499736aSShri Abhyankar for(j=0;j>-nz;j--){ 5338499736aSShri Abhyankar oidx = bs*vi[j]; 5348499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5358499736aSShri Abhyankar x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5368499736aSShri Abhyankar x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5378499736aSShri Abhyankar x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5388499736aSShri Abhyankar x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5398499736aSShri Abhyankar v -= bs2; 5408499736aSShri Abhyankar } 5418499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 5428499736aSShri Abhyankar idx += bs; 5438499736aSShri Abhyankar } 5448499736aSShri Abhyankar /* backward solve the L^T */ 5458499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 5468499736aSShri Abhyankar v = aa + bs2*ai[i]; 5478499736aSShri Abhyankar vi = aj + ai[i]; 5488499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 5498499736aSShri Abhyankar idt = bs*i; 5508499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 5518499736aSShri Abhyankar for(j=0;j<nz;j++){ 5528499736aSShri Abhyankar idx = bs*vi[j]; 5538499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5548499736aSShri Abhyankar x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5558499736aSShri Abhyankar x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5568499736aSShri Abhyankar x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5578499736aSShri Abhyankar x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5588499736aSShri Abhyankar v += bs2; 5598499736aSShri Abhyankar } 5608499736aSShri Abhyankar } 561b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5628499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5638499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5648499736aSShri Abhyankar PetscFunctionReturn(0); 5658499736aSShri Abhyankar } 5668499736aSShri Abhyankar 5678499736aSShri Abhyankar #undef __FUNCT__ 56806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 56906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 570f1af5d2fSBarry Smith { 571f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 572dfbe8321SBarry Smith PetscErrorCode ierr; 573b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 574b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 575b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 576b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 577b3260449SShri Abhyankar const PetscScalar *b; 578f1af5d2fSBarry Smith 579f1af5d2fSBarry Smith PetscFunctionBegin; 580ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 581b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 583f1af5d2fSBarry Smith 584f1af5d2fSBarry Smith /* forward solve the U^T */ 585f1af5d2fSBarry Smith idx = 0; 586f1af5d2fSBarry Smith for (i=0; i<n; i++) { 587f1af5d2fSBarry Smith 588f1af5d2fSBarry Smith v = aa + 36*diag[i]; 589f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 590ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 591ef66eb69SBarry Smith x6 = x[5+idx]; 592f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 593f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 594f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 595f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 596f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 597f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 598f1af5d2fSBarry Smith v += 36; 599f1af5d2fSBarry Smith 600f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 601f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 602f1af5d2fSBarry Smith while (nz--) { 603f1af5d2fSBarry Smith oidx = 6*(*vi++); 604f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 605f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 606f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 607f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 608f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 609f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 610f1af5d2fSBarry Smith v += 36; 611f1af5d2fSBarry Smith } 612f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 613f1af5d2fSBarry Smith x[5+idx] = s6; 614f1af5d2fSBarry Smith idx += 6; 615f1af5d2fSBarry Smith } 616f1af5d2fSBarry Smith /* backward solve the L^T */ 617f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 618f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 619f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 620f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 621f1af5d2fSBarry Smith idt = 6*i; 622f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 623f1af5d2fSBarry Smith s6 = x[5+idt]; 624f1af5d2fSBarry Smith while (nz--) { 625f1af5d2fSBarry Smith idx = 6*(*vi--); 626f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632f1af5d2fSBarry Smith v -= 36; 633f1af5d2fSBarry Smith } 634f1af5d2fSBarry Smith } 635b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 6361ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 637dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 638f1af5d2fSBarry Smith PetscFunctionReturn(0); 639f1af5d2fSBarry Smith } 640f1af5d2fSBarry Smith 6414a2ae208SSatish Balay #undef __FUNCT__ 6424dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 6434dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 6448499736aSShri Abhyankar { 6458499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 6468499736aSShri Abhyankar PetscErrorCode ierr; 647b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 6488499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 649b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 650b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 651b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 652b3260449SShri Abhyankar const PetscScalar *b; 6538499736aSShri Abhyankar 6548499736aSShri Abhyankar PetscFunctionBegin; 6558499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 656b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 6578499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 6588499736aSShri Abhyankar 6598499736aSShri Abhyankar /* forward solve the U^T */ 6608499736aSShri Abhyankar idx = 0; 6618499736aSShri Abhyankar for (i=0; i<n; i++) { 6628499736aSShri Abhyankar v = aa + bs2*diag[i]; 6638499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 6648499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 6658499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; 6668499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 6678499736aSShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 6688499736aSShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 6698499736aSShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 6708499736aSShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 6718499736aSShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 6728499736aSShri Abhyankar v -= bs2; 6738499736aSShri Abhyankar 6748499736aSShri Abhyankar vi = aj + diag[i] - 1; 6758499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 6768499736aSShri Abhyankar for(j=0;j>-nz;j--){ 6778499736aSShri Abhyankar oidx = bs*vi[j]; 6788499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 6798499736aSShri Abhyankar x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 6808499736aSShri Abhyankar x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 6818499736aSShri Abhyankar x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 6828499736aSShri Abhyankar x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 6838499736aSShri Abhyankar x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 6848499736aSShri Abhyankar v -= bs2; 6858499736aSShri Abhyankar } 6868499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 6878499736aSShri Abhyankar x[5+idx] = s6; 6888499736aSShri Abhyankar idx += bs; 6898499736aSShri Abhyankar } 6908499736aSShri Abhyankar /* backward solve the L^T */ 6918499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 6928499736aSShri Abhyankar v = aa + bs2*ai[i]; 6938499736aSShri Abhyankar vi = aj + ai[i]; 6948499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 6958499736aSShri Abhyankar idt = bs*i; 6968499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 6978499736aSShri Abhyankar s6 = x[5+idt]; 6988499736aSShri Abhyankar for(j=0;j<nz;j++){ 6998499736aSShri Abhyankar idx = bs*vi[j]; 7008499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 7018499736aSShri Abhyankar x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 7028499736aSShri Abhyankar x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 7038499736aSShri Abhyankar x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 7048499736aSShri Abhyankar x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 7058499736aSShri Abhyankar x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 7068499736aSShri Abhyankar v += bs2; 7078499736aSShri Abhyankar } 7088499736aSShri Abhyankar } 709b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 7108499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 7118499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 7128499736aSShri Abhyankar PetscFunctionReturn(0); 7138499736aSShri Abhyankar } 7148499736aSShri Abhyankar 7158499736aSShri Abhyankar #undef __FUNCT__ 71606e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 71706e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 718f1af5d2fSBarry Smith { 719f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 720dfbe8321SBarry Smith PetscErrorCode ierr; 721b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 722b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 723b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 724b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 725b3260449SShri Abhyankar const PetscScalar *b; 726f1af5d2fSBarry Smith 727f1af5d2fSBarry Smith PetscFunctionBegin; 728ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 729b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 7301ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 731f1af5d2fSBarry Smith 732f1af5d2fSBarry Smith /* forward solve the U^T */ 733f1af5d2fSBarry Smith idx = 0; 734f1af5d2fSBarry Smith for (i=0; i<n; i++) { 735f1af5d2fSBarry Smith 736f1af5d2fSBarry Smith v = aa + 49*diag[i]; 737f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 738ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 739ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 740f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 741f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 742f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 743f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 744f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 745f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 746f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 747f1af5d2fSBarry Smith v += 49; 748f1af5d2fSBarry Smith 749f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 750f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 751f1af5d2fSBarry Smith while (nz--) { 752f1af5d2fSBarry Smith oidx = 7*(*vi++); 753f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 754f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 755f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 756f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 757f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 758f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 759f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 760f1af5d2fSBarry Smith v += 49; 761f1af5d2fSBarry Smith } 762f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 763f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 764f1af5d2fSBarry Smith idx += 7; 765f1af5d2fSBarry Smith } 766f1af5d2fSBarry Smith /* backward solve the L^T */ 767f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 768f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 769f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 770f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 771f1af5d2fSBarry Smith idt = 7*i; 772f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 773f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 774f1af5d2fSBarry Smith while (nz--) { 775f1af5d2fSBarry Smith idx = 7*(*vi--); 776f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 777f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 778f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 779f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 780f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 781f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 782f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 783f1af5d2fSBarry Smith v -= 49; 784f1af5d2fSBarry Smith } 785f1af5d2fSBarry Smith } 786b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 7871ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 788dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 789f1af5d2fSBarry Smith PetscFunctionReturn(0); 790f1af5d2fSBarry Smith } 7918499736aSShri Abhyankar #undef __FUNCT__ 7924dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 7934dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 7948499736aSShri Abhyankar { 7958499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 7968499736aSShri Abhyankar PetscErrorCode ierr; 797b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 7988499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 799b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 800b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 801b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 802b3260449SShri Abhyankar const PetscScalar *b; 8038499736aSShri Abhyankar 8048499736aSShri Abhyankar PetscFunctionBegin; 8058499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 806b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 8078499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 8088499736aSShri Abhyankar 8098499736aSShri Abhyankar /* forward solve the U^T */ 8108499736aSShri Abhyankar idx = 0; 8118499736aSShri Abhyankar for (i=0; i<n; i++) { 8128499736aSShri Abhyankar v = aa + bs2*diag[i]; 8138499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 8148499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 8158499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 8168499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 8178499736aSShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 8188499736aSShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 8198499736aSShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 8208499736aSShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 8218499736aSShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 8228499736aSShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 8238499736aSShri Abhyankar v -= bs2; 8248499736aSShri Abhyankar vi = aj + diag[i] - 1; 8258499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 8268499736aSShri Abhyankar for(j=0;j>-nz;j--){ 8278499736aSShri Abhyankar oidx = bs*vi[j]; 8288499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8298499736aSShri Abhyankar x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8308499736aSShri Abhyankar x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8318499736aSShri Abhyankar x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8328499736aSShri Abhyankar x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8338499736aSShri Abhyankar x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8348499736aSShri Abhyankar x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8358499736aSShri Abhyankar v -= bs2; 8368499736aSShri Abhyankar } 8378499736aSShri Abhyankar x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 8388499736aSShri Abhyankar x[5+idx] = s6; x[6+idx] = s7; 8398499736aSShri Abhyankar idx += bs; 8408499736aSShri Abhyankar } 8418499736aSShri Abhyankar /* backward solve the L^T */ 8428499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 8438499736aSShri Abhyankar v = aa + bs2*ai[i]; 8448499736aSShri Abhyankar vi = aj + ai[i]; 8458499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 8468499736aSShri Abhyankar idt = bs*i; 8478499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 8488499736aSShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; 8498499736aSShri Abhyankar for(j=0;j<nz;j++){ 8508499736aSShri Abhyankar idx = bs*vi[j]; 8518499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8528499736aSShri Abhyankar x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8538499736aSShri Abhyankar x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8548499736aSShri Abhyankar x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8558499736aSShri Abhyankar x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8568499736aSShri Abhyankar x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8578499736aSShri Abhyankar x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8588499736aSShri Abhyankar v += bs2; 8598499736aSShri Abhyankar } 8608499736aSShri Abhyankar } 861b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 8628499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 8638499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 8648499736aSShri Abhyankar PetscFunctionReturn(0); 8658499736aSShri Abhyankar } 866f1af5d2fSBarry Smith 867f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 8684a2ae208SSatish Balay #undef __FUNCT__ 86906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 87006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 871f1af5d2fSBarry Smith { 872f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 873f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8746849ba73SBarry Smith PetscErrorCode ierr; 8755d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 876b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 877b3260449SShri Abhyankar PetscInt i,nz; 878b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 879b3260449SShri Abhyankar PetscScalar s1,*x,*t; 880b3260449SShri Abhyankar const PetscScalar *b; 881f1af5d2fSBarry Smith 882f1af5d2fSBarry Smith PetscFunctionBegin; 883b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 8841ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 885f1af5d2fSBarry Smith t = a->solve_work; 886f1af5d2fSBarry Smith 887f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 888f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 889f1af5d2fSBarry Smith 890f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 891f1af5d2fSBarry Smith for (i=0; i<n; i++) { 892f1af5d2fSBarry Smith t[i] = b[c[i]]; 893f1af5d2fSBarry Smith } 894f1af5d2fSBarry Smith 895f1af5d2fSBarry Smith /* forward solve the U^T */ 896f1af5d2fSBarry Smith for (i=0; i<n; i++) { 897f1af5d2fSBarry Smith 898f1af5d2fSBarry Smith v = aa + diag[i]; 899f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 900f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 901f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 902f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 903f1af5d2fSBarry Smith while (nz--) { 904f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 905f1af5d2fSBarry Smith } 906f1af5d2fSBarry Smith t[i] = s1; 907f1af5d2fSBarry Smith } 908f1af5d2fSBarry Smith /* backward solve the L^T */ 909f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 910f1af5d2fSBarry Smith v = aa + diag[i] - 1; 911f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 912f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 913f1af5d2fSBarry Smith s1 = t[i]; 914f1af5d2fSBarry Smith while (nz--) { 915f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 916f1af5d2fSBarry Smith } 917f1af5d2fSBarry Smith } 918f1af5d2fSBarry Smith 919f1af5d2fSBarry Smith /* copy t into x according to permutation */ 920f1af5d2fSBarry Smith for (i=0; i<n; i++) { 921f1af5d2fSBarry Smith x[r[i]] = t[i]; 922f1af5d2fSBarry Smith } 923f1af5d2fSBarry Smith 924f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 925f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 926b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 9271ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 928dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 929f1af5d2fSBarry Smith PetscFunctionReturn(0); 930f1af5d2fSBarry Smith } 931f1af5d2fSBarry Smith 9324a2ae208SSatish Balay #undef __FUNCT__ 93306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 93406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 935f1af5d2fSBarry Smith { 936f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 937f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9386849ba73SBarry Smith PetscErrorCode ierr; 9395d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 940b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 941b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 942b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 943b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x,*t; 944b3260449SShri Abhyankar const PetscScalar *b; 945f1af5d2fSBarry Smith 946f1af5d2fSBarry Smith PetscFunctionBegin; 947b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 9481ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 949f1af5d2fSBarry Smith t = a->solve_work; 950f1af5d2fSBarry Smith 951f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 952f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 953f1af5d2fSBarry Smith 954f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 955f1af5d2fSBarry Smith ii = 0; 956f1af5d2fSBarry Smith for (i=0; i<n; i++) { 957f1af5d2fSBarry Smith ic = 2*c[i]; 958f1af5d2fSBarry Smith t[ii] = b[ic]; 959f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 960f1af5d2fSBarry Smith ii += 2; 961f1af5d2fSBarry Smith } 962f1af5d2fSBarry Smith 963f1af5d2fSBarry Smith /* forward solve the U^T */ 964f1af5d2fSBarry Smith idx = 0; 965f1af5d2fSBarry Smith for (i=0; i<n; i++) { 966f1af5d2fSBarry Smith 967f1af5d2fSBarry Smith v = aa + 4*diag[i]; 968f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 969f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 970f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 971f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 972f1af5d2fSBarry Smith v += 4; 973f1af5d2fSBarry Smith 974f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 975f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 976f1af5d2fSBarry Smith while (nz--) { 977f1af5d2fSBarry Smith oidx = 2*(*vi++); 978f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 979f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 980f1af5d2fSBarry Smith v += 4; 981f1af5d2fSBarry Smith } 982f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 983f1af5d2fSBarry Smith idx += 2; 984f1af5d2fSBarry Smith } 985f1af5d2fSBarry Smith /* backward solve the L^T */ 986f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 987f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 988f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 989f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 990f1af5d2fSBarry Smith idt = 2*i; 991f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 992f1af5d2fSBarry Smith while (nz--) { 993f1af5d2fSBarry Smith idx = 2*(*vi--); 994f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 995f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 996f1af5d2fSBarry Smith v -= 4; 997f1af5d2fSBarry Smith } 998f1af5d2fSBarry Smith } 999f1af5d2fSBarry Smith 1000f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1001f1af5d2fSBarry Smith ii = 0; 1002f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1003f1af5d2fSBarry Smith ir = 2*r[i]; 1004f1af5d2fSBarry Smith x[ir] = t[ii]; 1005f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1006f1af5d2fSBarry Smith ii += 2; 1007f1af5d2fSBarry Smith } 1008f1af5d2fSBarry Smith 1009f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1010f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1011b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 10121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1013dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1014f1af5d2fSBarry Smith PetscFunctionReturn(0); 1015f1af5d2fSBarry Smith } 1016f1af5d2fSBarry Smith 10174a2ae208SSatish Balay #undef __FUNCT__ 10184dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 10194dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 102032121132SShri Abhyankar { 102132121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 102232121132SShri Abhyankar PetscErrorCode ierr; 102332121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1024b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 102532121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 102632121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1027b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1028b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1029b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x,*t; 1030b3260449SShri Abhyankar const PetscScalar *b; 103132121132SShri Abhyankar 103232121132SShri Abhyankar PetscFunctionBegin; 1033b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 103432121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 103532121132SShri Abhyankar t = a->solve_work; 103632121132SShri Abhyankar 103732121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 103832121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 103932121132SShri Abhyankar 104032121132SShri Abhyankar /* copy b into temp work space according to permutation */ 104132121132SShri Abhyankar for(i=0;i<n;i++){ 104232121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 104332121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; 104432121132SShri Abhyankar } 104532121132SShri Abhyankar 104632121132SShri Abhyankar /* forward solve the U^T */ 104732121132SShri Abhyankar idx = 0; 104832121132SShri Abhyankar for (i=0; i<n; i++) { 104932121132SShri Abhyankar v = aa + bs2*diag[i]; 105032121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 105132121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 105232121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 105332121132SShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 105432121132SShri Abhyankar v -= bs2; 105532121132SShri Abhyankar 105632121132SShri Abhyankar vi = aj + diag[i] - 1; 105732121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 105832121132SShri Abhyankar for(j=0;j>-nz;j--){ 105932121132SShri Abhyankar oidx = bs*vi[j]; 106032121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2; 106132121132SShri Abhyankar t[oidx+1] -= v[2]*s1 + v[3]*s2; 106232121132SShri Abhyankar v -= bs2; 106332121132SShri Abhyankar } 106432121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 106532121132SShri Abhyankar idx += bs; 106632121132SShri Abhyankar } 106732121132SShri Abhyankar /* backward solve the L^T */ 106832121132SShri Abhyankar for (i=n-1; i>=0; i--){ 106932121132SShri Abhyankar v = aa + bs2*ai[i]; 107032121132SShri Abhyankar vi = aj + ai[i]; 107132121132SShri Abhyankar nz = ai[i+1] - ai[i]; 107232121132SShri Abhyankar idt = bs*i; 107332121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 107432121132SShri Abhyankar for(j=0;j<nz;j++){ 107532121132SShri Abhyankar idx = bs*vi[j]; 107632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2; 107732121132SShri Abhyankar t[idx+1] -= v[2]*s1 + v[3]*s2; 107832121132SShri Abhyankar v += bs2; 107932121132SShri Abhyankar } 108032121132SShri Abhyankar } 108132121132SShri Abhyankar 108232121132SShri Abhyankar /* copy t into x according to permutation */ 108332121132SShri Abhyankar for(i=0;i<n;i++){ 108432121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 108532121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; 108632121132SShri Abhyankar } 108732121132SShri Abhyankar 108832121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 108932121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1090b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 109132121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 109232121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 109332121132SShri Abhyankar PetscFunctionReturn(0); 109432121132SShri Abhyankar } 109532121132SShri Abhyankar 109632121132SShri Abhyankar #undef __FUNCT__ 109706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 109806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1099f1af5d2fSBarry Smith { 1100f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1101f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 11026849ba73SBarry Smith PetscErrorCode ierr; 11035d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1104b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1105b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1106b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1107b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1108b3260449SShri Abhyankar const PetscScalar *b; 1109f1af5d2fSBarry Smith 1110f1af5d2fSBarry Smith PetscFunctionBegin; 1111b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 11121ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1113f1af5d2fSBarry Smith t = a->solve_work; 1114f1af5d2fSBarry Smith 1115f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1116f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1117f1af5d2fSBarry Smith 1118f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1119f1af5d2fSBarry Smith ii = 0; 1120f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1121f1af5d2fSBarry Smith ic = 3*c[i]; 1122f1af5d2fSBarry Smith t[ii] = b[ic]; 1123f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1124f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1125f1af5d2fSBarry Smith ii += 3; 1126f1af5d2fSBarry Smith } 1127f1af5d2fSBarry Smith 1128f1af5d2fSBarry Smith /* forward solve the U^T */ 1129f1af5d2fSBarry Smith idx = 0; 1130f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1131f1af5d2fSBarry Smith 1132f1af5d2fSBarry Smith v = aa + 9*diag[i]; 1133f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1134f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1135f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1136f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1137f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1138f1af5d2fSBarry Smith v += 9; 1139f1af5d2fSBarry Smith 1140f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1141f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1142f1af5d2fSBarry Smith while (nz--) { 1143f1af5d2fSBarry Smith oidx = 3*(*vi++); 1144f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1145f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1146f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1147f1af5d2fSBarry Smith v += 9; 1148f1af5d2fSBarry Smith } 1149f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1150f1af5d2fSBarry Smith idx += 3; 1151f1af5d2fSBarry Smith } 1152f1af5d2fSBarry Smith /* backward solve the L^T */ 1153f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1154f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 1155f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1156f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1157f1af5d2fSBarry Smith idt = 3*i; 1158f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1159f1af5d2fSBarry Smith while (nz--) { 1160f1af5d2fSBarry Smith idx = 3*(*vi--); 1161f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1162f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1163f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1164f1af5d2fSBarry Smith v -= 9; 1165f1af5d2fSBarry Smith } 1166f1af5d2fSBarry Smith } 1167f1af5d2fSBarry Smith 1168f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1169f1af5d2fSBarry Smith ii = 0; 1170f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1171f1af5d2fSBarry Smith ir = 3*r[i]; 1172f1af5d2fSBarry Smith x[ir] = t[ii]; 1173f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1174f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1175f1af5d2fSBarry Smith ii += 3; 1176f1af5d2fSBarry Smith } 1177f1af5d2fSBarry Smith 1178f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1179f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1180b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 11811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1182dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1183f1af5d2fSBarry Smith PetscFunctionReturn(0); 1184f1af5d2fSBarry Smith } 1185f1af5d2fSBarry Smith 11864a2ae208SSatish Balay #undef __FUNCT__ 11874dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 11884dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 118932121132SShri Abhyankar { 119032121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 119132121132SShri Abhyankar PetscErrorCode ierr; 119232121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1193b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 119432121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 119532121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1196b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1197b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1198b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1199b3260449SShri Abhyankar const PetscScalar *b; 120032121132SShri Abhyankar 120132121132SShri Abhyankar PetscFunctionBegin; 1202b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 120332121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 120432121132SShri Abhyankar t = a->solve_work; 120532121132SShri Abhyankar 120632121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 120732121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 120832121132SShri Abhyankar 120932121132SShri Abhyankar /* copy b into temp work space according to permutation */ 121032121132SShri Abhyankar for(i=0;i<n;i++){ 121132121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 121232121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 121332121132SShri Abhyankar } 121432121132SShri Abhyankar 121532121132SShri Abhyankar /* forward solve the U^T */ 121632121132SShri Abhyankar idx = 0; 121732121132SShri Abhyankar for (i=0; i<n; i++) { 121832121132SShri Abhyankar v = aa + bs2*diag[i]; 121932121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 122032121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 122132121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 122232121132SShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 122332121132SShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 122432121132SShri Abhyankar v -= bs2; 122532121132SShri Abhyankar 122632121132SShri Abhyankar vi = aj + diag[i] - 1; 122732121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 122832121132SShri Abhyankar for(j=0;j>-nz;j--){ 122932121132SShri Abhyankar oidx = bs*vi[j]; 123032121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 123132121132SShri Abhyankar t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 123232121132SShri Abhyankar t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 123332121132SShri Abhyankar v -= bs2; 123432121132SShri Abhyankar } 123532121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 123632121132SShri Abhyankar idx += bs; 123732121132SShri Abhyankar } 123832121132SShri Abhyankar /* backward solve the L^T */ 123932121132SShri Abhyankar for (i=n-1; i>=0; i--){ 124032121132SShri Abhyankar v = aa + bs2*ai[i]; 124132121132SShri Abhyankar vi = aj + ai[i]; 124232121132SShri Abhyankar nz = ai[i+1] - ai[i]; 124332121132SShri Abhyankar idt = bs*i; 124432121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 124532121132SShri Abhyankar for(j=0;j<nz;j++){ 124632121132SShri Abhyankar idx = bs*vi[j]; 124732121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 124832121132SShri Abhyankar t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 124932121132SShri Abhyankar t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 125032121132SShri Abhyankar v += bs2; 125132121132SShri Abhyankar } 125232121132SShri Abhyankar } 125332121132SShri Abhyankar 125432121132SShri Abhyankar /* copy t into x according to permutation */ 125532121132SShri Abhyankar for(i=0;i<n;i++){ 125632121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 125732121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 125832121132SShri Abhyankar } 125932121132SShri Abhyankar 126032121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 126132121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1262b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 126332121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 126432121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 126532121132SShri Abhyankar PetscFunctionReturn(0); 126632121132SShri Abhyankar } 126732121132SShri Abhyankar 126832121132SShri Abhyankar #undef __FUNCT__ 126906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 127006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1271f1af5d2fSBarry Smith { 1272f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1273f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 12746849ba73SBarry Smith PetscErrorCode ierr; 12755d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1276b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1277b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1278b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1279b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1280b3260449SShri Abhyankar const PetscScalar *b; 1281f1af5d2fSBarry Smith 1282f1af5d2fSBarry Smith PetscFunctionBegin; 1283b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 12841ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1285f1af5d2fSBarry Smith t = a->solve_work; 1286f1af5d2fSBarry Smith 1287f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1288f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1289f1af5d2fSBarry Smith 1290f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1291f1af5d2fSBarry Smith ii = 0; 1292f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1293f1af5d2fSBarry Smith ic = 4*c[i]; 1294f1af5d2fSBarry Smith t[ii] = b[ic]; 1295f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1296f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1297f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1298f1af5d2fSBarry Smith ii += 4; 1299f1af5d2fSBarry Smith } 1300f1af5d2fSBarry Smith 1301f1af5d2fSBarry Smith /* forward solve the U^T */ 1302f1af5d2fSBarry Smith idx = 0; 1303f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1304f1af5d2fSBarry Smith 1305f1af5d2fSBarry Smith v = aa + 16*diag[i]; 1306f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1307f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1308f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1309f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1310f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1311f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1312f1af5d2fSBarry Smith v += 16; 1313f1af5d2fSBarry Smith 1314f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1315f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1316f1af5d2fSBarry Smith while (nz--) { 1317f1af5d2fSBarry Smith oidx = 4*(*vi++); 1318f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1319f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1320f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1321f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1322f1af5d2fSBarry Smith v += 16; 1323f1af5d2fSBarry Smith } 1324f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1325f1af5d2fSBarry Smith idx += 4; 1326f1af5d2fSBarry Smith } 1327f1af5d2fSBarry Smith /* backward solve the L^T */ 1328f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1329f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 1330f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1331f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1332f1af5d2fSBarry Smith idt = 4*i; 1333f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1334f1af5d2fSBarry Smith while (nz--) { 1335f1af5d2fSBarry Smith idx = 4*(*vi--); 1336f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1337f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1338f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1339f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1340f1af5d2fSBarry Smith v -= 16; 1341f1af5d2fSBarry Smith } 1342f1af5d2fSBarry Smith } 1343f1af5d2fSBarry Smith 1344f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1345f1af5d2fSBarry Smith ii = 0; 1346f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1347f1af5d2fSBarry Smith ir = 4*r[i]; 1348f1af5d2fSBarry Smith x[ir] = t[ii]; 1349f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1350f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1351f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1352f1af5d2fSBarry Smith ii += 4; 1353f1af5d2fSBarry Smith } 1354f1af5d2fSBarry Smith 1355f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1356f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1357b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1359dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1360f1af5d2fSBarry Smith PetscFunctionReturn(0); 1361f1af5d2fSBarry Smith } 1362f1af5d2fSBarry Smith 13634a2ae208SSatish Balay #undef __FUNCT__ 13644dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 13654dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 136632121132SShri Abhyankar { 136732121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 136832121132SShri Abhyankar PetscErrorCode ierr; 136932121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1370b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 137132121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 137232121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1373b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1374b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1375b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1376b3260449SShri Abhyankar const PetscScalar *b; 137732121132SShri Abhyankar 137832121132SShri Abhyankar PetscFunctionBegin; 1379b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 138032121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 138132121132SShri Abhyankar t = a->solve_work; 138232121132SShri Abhyankar 138332121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 138432121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 138532121132SShri Abhyankar 138632121132SShri Abhyankar /* copy b into temp work space according to permutation */ 138732121132SShri Abhyankar for(i=0;i<n;i++){ 138832121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 138932121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 139032121132SShri Abhyankar } 139132121132SShri Abhyankar 139232121132SShri Abhyankar /* forward solve the U^T */ 139332121132SShri Abhyankar idx = 0; 139432121132SShri Abhyankar for (i=0; i<n; i++) { 139532121132SShri Abhyankar v = aa + bs2*diag[i]; 139632121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 139732121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 139832121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 139932121132SShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 140032121132SShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 140132121132SShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 140232121132SShri Abhyankar v -= bs2; 140332121132SShri Abhyankar 140432121132SShri Abhyankar vi = aj + diag[i] - 1; 140532121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 140632121132SShri Abhyankar for(j=0;j>-nz;j--){ 140732121132SShri Abhyankar oidx = bs*vi[j]; 140832121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 140932121132SShri Abhyankar t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 141032121132SShri Abhyankar t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 141132121132SShri Abhyankar t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 141232121132SShri Abhyankar v -= bs2; 141332121132SShri Abhyankar } 141432121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 141532121132SShri Abhyankar idx += bs; 141632121132SShri Abhyankar } 141732121132SShri Abhyankar /* backward solve the L^T */ 141832121132SShri Abhyankar for (i=n-1; i>=0; i--){ 141932121132SShri Abhyankar v = aa + bs2*ai[i]; 142032121132SShri Abhyankar vi = aj + ai[i]; 142132121132SShri Abhyankar nz = ai[i+1] - ai[i]; 142232121132SShri Abhyankar idt = bs*i; 142332121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 142432121132SShri Abhyankar for(j=0;j<nz;j++){ 142532121132SShri Abhyankar idx = bs*vi[j]; 142632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 142732121132SShri Abhyankar t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 142832121132SShri Abhyankar t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 142932121132SShri Abhyankar t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 143032121132SShri Abhyankar v += bs2; 143132121132SShri Abhyankar } 143232121132SShri Abhyankar } 143332121132SShri Abhyankar 143432121132SShri Abhyankar /* copy t into x according to permutation */ 143532121132SShri Abhyankar for(i=0;i<n;i++){ 143632121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 143732121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 143832121132SShri Abhyankar } 143932121132SShri Abhyankar 144032121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 144132121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1442b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 144332121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 144432121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 144532121132SShri Abhyankar PetscFunctionReturn(0); 144632121132SShri Abhyankar } 144732121132SShri Abhyankar 144832121132SShri Abhyankar #undef __FUNCT__ 144906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 145006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1451f1af5d2fSBarry Smith { 1452f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1453f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 14546849ba73SBarry Smith PetscErrorCode ierr; 14555d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1456b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1457b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1458b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1459b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1460b3260449SShri Abhyankar const PetscScalar *b; 1461f1af5d2fSBarry Smith 1462f1af5d2fSBarry Smith PetscFunctionBegin; 1463b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14641ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1465f1af5d2fSBarry Smith t = a->solve_work; 1466f1af5d2fSBarry Smith 1467f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1468f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1469f1af5d2fSBarry Smith 1470f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1471f1af5d2fSBarry Smith ii = 0; 1472f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1473f1af5d2fSBarry Smith ic = 5*c[i]; 1474f1af5d2fSBarry Smith t[ii] = b[ic]; 1475f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1476f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1477f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1478f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1479f1af5d2fSBarry Smith ii += 5; 1480f1af5d2fSBarry Smith } 1481f1af5d2fSBarry Smith 1482f1af5d2fSBarry Smith /* forward solve the U^T */ 1483f1af5d2fSBarry Smith idx = 0; 1484f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1485f1af5d2fSBarry Smith 1486f1af5d2fSBarry Smith v = aa + 25*diag[i]; 1487f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1488f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1489f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1490f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1491f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1492f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1493f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1494f1af5d2fSBarry Smith v += 25; 1495f1af5d2fSBarry Smith 1496f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1497f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1498f1af5d2fSBarry Smith while (nz--) { 1499f1af5d2fSBarry Smith oidx = 5*(*vi++); 1500f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1501f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1502f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1503f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1504f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1505f1af5d2fSBarry Smith v += 25; 1506f1af5d2fSBarry Smith } 1507f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1508f1af5d2fSBarry Smith idx += 5; 1509f1af5d2fSBarry Smith } 1510f1af5d2fSBarry Smith /* backward solve the L^T */ 1511f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1512f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 1513f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1514f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1515f1af5d2fSBarry Smith idt = 5*i; 1516f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1517f1af5d2fSBarry Smith while (nz--) { 1518f1af5d2fSBarry Smith idx = 5*(*vi--); 1519f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1520f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1521f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1522f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1523f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1524f1af5d2fSBarry Smith v -= 25; 1525f1af5d2fSBarry Smith } 1526f1af5d2fSBarry Smith } 1527f1af5d2fSBarry Smith 1528f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1529f1af5d2fSBarry Smith ii = 0; 1530f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1531f1af5d2fSBarry Smith ir = 5*r[i]; 1532f1af5d2fSBarry Smith x[ir] = t[ii]; 1533f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1534f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1535f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1536f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1537f1af5d2fSBarry Smith ii += 5; 1538f1af5d2fSBarry Smith } 1539f1af5d2fSBarry Smith 1540f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1541f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1542b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15431ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1544dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1545f1af5d2fSBarry Smith PetscFunctionReturn(0); 1546f1af5d2fSBarry Smith } 1547f1af5d2fSBarry Smith 15484a2ae208SSatish Balay #undef __FUNCT__ 15494dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 15504dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 155132121132SShri Abhyankar { 155232121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 155332121132SShri Abhyankar PetscErrorCode ierr; 155432121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1555b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 155632121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 155732121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1558b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1559b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1560b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1561b3260449SShri Abhyankar const PetscScalar *b; 156232121132SShri Abhyankar 156332121132SShri Abhyankar PetscFunctionBegin; 1564b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 156532121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 156632121132SShri Abhyankar t = a->solve_work; 156732121132SShri Abhyankar 156832121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 156932121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 157032121132SShri Abhyankar 157132121132SShri Abhyankar /* copy b into temp work space according to permutation */ 157232121132SShri Abhyankar for(i=0;i<n;i++){ 157332121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 157432121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 157532121132SShri Abhyankar t[ii+4] = b[ic+4]; 157632121132SShri Abhyankar } 157732121132SShri Abhyankar 157832121132SShri Abhyankar /* forward solve the U^T */ 157932121132SShri Abhyankar idx = 0; 158032121132SShri Abhyankar for (i=0; i<n; i++) { 158132121132SShri Abhyankar v = aa + bs2*diag[i]; 158232121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 158332121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 158432121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 158532121132SShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 158632121132SShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 158732121132SShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 158832121132SShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 158932121132SShri Abhyankar v -= bs2; 159032121132SShri Abhyankar 159132121132SShri Abhyankar vi = aj + diag[i] - 1; 159232121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 159332121132SShri Abhyankar for(j=0;j>-nz;j--){ 159432121132SShri Abhyankar oidx = bs*vi[j]; 159532121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 159632121132SShri Abhyankar t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 159732121132SShri Abhyankar t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 159832121132SShri Abhyankar t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 159932121132SShri Abhyankar t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 160032121132SShri Abhyankar v -= bs2; 160132121132SShri Abhyankar } 160232121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 160332121132SShri Abhyankar idx += bs; 160432121132SShri Abhyankar } 160532121132SShri Abhyankar /* backward solve the L^T */ 160632121132SShri Abhyankar for (i=n-1; i>=0; i--){ 160732121132SShri Abhyankar v = aa + bs2*ai[i]; 160832121132SShri Abhyankar vi = aj + ai[i]; 160932121132SShri Abhyankar nz = ai[i+1] - ai[i]; 161032121132SShri Abhyankar idt = bs*i; 161132121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 161232121132SShri Abhyankar for(j=0;j<nz;j++){ 161332121132SShri Abhyankar idx = bs*vi[j]; 161432121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 161532121132SShri Abhyankar t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 161632121132SShri Abhyankar t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 161732121132SShri Abhyankar t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 161832121132SShri Abhyankar t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 161932121132SShri Abhyankar v += bs2; 162032121132SShri Abhyankar } 162132121132SShri Abhyankar } 162232121132SShri Abhyankar 162332121132SShri Abhyankar /* copy t into x according to permutation */ 162432121132SShri Abhyankar for(i=0;i<n;i++){ 162532121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 162632121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 162732121132SShri Abhyankar x[ir+4] = t[ii+4]; 162832121132SShri Abhyankar } 162932121132SShri Abhyankar 163032121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 163132121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1632b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 163332121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 163432121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 163532121132SShri Abhyankar PetscFunctionReturn(0); 163632121132SShri Abhyankar } 163732121132SShri Abhyankar 163832121132SShri Abhyankar #undef __FUNCT__ 163906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 164006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1641f1af5d2fSBarry Smith { 1642f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1643f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 16446849ba73SBarry Smith PetscErrorCode ierr; 16455d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1646b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1647b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1648b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1649b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1650b3260449SShri Abhyankar const PetscScalar *b; 1651f1af5d2fSBarry Smith 1652f1af5d2fSBarry Smith PetscFunctionBegin; 1653b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16541ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1655f1af5d2fSBarry Smith t = a->solve_work; 1656f1af5d2fSBarry Smith 1657f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1658f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1659f1af5d2fSBarry Smith 1660f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1661f1af5d2fSBarry Smith ii = 0; 1662f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1663f1af5d2fSBarry Smith ic = 6*c[i]; 1664f1af5d2fSBarry Smith t[ii] = b[ic]; 1665f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1666f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1667f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1668f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1669f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1670f1af5d2fSBarry Smith ii += 6; 1671f1af5d2fSBarry Smith } 1672f1af5d2fSBarry Smith 1673f1af5d2fSBarry Smith /* forward solve the U^T */ 1674f1af5d2fSBarry Smith idx = 0; 1675f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1676f1af5d2fSBarry Smith 1677f1af5d2fSBarry Smith v = aa + 36*diag[i]; 1678f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1679f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1680f1af5d2fSBarry Smith x6 = t[5+idx]; 1681f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1682f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1683f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1684f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1685f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1686f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1687f1af5d2fSBarry Smith v += 36; 1688f1af5d2fSBarry Smith 1689f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1690f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1691f1af5d2fSBarry Smith while (nz--) { 1692f1af5d2fSBarry Smith oidx = 6*(*vi++); 1693f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1694f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1695f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1696f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1697f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1698f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1699f1af5d2fSBarry Smith v += 36; 1700f1af5d2fSBarry Smith } 1701f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1702f1af5d2fSBarry Smith t[5+idx] = s6; 1703f1af5d2fSBarry Smith idx += 6; 1704f1af5d2fSBarry Smith } 1705f1af5d2fSBarry Smith /* backward solve the L^T */ 1706f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1707f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 1708f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1709f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1710f1af5d2fSBarry Smith idt = 6*i; 1711f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1712f1af5d2fSBarry Smith s6 = t[5+idt]; 1713f1af5d2fSBarry Smith while (nz--) { 1714f1af5d2fSBarry Smith idx = 6*(*vi--); 1715f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1716f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1717f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1718f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1719f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1720f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1721f1af5d2fSBarry Smith v -= 36; 1722f1af5d2fSBarry Smith } 1723f1af5d2fSBarry Smith } 1724f1af5d2fSBarry Smith 1725f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1726f1af5d2fSBarry Smith ii = 0; 1727f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1728f1af5d2fSBarry Smith ir = 6*r[i]; 1729f1af5d2fSBarry Smith x[ir] = t[ii]; 1730f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1731f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1732f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1733f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1734f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1735f1af5d2fSBarry Smith ii += 6; 1736f1af5d2fSBarry Smith } 1737f1af5d2fSBarry Smith 1738f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1739f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1740b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17411ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1742dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1743f1af5d2fSBarry Smith PetscFunctionReturn(0); 1744f1af5d2fSBarry Smith } 1745f1af5d2fSBarry Smith 17464a2ae208SSatish Balay #undef __FUNCT__ 17474dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 17484dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 174932121132SShri Abhyankar { 175032121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 175132121132SShri Abhyankar PetscErrorCode ierr; 175232121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1753b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 175432121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 175532121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1756b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1757b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1758b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1759b3260449SShri Abhyankar const PetscScalar *b; 176032121132SShri Abhyankar 176132121132SShri Abhyankar PetscFunctionBegin; 1762b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 176332121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 176432121132SShri Abhyankar t = a->solve_work; 176532121132SShri Abhyankar 176632121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 176732121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 176832121132SShri Abhyankar 176932121132SShri Abhyankar /* copy b into temp work space according to permutation */ 177032121132SShri Abhyankar for(i=0;i<n;i++){ 177132121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 177232121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 177332121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 177432121132SShri Abhyankar } 177532121132SShri Abhyankar 177632121132SShri Abhyankar /* forward solve the U^T */ 177732121132SShri Abhyankar idx = 0; 177832121132SShri Abhyankar for (i=0; i<n; i++) { 177932121132SShri Abhyankar v = aa + bs2*diag[i]; 178032121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 178132121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 178232121132SShri Abhyankar x6 = t[5+idx]; 178332121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 178432121132SShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 178532121132SShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 178632121132SShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 178732121132SShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 178832121132SShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 178932121132SShri Abhyankar v -= bs2; 179032121132SShri Abhyankar 179132121132SShri Abhyankar vi = aj + diag[i] - 1; 179232121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 179332121132SShri Abhyankar for(j=0;j>-nz;j--){ 179432121132SShri Abhyankar oidx = bs*vi[j]; 179532121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 179632121132SShri Abhyankar t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 179732121132SShri Abhyankar t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 179832121132SShri Abhyankar t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 179932121132SShri Abhyankar t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 180032121132SShri Abhyankar t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 180132121132SShri Abhyankar v -= bs2; 180232121132SShri Abhyankar } 180332121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 180432121132SShri Abhyankar t[5+idx] = s6; 180532121132SShri Abhyankar idx += bs; 180632121132SShri Abhyankar } 180732121132SShri Abhyankar /* backward solve the L^T */ 180832121132SShri Abhyankar for (i=n-1; i>=0; i--){ 180932121132SShri Abhyankar v = aa + bs2*ai[i]; 181032121132SShri Abhyankar vi = aj + ai[i]; 181132121132SShri Abhyankar nz = ai[i+1] - ai[i]; 181232121132SShri Abhyankar idt = bs*i; 181332121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 181432121132SShri Abhyankar s6 = t[5+idt]; 181532121132SShri Abhyankar for(j=0;j<nz;j++){ 181632121132SShri Abhyankar idx = bs*vi[j]; 181732121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 181832121132SShri Abhyankar t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 181932121132SShri Abhyankar t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 182032121132SShri Abhyankar t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 182132121132SShri Abhyankar t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 182232121132SShri Abhyankar t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 182332121132SShri Abhyankar v += bs2; 182432121132SShri Abhyankar } 182532121132SShri Abhyankar } 182632121132SShri Abhyankar 182732121132SShri Abhyankar /* copy t into x according to permutation */ 182832121132SShri Abhyankar for(i=0;i<n;i++){ 182932121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 183032121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 183132121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 183232121132SShri Abhyankar } 183332121132SShri Abhyankar 183432121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 183532121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1836b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 183732121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 183832121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 183932121132SShri Abhyankar PetscFunctionReturn(0); 184032121132SShri Abhyankar } 184132121132SShri Abhyankar 184232121132SShri Abhyankar #undef __FUNCT__ 184306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 184406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1845f1af5d2fSBarry Smith { 1846f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1847f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 18486849ba73SBarry Smith PetscErrorCode ierr; 18495d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1850b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1851b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1852b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1853b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1854b3260449SShri Abhyankar const PetscScalar *b; 1855f1af5d2fSBarry Smith 1856f1af5d2fSBarry Smith PetscFunctionBegin; 1857b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18581ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1859f1af5d2fSBarry Smith t = a->solve_work; 1860f1af5d2fSBarry Smith 1861f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1862f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1863f1af5d2fSBarry Smith 1864f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1865f1af5d2fSBarry Smith ii = 0; 1866f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1867f1af5d2fSBarry Smith ic = 7*c[i]; 1868f1af5d2fSBarry Smith t[ii] = b[ic]; 1869f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1870f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1871f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1872f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1873f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1874f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1875f1af5d2fSBarry Smith ii += 7; 1876f1af5d2fSBarry Smith } 1877f1af5d2fSBarry Smith 1878f1af5d2fSBarry Smith /* forward solve the U^T */ 1879f1af5d2fSBarry Smith idx = 0; 1880f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1881f1af5d2fSBarry Smith 1882f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1883f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1884f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1885f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1886f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1887f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1888f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1889f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1890f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1891f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1892f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1893f1af5d2fSBarry Smith v += 49; 1894f1af5d2fSBarry Smith 1895f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1896f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1897f1af5d2fSBarry Smith while (nz--) { 1898f1af5d2fSBarry Smith oidx = 7*(*vi++); 1899f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1900f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1901f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1902f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1903f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1904f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1905f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1906f1af5d2fSBarry Smith v += 49; 1907f1af5d2fSBarry Smith } 1908f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1909f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1910f1af5d2fSBarry Smith idx += 7; 1911f1af5d2fSBarry Smith } 1912f1af5d2fSBarry Smith /* backward solve the L^T */ 1913f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1914f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1915f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1916f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1917f1af5d2fSBarry Smith idt = 7*i; 1918f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1919f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1920f1af5d2fSBarry Smith while (nz--) { 1921f1af5d2fSBarry Smith idx = 7*(*vi--); 1922f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1923f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1924f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1925f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1926f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1927f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1928f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1929f1af5d2fSBarry Smith v -= 49; 1930f1af5d2fSBarry Smith } 1931f1af5d2fSBarry Smith } 1932f1af5d2fSBarry Smith 1933f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1934f1af5d2fSBarry Smith ii = 0; 1935f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1936f1af5d2fSBarry Smith ir = 7*r[i]; 1937f1af5d2fSBarry Smith x[ir] = t[ii]; 1938f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1939f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1940f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1941f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1942f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1943f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1944f1af5d2fSBarry Smith ii += 7; 1945f1af5d2fSBarry Smith } 1946f1af5d2fSBarry Smith 1947f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1948f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1949b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19501ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1951dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1952f1af5d2fSBarry Smith PetscFunctionReturn(0); 1953f1af5d2fSBarry Smith } 195432121132SShri Abhyankar #undef __FUNCT__ 19554dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 19564dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 195732121132SShri Abhyankar { 195832121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 195932121132SShri Abhyankar PetscErrorCode ierr; 196032121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1961b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 196232121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 196332121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1964b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1965b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1966b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1967b3260449SShri Abhyankar const PetscScalar *b; 196832121132SShri Abhyankar 196932121132SShri Abhyankar PetscFunctionBegin; 1970b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 197132121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 197232121132SShri Abhyankar t = a->solve_work; 197332121132SShri Abhyankar 197432121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 197532121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 197632121132SShri Abhyankar 197732121132SShri Abhyankar /* copy b into temp work space according to permutation */ 197832121132SShri Abhyankar for(i=0;i<n;i++){ 197932121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 198032121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 198132121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 198232121132SShri Abhyankar } 198332121132SShri Abhyankar 198432121132SShri Abhyankar /* forward solve the U^T */ 198532121132SShri Abhyankar idx = 0; 198632121132SShri Abhyankar for (i=0; i<n; i++) { 198732121132SShri Abhyankar v = aa + bs2*diag[i]; 198832121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 198932121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 199032121132SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 199132121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 199232121132SShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 199332121132SShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 199432121132SShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 199532121132SShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 199632121132SShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 199732121132SShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 199832121132SShri Abhyankar v -= bs2; 199932121132SShri Abhyankar 200032121132SShri Abhyankar vi = aj + diag[i] - 1; 200132121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 200232121132SShri Abhyankar for(j=0;j>-nz;j--){ 200332121132SShri Abhyankar oidx = bs*vi[j]; 200432121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 200532121132SShri Abhyankar t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 200632121132SShri Abhyankar t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 200732121132SShri Abhyankar t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 200832121132SShri Abhyankar t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 200932121132SShri Abhyankar t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 201032121132SShri Abhyankar t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 201132121132SShri Abhyankar v -= bs2; 201232121132SShri Abhyankar } 201332121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 201432121132SShri Abhyankar t[5+idx] = s6; t[6+idx] = s7; 201532121132SShri Abhyankar idx += bs; 201632121132SShri Abhyankar } 201732121132SShri Abhyankar /* backward solve the L^T */ 201832121132SShri Abhyankar for (i=n-1; i>=0; i--){ 201932121132SShri Abhyankar v = aa + bs2*ai[i]; 202032121132SShri Abhyankar vi = aj + ai[i]; 202132121132SShri Abhyankar nz = ai[i+1] - ai[i]; 202232121132SShri Abhyankar idt = bs*i; 202332121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 202432121132SShri Abhyankar s6 = t[5+idt]; s7 = t[6+idt]; 202532121132SShri Abhyankar for(j=0;j<nz;j++){ 202632121132SShri Abhyankar idx = bs*vi[j]; 202732121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 202832121132SShri Abhyankar t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 202932121132SShri Abhyankar t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 203032121132SShri Abhyankar t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 203132121132SShri Abhyankar t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 203232121132SShri Abhyankar t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 203332121132SShri Abhyankar t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 203432121132SShri Abhyankar v += bs2; 203532121132SShri Abhyankar } 203632121132SShri Abhyankar } 203732121132SShri Abhyankar 203832121132SShri Abhyankar /* copy t into x according to permutation */ 203932121132SShri Abhyankar for(i=0;i<n;i++){ 204032121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 204132121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 204232121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 204332121132SShri Abhyankar } 204432121132SShri Abhyankar 204532121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 204632121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2047b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 204832121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 204932121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 205032121132SShri Abhyankar PetscFunctionReturn(0); 205132121132SShri Abhyankar } 2052f1af5d2fSBarry Smith 20534e2b4712SSatish Balay /* ----------------------------------------------------------- */ 20544a2ae208SSatish Balay #undef __FUNCT__ 205506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 205606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 20574e2b4712SSatish Balay { 20584e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 20594e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 20606849ba73SBarry Smith PetscErrorCode ierr; 2061b3260449SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 2062b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 2063b3260449SShri Abhyankar PetscInt i,nz; 2064b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2065b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2066b3260449SShri Abhyankar PetscScalar *x,*s,*t,*ls; 2067b3260449SShri Abhyankar const PetscScalar *b; 20684e2b4712SSatish Balay 20694e2b4712SSatish Balay PetscFunctionBegin; 2070b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20711ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2072f1af5d2fSBarry Smith t = a->solve_work; 20734e2b4712SSatish Balay 20744e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 20754e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 20764e2b4712SSatish Balay 20774e2b4712SSatish Balay /* forward solve the lower triangular */ 207887828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 20794e2b4712SSatish Balay for (i=1; i<n; i++) { 20804e2b4712SSatish Balay v = aa + bs2*ai[i]; 20814e2b4712SSatish Balay vi = aj + ai[i]; 20824e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 2083f1af5d2fSBarry Smith s = t + bs*i; 208487828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 20854e2b4712SSatish Balay while (nz--) { 2086f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 20874e2b4712SSatish Balay v += bs2; 20884e2b4712SSatish Balay } 20894e2b4712SSatish Balay } 20904e2b4712SSatish Balay /* backward solve the upper triangular */ 2091d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 20924e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 20934e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 20944e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 20954e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 209687828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 20974e2b4712SSatish Balay while (nz--) { 2098f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 20994e2b4712SSatish Balay v += bs2; 21004e2b4712SSatish Balay } 2101f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 210287828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 21034e2b4712SSatish Balay } 21044e2b4712SSatish Balay 21054e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21064e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2107b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21081ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2109dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 21104e2b4712SSatish Balay PetscFunctionReturn(0); 21114e2b4712SSatish Balay } 21124e2b4712SSatish Balay 21135c42ef9dSBarry Smith /* ----------------------------------------------------------- */ 21145c42ef9dSBarry Smith #undef __FUNCT__ 211506e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 211606e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 21175c42ef9dSBarry Smith { 21185c42ef9dSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21195c42ef9dSBarry Smith IS iscol=a->col,isrow=a->row; 21205c42ef9dSBarry Smith PetscErrorCode ierr; 21215c42ef9dSBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2122b3260449SShri Abhyankar PetscInt i,nz,j; 2123b3260449SShri Abhyankar const PetscInt n=a->mbs,bs=A->rmap->bs,bs2=a->bs2; 21245c42ef9dSBarry Smith const MatScalar *aa=a->a,*v; 21255c42ef9dSBarry Smith PetscScalar *x,*t,*ls; 21265c42ef9dSBarry Smith const PetscScalar *b; 21275c42ef9dSBarry Smith PetscFunctionBegin; 21285c42ef9dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21295c42ef9dSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21305c42ef9dSBarry Smith t = a->solve_work; 21315c42ef9dSBarry Smith 21325c42ef9dSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21335c42ef9dSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 21345c42ef9dSBarry Smith 21355c42ef9dSBarry Smith /* copy the b into temp work space according to permutation */ 21365c42ef9dSBarry Smith for (i=0; i<n; i++) { 21375c42ef9dSBarry Smith for (j=0; j<bs; j++) { 21385c42ef9dSBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 21395c42ef9dSBarry Smith } 21405c42ef9dSBarry Smith } 21415c42ef9dSBarry Smith 21425c42ef9dSBarry Smith 21435c42ef9dSBarry Smith /* forward solve the upper triangular transpose */ 21445c42ef9dSBarry Smith ls = a->solve_work + A->cmap->n; 21455c42ef9dSBarry Smith for (i=0; i<n; i++){ 21465c42ef9dSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 21475c42ef9dSBarry Smith Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 21485c42ef9dSBarry Smith v = aa + bs2*(a->diag[i] + 1); 21495c42ef9dSBarry Smith vi = aj + a->diag[i] + 1; 21505c42ef9dSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 21515c42ef9dSBarry Smith while (nz--) { 21525c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 21535c42ef9dSBarry Smith v += bs2; 21545c42ef9dSBarry Smith } 21555c42ef9dSBarry Smith } 21565c42ef9dSBarry Smith 21575c42ef9dSBarry Smith /* backward solve the lower triangular transpose */ 21585c42ef9dSBarry Smith for (i=n-1; i>=0; i--) { 21595c42ef9dSBarry Smith v = aa + bs2*ai[i]; 21605c42ef9dSBarry Smith vi = aj + ai[i]; 21615c42ef9dSBarry Smith nz = a->diag[i] - ai[i]; 21625c42ef9dSBarry Smith while (nz--) { 21635c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 21645c42ef9dSBarry Smith v += bs2; 21655c42ef9dSBarry Smith } 21665c42ef9dSBarry Smith } 21675c42ef9dSBarry Smith 21685c42ef9dSBarry Smith /* copy t into x according to permutation */ 21695c42ef9dSBarry Smith for (i=0; i<n; i++) { 21705c42ef9dSBarry Smith for (j=0; j<bs; j++) { 21715c42ef9dSBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 21725c42ef9dSBarry Smith } 21735c42ef9dSBarry Smith } 21745c42ef9dSBarry Smith 21755c42ef9dSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21765c42ef9dSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21775c42ef9dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21785c42ef9dSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 21795c42ef9dSBarry Smith ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 21805c42ef9dSBarry Smith PetscFunctionReturn(0); 21815c42ef9dSBarry Smith } 21825c42ef9dSBarry Smith 21834a2ae208SSatish Balay #undef __FUNCT__ 21844dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 21854dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 21868499736aSShri Abhyankar { 21878499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21888499736aSShri Abhyankar IS iscol=a->col,isrow=a->row; 21898499736aSShri Abhyankar PetscErrorCode ierr; 2190b3260449SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 2191b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2192b3260449SShri Abhyankar PetscInt i,j,nz; 2193b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 21948499736aSShri Abhyankar const MatScalar *aa=a->a,*v; 21958499736aSShri Abhyankar PetscScalar *x,*t,*ls; 21968499736aSShri Abhyankar const PetscScalar *b; 2197b3260449SShri Abhyankar 21988499736aSShri Abhyankar PetscFunctionBegin; 21998499736aSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22008499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22018499736aSShri Abhyankar t = a->solve_work; 22028499736aSShri Abhyankar 22038499736aSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 22048499736aSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22058499736aSShri Abhyankar 22068499736aSShri Abhyankar /* copy the b into temp work space according to permutation */ 22078499736aSShri Abhyankar for (i=0; i<n; i++) { 22088499736aSShri Abhyankar for (j=0; j<bs; j++) { 22098499736aSShri Abhyankar t[i*bs+j] = b[c[i]*bs+j]; 22108499736aSShri Abhyankar } 22118499736aSShri Abhyankar } 22128499736aSShri Abhyankar 22138499736aSShri Abhyankar 22148499736aSShri Abhyankar /* forward solve the upper triangular transpose */ 22158499736aSShri Abhyankar ls = a->solve_work + A->cmap->n; 22168499736aSShri Abhyankar for (i=0; i<n; i++){ 22178499736aSShri Abhyankar ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 22188499736aSShri Abhyankar Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 22198499736aSShri Abhyankar v = aa + bs2*(diag[i] - 1); 22208499736aSShri Abhyankar vi = aj + diag[i] - 1; 22218499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 22228499736aSShri Abhyankar for(j=0;j>-nz;j--){ 22238499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 22248499736aSShri Abhyankar v -= bs2; 22258499736aSShri Abhyankar } 22268499736aSShri Abhyankar } 22278499736aSShri Abhyankar 22288499736aSShri Abhyankar /* backward solve the lower triangular transpose */ 22298499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 22308499736aSShri Abhyankar v = aa + bs2*ai[i]; 22318499736aSShri Abhyankar vi = aj + ai[i]; 22328499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 22338499736aSShri Abhyankar for(j=0;j<nz;j++){ 22348499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 22358499736aSShri Abhyankar v += bs2; 22368499736aSShri Abhyankar } 22378499736aSShri Abhyankar } 22388499736aSShri Abhyankar 22398499736aSShri Abhyankar /* copy t into x according to permutation */ 22408499736aSShri Abhyankar for (i=0; i<n; i++) { 22418499736aSShri Abhyankar for (j=0; j<bs; j++) { 22428499736aSShri Abhyankar x[bs*r[i]+j] = t[bs*i+j]; 22438499736aSShri Abhyankar } 22448499736aSShri Abhyankar } 22458499736aSShri Abhyankar 22468499736aSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22478499736aSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 22488499736aSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22498499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 22508499736aSShri Abhyankar ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 22518499736aSShri Abhyankar PetscFunctionReturn(0); 22528499736aSShri Abhyankar } 22538499736aSShri Abhyankar 2254832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */ 225529a97285SShri Abhyankar 22562b0b2ea7SShri Abhyankar #undef __FUNCT__ 2257832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2" 2258832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx) 22592b0b2ea7SShri Abhyankar { 22602b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 22612b0b2ea7SShri Abhyankar PetscErrorCode ierr; 2262b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 22630fa040f9SShri Abhyankar PetscInt i,nz,idx,idt,m; 22640b68f018SBarry Smith const MatScalar *aa=a->a,*v; 22652b0b2ea7SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 22662b0b2ea7SShri Abhyankar PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 22670fa040f9SShri Abhyankar PetscScalar *x; 22680b68f018SBarry Smith const PetscScalar *b; 22692b0b2ea7SShri Abhyankar 22702b0b2ea7SShri Abhyankar PetscFunctionBegin; 22710b68f018SBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22722b0b2ea7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22732b0b2ea7SShri Abhyankar 22742b0b2ea7SShri Abhyankar /* forward solve the lower triangular */ 227529a97285SShri Abhyankar idx = 0; 22760fa040f9SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx]; 22770fa040f9SShri Abhyankar x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx]; 22780fa040f9SShri Abhyankar x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx]; 22792b0b2ea7SShri Abhyankar 22802b0b2ea7SShri Abhyankar for (i=1; i<n; i++) { 22812b0b2ea7SShri Abhyankar v = aa + bs2*ai[i]; 22822b0b2ea7SShri Abhyankar vi = aj + ai[i]; 22832b0b2ea7SShri Abhyankar nz = ai[i+1] - ai[i]; 22840fa040f9SShri Abhyankar idt = bs*i; 22850fa040f9SShri Abhyankar s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt]; 22860fa040f9SShri Abhyankar s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt]; 22870fa040f9SShri Abhyankar s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt]; 22882b0b2ea7SShri Abhyankar for(m=0;m<nz;m++){ 22892b0b2ea7SShri Abhyankar idx = bs*vi[m]; 22900fa040f9SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 22910fa040f9SShri Abhyankar x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 22920fa040f9SShri Abhyankar x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 22932b0b2ea7SShri Abhyankar 22940b8f6341SShri Abhyankar 22952b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 22962b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 22972b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 22982b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 22992b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 23002b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 23012b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 23022b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 23032b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 23042b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 23052b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 23062b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 23072b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 23082b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 23092b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 23102b0b2ea7SShri Abhyankar 23112b0b2ea7SShri Abhyankar v += bs2; 23122b0b2ea7SShri Abhyankar } 23130fa040f9SShri Abhyankar x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5; 23140fa040f9SShri Abhyankar x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10; 23150fa040f9SShri Abhyankar x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15; 23162b0b2ea7SShri Abhyankar 23172b0b2ea7SShri Abhyankar } 23182b0b2ea7SShri Abhyankar /* backward solve the upper triangular */ 23192b0b2ea7SShri Abhyankar for (i=n-1; i>=0; i--){ 23202b0b2ea7SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 23212b0b2ea7SShri Abhyankar vi = aj + adiag[i+1]+1; 23222b0b2ea7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 23232b0b2ea7SShri Abhyankar idt = bs*i; 23240fa040f9SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 23250fa040f9SShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt]; 23260fa040f9SShri Abhyankar s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt]; 23272b0b2ea7SShri Abhyankar 23282b0b2ea7SShri Abhyankar for(m=0;m<nz;m++){ 23292b0b2ea7SShri Abhyankar idx = bs*vi[m]; 23300fa040f9SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 23310fa040f9SShri Abhyankar x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 23320fa040f9SShri Abhyankar x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 23332b0b2ea7SShri Abhyankar 23342b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 23352b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 23362b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 23372b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 23382b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 23392b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 23402b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 23412b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 23422b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 23432b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 23442b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 23452b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 23462b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 23472b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 23482b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 23492b0b2ea7SShri Abhyankar 23502b0b2ea7SShri Abhyankar v += bs2; 23512b0b2ea7SShri Abhyankar } 23522b0b2ea7SShri Abhyankar 23530fa040f9SShri Abhyankar x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 23540fa040f9SShri Abhyankar x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 23550fa040f9SShri Abhyankar x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 23560fa040f9SShri Abhyankar x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 23570fa040f9SShri Abhyankar x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 23580fa040f9SShri Abhyankar x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 23590fa040f9SShri Abhyankar x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 23600fa040f9SShri Abhyankar x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 23610fa040f9SShri Abhyankar x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 23620fa040f9SShri Abhyankar x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 23630fa040f9SShri Abhyankar x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 23640fa040f9SShri Abhyankar x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 23650fa040f9SShri Abhyankar x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 23660fa040f9SShri Abhyankar x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 23670fa040f9SShri Abhyankar x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 23682b0b2ea7SShri Abhyankar 23692b0b2ea7SShri Abhyankar } 23702b0b2ea7SShri Abhyankar 23710b68f018SBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23722b0b2ea7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 23732b0b2ea7SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 23742b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 23752b0b2ea7SShri Abhyankar } 23762b0b2ea7SShri Abhyankar 2377832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */ 2378832cc040SShri Abhyankar /* Default MatSolve for block size 15 */ 2379832cc040SShri Abhyankar 23808499736aSShri Abhyankar #undef __FUNCT__ 2381832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1" 2382832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx) 23830b8f6341SShri Abhyankar { 23840b8f6341SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 23850b8f6341SShri Abhyankar PetscErrorCode ierr; 23860b8f6341SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 23870fa040f9SShri Abhyankar PetscInt i,k,nz,kdx,idx,idt,m; 23880b8f6341SShri Abhyankar const MatScalar *aa=a->a,*v; 23890b8f6341SShri Abhyankar PetscScalar s[15]; 23900fa040f9SShri Abhyankar PetscScalar *x; 23910b8f6341SShri Abhyankar const PetscScalar *b; 23920b8f6341SShri Abhyankar 23930b8f6341SShri Abhyankar PetscFunctionBegin; 23940b8f6341SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23950b8f6341SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23960b8f6341SShri Abhyankar 23970b8f6341SShri Abhyankar /* forward solve the lower triangular */ 2398832cc040SShri Abhyankar for (i=0; i<n; i++) { 23990b8f6341SShri Abhyankar v = aa + bs2*ai[i]; 24000b8f6341SShri Abhyankar vi = aj + ai[i]; 24010b8f6341SShri Abhyankar nz = ai[i+1] - ai[i]; 24020fa040f9SShri Abhyankar idt = bs*i; 2403832cc040SShri Abhyankar x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 2404832cc040SShri Abhyankar x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 2405832cc040SShri Abhyankar x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt]; 24060b8f6341SShri Abhyankar for(m=0;m<nz;m++){ 24070b8f6341SShri Abhyankar idx = bs*vi[m]; 24080b8f6341SShri Abhyankar for(k=0;k<15;k++){ 24090fa040f9SShri Abhyankar kdx = k + idx; 2410832cc040SShri Abhyankar x[idt] -= v[0]*x[kdx]; 2411832cc040SShri Abhyankar x[1+idt] -= v[1]*x[kdx]; 2412832cc040SShri Abhyankar x[2+idt] -= v[2]*x[kdx]; 2413832cc040SShri Abhyankar x[3+idt] -= v[3]*x[kdx]; 2414832cc040SShri Abhyankar x[4+idt] -= v[4]*x[kdx]; 2415832cc040SShri Abhyankar x[5+idt] -= v[5]*x[kdx]; 2416832cc040SShri Abhyankar x[6+idt] -= v[6]*x[kdx]; 2417832cc040SShri Abhyankar x[7+idt] -= v[7]*x[kdx]; 2418832cc040SShri Abhyankar x[8+idt] -= v[8]*x[kdx]; 2419832cc040SShri Abhyankar x[9+idt] -= v[9]*x[kdx]; 2420832cc040SShri Abhyankar x[10+idt] -= v[10]*x[kdx]; 2421832cc040SShri Abhyankar x[11+idt] -= v[11]*x[kdx]; 2422832cc040SShri Abhyankar x[12+idt] -= v[12]*x[kdx]; 2423832cc040SShri Abhyankar x[13+idt] -= v[13]*x[kdx]; 2424832cc040SShri Abhyankar x[14+idt] -= v[14]*x[kdx]; 24250b8f6341SShri Abhyankar v += 15; 24260b8f6341SShri Abhyankar } 24270b8f6341SShri Abhyankar } 24280b8f6341SShri Abhyankar } 24290b8f6341SShri Abhyankar /* backward solve the upper triangular */ 24300b8f6341SShri Abhyankar for (i=n-1; i>=0; i--){ 24310b8f6341SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 24320b8f6341SShri Abhyankar vi = aj + adiag[i+1]+1; 24330b8f6341SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 24340b8f6341SShri Abhyankar idt = bs*i; 24350fa040f9SShri Abhyankar s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 24360fa040f9SShri Abhyankar s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 24370fa040f9SShri Abhyankar s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt]; 24380b8f6341SShri Abhyankar 24390b8f6341SShri Abhyankar for(m=0;m<nz;m++){ 24400b8f6341SShri Abhyankar idx = bs*vi[m]; 24410b8f6341SShri Abhyankar for(k=0;k<15;k++){ 24420fa040f9SShri Abhyankar kdx = k + idx; 24430fa040f9SShri Abhyankar s[0] -= v[0]*x[kdx]; 24440fa040f9SShri Abhyankar s[1] -= v[1]*x[kdx]; 24450fa040f9SShri Abhyankar s[2] -= v[2]*x[kdx]; 24460fa040f9SShri Abhyankar s[3] -= v[3]*x[kdx]; 24470fa040f9SShri Abhyankar s[4] -= v[4]*x[kdx]; 24480fa040f9SShri Abhyankar s[5] -= v[5]*x[kdx]; 24490fa040f9SShri Abhyankar s[6] -= v[6]*x[kdx]; 24500fa040f9SShri Abhyankar s[7] -= v[7]*x[kdx]; 24510fa040f9SShri Abhyankar s[8] -= v[8]*x[kdx]; 24520fa040f9SShri Abhyankar s[9] -= v[9]*x[kdx]; 24530fa040f9SShri Abhyankar s[10] -= v[10]*x[kdx]; 24540fa040f9SShri Abhyankar s[11] -= v[11]*x[kdx]; 24550fa040f9SShri Abhyankar s[12] -= v[12]*x[kdx]; 24560fa040f9SShri Abhyankar s[13] -= v[13]*x[kdx]; 24570fa040f9SShri Abhyankar s[14] -= v[14]*x[kdx]; 24580b8f6341SShri Abhyankar v += 15; 24590b8f6341SShri Abhyankar } 24600b8f6341SShri Abhyankar } 24610fa040f9SShri Abhyankar ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr); 24620b8f6341SShri Abhyankar for(k=0;k<15;k++){ 24630fa040f9SShri Abhyankar x[idt] += v[0]*s[k]; 24640fa040f9SShri Abhyankar x[1+idt] += v[1]*s[k]; 24650fa040f9SShri Abhyankar x[2+idt] += v[2]*s[k]; 24660fa040f9SShri Abhyankar x[3+idt] += v[3]*s[k]; 24670fa040f9SShri Abhyankar x[4+idt] += v[4]*s[k]; 24680fa040f9SShri Abhyankar x[5+idt] += v[5]*s[k]; 24690fa040f9SShri Abhyankar x[6+idt] += v[6]*s[k]; 24700fa040f9SShri Abhyankar x[7+idt] += v[7]*s[k]; 24710fa040f9SShri Abhyankar x[8+idt] += v[8]*s[k]; 24720fa040f9SShri Abhyankar x[9+idt] += v[9]*s[k]; 24730fa040f9SShri Abhyankar x[10+idt] += v[10]*s[k]; 24740fa040f9SShri Abhyankar x[11+idt] += v[11]*s[k]; 24750fa040f9SShri Abhyankar x[12+idt] += v[12]*s[k]; 24760fa040f9SShri Abhyankar x[13+idt] += v[13]*s[k]; 24770fa040f9SShri Abhyankar x[14+idt] += v[14]*s[k]; 24780b8f6341SShri Abhyankar v += 15; 24790b8f6341SShri Abhyankar } 24800b8f6341SShri Abhyankar } 24810b8f6341SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24820b8f6341SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 24830b8f6341SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 24840b8f6341SShri Abhyankar PetscFunctionReturn(0); 24850b8f6341SShri Abhyankar } 24860b8f6341SShri Abhyankar 24870b8f6341SShri Abhyankar 24880b8f6341SShri Abhyankar #undef __FUNCT__ 248906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 249006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 24914e2b4712SSatish Balay { 24924e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 24934e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 24946849ba73SBarry Smith PetscErrorCode ierr; 2495b3260449SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j; 2496b3260449SShri Abhyankar const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 2497b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 2498b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2499b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2500b3260449SShri Abhyankar const PetscScalar *b; 25014e2b4712SSatish Balay 25024e2b4712SSatish Balay PetscFunctionBegin; 2503b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25041ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2505f1af5d2fSBarry Smith t = a->solve_work; 25064e2b4712SSatish Balay 25074e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 25084e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 25094e2b4712SSatish Balay 25104e2b4712SSatish Balay /* forward solve the lower triangular */ 25114e2b4712SSatish Balay idx = 7*(*r++); 2512f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2513f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2514f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 25154e2b4712SSatish Balay 25164e2b4712SSatish Balay for (i=1; i<n; i++) { 25174e2b4712SSatish Balay v = aa + 49*ai[i]; 25184e2b4712SSatish Balay vi = aj + ai[i]; 25194e2b4712SSatish Balay nz = diag[i] - ai[i]; 25204e2b4712SSatish Balay idx = 7*(*r++); 2521f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2522f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 25234e2b4712SSatish Balay while (nz--) { 25244e2b4712SSatish Balay idx = 7*(*vi++); 2525f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2526f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2527f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 2528f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2529f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2530f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2531f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2532f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2533f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2534f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 25354e2b4712SSatish Balay v += 49; 25364e2b4712SSatish Balay } 25374e2b4712SSatish Balay idx = 7*i; 2538f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2539f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2540f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 25414e2b4712SSatish Balay } 25424e2b4712SSatish Balay /* backward solve the upper triangular */ 25434e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 25444e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 25454e2b4712SSatish Balay vi = aj + diag[i] + 1; 25464e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 25474e2b4712SSatish Balay idt = 7*i; 2548f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2549f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2550f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 25514e2b4712SSatish Balay while (nz--) { 25524e2b4712SSatish Balay idx = 7*(*vi++); 2553f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2554f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2555f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 2556f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2557f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2558f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2559f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2560f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2561f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2562f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 25634e2b4712SSatish Balay v += 49; 25644e2b4712SSatish Balay } 25654e2b4712SSatish Balay idc = 7*(*c--); 25664e2b4712SSatish Balay v = aa + 49*diag[i]; 2567f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2568f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2569f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2570f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2571f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2572f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2573f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2574f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2575f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2576f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2577f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2578f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2579f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2580f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 25814e2b4712SSatish Balay } 25824e2b4712SSatish Balay 25834e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 25844e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2585b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25861ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2587dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 25884e2b4712SSatish Balay PetscFunctionReturn(0); 25894e2b4712SSatish Balay } 25904e2b4712SSatish Balay 25918f690400SShri Abhyankar #undef __FUNCT__ 25924dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7" 25934dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 259435aa4fcfSShri Abhyankar { 259535aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 259635aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 259735aa4fcfSShri Abhyankar PetscErrorCode ierr; 2598b3260449SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 2599b3260449SShri Abhyankar const PetscInt n=a->mbs,*rout,*cout,*vi; 2600b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 2601b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2602b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2603b3260449SShri Abhyankar const PetscScalar *b; 260435aa4fcfSShri Abhyankar 260535aa4fcfSShri Abhyankar PetscFunctionBegin; 2606b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 260735aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260835aa4fcfSShri Abhyankar t = a->solve_work; 260935aa4fcfSShri Abhyankar 261035aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 261135aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 261235aa4fcfSShri Abhyankar 261335aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 261435aa4fcfSShri Abhyankar idx = 7*r[0]; 261535aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 261635aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 261735aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 261835aa4fcfSShri Abhyankar 261935aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 262035aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 262135aa4fcfSShri Abhyankar vi = aj + ai[i]; 262235aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 262335aa4fcfSShri Abhyankar idx = 7*r[i]; 262435aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 262535aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 262635aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 262735aa4fcfSShri Abhyankar idx = 7*vi[m]; 262835aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 262935aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 263035aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 263135aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 263235aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 263335aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 263435aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 263535aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 263635aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 263735aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 263835aa4fcfSShri Abhyankar v += 49; 263935aa4fcfSShri Abhyankar } 264035aa4fcfSShri Abhyankar idx = 7*i; 264135aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 264235aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 264335aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 264435aa4fcfSShri Abhyankar } 264535aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 264635aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 264735aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 264835aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 264935aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 265035aa4fcfSShri Abhyankar idt = 7*i; 265135aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 265235aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 265335aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 265435aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 265535aa4fcfSShri Abhyankar idx = 7*vi[m]; 265635aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 265735aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 265835aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 265935aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 266035aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 266135aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 266235aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 266335aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 266435aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 266535aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 266635aa4fcfSShri Abhyankar v += 49; 266735aa4fcfSShri Abhyankar } 266835aa4fcfSShri Abhyankar idc = 7*c[i]; 266935aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 267035aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 267135aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 267235aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 267335aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 267435aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 267535aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 267635aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 267735aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 267835aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 267935aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 268035aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 268135aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 268235aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 268335aa4fcfSShri Abhyankar } 268435aa4fcfSShri Abhyankar 268535aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 268635aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2687b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 268835aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 268935aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 269035aa4fcfSShri Abhyankar PetscFunctionReturn(0); 269135aa4fcfSShri Abhyankar } 269235aa4fcfSShri Abhyankar 269335aa4fcfSShri Abhyankar #undef __FUNCT__ 269406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 269506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 269615091d37SBarry Smith { 269715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2698b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2699dfbe8321SBarry Smith PetscErrorCode ierr; 2700b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 2701d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2702d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2703d9fead3dSBarry Smith const PetscScalar *b; 270415091d37SBarry Smith 270515091d37SBarry Smith PetscFunctionBegin; 2706d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27071ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 270815091d37SBarry Smith /* forward solve the lower triangular */ 270915091d37SBarry Smith idx = 0; 271015091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 271115091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 271215091d37SBarry Smith x[6] = b[6+idx]; 271315091d37SBarry Smith for (i=1; i<n; i++) { 271415091d37SBarry Smith v = aa + 49*ai[i]; 271515091d37SBarry Smith vi = aj + ai[i]; 271615091d37SBarry Smith nz = diag[i] - ai[i]; 271715091d37SBarry Smith idx = 7*i; 2718f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2719f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2720f1af5d2fSBarry Smith s7 = b[6+idx]; 272115091d37SBarry Smith while (nz--) { 272215091d37SBarry Smith jdx = 7*(*vi++); 272315091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 272415091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 272515091d37SBarry Smith x7 = x[6+jdx]; 2726f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2727f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2728f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2729f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2730f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2731f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2732f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 273315091d37SBarry Smith v += 49; 273415091d37SBarry Smith } 2735f1af5d2fSBarry Smith x[idx] = s1; 2736f1af5d2fSBarry Smith x[1+idx] = s2; 2737f1af5d2fSBarry Smith x[2+idx] = s3; 2738f1af5d2fSBarry Smith x[3+idx] = s4; 2739f1af5d2fSBarry Smith x[4+idx] = s5; 2740f1af5d2fSBarry Smith x[5+idx] = s6; 2741f1af5d2fSBarry Smith x[6+idx] = s7; 274215091d37SBarry Smith } 274315091d37SBarry Smith /* backward solve the upper triangular */ 274415091d37SBarry Smith for (i=n-1; i>=0; i--){ 274515091d37SBarry Smith v = aa + 49*diag[i] + 49; 274615091d37SBarry Smith vi = aj + diag[i] + 1; 274715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 274815091d37SBarry Smith idt = 7*i; 2749f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2750f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2751f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 2752f1af5d2fSBarry Smith s7 = x[6+idt]; 275315091d37SBarry Smith while (nz--) { 275415091d37SBarry Smith idx = 7*(*vi++); 275515091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 275615091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 275715091d37SBarry Smith x7 = x[6+idx]; 2758f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2759f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2760f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2761f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2762f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2763f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2764f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 276515091d37SBarry Smith v += 49; 276615091d37SBarry Smith } 276715091d37SBarry Smith v = aa + 49*diag[i]; 2768f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2769f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 2770f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2771f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 2772f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2773f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 2774f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2775f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 2776f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2777f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 2778f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2779f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 2780f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2781f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 278215091d37SBarry Smith } 278315091d37SBarry Smith 2784d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27851ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2786dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 278715091d37SBarry Smith PetscFunctionReturn(0); 278815091d37SBarry Smith } 278915091d37SBarry Smith 2790cee9d6f2SShri Abhyankar #undef __FUNCT__ 27914dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 27924dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 279353cca76cSShri Abhyankar { 279453cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2795b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 279653cca76cSShri Abhyankar PetscErrorCode ierr; 2797b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 2798b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 279953cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 280053cca76cSShri Abhyankar PetscScalar *x; 280153cca76cSShri Abhyankar const PetscScalar *b; 280253cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 280353cca76cSShri Abhyankar 280453cca76cSShri Abhyankar PetscFunctionBegin; 280553cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 280653cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 280753cca76cSShri Abhyankar /* forward solve the lower triangular */ 280853cca76cSShri Abhyankar idx = 0; 280953cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 281053cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 281153cca76cSShri Abhyankar for (i=1; i<n; i++) { 281253cca76cSShri Abhyankar v = aa + bs2*ai[i]; 281353cca76cSShri Abhyankar vi = aj + ai[i]; 281453cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 281553cca76cSShri Abhyankar idx = bs*i; 281653cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 281753cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 281853cca76cSShri Abhyankar for(k=0;k<nz;k++) { 281953cca76cSShri Abhyankar jdx = bs*vi[k]; 282053cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 282153cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 282253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 282353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 282453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 282553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 282653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 282753cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 282853cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 282953cca76cSShri Abhyankar v += bs2; 283053cca76cSShri Abhyankar } 283153cca76cSShri Abhyankar 283253cca76cSShri Abhyankar x[idx] = s1; 283353cca76cSShri Abhyankar x[1+idx] = s2; 283453cca76cSShri Abhyankar x[2+idx] = s3; 283553cca76cSShri Abhyankar x[3+idx] = s4; 283653cca76cSShri Abhyankar x[4+idx] = s5; 283753cca76cSShri Abhyankar x[5+idx] = s6; 283853cca76cSShri Abhyankar x[6+idx] = s7; 283953cca76cSShri Abhyankar } 284053cca76cSShri Abhyankar 284153cca76cSShri Abhyankar /* backward solve the upper triangular */ 284253cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 284353cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 284453cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 284553cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 284653cca76cSShri Abhyankar idt = bs*i; 284753cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 284853cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 284953cca76cSShri Abhyankar for(k=0;k<nz;k++) { 285053cca76cSShri Abhyankar idx = bs*vi[k]; 285153cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 285253cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 285353cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 285453cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 285553cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 285653cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 285753cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 285853cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 285953cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 286053cca76cSShri Abhyankar v += bs2; 286153cca76cSShri Abhyankar } 286253cca76cSShri Abhyankar /* x = inv_diagonal*x */ 286353cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 286453cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 286553cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 286653cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 286753cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 286853cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 286953cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 287053cca76cSShri Abhyankar } 287153cca76cSShri Abhyankar 287253cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 287353cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 287453cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 287553cca76cSShri Abhyankar PetscFunctionReturn(0); 287653cca76cSShri Abhyankar } 287753cca76cSShri Abhyankar 287853cca76cSShri Abhyankar #undef __FUNCT__ 287906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 288006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 288115091d37SBarry Smith { 288215091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 288315091d37SBarry Smith IS iscol=a->col,isrow=a->row; 28846849ba73SBarry Smith PetscErrorCode ierr; 28855d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 2886b3260449SShri Abhyankar const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2887b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 2888d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2889d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2890d9fead3dSBarry Smith const PetscScalar *b; 2891b3260449SShri Abhyankar 289215091d37SBarry Smith PetscFunctionBegin; 2893d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2895f1af5d2fSBarry Smith t = a->solve_work; 289615091d37SBarry Smith 289715091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 289815091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 289915091d37SBarry Smith 290015091d37SBarry Smith /* forward solve the lower triangular */ 290115091d37SBarry Smith idx = 6*(*r++); 2902f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2903f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 2904f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 290515091d37SBarry Smith for (i=1; i<n; i++) { 290615091d37SBarry Smith v = aa + 36*ai[i]; 290715091d37SBarry Smith vi = aj + ai[i]; 290815091d37SBarry Smith nz = diag[i] - ai[i]; 290915091d37SBarry Smith idx = 6*(*r++); 2910f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2911f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 291215091d37SBarry Smith while (nz--) { 291315091d37SBarry Smith idx = 6*(*vi++); 2914f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2915f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2916f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2917f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2918f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2919f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2920f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2921f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 292215091d37SBarry Smith v += 36; 292315091d37SBarry Smith } 292415091d37SBarry Smith idx = 6*i; 2925f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2926f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 2927f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 292815091d37SBarry Smith } 292915091d37SBarry Smith /* backward solve the upper triangular */ 293015091d37SBarry Smith for (i=n-1; i>=0; i--){ 293115091d37SBarry Smith v = aa + 36*diag[i] + 36; 293215091d37SBarry Smith vi = aj + diag[i] + 1; 293315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 293415091d37SBarry Smith idt = 6*i; 2935f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2936f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 2937f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 293815091d37SBarry Smith while (nz--) { 293915091d37SBarry Smith idx = 6*(*vi++); 2940f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2941f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2942f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 2943f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2944f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2945f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2946f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2947f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2948f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 294915091d37SBarry Smith v += 36; 295015091d37SBarry Smith } 295115091d37SBarry Smith idc = 6*(*c--); 295215091d37SBarry Smith v = aa + 36*diag[i]; 2953f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2954f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 2955f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2956f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 2957f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2958f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 2959f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2960f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 2961f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2962f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 2963f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2964f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 296515091d37SBarry Smith } 296615091d37SBarry Smith 296715091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 296815091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2969d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29701ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2971dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 297215091d37SBarry Smith PetscFunctionReturn(0); 297315091d37SBarry Smith } 297415091d37SBarry Smith 29756506fda5SShri Abhyankar #undef __FUNCT__ 29764dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6" 29774dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 29786506fda5SShri Abhyankar { 29796506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 29806506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 29816506fda5SShri Abhyankar PetscErrorCode ierr; 29826506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 2983b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 2984b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 29856506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 29866506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 29876506fda5SShri Abhyankar const PetscScalar *b; 2988b3260449SShri Abhyankar 29896506fda5SShri Abhyankar PetscFunctionBegin; 29906506fda5SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29916506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 29926506fda5SShri Abhyankar t = a->solve_work; 29936506fda5SShri Abhyankar 29946506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 29956506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 29966506fda5SShri Abhyankar 29976506fda5SShri Abhyankar /* forward solve the lower triangular */ 29986506fda5SShri Abhyankar idx = 6*r[0]; 29996506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 30006506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 30016506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 30026506fda5SShri Abhyankar for (i=1; i<n; i++) { 30036506fda5SShri Abhyankar v = aa + 36*ai[i]; 30046506fda5SShri Abhyankar vi = aj + ai[i]; 30056506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 30066506fda5SShri Abhyankar idx = 6*r[i]; 30076506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 30086506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 30096506fda5SShri Abhyankar for(m=0;m<nz;m++){ 30106506fda5SShri Abhyankar idx = 6*vi[m]; 30116506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 30126506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 30136506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 30146506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 30156506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 30166506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 30176506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 30186506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 30196506fda5SShri Abhyankar v += 36; 30206506fda5SShri Abhyankar } 30216506fda5SShri Abhyankar idx = 6*i; 30226506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 30236506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 30246506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 30256506fda5SShri Abhyankar } 30266506fda5SShri Abhyankar /* backward solve the upper triangular */ 30276506fda5SShri Abhyankar for (i=n-1; i>=0; i--){ 30286506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 30296506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 30306506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 30316506fda5SShri Abhyankar idt = 6*i; 30326506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 30336506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 30346506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 30356506fda5SShri Abhyankar for(m=0;m<nz;m++){ 30366506fda5SShri Abhyankar idx = 6*vi[m]; 30376506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 30386506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 30396506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 30406506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 30416506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 30426506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 30436506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 30446506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 30456506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 30466506fda5SShri Abhyankar v += 36; 30476506fda5SShri Abhyankar } 30486506fda5SShri Abhyankar idc = 6*c[i]; 30496506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 30506506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 30516506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 30526506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 30536506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 30546506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 30556506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 30566506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 30576506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 30586506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 30596506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 30606506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 30616506fda5SShri Abhyankar } 30626506fda5SShri Abhyankar 30636506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 30646506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 30656506fda5SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30666506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 30676506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 30686506fda5SShri Abhyankar PetscFunctionReturn(0); 30696506fda5SShri Abhyankar } 30708f690400SShri Abhyankar 30718f690400SShri Abhyankar #undef __FUNCT__ 307206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 307306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 307415091d37SBarry Smith { 307515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3076b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 3077dfbe8321SBarry Smith PetscErrorCode ierr; 3078b3260449SShri Abhyankar const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 3079d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3080d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3081d9fead3dSBarry Smith const PetscScalar *b; 308215091d37SBarry Smith 308315091d37SBarry Smith PetscFunctionBegin; 3084d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30851ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 308615091d37SBarry Smith /* forward solve the lower triangular */ 308715091d37SBarry Smith idx = 0; 308815091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 308915091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 309015091d37SBarry Smith for (i=1; i<n; i++) { 309115091d37SBarry Smith v = aa + 36*ai[i]; 309215091d37SBarry Smith vi = aj + ai[i]; 309315091d37SBarry Smith nz = diag[i] - ai[i]; 309415091d37SBarry Smith idx = 6*i; 3095f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3096f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 309715091d37SBarry Smith while (nz--) { 309815091d37SBarry Smith jdx = 6*(*vi++); 309915091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 310015091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 3101f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3102f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3103f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3104f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3105f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3106f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 310715091d37SBarry Smith v += 36; 310815091d37SBarry Smith } 3109f1af5d2fSBarry Smith x[idx] = s1; 3110f1af5d2fSBarry Smith x[1+idx] = s2; 3111f1af5d2fSBarry Smith x[2+idx] = s3; 3112f1af5d2fSBarry Smith x[3+idx] = s4; 3113f1af5d2fSBarry Smith x[4+idx] = s5; 3114f1af5d2fSBarry Smith x[5+idx] = s6; 311515091d37SBarry Smith } 311615091d37SBarry Smith /* backward solve the upper triangular */ 311715091d37SBarry Smith for (i=n-1; i>=0; i--){ 311815091d37SBarry Smith v = aa + 36*diag[i] + 36; 311915091d37SBarry Smith vi = aj + diag[i] + 1; 312015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 312115091d37SBarry Smith idt = 6*i; 3122f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3123f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 3124f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 312515091d37SBarry Smith while (nz--) { 312615091d37SBarry Smith idx = 6*(*vi++); 312715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 312815091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3129f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3130f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3131f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3132f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3133f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3134f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 313515091d37SBarry Smith v += 36; 313615091d37SBarry Smith } 313715091d37SBarry Smith v = aa + 36*diag[i]; 3138f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3139f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3140f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3141f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3142f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3143f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 314415091d37SBarry Smith } 314515091d37SBarry Smith 3146d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 31471ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3148dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 314915091d37SBarry Smith PetscFunctionReturn(0); 315015091d37SBarry Smith } 315115091d37SBarry Smith 3152cee9d6f2SShri Abhyankar #undef __FUNCT__ 31534dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 31544dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315553cca76cSShri Abhyankar { 315653cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3157b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 315853cca76cSShri Abhyankar PetscErrorCode ierr; 3159b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 3160b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 316153cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 316253cca76cSShri Abhyankar PetscScalar *x; 316353cca76cSShri Abhyankar const PetscScalar *b; 316453cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 316553cca76cSShri Abhyankar 316653cca76cSShri Abhyankar PetscFunctionBegin; 316753cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 316853cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 316953cca76cSShri Abhyankar /* forward solve the lower triangular */ 317053cca76cSShri Abhyankar idx = 0; 317153cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 317253cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 317353cca76cSShri Abhyankar for (i=1; i<n; i++) { 317453cca76cSShri Abhyankar v = aa + bs2*ai[i]; 317553cca76cSShri Abhyankar vi = aj + ai[i]; 317653cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 317753cca76cSShri Abhyankar idx = bs*i; 317853cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 317953cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 318053cca76cSShri Abhyankar for(k=0;k<nz;k++){ 318153cca76cSShri Abhyankar jdx = bs*vi[k]; 318253cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 318353cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 318453cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 318553cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 318653cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 318753cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 318853cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 318953cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 319053cca76cSShri Abhyankar v += bs2; 319153cca76cSShri Abhyankar } 319253cca76cSShri Abhyankar 319353cca76cSShri Abhyankar x[idx] = s1; 319453cca76cSShri Abhyankar x[1+idx] = s2; 319553cca76cSShri Abhyankar x[2+idx] = s3; 319653cca76cSShri Abhyankar x[3+idx] = s4; 319753cca76cSShri Abhyankar x[4+idx] = s5; 319853cca76cSShri Abhyankar x[5+idx] = s6; 319953cca76cSShri Abhyankar } 320053cca76cSShri Abhyankar 320153cca76cSShri Abhyankar /* backward solve the upper triangular */ 320253cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 320353cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 320453cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 320553cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 320653cca76cSShri Abhyankar idt = bs*i; 320753cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 320853cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 320953cca76cSShri Abhyankar for(k=0;k<nz;k++){ 321053cca76cSShri Abhyankar idx = bs*vi[k]; 321153cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 321253cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 321353cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 321453cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 321553cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 321653cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 321753cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 321853cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 321953cca76cSShri Abhyankar v += bs2; 322053cca76cSShri Abhyankar } 322153cca76cSShri Abhyankar /* x = inv_diagonal*x */ 322253cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 322353cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 322453cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 322553cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 322653cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 322753cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 322853cca76cSShri Abhyankar } 322953cca76cSShri Abhyankar 323053cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 323153cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 323253cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 323353cca76cSShri Abhyankar PetscFunctionReturn(0); 323453cca76cSShri Abhyankar } 323553cca76cSShri Abhyankar 323653cca76cSShri Abhyankar #undef __FUNCT__ 323706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 323806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 32394e2b4712SSatish Balay { 32404e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 32414e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 32426849ba73SBarry Smith PetscErrorCode ierr; 32435d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3244b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3245b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 3246d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3247d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3248d9fead3dSBarry Smith const PetscScalar *b; 32494e2b4712SSatish Balay 32504e2b4712SSatish Balay PetscFunctionBegin; 3251d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 32521ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3253f1af5d2fSBarry Smith t = a->solve_work; 32544e2b4712SSatish Balay 32554e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 32564e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 32574e2b4712SSatish Balay 32584e2b4712SSatish Balay /* forward solve the lower triangular */ 32594e2b4712SSatish Balay idx = 5*(*r++); 3260f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3261f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 32624e2b4712SSatish Balay for (i=1; i<n; i++) { 32634e2b4712SSatish Balay v = aa + 25*ai[i]; 32644e2b4712SSatish Balay vi = aj + ai[i]; 32654e2b4712SSatish Balay nz = diag[i] - ai[i]; 32664e2b4712SSatish Balay idx = 5*(*r++); 3267f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3268f1af5d2fSBarry Smith s5 = b[4+idx]; 32694e2b4712SSatish Balay while (nz--) { 32704e2b4712SSatish Balay idx = 5*(*vi++); 3271f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3272f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 3273f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3274f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3275f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3276f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3277f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 32784e2b4712SSatish Balay v += 25; 32794e2b4712SSatish Balay } 32804e2b4712SSatish Balay idx = 5*i; 3281f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3282f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 32834e2b4712SSatish Balay } 32844e2b4712SSatish Balay /* backward solve the upper triangular */ 32854e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 32864e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 32874e2b4712SSatish Balay vi = aj + diag[i] + 1; 32884e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 32894e2b4712SSatish Balay idt = 5*i; 3290f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3291f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 32924e2b4712SSatish Balay while (nz--) { 32934e2b4712SSatish Balay idx = 5*(*vi++); 3294f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3295f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3296f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3297f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3298f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3299f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3300f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 33014e2b4712SSatish Balay v += 25; 33024e2b4712SSatish Balay } 33034e2b4712SSatish Balay idc = 5*(*c--); 33044e2b4712SSatish Balay v = aa + 25*diag[i]; 3305f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3306f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 3307f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3308f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 3309f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3310f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 3311f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3312f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 3313f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3314f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 33154e2b4712SSatish Balay } 33164e2b4712SSatish Balay 33174e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 33184e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3319d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 33201ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3321dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 33224e2b4712SSatish Balay PetscFunctionReturn(0); 33234e2b4712SSatish Balay } 33244e2b4712SSatish Balay 332578bb4007SShri Abhyankar #undef __FUNCT__ 33264dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5" 33274dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 332878bb4007SShri Abhyankar { 332978bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 333078bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 333178bb4007SShri Abhyankar PetscErrorCode ierr; 333278bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 3333b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3334b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 333578bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 333678bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 333778bb4007SShri Abhyankar const PetscScalar *b; 333878bb4007SShri Abhyankar 333978bb4007SShri Abhyankar PetscFunctionBegin; 334078bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 334178bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 334278bb4007SShri Abhyankar t = a->solve_work; 334378bb4007SShri Abhyankar 334478bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 334578bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 334678bb4007SShri Abhyankar 334778bb4007SShri Abhyankar /* forward solve the lower triangular */ 334878bb4007SShri Abhyankar idx = 5*r[0]; 334978bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 335078bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 335178bb4007SShri Abhyankar for (i=1; i<n; i++) { 335278bb4007SShri Abhyankar v = aa + 25*ai[i]; 335378bb4007SShri Abhyankar vi = aj + ai[i]; 335478bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 335578bb4007SShri Abhyankar idx = 5*r[i]; 335678bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 335778bb4007SShri Abhyankar s5 = b[4+idx]; 335878bb4007SShri Abhyankar for(m=0;m<nz;m++){ 335978bb4007SShri Abhyankar idx = 5*vi[m]; 336078bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 336178bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 336278bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 336378bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 336478bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 336578bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 336678bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 336778bb4007SShri Abhyankar v += 25; 336878bb4007SShri Abhyankar } 336978bb4007SShri Abhyankar idx = 5*i; 337078bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 337178bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 337278bb4007SShri Abhyankar } 337378bb4007SShri Abhyankar /* backward solve the upper triangular */ 337478bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 337578bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 337678bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 337778bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 337878bb4007SShri Abhyankar idt = 5*i; 337978bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 338078bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 338178bb4007SShri Abhyankar for(m=0;m<nz;m++){ 338278bb4007SShri Abhyankar idx = 5*vi[m]; 338378bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 338478bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 338578bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 338678bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 338778bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 338878bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 338978bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 339078bb4007SShri Abhyankar v += 25; 339178bb4007SShri Abhyankar } 339278bb4007SShri Abhyankar idc = 5*c[i]; 339378bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 339478bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 339578bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 339678bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 339778bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 339878bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 339978bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 340078bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 340178bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 340278bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 340378bb4007SShri Abhyankar } 340478bb4007SShri Abhyankar 340578bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 340678bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 340778bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 340878bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 340978bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 341078bb4007SShri Abhyankar PetscFunctionReturn(0); 341178bb4007SShri Abhyankar } 341278bb4007SShri Abhyankar 34138f690400SShri Abhyankar #undef __FUNCT__ 341406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 341506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 341615091d37SBarry Smith { 341715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3418b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3419b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 3420dfbe8321SBarry Smith PetscErrorCode ierr; 3421d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3422d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3423d9fead3dSBarry Smith const PetscScalar *b; 342415091d37SBarry Smith 342515091d37SBarry Smith PetscFunctionBegin; 3426d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 34271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 342815091d37SBarry Smith /* forward solve the lower triangular */ 342915091d37SBarry Smith idx = 0; 343015091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 343115091d37SBarry Smith for (i=1; i<n; i++) { 343215091d37SBarry Smith v = aa + 25*ai[i]; 343315091d37SBarry Smith vi = aj + ai[i]; 343415091d37SBarry Smith nz = diag[i] - ai[i]; 343515091d37SBarry Smith idx = 5*i; 3436f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 343715091d37SBarry Smith while (nz--) { 343815091d37SBarry Smith jdx = 5*(*vi++); 343915091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3440f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3441f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3442f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3443f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3444f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 344515091d37SBarry Smith v += 25; 344615091d37SBarry Smith } 3447f1af5d2fSBarry Smith x[idx] = s1; 3448f1af5d2fSBarry Smith x[1+idx] = s2; 3449f1af5d2fSBarry Smith x[2+idx] = s3; 3450f1af5d2fSBarry Smith x[3+idx] = s4; 3451f1af5d2fSBarry Smith x[4+idx] = s5; 345215091d37SBarry Smith } 345315091d37SBarry Smith /* backward solve the upper triangular */ 345415091d37SBarry Smith for (i=n-1; i>=0; i--){ 345515091d37SBarry Smith v = aa + 25*diag[i] + 25; 345615091d37SBarry Smith vi = aj + diag[i] + 1; 345715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 345815091d37SBarry Smith idt = 5*i; 3459f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3460f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 346115091d37SBarry Smith while (nz--) { 346215091d37SBarry Smith idx = 5*(*vi++); 346315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3464f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3465f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3466f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3467f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3468f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 346915091d37SBarry Smith v += 25; 347015091d37SBarry Smith } 347115091d37SBarry Smith v = aa + 25*diag[i]; 3472f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3473f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3474f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3475f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3476f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 347715091d37SBarry Smith } 347815091d37SBarry Smith 3479d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 34801ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3481dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 348215091d37SBarry Smith PetscFunctionReturn(0); 348315091d37SBarry Smith } 348415091d37SBarry Smith 3485cee9d6f2SShri Abhyankar #undef __FUNCT__ 34864dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 34874dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 348853cca76cSShri Abhyankar { 348953cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3490b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3491b3260449SShri Abhyankar PetscInt i,k,nz,idx,idt,jdx; 349253cca76cSShri Abhyankar PetscErrorCode ierr; 349353cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 349453cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 349553cca76cSShri Abhyankar const PetscScalar *b; 349653cca76cSShri Abhyankar 349753cca76cSShri Abhyankar PetscFunctionBegin; 349853cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 349953cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 350053cca76cSShri Abhyankar /* forward solve the lower triangular */ 350153cca76cSShri Abhyankar idx = 0; 350253cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 350353cca76cSShri Abhyankar for (i=1; i<n; i++) { 350453cca76cSShri Abhyankar v = aa + 25*ai[i]; 350553cca76cSShri Abhyankar vi = aj + ai[i]; 350653cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 350753cca76cSShri Abhyankar idx = 5*i; 350853cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 350953cca76cSShri Abhyankar for(k=0;k<nz;k++) { 351053cca76cSShri Abhyankar jdx = 5*vi[k]; 351153cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 351253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 351353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 351453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 351553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 351653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 351753cca76cSShri Abhyankar v += 25; 351853cca76cSShri Abhyankar } 351953cca76cSShri Abhyankar x[idx] = s1; 352053cca76cSShri Abhyankar x[1+idx] = s2; 352153cca76cSShri Abhyankar x[2+idx] = s3; 352253cca76cSShri Abhyankar x[3+idx] = s4; 352353cca76cSShri Abhyankar x[4+idx] = s5; 352453cca76cSShri Abhyankar } 352553cca76cSShri Abhyankar 352653cca76cSShri Abhyankar /* backward solve the upper triangular */ 352753cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 352853cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 352953cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 353053cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 353153cca76cSShri Abhyankar idt = 5*i; 353253cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 353353cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 353453cca76cSShri Abhyankar for(k=0;k<nz;k++){ 353553cca76cSShri Abhyankar idx = 5*vi[k]; 353653cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 353753cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 353853cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 353953cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 354053cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 354153cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 354253cca76cSShri Abhyankar v += 25; 354353cca76cSShri Abhyankar } 354453cca76cSShri Abhyankar /* x = inv_diagonal*x */ 354553cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 354653cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 354753cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 354853cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 354953cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 355053cca76cSShri Abhyankar } 355153cca76cSShri Abhyankar 355253cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 355353cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 355453cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 355553cca76cSShri Abhyankar PetscFunctionReturn(0); 355653cca76cSShri Abhyankar } 355753cca76cSShri Abhyankar 355853cca76cSShri Abhyankar #undef __FUNCT__ 355906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 356006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 35614e2b4712SSatish Balay { 35624e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 35634e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 35646849ba73SBarry Smith PetscErrorCode ierr; 3565b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3566b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 35675d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3568d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3569d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3570d9fead3dSBarry Smith const PetscScalar *b; 35714e2b4712SSatish Balay 35724e2b4712SSatish Balay PetscFunctionBegin; 3573d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 35741ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3575f1af5d2fSBarry Smith t = a->solve_work; 35764e2b4712SSatish Balay 35774e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 35784e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 35794e2b4712SSatish Balay 35804e2b4712SSatish Balay /* forward solve the lower triangular */ 35814e2b4712SSatish Balay idx = 4*(*r++); 3582f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3583f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 35844e2b4712SSatish Balay for (i=1; i<n; i++) { 35854e2b4712SSatish Balay v = aa + 16*ai[i]; 35864e2b4712SSatish Balay vi = aj + ai[i]; 35874e2b4712SSatish Balay nz = diag[i] - ai[i]; 35884e2b4712SSatish Balay idx = 4*(*r++); 3589f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 35904e2b4712SSatish Balay while (nz--) { 35914e2b4712SSatish Balay idx = 4*(*vi++); 3592f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3593f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3594f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3595f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3596f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 35974e2b4712SSatish Balay v += 16; 35984e2b4712SSatish Balay } 35994e2b4712SSatish Balay idx = 4*i; 3600f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3601f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 36024e2b4712SSatish Balay } 36034e2b4712SSatish Balay /* backward solve the upper triangular */ 36044e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 36054e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 36064e2b4712SSatish Balay vi = aj + diag[i] + 1; 36074e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 36084e2b4712SSatish Balay idt = 4*i; 3609f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3610f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 36114e2b4712SSatish Balay while (nz--) { 36124e2b4712SSatish Balay idx = 4*(*vi++); 3613f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3614f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 3615f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3616f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3617f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3618f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 36194e2b4712SSatish Balay v += 16; 36204e2b4712SSatish Balay } 36214e2b4712SSatish Balay idc = 4*(*c--); 36224e2b4712SSatish Balay v = aa + 16*diag[i]; 3623f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3624f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3625f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3626f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 36274e2b4712SSatish Balay } 36284e2b4712SSatish Balay 36294e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 36304e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3631d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 36321ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3633dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 36344e2b4712SSatish Balay PetscFunctionReturn(0); 36354e2b4712SSatish Balay } 3636f26ec98cSKris Buschelman 36378f690400SShri Abhyankar #undef __FUNCT__ 36384dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4" 36394dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 364078bb4007SShri Abhyankar { 364178bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 364278bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 364378bb4007SShri Abhyankar PetscErrorCode ierr; 3644b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3645b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 364678bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 364778bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 364878bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 364978bb4007SShri Abhyankar const PetscScalar *b; 365078bb4007SShri Abhyankar 365178bb4007SShri Abhyankar PetscFunctionBegin; 365278bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 365378bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 365478bb4007SShri Abhyankar t = a->solve_work; 365578bb4007SShri Abhyankar 365678bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 365778bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 365878bb4007SShri Abhyankar 365978bb4007SShri Abhyankar /* forward solve the lower triangular */ 366078bb4007SShri Abhyankar idx = 4*r[0]; 366178bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 366278bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 366378bb4007SShri Abhyankar for (i=1; i<n; i++) { 366478bb4007SShri Abhyankar v = aa + 16*ai[i]; 366578bb4007SShri Abhyankar vi = aj + ai[i]; 366678bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 366778bb4007SShri Abhyankar idx = 4*r[i]; 366878bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 366978bb4007SShri Abhyankar for(m=0;m<nz;m++){ 367078bb4007SShri Abhyankar idx = 4*vi[m]; 367178bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 367278bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 367378bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 367478bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 367578bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 367678bb4007SShri Abhyankar v += 16; 367778bb4007SShri Abhyankar } 367878bb4007SShri Abhyankar idx = 4*i; 367978bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 368078bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 368178bb4007SShri Abhyankar } 368278bb4007SShri Abhyankar /* backward solve the upper triangular */ 368378bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 368478bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 368578bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 368678bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 368778bb4007SShri Abhyankar idt = 4*i; 368878bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 368978bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 369078bb4007SShri Abhyankar for(m=0;m<nz;m++){ 369178bb4007SShri Abhyankar idx = 4*vi[m]; 369278bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 369378bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 369478bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 369578bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 369678bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 369778bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 369878bb4007SShri Abhyankar v += 16; 369978bb4007SShri Abhyankar } 370078bb4007SShri Abhyankar idc = 4*c[i]; 370178bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 370278bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 370378bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 370478bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 370578bb4007SShri Abhyankar } 370678bb4007SShri Abhyankar 370778bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 370878bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 370978bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 371078bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 371178bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 371278bb4007SShri Abhyankar PetscFunctionReturn(0); 371378bb4007SShri Abhyankar } 371478bb4007SShri Abhyankar 371578bb4007SShri Abhyankar #undef __FUNCT__ 3716f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3717dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3718f26ec98cSKris Buschelman { 3719f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3720f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 37216849ba73SBarry Smith PetscErrorCode ierr; 3722b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3723b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 37245d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3725d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3726d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3727d9fead3dSBarry Smith PetscScalar *x; 3728d9fead3dSBarry Smith const PetscScalar *b; 3729f26ec98cSKris Buschelman 3730f26ec98cSKris Buschelman PetscFunctionBegin; 3731d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 37321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3733f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 3734f26ec98cSKris Buschelman 3735f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3736f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3737f26ec98cSKris Buschelman 3738f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3739f26ec98cSKris Buschelman idx = 4*(*r++); 3740f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 3741f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 3742f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 3743f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 3744f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3745f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3746f26ec98cSKris Buschelman vi = aj + ai[i]; 3747f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3748f26ec98cSKris Buschelman idx = 4*(*r++); 3749f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3750f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3751f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3752f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3753f26ec98cSKris Buschelman while (nz--) { 3754f26ec98cSKris Buschelman idx = 4*(*vi++); 3755f26ec98cSKris Buschelman x1 = t[idx]; 3756f26ec98cSKris Buschelman x2 = t[1+idx]; 3757f26ec98cSKris Buschelman x3 = t[2+idx]; 3758f26ec98cSKris Buschelman x4 = t[3+idx]; 3759f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3760f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3761f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3762f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3763f26ec98cSKris Buschelman v += 16; 3764f26ec98cSKris Buschelman } 3765f26ec98cSKris Buschelman idx = 4*i; 3766f26ec98cSKris Buschelman t[idx] = s1; 3767f26ec98cSKris Buschelman t[1+idx] = s2; 3768f26ec98cSKris Buschelman t[2+idx] = s3; 3769f26ec98cSKris Buschelman t[3+idx] = s4; 3770f26ec98cSKris Buschelman } 3771f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3772f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3773f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 3774f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3775f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3776f26ec98cSKris Buschelman idt = 4*i; 3777f26ec98cSKris Buschelman s1 = t[idt]; 3778f26ec98cSKris Buschelman s2 = t[1+idt]; 3779f26ec98cSKris Buschelman s3 = t[2+idt]; 3780f26ec98cSKris Buschelman s4 = t[3+idt]; 3781f26ec98cSKris Buschelman while (nz--) { 3782f26ec98cSKris Buschelman idx = 4*(*vi++); 3783f26ec98cSKris Buschelman x1 = t[idx]; 3784f26ec98cSKris Buschelman x2 = t[1+idx]; 3785f26ec98cSKris Buschelman x3 = t[2+idx]; 3786f26ec98cSKris Buschelman x4 = t[3+idx]; 3787f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3788f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3789f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3790f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3791f26ec98cSKris Buschelman v += 16; 3792f26ec98cSKris Buschelman } 3793f26ec98cSKris Buschelman idc = 4*(*c--); 3794f26ec98cSKris Buschelman v = aa + 16*diag[i]; 3795f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3796f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3797f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3798f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3799f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 3800f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 3801f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 3802f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 3803f26ec98cSKris Buschelman } 3804f26ec98cSKris Buschelman 3805f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3806f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3807d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 38081ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3809dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3810f26ec98cSKris Buschelman PetscFunctionReturn(0); 3811f26ec98cSKris Buschelman } 3812f26ec98cSKris Buschelman 381324c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 381424c233c2SKris Buschelman 381524c233c2SKris Buschelman #include PETSC_HAVE_SSE 381624c233c2SKris Buschelman 381724c233c2SKris Buschelman #undef __FUNCT__ 381824c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3819dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 382024c233c2SKris Buschelman { 382124c233c2SKris Buschelman /* 382224c233c2SKris Buschelman Note: This code uses demotion of double 382324c233c2SKris Buschelman to float when performing the mixed-mode computation. 382424c233c2SKris Buschelman This may not be numerically reasonable for all applications. 382524c233c2SKris Buschelman */ 382624c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 382724c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 38286849ba73SBarry Smith PetscErrorCode ierr; 38295d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 38305d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 383124c233c2SKris Buschelman MatScalar *aa=a->a,*v; 383287828ca2SBarry Smith PetscScalar *x,*b,*t; 383324c233c2SKris Buschelman 383424c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 383524c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 383624c233c2SKris Buschelman unsigned long offset; 383724c233c2SKris Buschelman 383824c233c2SKris Buschelman PetscFunctionBegin; 383924c233c2SKris Buschelman SSE_SCOPE_BEGIN; 384024c233c2SKris Buschelman 384124c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 384224c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 384324c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 384424c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 384524c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 384624c233c2SKris Buschelman 38471ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 38481ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 384924c233c2SKris Buschelman t = a->solve_work; 385024c233c2SKris Buschelman 385124c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 385224c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 385324c233c2SKris Buschelman 385424c233c2SKris Buschelman /* forward solve the lower triangular */ 385524c233c2SKris Buschelman idx = 4*(*r++); 385624c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 385724c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 385824c233c2SKris Buschelman v = aa + 16*ai[1]; 385924c233c2SKris Buschelman 386024c233c2SKris Buschelman for (i=1; i<n;) { 386124c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 386224c233c2SKris Buschelman vi = aj + ai[i]; 386324c233c2SKris Buschelman nz = diag[i] - ai[i]; 386424c233c2SKris Buschelman idx = 4*(*r++); 386524c233c2SKris Buschelman 386624c233c2SKris Buschelman /* Demote sum from double to float */ 386724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 386824c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 386924c233c2SKris Buschelman 387024c233c2SKris Buschelman while (nz--) { 387124c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 387224c233c2SKris Buschelman idx = 4*(*vi++); 387324c233c2SKris Buschelman 387424c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 387524c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 387624c233c2SKris Buschelman 387724c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 387824c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 387924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 388024c233c2SKris Buschelman 388124c233c2SKris Buschelman /* First Column */ 388224c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 388324c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 388424c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 388524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 388624c233c2SKris Buschelman 388724c233c2SKris Buschelman /* Second Column */ 388824c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 388924c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 389024c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 389124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 389224c233c2SKris Buschelman 389324c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 389424c233c2SKris Buschelman 389524c233c2SKris Buschelman /* Third Column */ 389624c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 389724c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 389824c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 389924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 390024c233c2SKris Buschelman 390124c233c2SKris Buschelman /* Fourth Column */ 390224c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 390324c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 390424c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 390524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 390624c233c2SKris Buschelman SSE_INLINE_END_2 390724c233c2SKris Buschelman 390824c233c2SKris Buschelman v += 16; 390924c233c2SKris Buschelman } 391024c233c2SKris Buschelman idx = 4*i; 391124c233c2SKris Buschelman v = aa + 16*ai[++i]; 391224c233c2SKris Buschelman PREFETCH_NTA(v); 391324c233c2SKris Buschelman STORE_PS(tmps,XMM7); 391424c233c2SKris Buschelman 391524c233c2SKris Buschelman /* Promote result from float to double */ 391624c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 391724c233c2SKris Buschelman } 391824c233c2SKris Buschelman /* backward solve the upper triangular */ 391924c233c2SKris Buschelman idt = 4*(n-1); 392024c233c2SKris Buschelman ai16 = 16*diag[n-1]; 392124c233c2SKris Buschelman v = aa + ai16 + 16; 392224c233c2SKris Buschelman for (i=n-1; i>=0;){ 392324c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 392424c233c2SKris Buschelman vi = aj + diag[i] + 1; 392524c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 392624c233c2SKris Buschelman 392724c233c2SKris Buschelman /* Demote accumulator from double to float */ 392824c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 392924c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 393024c233c2SKris Buschelman 393124c233c2SKris Buschelman while (nz--) { 393224c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 393324c233c2SKris Buschelman idx = 4*(*vi++); 393424c233c2SKris Buschelman 393524c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 393624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 393724c233c2SKris Buschelman 393824c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 393924c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 394024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 394124c233c2SKris Buschelman 394224c233c2SKris Buschelman /* First Column */ 394324c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 394424c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 394524c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 394624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 394724c233c2SKris Buschelman 394824c233c2SKris Buschelman /* Second Column */ 394924c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 395024c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 395124c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 395224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 395324c233c2SKris Buschelman 395424c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 395524c233c2SKris Buschelman 395624c233c2SKris Buschelman /* Third Column */ 395724c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 395824c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 395924c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 396024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 396124c233c2SKris Buschelman 396224c233c2SKris Buschelman /* Fourth Column */ 396324c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 396424c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 396524c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 396624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 396724c233c2SKris Buschelman SSE_INLINE_END_2 396824c233c2SKris Buschelman v += 16; 396924c233c2SKris Buschelman } 397024c233c2SKris Buschelman v = aa + ai16; 397124c233c2SKris Buschelman ai16 = 16*diag[--i]; 397224c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 397324c233c2SKris Buschelman /* 397424c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 397524c233c2SKris Buschelman which was inverted as part of the factorization 397624c233c2SKris Buschelman */ 397724c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 397824c233c2SKris Buschelman /* First Column */ 397924c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 398024c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 398124c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 398224c233c2SKris Buschelman 398324c233c2SKris Buschelman /* Second Column */ 398424c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 398524c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 398624c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 398724c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 398824c233c2SKris Buschelman 398924c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 399024c233c2SKris Buschelman 399124c233c2SKris Buschelman /* Third Column */ 399224c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 399324c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 399424c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 399524c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 399624c233c2SKris Buschelman 399724c233c2SKris Buschelman /* Fourth Column */ 399824c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 399924c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 400024c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 400124c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 400224c233c2SKris Buschelman 400324c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 400424c233c2SKris Buschelman SSE_INLINE_END_3 400524c233c2SKris Buschelman 400624c233c2SKris Buschelman /* Promote solution from float to double */ 400724c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 400824c233c2SKris Buschelman 400924c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 401024c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 401124c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 401224c233c2SKris Buschelman idc = 4*(*c--); 401324c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 401424c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 401524c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 401624c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 401724c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 401824c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 401924c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 402024c233c2SKris Buschelman SSE_INLINE_END_2 402124c233c2SKris Buschelman v = aa + ai16 + 16; 402224c233c2SKris Buschelman idt -= 4; 402324c233c2SKris Buschelman } 402424c233c2SKris Buschelman 402524c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 402624c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 40271ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 40281ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4029dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 403024c233c2SKris Buschelman SSE_SCOPE_END; 403124c233c2SKris Buschelman PetscFunctionReturn(0); 403224c233c2SKris Buschelman } 403324c233c2SKris Buschelman 403424c233c2SKris Buschelman #endif 40350ef38995SBarry Smith 40360ef38995SBarry Smith 40374e2b4712SSatish Balay /* 40384e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 40394e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 40404e2b4712SSatish Balay */ 40414a2ae208SSatish Balay #undef __FUNCT__ 404206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 404306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 40444e2b4712SSatish Balay { 40454e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4046356650c2SBarry Smith PetscInt n=a->mbs; 4047356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 4048dfbe8321SBarry Smith PetscErrorCode ierr; 4049356650c2SBarry Smith const PetscInt *diag = a->diag; 4050d9fead3dSBarry Smith const MatScalar *aa=a->a; 4051d9fead3dSBarry Smith PetscScalar *x; 4052d9fead3dSBarry Smith const PetscScalar *b; 40534e2b4712SSatish Balay 40544e2b4712SSatish Balay PetscFunctionBegin; 4055d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 40561ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 40574e2b4712SSatish Balay 4058aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 40592853dc0eSBarry Smith { 406087828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 40612853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 40622853dc0eSBarry Smith } 4063aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 40642853dc0eSBarry Smith { 406587828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 40662853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 40672853dc0eSBarry Smith } 4068aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 40692853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 4070e1293385SBarry Smith #else 407130d4dcafSBarry Smith { 407287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4073d9fead3dSBarry Smith const MatScalar *v; 4074356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 4075356650c2SBarry Smith const PetscInt *vi; 4076e1293385SBarry Smith 40774e2b4712SSatish Balay /* forward solve the lower triangular */ 40784e2b4712SSatish Balay idx = 0; 4079e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 40804e2b4712SSatish Balay for (i=1; i<n; i++) { 40814e2b4712SSatish Balay v = aa + 16*ai[i]; 40824e2b4712SSatish Balay vi = aj + ai[i]; 40834e2b4712SSatish Balay nz = diag[i] - ai[i]; 4084e1293385SBarry Smith idx += 4; 4085f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 40864e2b4712SSatish Balay while (nz--) { 40874e2b4712SSatish Balay jdx = 4*(*vi++); 40884e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 4089f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4090f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4091f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4092f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 40934e2b4712SSatish Balay v += 16; 40944e2b4712SSatish Balay } 4095f1af5d2fSBarry Smith x[idx] = s1; 4096f1af5d2fSBarry Smith x[1+idx] = s2; 4097f1af5d2fSBarry Smith x[2+idx] = s3; 4098f1af5d2fSBarry Smith x[3+idx] = s4; 40994e2b4712SSatish Balay } 41004e2b4712SSatish Balay /* backward solve the upper triangular */ 41014e555682SBarry Smith idt = 4*(n-1); 41024e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 41034e555682SBarry Smith ai16 = 16*diag[i]; 41044e555682SBarry Smith v = aa + ai16 + 16; 41054e2b4712SSatish Balay vi = aj + diag[i] + 1; 41064e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4107f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4108f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 41094e2b4712SSatish Balay while (nz--) { 41104e2b4712SSatish Balay idx = 4*(*vi++); 41114e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 4112f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4113f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4114f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4115f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 41164e2b4712SSatish Balay v += 16; 41174e2b4712SSatish Balay } 41184e555682SBarry Smith v = aa + ai16; 4119f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4120f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4121f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4122f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4123329f5518SBarry Smith idt -= 4; 41244e2b4712SSatish Balay } 412530d4dcafSBarry Smith } 4126e1293385SBarry Smith #endif 41274e2b4712SSatish Balay 4128d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41291ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4130dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 41314e2b4712SSatish Balay PetscFunctionReturn(0); 41324e2b4712SSatish Balay } 41334e2b4712SSatish Balay 4134b2b2dd24SShri Abhyankar #undef __FUNCT__ 41354dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 41364dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4137b2b2dd24SShri Abhyankar { 4138b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4139b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4140b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 4141b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4142b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4143b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4144b2b2dd24SShri Abhyankar PetscScalar *x; 4145b2b2dd24SShri Abhyankar const PetscScalar *b; 4146b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4147cee9d6f2SShri Abhyankar 4148b2b2dd24SShri Abhyankar PetscFunctionBegin; 4149b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4150b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4151b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4152b2b2dd24SShri Abhyankar idx = 0; 4153b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4154b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4155b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4156b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4157b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4158b2b2dd24SShri Abhyankar idx = bs*i; 4159b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4160b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 4161b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4162b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4163b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4164b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4165b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4166b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4167b2b2dd24SShri Abhyankar 4168b2b2dd24SShri Abhyankar v += bs2; 4169b2b2dd24SShri Abhyankar } 4170b2b2dd24SShri Abhyankar 4171b2b2dd24SShri Abhyankar x[idx] = s1; 4172b2b2dd24SShri Abhyankar x[1+idx] = s2; 4173b2b2dd24SShri Abhyankar x[2+idx] = s3; 4174b2b2dd24SShri Abhyankar x[3+idx] = s4; 4175b2b2dd24SShri Abhyankar } 4176b2b2dd24SShri Abhyankar 4177b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4178b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4179b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4180b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4181b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4182b2b2dd24SShri Abhyankar idt = bs*i; 4183b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4184b2b2dd24SShri Abhyankar 4185b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4186b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4187b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4188b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4189b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4190b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4191b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4192b2b2dd24SShri Abhyankar 4193b2b2dd24SShri Abhyankar v += bs2; 4194b2b2dd24SShri Abhyankar } 4195b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4196b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4197b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4198b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4199b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4200b2b2dd24SShri Abhyankar 4201b2b2dd24SShri Abhyankar } 4202b2b2dd24SShri Abhyankar 4203b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4204b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4205b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4206b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4207b2b2dd24SShri Abhyankar } 4208cee9d6f2SShri Abhyankar 4209cee9d6f2SShri Abhyankar #undef __FUNCT__ 4210f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4211dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4212f26ec98cSKris Buschelman { 4213f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4214b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 4215dfbe8321SBarry Smith PetscErrorCode ierr; 4216b3260449SShri Abhyankar const MatScalar *aa=a->a; 4217b3260449SShri Abhyankar const PetscScalar *b; 4218b3260449SShri Abhyankar PetscScalar *x; 4219f26ec98cSKris Buschelman 4220f26ec98cSKris Buschelman PetscFunctionBegin; 4221b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42221ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4223f26ec98cSKris Buschelman 4224f26ec98cSKris Buschelman { 4225f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4226b3260449SShri Abhyankar const MatScalar *v; 4227b3260449SShri Abhyankar MatScalar *t=(MatScalar *)x; 4228b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i,ai16; 4229b3260449SShri Abhyankar const PetscInt *vi; 4230f26ec98cSKris Buschelman 4231f26ec98cSKris Buschelman /* forward solve the lower triangular */ 4232f26ec98cSKris Buschelman idx = 0; 4233f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 4234f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 4235f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 4236f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 4237f26ec98cSKris Buschelman for (i=1; i<n; i++) { 4238f26ec98cSKris Buschelman v = aa + 16*ai[i]; 4239f26ec98cSKris Buschelman vi = aj + ai[i]; 4240f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 4241f26ec98cSKris Buschelman idx += 4; 4242f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 4243f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 4244f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 4245f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 4246f26ec98cSKris Buschelman while (nz--) { 4247f26ec98cSKris Buschelman jdx = 4*(*vi++); 4248f26ec98cSKris Buschelman x1 = t[jdx]; 4249f26ec98cSKris Buschelman x2 = t[1+jdx]; 4250f26ec98cSKris Buschelman x3 = t[2+jdx]; 4251f26ec98cSKris Buschelman x4 = t[3+jdx]; 4252f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4253f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4254f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4255f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4256f26ec98cSKris Buschelman v += 16; 4257f26ec98cSKris Buschelman } 4258f26ec98cSKris Buschelman t[idx] = s1; 4259f26ec98cSKris Buschelman t[1+idx] = s2; 4260f26ec98cSKris Buschelman t[2+idx] = s3; 4261f26ec98cSKris Buschelman t[3+idx] = s4; 4262f26ec98cSKris Buschelman } 4263f26ec98cSKris Buschelman /* backward solve the upper triangular */ 4264f26ec98cSKris Buschelman idt = 4*(n-1); 4265f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 4266f26ec98cSKris Buschelman ai16 = 16*diag[i]; 4267f26ec98cSKris Buschelman v = aa + ai16 + 16; 4268f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 4269f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 4270f26ec98cSKris Buschelman s1 = t[idt]; 4271f26ec98cSKris Buschelman s2 = t[1+idt]; 4272f26ec98cSKris Buschelman s3 = t[2+idt]; 4273f26ec98cSKris Buschelman s4 = t[3+idt]; 4274f26ec98cSKris Buschelman while (nz--) { 4275f26ec98cSKris Buschelman idx = 4*(*vi++); 4276f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 4277f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 4278f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 4279f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 4280f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4281f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4282f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4283f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4284f26ec98cSKris Buschelman v += 16; 4285f26ec98cSKris Buschelman } 4286f26ec98cSKris Buschelman v = aa + ai16; 4287f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4288f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4289f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4290f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4291f26ec98cSKris Buschelman idt -= 4; 4292f26ec98cSKris Buschelman } 4293f26ec98cSKris Buschelman } 4294f26ec98cSKris Buschelman 4295b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42961ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4297dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4298f26ec98cSKris Buschelman PetscFunctionReturn(0); 4299f26ec98cSKris Buschelman } 4300f26ec98cSKris Buschelman 43013660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 43023660e330SKris Buschelman 43033660e330SKris Buschelman #include PETSC_HAVE_SSE 43043660e330SKris Buschelman #undef __FUNCT__ 43057cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4306dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 43073660e330SKris Buschelman { 43083660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 43092aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 4310dfbe8321SBarry Smith PetscErrorCode ierr; 4311dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 43123660e330SKris Buschelman MatScalar *aa=a->a; 431387828ca2SBarry Smith PetscScalar *x,*b; 43143660e330SKris Buschelman 43153660e330SKris Buschelman PetscFunctionBegin; 43163660e330SKris Buschelman SSE_SCOPE_BEGIN; 43173660e330SKris Buschelman /* 43183660e330SKris Buschelman Note: This code currently uses demotion of double 43193660e330SKris Buschelman to float when performing the mixed-mode computation. 43203660e330SKris Buschelman This may not be numerically reasonable for all applications. 43213660e330SKris Buschelman */ 43223660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 43233660e330SKris Buschelman 43241ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 43251ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 43263660e330SKris Buschelman { 4327eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 4328eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 43292aa5897fSKris Buschelman int nz,i,idt,ai16; 43302aa5897fSKris Buschelman unsigned int jdx,idx; 43312aa5897fSKris Buschelman unsigned short *vi; 4332eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 43333660e330SKris Buschelman 4334eb05f457SKris Buschelman /* First block is the identity. */ 43353660e330SKris Buschelman idx = 0; 4336eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 43372aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 43383660e330SKris Buschelman 43393660e330SKris Buschelman for (i=1; i<n;) { 43403660e330SKris Buschelman PREFETCH_NTA(&v[8]); 43413660e330SKris Buschelman vi = aj + ai[i]; 43423660e330SKris Buschelman nz = diag[i] - ai[i]; 43433660e330SKris Buschelman idx += 4; 43443660e330SKris Buschelman 4345eb05f457SKris Buschelman /* Demote RHS from double to float. */ 4346eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4347eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 43483660e330SKris Buschelman 43493660e330SKris Buschelman while (nz--) { 43503660e330SKris Buschelman PREFETCH_NTA(&v[16]); 43512aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 43523660e330SKris Buschelman 43533660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 4354eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 43553660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 43563660e330SKris Buschelman 43573660e330SKris Buschelman /* First Column */ 43583660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 43593660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 43603660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 43613660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 43623660e330SKris Buschelman 43633660e330SKris Buschelman /* Second Column */ 43643660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 43653660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 43663660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 43673660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 43683660e330SKris Buschelman 43693660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 43703660e330SKris Buschelman 43713660e330SKris Buschelman /* Third Column */ 43723660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 43733660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 43743660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 43753660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 43763660e330SKris Buschelman 43773660e330SKris Buschelman /* Fourth Column */ 43783660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 43793660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 43803660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 43813660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 43823660e330SKris Buschelman SSE_INLINE_END_2 43833660e330SKris Buschelman 43843660e330SKris Buschelman v += 16; 43853660e330SKris Buschelman } 43863660e330SKris Buschelman v = aa + 16*ai[++i]; 43873660e330SKris Buschelman PREFETCH_NTA(v); 4388eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 43893660e330SKris Buschelman } 4390eb05f457SKris Buschelman 4391eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 4392eb05f457SKris Buschelman 43933660e330SKris Buschelman idt = 4*(n-1); 43943660e330SKris Buschelman ai16 = 16*diag[n-1]; 43953660e330SKris Buschelman v = aa + ai16 + 16; 43963660e330SKris Buschelman for (i=n-1; i>=0;){ 43973660e330SKris Buschelman PREFETCH_NTA(&v[8]); 43983660e330SKris Buschelman vi = aj + diag[i] + 1; 43993660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 44003660e330SKris Buschelman 4401eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 44023660e330SKris Buschelman 44033660e330SKris Buschelman while (nz--) { 44043660e330SKris Buschelman PREFETCH_NTA(&v[16]); 44052aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 44063660e330SKris Buschelman 44073660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 4408eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 44093660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 44103660e330SKris Buschelman 44113660e330SKris Buschelman /* First Column */ 44123660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 44133660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 44143660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 44153660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 44163660e330SKris Buschelman 44173660e330SKris Buschelman /* Second Column */ 44183660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 44193660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 44203660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 44213660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 44223660e330SKris Buschelman 44233660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 44243660e330SKris Buschelman 44253660e330SKris Buschelman /* Third Column */ 44263660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 44273660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 44283660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 44293660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 44303660e330SKris Buschelman 44313660e330SKris Buschelman /* Fourth Column */ 44323660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 44333660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 44343660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 44353660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 44363660e330SKris Buschelman SSE_INLINE_END_2 44373660e330SKris Buschelman v += 16; 44383660e330SKris Buschelman } 44393660e330SKris Buschelman v = aa + ai16; 44403660e330SKris Buschelman ai16 = 16*diag[--i]; 44413660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 44423660e330SKris Buschelman /* 44433660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 44443660e330SKris Buschelman which was inverted as part of the factorization 44453660e330SKris Buschelman */ 4446eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 44473660e330SKris Buschelman /* First Column */ 44483660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 44493660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 44503660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 44513660e330SKris Buschelman 44523660e330SKris Buschelman /* Second Column */ 44533660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 44543660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 44553660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 44563660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 44573660e330SKris Buschelman 44583660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 44593660e330SKris Buschelman 44603660e330SKris Buschelman /* Third Column */ 44613660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 44623660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 44633660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 44643660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 44653660e330SKris Buschelman 44663660e330SKris Buschelman /* Fourth Column */ 44673660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 44683660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 44693660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 44703660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 44713660e330SKris Buschelman 44723660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 44733660e330SKris Buschelman SSE_INLINE_END_3 44743660e330SKris Buschelman 44753660e330SKris Buschelman v = aa + ai16 + 16; 44763660e330SKris Buschelman idt -= 4; 44773660e330SKris Buschelman } 4478eb05f457SKris Buschelman 4479eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 4480eb05f457SKris Buschelman idt = 4*(n-1); 4481eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 4482eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4483eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4484eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 4485eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 4486eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 4487eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 4488eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 4489eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 449054693613SKris Buschelman idt -= 4; 44913660e330SKris Buschelman } 4492eb05f457SKris Buschelman 4493eb05f457SKris Buschelman } /* End of artificial scope. */ 44941ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 44951ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4496dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 44973660e330SKris Buschelman SSE_SCOPE_END; 44983660e330SKris Buschelman PetscFunctionReturn(0); 44993660e330SKris Buschelman } 45003660e330SKris Buschelman 45017cf1b8d3SKris Buschelman #undef __FUNCT__ 45027cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4503dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 45047cf1b8d3SKris Buschelman { 45057cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 45067cf1b8d3SKris Buschelman int *aj=a->j; 4507dfbe8321SBarry Smith PetscErrorCode ierr; 4508dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 45097cf1b8d3SKris Buschelman MatScalar *aa=a->a; 45107cf1b8d3SKris Buschelman PetscScalar *x,*b; 45117cf1b8d3SKris Buschelman 45127cf1b8d3SKris Buschelman PetscFunctionBegin; 45137cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 45147cf1b8d3SKris Buschelman /* 45157cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 45167cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 45177cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 45187cf1b8d3SKris Buschelman */ 45197cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 45207cf1b8d3SKris Buschelman 45211ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 45221ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 45237cf1b8d3SKris Buschelman { 45247cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 45257cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 45267cf1b8d3SKris Buschelman int nz,i,idt,ai16; 45277cf1b8d3SKris Buschelman int jdx,idx; 45287cf1b8d3SKris Buschelman int *vi; 45297cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 45307cf1b8d3SKris Buschelman 45317cf1b8d3SKris Buschelman /* First block is the identity. */ 45327cf1b8d3SKris Buschelman idx = 0; 45337cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 45347cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 45357cf1b8d3SKris Buschelman 45367cf1b8d3SKris Buschelman for (i=1; i<n;) { 45377cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 45387cf1b8d3SKris Buschelman vi = aj + ai[i]; 45397cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 45407cf1b8d3SKris Buschelman idx += 4; 45417cf1b8d3SKris Buschelman 45427cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 45437cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 45447cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 45457cf1b8d3SKris Buschelman 45467cf1b8d3SKris Buschelman while (nz--) { 45477cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 45487cf1b8d3SKris Buschelman jdx = 4*(*vi++); 45497cf1b8d3SKris Buschelman /* jdx = *vi++; */ 45507cf1b8d3SKris Buschelman 45517cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 45527cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 45537cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 45547cf1b8d3SKris Buschelman 45557cf1b8d3SKris Buschelman /* First Column */ 45567cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 45577cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 45587cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 45597cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 45607cf1b8d3SKris Buschelman 45617cf1b8d3SKris Buschelman /* Second Column */ 45627cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 45637cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 45647cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 45657cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 45667cf1b8d3SKris Buschelman 45677cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 45687cf1b8d3SKris Buschelman 45697cf1b8d3SKris Buschelman /* Third Column */ 45707cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 45717cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 45727cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 45737cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 45747cf1b8d3SKris Buschelman 45757cf1b8d3SKris Buschelman /* Fourth Column */ 45767cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 45777cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 45787cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 45797cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 45807cf1b8d3SKris Buschelman SSE_INLINE_END_2 45817cf1b8d3SKris Buschelman 45827cf1b8d3SKris Buschelman v += 16; 45837cf1b8d3SKris Buschelman } 45847cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 45857cf1b8d3SKris Buschelman PREFETCH_NTA(v); 45867cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 45877cf1b8d3SKris Buschelman } 45887cf1b8d3SKris Buschelman 45897cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 45907cf1b8d3SKris Buschelman 45917cf1b8d3SKris Buschelman idt = 4*(n-1); 45927cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 45937cf1b8d3SKris Buschelman v = aa + ai16 + 16; 45947cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 45957cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 45967cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 45977cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 45987cf1b8d3SKris Buschelman 45997cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 46007cf1b8d3SKris Buschelman 46017cf1b8d3SKris Buschelman while (nz--) { 46027cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 46037cf1b8d3SKris Buschelman idx = 4*(*vi++); 46047cf1b8d3SKris Buschelman /* idx = *vi++; */ 46057cf1b8d3SKris Buschelman 46067cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 46077cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 46087cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 46097cf1b8d3SKris Buschelman 46107cf1b8d3SKris Buschelman /* First Column */ 46117cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 46127cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 46137cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 46147cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 46157cf1b8d3SKris Buschelman 46167cf1b8d3SKris Buschelman /* Second Column */ 46177cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 46187cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 46197cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 46207cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 46217cf1b8d3SKris Buschelman 46227cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 46237cf1b8d3SKris Buschelman 46247cf1b8d3SKris Buschelman /* Third Column */ 46257cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 46267cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 46277cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 46287cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 46297cf1b8d3SKris Buschelman 46307cf1b8d3SKris Buschelman /* Fourth Column */ 46317cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 46327cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 46337cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 46347cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 46357cf1b8d3SKris Buschelman SSE_INLINE_END_2 46367cf1b8d3SKris Buschelman v += 16; 46377cf1b8d3SKris Buschelman } 46387cf1b8d3SKris Buschelman v = aa + ai16; 46397cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 46407cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 46417cf1b8d3SKris Buschelman /* 46427cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 46437cf1b8d3SKris Buschelman which was inverted as part of the factorization 46447cf1b8d3SKris Buschelman */ 46457cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 46467cf1b8d3SKris Buschelman /* First Column */ 46477cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 46487cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 46497cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 46507cf1b8d3SKris Buschelman 46517cf1b8d3SKris Buschelman /* Second Column */ 46527cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 46537cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 46547cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 46557cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 46567cf1b8d3SKris Buschelman 46577cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 46587cf1b8d3SKris Buschelman 46597cf1b8d3SKris Buschelman /* Third Column */ 46607cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 46617cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 46627cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 46637cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 46647cf1b8d3SKris Buschelman 46657cf1b8d3SKris Buschelman /* Fourth Column */ 46667cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 46677cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 46687cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 46697cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 46707cf1b8d3SKris Buschelman 46717cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 46727cf1b8d3SKris Buschelman SSE_INLINE_END_3 46737cf1b8d3SKris Buschelman 46747cf1b8d3SKris Buschelman v = aa + ai16 + 16; 46757cf1b8d3SKris Buschelman idt -= 4; 46767cf1b8d3SKris Buschelman } 46777cf1b8d3SKris Buschelman 46787cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 46797cf1b8d3SKris Buschelman idt = 4*(n-1); 46807cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 46817cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 46827cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 46837cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 46847cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 46857cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 46867cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 46877cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 46887cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 46897cf1b8d3SKris Buschelman idt -= 4; 46907cf1b8d3SKris Buschelman } 46917cf1b8d3SKris Buschelman 46927cf1b8d3SKris Buschelman } /* End of artificial scope. */ 46931ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 46941ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4695dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 46967cf1b8d3SKris Buschelman SSE_SCOPE_END; 46977cf1b8d3SKris Buschelman PetscFunctionReturn(0); 46987cf1b8d3SKris Buschelman } 46997cf1b8d3SKris Buschelman 47003660e330SKris Buschelman #endif 47018f690400SShri Abhyankar 47024a2ae208SSatish Balay #undef __FUNCT__ 470306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 470406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 47054e2b4712SSatish Balay { 47064e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 47074e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 47086849ba73SBarry Smith PetscErrorCode ierr; 4709b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4710b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 47115d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4712d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4713d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4714d9fead3dSBarry Smith const PetscScalar *b; 47154e2b4712SSatish Balay 47164e2b4712SSatish Balay PetscFunctionBegin; 4717d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4719f1af5d2fSBarry Smith t = a->solve_work; 47204e2b4712SSatish Balay 47214e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 47224e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 47234e2b4712SSatish Balay 47244e2b4712SSatish Balay /* forward solve the lower triangular */ 47254e2b4712SSatish Balay idx = 3*(*r++); 4726f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 47274e2b4712SSatish Balay for (i=1; i<n; i++) { 47284e2b4712SSatish Balay v = aa + 9*ai[i]; 47294e2b4712SSatish Balay vi = aj + ai[i]; 47304e2b4712SSatish Balay nz = diag[i] - ai[i]; 47314e2b4712SSatish Balay idx = 3*(*r++); 4732f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 47334e2b4712SSatish Balay while (nz--) { 47344e2b4712SSatish Balay idx = 3*(*vi++); 4735f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4736f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4737f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4738f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 47394e2b4712SSatish Balay v += 9; 47404e2b4712SSatish Balay } 47414e2b4712SSatish Balay idx = 3*i; 4742f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 47434e2b4712SSatish Balay } 47444e2b4712SSatish Balay /* backward solve the upper triangular */ 47454e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 47464e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 47474e2b4712SSatish Balay vi = aj + diag[i] + 1; 47484e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 47494e2b4712SSatish Balay idt = 3*i; 4750f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 47514e2b4712SSatish Balay while (nz--) { 47524e2b4712SSatish Balay idx = 3*(*vi++); 4753f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4754f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4755f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4756f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 47574e2b4712SSatish Balay v += 9; 47584e2b4712SSatish Balay } 47594e2b4712SSatish Balay idc = 3*(*c--); 47604e2b4712SSatish Balay v = aa + 9*diag[i]; 4761f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4762f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4763f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 47644e2b4712SSatish Balay } 47654e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 47664e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4767d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47681ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4769dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 47704e2b4712SSatish Balay PetscFunctionReturn(0); 47714e2b4712SSatish Balay } 47724e2b4712SSatish Balay 47730c4413a7SShri Abhyankar #undef __FUNCT__ 47744dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3" 47754dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 47760c4413a7SShri Abhyankar { 47770c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 47780c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 47790c4413a7SShri Abhyankar PetscErrorCode ierr; 4780b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4781b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 47820c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 47830c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 47840c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 47850c4413a7SShri Abhyankar const PetscScalar *b; 47860c4413a7SShri Abhyankar 47870c4413a7SShri Abhyankar PetscFunctionBegin; 47880c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47890c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 47900c4413a7SShri Abhyankar t = a->solve_work; 47910c4413a7SShri Abhyankar 47920c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 47930c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 47940c4413a7SShri Abhyankar 47950c4413a7SShri Abhyankar /* forward solve the lower triangular */ 47960c4413a7SShri Abhyankar idx = 3*r[0]; 47970c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 47980c4413a7SShri Abhyankar for (i=1; i<n; i++) { 47990c4413a7SShri Abhyankar v = aa + 9*ai[i]; 48000c4413a7SShri Abhyankar vi = aj + ai[i]; 48010c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 48020c4413a7SShri Abhyankar idx = 3*r[i]; 48030c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 48040c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 48050c4413a7SShri Abhyankar idx = 3*vi[m]; 48060c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 48070c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 48080c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 48090c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48100c4413a7SShri Abhyankar v += 9; 48110c4413a7SShri Abhyankar } 48120c4413a7SShri Abhyankar idx = 3*i; 48130c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 48140c4413a7SShri Abhyankar } 48150c4413a7SShri Abhyankar /* backward solve the upper triangular */ 48160c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 48170c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 48180c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 48190c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 48200c4413a7SShri Abhyankar idt = 3*i; 48210c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 48220c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 48230c4413a7SShri Abhyankar idx = 3*vi[m]; 48240c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 48250c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 48260c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 48270c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48280c4413a7SShri Abhyankar v += 9; 48290c4413a7SShri Abhyankar } 48300c4413a7SShri Abhyankar idc = 3*c[i]; 48310c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 48320c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 48330c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 48340c4413a7SShri Abhyankar } 48350c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 48360c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 48370c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 48380c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 48390c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 48400c4413a7SShri Abhyankar PetscFunctionReturn(0); 48410c4413a7SShri Abhyankar } 48420c4413a7SShri Abhyankar 484315091d37SBarry Smith /* 484415091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 484515091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 484615091d37SBarry Smith */ 48474a2ae208SSatish Balay #undef __FUNCT__ 484806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 484906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 485015091d37SBarry Smith { 485115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 48520b68f018SBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4853dfbe8321SBarry Smith PetscErrorCode ierr; 48540b68f018SBarry Smith const PetscInt *diag = a->diag,*vi; 4855d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4856d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4857d9fead3dSBarry Smith const PetscScalar *b; 48580b68f018SBarry Smith PetscInt jdx,idt,idx,nz,i; 485915091d37SBarry Smith 486015091d37SBarry Smith PetscFunctionBegin; 4861d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 48621ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 486315091d37SBarry Smith 486415091d37SBarry Smith /* forward solve the lower triangular */ 486515091d37SBarry Smith idx = 0; 486615091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 486715091d37SBarry Smith for (i=1; i<n; i++) { 486815091d37SBarry Smith v = aa + 9*ai[i]; 486915091d37SBarry Smith vi = aj + ai[i]; 487015091d37SBarry Smith nz = diag[i] - ai[i]; 487115091d37SBarry Smith idx += 3; 4872f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 487315091d37SBarry Smith while (nz--) { 487415091d37SBarry Smith jdx = 3*(*vi++); 487515091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4876f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4877f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4878f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 487915091d37SBarry Smith v += 9; 488015091d37SBarry Smith } 4881f1af5d2fSBarry Smith x[idx] = s1; 4882f1af5d2fSBarry Smith x[1+idx] = s2; 4883f1af5d2fSBarry Smith x[2+idx] = s3; 488415091d37SBarry Smith } 488515091d37SBarry Smith /* backward solve the upper triangular */ 488615091d37SBarry Smith for (i=n-1; i>=0; i--){ 488715091d37SBarry Smith v = aa + 9*diag[i] + 9; 488815091d37SBarry Smith vi = aj + diag[i] + 1; 488915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 489015091d37SBarry Smith idt = 3*i; 4891f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4892f1af5d2fSBarry Smith s3 = x[2+idt]; 489315091d37SBarry Smith while (nz--) { 489415091d37SBarry Smith idx = 3*(*vi++); 489515091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4896f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4897f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4898f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 489915091d37SBarry Smith v += 9; 490015091d37SBarry Smith } 490115091d37SBarry Smith v = aa + 9*diag[i]; 4902f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4903f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4904f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 490515091d37SBarry Smith } 490615091d37SBarry Smith 4907d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 49081ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4909dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 491015091d37SBarry Smith PetscFunctionReturn(0); 491115091d37SBarry Smith } 491215091d37SBarry Smith 4913cee9d6f2SShri Abhyankar #undef __FUNCT__ 49144dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 49154dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4916b2b2dd24SShri Abhyankar { 4917b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4918b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4919b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4920b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 4921b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4922b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4923b2b2dd24SShri Abhyankar PetscScalar *x; 4924b2b2dd24SShri Abhyankar const PetscScalar *b; 4925b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4926b2b2dd24SShri Abhyankar 4927b2b2dd24SShri Abhyankar PetscFunctionBegin; 4928b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4929b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4930b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4931b2b2dd24SShri Abhyankar idx = 0; 4932b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4933b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4934b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4935b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4936b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4937b2b2dd24SShri Abhyankar idx = bs*i; 4938b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4939b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4940b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4941b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4942b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4943b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4944b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4945b2b2dd24SShri Abhyankar 4946b2b2dd24SShri Abhyankar v += bs2; 4947b2b2dd24SShri Abhyankar } 4948b2b2dd24SShri Abhyankar 4949b2b2dd24SShri Abhyankar x[idx] = s1; 4950b2b2dd24SShri Abhyankar x[1+idx] = s2; 4951b2b2dd24SShri Abhyankar x[2+idx] = s3; 4952b2b2dd24SShri Abhyankar } 4953b2b2dd24SShri Abhyankar 4954b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4955b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4956b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4957b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4958b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4959b2b2dd24SShri Abhyankar idt = bs*i; 4960b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4961b2b2dd24SShri Abhyankar 4962b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4963b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4964b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4965b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4966b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4967b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4968b2b2dd24SShri Abhyankar 4969b2b2dd24SShri Abhyankar v += bs2; 4970b2b2dd24SShri Abhyankar } 4971b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4972b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4973b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4974b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4975b2b2dd24SShri Abhyankar 4976b2b2dd24SShri Abhyankar } 4977b2b2dd24SShri Abhyankar 4978b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4979b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4980b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4981b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4982b2b2dd24SShri Abhyankar } 4983b2b2dd24SShri Abhyankar 4984b2b2dd24SShri Abhyankar #undef __FUNCT__ 498506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 498606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 49874e2b4712SSatish Balay { 49884e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 49894e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 49906849ba73SBarry Smith PetscErrorCode ierr; 4991b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4992b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 49935d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4994d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4995d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 4996d9fead3dSBarry Smith const PetscScalar *b; 49974e2b4712SSatish Balay 49984e2b4712SSatish Balay PetscFunctionBegin; 4999d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 50001ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5001f1af5d2fSBarry Smith t = a->solve_work; 50024e2b4712SSatish Balay 50034e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 50044e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 50054e2b4712SSatish Balay 50064e2b4712SSatish Balay /* forward solve the lower triangular */ 50074e2b4712SSatish Balay idx = 2*(*r++); 5008f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 50094e2b4712SSatish Balay for (i=1; i<n; i++) { 50104e2b4712SSatish Balay v = aa + 4*ai[i]; 50114e2b4712SSatish Balay vi = aj + ai[i]; 50124e2b4712SSatish Balay nz = diag[i] - ai[i]; 50134e2b4712SSatish Balay idx = 2*(*r++); 5014f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 50154e2b4712SSatish Balay while (nz--) { 50164e2b4712SSatish Balay idx = 2*(*vi++); 5017f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 5018f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5019f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 50204e2b4712SSatish Balay v += 4; 50214e2b4712SSatish Balay } 50224e2b4712SSatish Balay idx = 2*i; 5023f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 50244e2b4712SSatish Balay } 50254e2b4712SSatish Balay /* backward solve the upper triangular */ 50264e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 50274e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 50284e2b4712SSatish Balay vi = aj + diag[i] + 1; 50294e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 50304e2b4712SSatish Balay idt = 2*i; 5031f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 50324e2b4712SSatish Balay while (nz--) { 50334e2b4712SSatish Balay idx = 2*(*vi++); 5034f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 5035f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5036f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 50374e2b4712SSatish Balay v += 4; 50384e2b4712SSatish Balay } 50394e2b4712SSatish Balay idc = 2*(*c--); 50404e2b4712SSatish Balay v = aa + 4*diag[i]; 5041f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5042f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 50434e2b4712SSatish Balay } 50444e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 50454e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5046d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 50471ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5048dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 50494e2b4712SSatish Balay PetscFunctionReturn(0); 50504e2b4712SSatish Balay } 50514e2b4712SSatish Balay 50520c4413a7SShri Abhyankar #undef __FUNCT__ 50534dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2" 50544dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 50550c4413a7SShri Abhyankar { 50560c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 50570c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 50580c4413a7SShri Abhyankar PetscErrorCode ierr; 5059b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5060b3260449SShri Abhyankar PetscInt i,nz,idx,jdx,idt,idc,m; 50610c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 50620c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 50630c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 50640c4413a7SShri Abhyankar const PetscScalar *b; 50650c4413a7SShri Abhyankar 50660c4413a7SShri Abhyankar PetscFunctionBegin; 50670c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 50680c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 50690c4413a7SShri Abhyankar t = a->solve_work; 50700c4413a7SShri Abhyankar 50710c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 50720c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 50730c4413a7SShri Abhyankar 50740c4413a7SShri Abhyankar /* forward solve the lower triangular */ 50750c4413a7SShri Abhyankar idx = 2*r[0]; 50760c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 50770c4413a7SShri Abhyankar for (i=1; i<n; i++) { 50780c4413a7SShri Abhyankar v = aa + 4*ai[i]; 50790c4413a7SShri Abhyankar vi = aj + ai[i]; 50800c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 50810c4413a7SShri Abhyankar idx = 2*r[i]; 50820c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 50830c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 50840c4413a7SShri Abhyankar jdx = 2*vi[m]; 50850c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 50860c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 50870c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 50880c4413a7SShri Abhyankar v += 4; 50890c4413a7SShri Abhyankar } 50900c4413a7SShri Abhyankar idx = 2*i; 50910c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 50920c4413a7SShri Abhyankar } 50930c4413a7SShri Abhyankar /* backward solve the upper triangular */ 50940c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 50950c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 50960c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 50970c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 50980c4413a7SShri Abhyankar idt = 2*i; 50990c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 51000c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 51010c4413a7SShri Abhyankar idx = 2*vi[m]; 51020c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 51030c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 51040c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 51050c4413a7SShri Abhyankar v += 4; 51060c4413a7SShri Abhyankar } 51070c4413a7SShri Abhyankar idc = 2*c[i]; 51080c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 51090c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 51100c4413a7SShri Abhyankar } 51110c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 51120c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 51130c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 51140c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51150c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 51160c4413a7SShri Abhyankar PetscFunctionReturn(0); 51170c4413a7SShri Abhyankar } 51188f690400SShri Abhyankar 511915091d37SBarry Smith /* 512015091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 512115091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 512215091d37SBarry Smith */ 51234a2ae208SSatish Balay #undef __FUNCT__ 512406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 512506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 512615091d37SBarry Smith { 512715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5128b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5129dfbe8321SBarry Smith PetscErrorCode ierr; 5130d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 5131d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 5132d9fead3dSBarry Smith const PetscScalar *b; 5133b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i; 513415091d37SBarry Smith 513515091d37SBarry Smith PetscFunctionBegin; 5136d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 51371ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 513815091d37SBarry Smith 513915091d37SBarry Smith /* forward solve the lower triangular */ 514015091d37SBarry Smith idx = 0; 514115091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 514215091d37SBarry Smith for (i=1; i<n; i++) { 514315091d37SBarry Smith v = aa + 4*ai[i]; 514415091d37SBarry Smith vi = aj + ai[i]; 514515091d37SBarry Smith nz = diag[i] - ai[i]; 514615091d37SBarry Smith idx += 2; 5147f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 514815091d37SBarry Smith while (nz--) { 514915091d37SBarry Smith jdx = 2*(*vi++); 515015091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 5151f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5152f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 515315091d37SBarry Smith v += 4; 515415091d37SBarry Smith } 5155f1af5d2fSBarry Smith x[idx] = s1; 5156f1af5d2fSBarry Smith x[1+idx] = s2; 515715091d37SBarry Smith } 515815091d37SBarry Smith /* backward solve the upper triangular */ 515915091d37SBarry Smith for (i=n-1; i>=0; i--){ 516015091d37SBarry Smith v = aa + 4*diag[i] + 4; 516115091d37SBarry Smith vi = aj + diag[i] + 1; 516215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 516315091d37SBarry Smith idt = 2*i; 5164f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 516515091d37SBarry Smith while (nz--) { 516615091d37SBarry Smith idx = 2*(*vi++); 516715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 5168f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5169f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 517015091d37SBarry Smith v += 4; 517115091d37SBarry Smith } 517215091d37SBarry Smith v = aa + 4*diag[i]; 5173f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 5174f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 517515091d37SBarry Smith } 517615091d37SBarry Smith 5177d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 51781ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5179dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 518015091d37SBarry Smith PetscFunctionReturn(0); 518115091d37SBarry Smith } 518215091d37SBarry Smith 5183cee9d6f2SShri Abhyankar #undef __FUNCT__ 51844dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 51854dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5186b2b2dd24SShri Abhyankar { 5187b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5188b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5189b3260449SShri Abhyankar PetscInt i,k,nz,idx,idt,jdx; 5190b2b2dd24SShri Abhyankar PetscErrorCode ierr; 5191b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 5192b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 5193b2b2dd24SShri Abhyankar const PetscScalar *b; 5194b2b2dd24SShri Abhyankar 5195b2b2dd24SShri Abhyankar PetscFunctionBegin; 5196b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5197b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5198b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 5199b2b2dd24SShri Abhyankar idx = 0; 5200b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 5201b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 5202b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 5203b2b2dd24SShri Abhyankar vi = aj + ai[i]; 5204b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 5205b2b2dd24SShri Abhyankar idx = 2*i; 5206b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 5207b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5208b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 5209b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 5210b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5211b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5212b2b2dd24SShri Abhyankar v += 4; 5213b2b2dd24SShri Abhyankar } 5214b2b2dd24SShri Abhyankar x[idx] = s1; 5215b2b2dd24SShri Abhyankar x[1+idx] = s2; 5216b2b2dd24SShri Abhyankar } 5217b2b2dd24SShri Abhyankar 5218b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 5219b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 5220b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 5221b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 5222b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5223b2b2dd24SShri Abhyankar idt = 2*i; 5224b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 5225b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5226b2b2dd24SShri Abhyankar idx = 2*vi[k]; 5227b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 5228b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5229b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5230b2b2dd24SShri Abhyankar v += 4; 5231b2b2dd24SShri Abhyankar } 5232b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 5233b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 5234b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 5235b2b2dd24SShri Abhyankar } 5236b2b2dd24SShri Abhyankar 5237b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5238b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5239b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5240b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 5241b2b2dd24SShri Abhyankar } 5242b2b2dd24SShri Abhyankar 5243b2b2dd24SShri Abhyankar #undef __FUNCT__ 524406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 524506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 52464e2b4712SSatish Balay { 52474e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 52484e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 52496849ba73SBarry Smith PetscErrorCode ierr; 5250b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5251b3260449SShri Abhyankar PetscInt i,nz; 52525d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5253b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 5254b3260449SShri Abhyankar PetscScalar *x,s1,*t; 5255b3260449SShri Abhyankar const PetscScalar *b; 52564e2b4712SSatish Balay 52574e2b4712SSatish Balay PetscFunctionBegin; 52584e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 52594e2b4712SSatish Balay 5260b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 52611ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5262f1af5d2fSBarry Smith t = a->solve_work; 52634e2b4712SSatish Balay 52644e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 52654e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 52664e2b4712SSatish Balay 52674e2b4712SSatish Balay /* forward solve the lower triangular */ 5268f1af5d2fSBarry Smith t[0] = b[*r++]; 52694e2b4712SSatish Balay for (i=1; i<n; i++) { 52704e2b4712SSatish Balay v = aa + ai[i]; 52714e2b4712SSatish Balay vi = aj + ai[i]; 52724e2b4712SSatish Balay nz = diag[i] - ai[i]; 5273f1af5d2fSBarry Smith s1 = b[*r++]; 52744e2b4712SSatish Balay while (nz--) { 5275f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 52764e2b4712SSatish Balay } 5277f1af5d2fSBarry Smith t[i] = s1; 52784e2b4712SSatish Balay } 52794e2b4712SSatish Balay /* backward solve the upper triangular */ 52804e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 52814e2b4712SSatish Balay v = aa + diag[i] + 1; 52824e2b4712SSatish Balay vi = aj + diag[i] + 1; 52834e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 5284f1af5d2fSBarry Smith s1 = t[i]; 52854e2b4712SSatish Balay while (nz--) { 5286f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 52874e2b4712SSatish Balay } 5288f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 52894e2b4712SSatish Balay } 52904e2b4712SSatish Balay 52914e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 52924e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5293b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 52941ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5295dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 52964e2b4712SSatish Balay PetscFunctionReturn(0); 52974e2b4712SSatish Balay } 5298048b5e81SShri Abhyankar 5299048b5e81SShri Abhyankar #undef __FUNCT__ 5300048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1" 5301048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 5302048b5e81SShri Abhyankar { 5303048b5e81SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5304048b5e81SShri Abhyankar IS iscol = a->col,isrow = a->row; 5305048b5e81SShri Abhyankar PetscErrorCode ierr; 5306048b5e81SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 5307048b5e81SShri Abhyankar const PetscInt *rout,*cout,*r,*c; 5308048b5e81SShri Abhyankar PetscScalar *x,*tmp,sum; 5309048b5e81SShri Abhyankar const PetscScalar *b; 5310048b5e81SShri Abhyankar const MatScalar *aa = a->a,*v; 5311048b5e81SShri Abhyankar 5312048b5e81SShri Abhyankar PetscFunctionBegin; 5313048b5e81SShri Abhyankar if (!n) PetscFunctionReturn(0); 5314048b5e81SShri Abhyankar 5315048b5e81SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5316048b5e81SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5317048b5e81SShri Abhyankar tmp = a->solve_work; 5318048b5e81SShri Abhyankar 5319048b5e81SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5320048b5e81SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5321048b5e81SShri Abhyankar 5322048b5e81SShri Abhyankar /* forward solve the lower triangular */ 5323048b5e81SShri Abhyankar tmp[0] = b[r[0]]; 5324048b5e81SShri Abhyankar v = aa; 5325048b5e81SShri Abhyankar vi = aj; 5326048b5e81SShri Abhyankar for (i=1; i<n; i++) { 5327048b5e81SShri Abhyankar nz = ai[i+1] - ai[i]; 5328048b5e81SShri Abhyankar sum = b[r[i]]; 5329048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5330048b5e81SShri Abhyankar tmp[i] = sum; 5331048b5e81SShri Abhyankar v += nz; vi += nz; 5332048b5e81SShri Abhyankar } 5333048b5e81SShri Abhyankar 5334048b5e81SShri Abhyankar /* backward solve the upper triangular */ 5335048b5e81SShri Abhyankar for (i=n-1; i>=0; i--){ 5336048b5e81SShri Abhyankar v = aa + adiag[i+1]+1; 5337048b5e81SShri Abhyankar vi = aj + adiag[i+1]+1; 5338048b5e81SShri Abhyankar nz = adiag[i]-adiag[i+1]-1; 5339048b5e81SShri Abhyankar sum = tmp[i]; 5340048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5341048b5e81SShri Abhyankar x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 5342048b5e81SShri Abhyankar } 5343048b5e81SShri Abhyankar 5344048b5e81SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5345048b5e81SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5346048b5e81SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5347048b5e81SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5348048b5e81SShri Abhyankar ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 5349048b5e81SShri Abhyankar PetscFunctionReturn(0); 5350048b5e81SShri Abhyankar } 5351048b5e81SShri Abhyankar 535215091d37SBarry Smith /* 535315091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 535415091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 535515091d37SBarry Smith */ 53564a2ae208SSatish Balay #undef __FUNCT__ 535706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 535806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 535915091d37SBarry Smith { 536015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5361b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5362dfbe8321SBarry Smith PetscErrorCode ierr; 5363b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 5364b3260449SShri Abhyankar PetscScalar *x; 5365b3260449SShri Abhyankar const PetscScalar *b; 536687828ca2SBarry Smith PetscScalar s1,x1; 5367b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i; 536815091d37SBarry Smith 536915091d37SBarry Smith PetscFunctionBegin; 5370b3260449SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 53711ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 537215091d37SBarry Smith 537315091d37SBarry Smith /* forward solve the lower triangular */ 537415091d37SBarry Smith idx = 0; 537515091d37SBarry Smith x[0] = b[0]; 537615091d37SBarry Smith for (i=1; i<n; i++) { 537715091d37SBarry Smith v = aa + ai[i]; 537815091d37SBarry Smith vi = aj + ai[i]; 537915091d37SBarry Smith nz = diag[i] - ai[i]; 538015091d37SBarry Smith idx += 1; 5381f1af5d2fSBarry Smith s1 = b[idx]; 538215091d37SBarry Smith while (nz--) { 538315091d37SBarry Smith jdx = *vi++; 538415091d37SBarry Smith x1 = x[jdx]; 5385f1af5d2fSBarry Smith s1 -= v[0]*x1; 538615091d37SBarry Smith v += 1; 538715091d37SBarry Smith } 5388f1af5d2fSBarry Smith x[idx] = s1; 538915091d37SBarry Smith } 539015091d37SBarry Smith /* backward solve the upper triangular */ 539115091d37SBarry Smith for (i=n-1; i>=0; i--){ 539215091d37SBarry Smith v = aa + diag[i] + 1; 539315091d37SBarry Smith vi = aj + diag[i] + 1; 539415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 539515091d37SBarry Smith idt = i; 5396f1af5d2fSBarry Smith s1 = x[idt]; 539715091d37SBarry Smith while (nz--) { 539815091d37SBarry Smith idx = *vi++; 539915091d37SBarry Smith x1 = x[idx]; 5400f1af5d2fSBarry Smith s1 -= v[0]*x1; 540115091d37SBarry Smith v += 1; 540215091d37SBarry Smith } 540315091d37SBarry Smith v = aa + diag[i]; 5404f1af5d2fSBarry Smith x[idt] = v[0]*s1; 540515091d37SBarry Smith } 5406b3260449SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 54071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5408dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 540915091d37SBarry Smith PetscFunctionReturn(0); 541015091d37SBarry Smith } 54114e2b4712SSatish Balay 5412048b5e81SShri Abhyankar 5413048b5e81SShri Abhyankar #undef __FUNCT__ 5414048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5415048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5416048b5e81SShri Abhyankar { 5417048b5e81SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5418048b5e81SShri Abhyankar PetscErrorCode ierr; 5419048b5e81SShri Abhyankar const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi; 5420048b5e81SShri Abhyankar PetscScalar *x,sum; 5421048b5e81SShri Abhyankar const PetscScalar *b; 5422048b5e81SShri Abhyankar const MatScalar *aa = a->a,*v; 5423048b5e81SShri Abhyankar PetscInt i,nz; 5424048b5e81SShri Abhyankar 5425048b5e81SShri Abhyankar PetscFunctionBegin; 5426048b5e81SShri Abhyankar if (!n) PetscFunctionReturn(0); 5427048b5e81SShri Abhyankar 5428048b5e81SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5429048b5e81SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5430048b5e81SShri Abhyankar 5431048b5e81SShri Abhyankar /* forward solve the lower triangular */ 5432048b5e81SShri Abhyankar x[0] = b[0]; 5433048b5e81SShri Abhyankar v = aa; 5434048b5e81SShri Abhyankar vi = aj; 5435048b5e81SShri Abhyankar for (i=1; i<n; i++) { 5436048b5e81SShri Abhyankar nz = ai[i+1] - ai[i]; 5437048b5e81SShri Abhyankar sum = b[i]; 5438048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5439048b5e81SShri Abhyankar v += nz; 5440048b5e81SShri Abhyankar vi += nz; 5441048b5e81SShri Abhyankar x[i] = sum; 5442048b5e81SShri Abhyankar } 5443048b5e81SShri Abhyankar 5444048b5e81SShri Abhyankar /* backward solve the upper triangular */ 5445048b5e81SShri Abhyankar for (i=n-1; i>=0; i--){ 5446048b5e81SShri Abhyankar v = aa + adiag[i+1] + 1; 5447048b5e81SShri Abhyankar vi = aj + adiag[i+1] + 1; 5448048b5e81SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5449048b5e81SShri Abhyankar sum = x[i]; 5450048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5451048b5e81SShri Abhyankar x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */ 5452048b5e81SShri Abhyankar } 5453048b5e81SShri Abhyankar 5454048b5e81SShri Abhyankar ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 5455048b5e81SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5456048b5e81SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5457048b5e81SShri Abhyankar PetscFunctionReturn(0); 5458048b5e81SShri Abhyankar } 5459048b5e81SShri Abhyankar 54604e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 546116a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 54626bce7ff8SHong Zhang 54632b0b2ea7SShri Abhyankar #undef __FUNCT__ 546429a97285SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5465766f9fbaSBarry Smith /* 5466766f9fbaSBarry Smith This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 5467766f9fbaSBarry Smith */ 546829a97285SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 54692b0b2ea7SShri Abhyankar { 54702b0b2ea7SShri Abhyankar Mat C=B; 54712b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 54722b0b2ea7SShri Abhyankar PetscErrorCode ierr; 5473766f9fbaSBarry Smith PetscInt i,j,k,ipvt[15]; 5474766f9fbaSBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 5475766f9fbaSBarry Smith PetscInt nz,nzL,row; 5476766f9fbaSBarry Smith MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 5477766f9fbaSBarry Smith const MatScalar *v,*aa=a->a; 54782b0b2ea7SShri Abhyankar PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 54790fa040f9SShri Abhyankar PetscInt sol_ver; 54802b0b2ea7SShri Abhyankar 54812b0b2ea7SShri Abhyankar PetscFunctionBegin; 54822b0b2ea7SShri Abhyankar 54830fa040f9SShri Abhyankar ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr); 54840fa040f9SShri Abhyankar 54852b0b2ea7SShri Abhyankar /* generate work space needed by the factorization */ 54862b0b2ea7SShri Abhyankar ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 54872b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 54882b0b2ea7SShri Abhyankar 54892b0b2ea7SShri Abhyankar for (i=0; i<n; i++){ 54902b0b2ea7SShri Abhyankar /* zero rtmp */ 54912b0b2ea7SShri Abhyankar /* L part */ 54922b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 54932b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 54942b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 54952b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 54962b0b2ea7SShri Abhyankar } 54972b0b2ea7SShri Abhyankar 54982b0b2ea7SShri Abhyankar /* U part */ 54992b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 55002b0b2ea7SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 55012b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 55022b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 55032b0b2ea7SShri Abhyankar } 55042b0b2ea7SShri Abhyankar 55052b0b2ea7SShri Abhyankar /* load in initial (unfactored row) */ 550629a97285SShri Abhyankar nz = ai[i+1] - ai[i]; 550729a97285SShri Abhyankar ajtmp = aj + ai[i]; 550829a97285SShri Abhyankar v = aa + bs2*ai[i]; 55092b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 551029a97285SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 55112b0b2ea7SShri Abhyankar } 55122b0b2ea7SShri Abhyankar 55132b0b2ea7SShri Abhyankar /* elimination */ 55142b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 55152b0b2ea7SShri Abhyankar nzL = bi[i+1] - bi[i]; 55162b0b2ea7SShri Abhyankar for(k=0;k < nzL;k++) { 55172b0b2ea7SShri Abhyankar row = bjtmp[k]; 55182b0b2ea7SShri Abhyankar pc = rtmp + bs2*row; 55192b0b2ea7SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 55202b0b2ea7SShri Abhyankar if (flg) { 55212b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[row]; 5522766f9fbaSBarry Smith Kernel_A_gets_A_times_B(bs,pc,pv,mwork); 5523766f9fbaSBarry Smith /*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 55242b0b2ea7SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 55252b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 55262b0b2ea7SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 55272b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 5528766f9fbaSBarry Smith vv = rtmp + bs2*pj[j]; 5529766f9fbaSBarry Smith Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 5530766f9fbaSBarry Smith /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 55312b0b2ea7SShri Abhyankar pv += bs2; 55322b0b2ea7SShri Abhyankar } 5533766f9fbaSBarry Smith ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 55342b0b2ea7SShri Abhyankar } 55352b0b2ea7SShri Abhyankar } 55362b0b2ea7SShri Abhyankar 55372b0b2ea7SShri Abhyankar /* finished row so stick it into b->a */ 55382b0b2ea7SShri Abhyankar /* L part */ 55392b0b2ea7SShri Abhyankar pv = b->a + bs2*bi[i] ; 55402b0b2ea7SShri Abhyankar pj = b->j + bi[i] ; 55412b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 55422b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 55432b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 55442b0b2ea7SShri Abhyankar } 55452b0b2ea7SShri Abhyankar 55462b0b2ea7SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 55472b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[i]; 55482b0b2ea7SShri Abhyankar pj = b->j + bdiag[i]; 55492b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5550766f9fbaSBarry Smith /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */ 5551182b8fbaSHong Zhang ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr); 55522b0b2ea7SShri Abhyankar 55532b0b2ea7SShri Abhyankar /* U part */ 55542b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 55552b0b2ea7SShri Abhyankar pj = b->j + bdiag[i+1]+1; 55562b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 55572b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 55582b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 55592b0b2ea7SShri Abhyankar } 55602b0b2ea7SShri Abhyankar } 55612b0b2ea7SShri Abhyankar 55622b0b2ea7SShri Abhyankar ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5563832cc040SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 5564766f9fbaSBarry Smith C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 55652b0b2ea7SShri Abhyankar C->assembled = PETSC_TRUE; 5566766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 55672b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 55682b0b2ea7SShri Abhyankar } 55692b0b2ea7SShri Abhyankar 55706bce7ff8SHong Zhang #undef __FUNCT__ 55714dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 55724dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 55736bce7ff8SHong Zhang { 55746bce7ff8SHong Zhang Mat C=B; 55756bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 55766bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 55776bce7ff8SHong Zhang PetscErrorCode ierr; 55786bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 55796bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 55806bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5581b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5582914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5583914a18a2SHong Zhang MatScalar *v_work; 5584ae3d28f0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 55856bce7ff8SHong Zhang 55866bce7ff8SHong Zhang PetscFunctionBegin; 55876bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 55886bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5589ae3d28f0SHong Zhang 5590fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5591fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 55926bce7ff8SHong Zhang ics = ic; 55936bce7ff8SHong Zhang 5594914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 5595fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5596914a18a2SHong Zhang 55976bce7ff8SHong Zhang for (i=0; i<n; i++){ 55986bce7ff8SHong Zhang /* zero rtmp */ 55996bce7ff8SHong Zhang /* L part */ 56006bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 56016bce7ff8SHong Zhang bjtmp = bj + bi[i]; 5602914a18a2SHong Zhang for (j=0; j<nz; j++){ 5603914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5604914a18a2SHong Zhang } 56056bce7ff8SHong Zhang 56066bce7ff8SHong Zhang /* U part */ 56071a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 56081a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 56091a83e813SShri Abhyankar for (j=0; j<nz; j++){ 56101a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56111a83e813SShri Abhyankar } 56121a83e813SShri Abhyankar 56131a83e813SShri Abhyankar /* load in initial (unfactored row) */ 56141a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 56151a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 56161a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 56171a83e813SShri Abhyankar for (j=0; j<nz; j++) { 56181a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 56191a83e813SShri Abhyankar } 56201a83e813SShri Abhyankar 56211a83e813SShri Abhyankar /* elimination */ 56221a83e813SShri Abhyankar bjtmp = bj + bi[i]; 56231a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 56241a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 56251a83e813SShri Abhyankar row = bjtmp[k]; 56261a83e813SShri Abhyankar pc = rtmp + bs2*row; 56271a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 56281a83e813SShri Abhyankar if (flg) { 56291a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 56301a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 56311a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 56321a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 56331a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 56341a83e813SShri Abhyankar for (j=0; j<nz; j++) { 56351a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 56361a83e813SShri Abhyankar } 56371a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 56381a83e813SShri Abhyankar } 56391a83e813SShri Abhyankar } 56401a83e813SShri Abhyankar 56411a83e813SShri Abhyankar /* finished row so stick it into b->a */ 56421a83e813SShri Abhyankar /* L part */ 56431a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 56441a83e813SShri Abhyankar pj = b->j + bi[i] ; 56451a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 56461a83e813SShri Abhyankar for (j=0; j<nz; j++) { 56471a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56481a83e813SShri Abhyankar } 56491a83e813SShri Abhyankar 56501a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 56511a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 56521a83e813SShri Abhyankar pj = b->j + bdiag[i]; 5653e32f2f54SBarry Smith /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 56541a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56551a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 56561a83e813SShri Abhyankar 56571a83e813SShri Abhyankar /* U part */ 56581a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 56591a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 56601a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 56611a83e813SShri Abhyankar for (j=0; j<nz; j++){ 56621a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56631a83e813SShri Abhyankar } 56641a83e813SShri Abhyankar } 56651a83e813SShri Abhyankar 56661a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 5667fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 56681a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 56691a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 56701a83e813SShri Abhyankar 5671ae3d28f0SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5672ae3d28f0SHong Zhang ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5673ae3d28f0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 5674ae3d28f0SHong Zhang if (both_identity){ 56754dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5676ae3d28f0SHong Zhang } else { 56774dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N; 5678ae3d28f0SHong Zhang } 56794dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5680ae3d28f0SHong Zhang 56811a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 5682766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 56831a83e813SShri Abhyankar PetscFunctionReturn(0); 56841a83e813SShri Abhyankar } 56851a83e813SShri Abhyankar 56866bce7ff8SHong Zhang /* 56876bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 56884dd39f65SShri Abhyankar See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 56894dd39f65SShri Abhyankar because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 56906bce7ff8SHong Zhang */ 5691c0c7eb62SShri Abhyankar 56926bce7ff8SHong Zhang #undef __FUNCT__ 56934dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 56944dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 56956bce7ff8SHong Zhang { 56966bce7ff8SHong Zhang 56976bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 56986bce7ff8SHong Zhang PetscErrorCode ierr; 569916a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 570035aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 570135aa4fcfSShri Abhyankar 570235aa4fcfSShri Abhyankar PetscFunctionBegin; 570335aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 570435aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 570535aa4fcfSShri Abhyankar 570635aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 570735aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 570835aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 570935aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 571035aa4fcfSShri Abhyankar if (!b->diag){ 571135aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 571235aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 571335aa4fcfSShri Abhyankar } 571435aa4fcfSShri Abhyankar bdiag = b->diag; 571535aa4fcfSShri Abhyankar 571635aa4fcfSShri Abhyankar if (n > 0) { 571735aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 571835aa4fcfSShri Abhyankar } 571935aa4fcfSShri Abhyankar 572035aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 572135aa4fcfSShri Abhyankar bi = b->i; 572235aa4fcfSShri Abhyankar bj = b->j; 572335aa4fcfSShri Abhyankar 572435aa4fcfSShri Abhyankar /* L part */ 572535aa4fcfSShri Abhyankar bi[0] = 0; 572635aa4fcfSShri Abhyankar for (i=0; i<n; i++){ 572735aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 572835aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 572935aa4fcfSShri Abhyankar aj = a->j + ai[i]; 573035aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 573135aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 573235aa4fcfSShri Abhyankar } 573335aa4fcfSShri Abhyankar } 573435aa4fcfSShri Abhyankar 573535aa4fcfSShri Abhyankar /* U part */ 573635aa4fcfSShri Abhyankar bi_temp = bi[n]; 573735aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 573835aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 573935aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 574035aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 574135aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 574235aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 574335aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 574435aa4fcfSShri Abhyankar } 574535aa4fcfSShri Abhyankar /* diag[i] */ 574635aa4fcfSShri Abhyankar *bj = i; bj++; 574735aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 574835aa4fcfSShri Abhyankar } 574935aa4fcfSShri Abhyankar PetscFunctionReturn(0); 575035aa4fcfSShri Abhyankar } 575135aa4fcfSShri Abhyankar 575235aa4fcfSShri Abhyankar #undef __FUNCT__ 57534dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 57544dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 575516a2bf60SHong Zhang { 575616a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 575716a2bf60SHong Zhang IS isicol; 575816a2bf60SHong Zhang PetscErrorCode ierr; 575916a2bf60SHong Zhang const PetscInt *r,*ic; 57607fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 576116a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 576216a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 576316a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 57647fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 576516a2bf60SHong Zhang PetscReal f; 576616a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 576716a2bf60SHong Zhang PetscBT lnkbt; 576816a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 576916a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 577016a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 577116a2bf60SHong Zhang PetscTruth missing; 57727fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 577316a2bf60SHong Zhang 577416a2bf60SHong Zhang PetscFunctionBegin; 5775e32f2f54SBarry Smith if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5776*6ba06ab7SHong Zhang if (bs>1){ /* check shifttype */ 5777*6ba06ab7SHong Zhang if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) 5778*6ba06ab7SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 5779*6ba06ab7SHong Zhang } 5780*6ba06ab7SHong Zhang 578116a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5782e32f2f54SBarry Smith if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 578316a2bf60SHong Zhang 578416a2bf60SHong Zhang f = info->fill; 578516a2bf60SHong Zhang levels = (PetscInt)info->levels; 578616a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 578716a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 578816a2bf60SHong Zhang 578916a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 579016a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 57917fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 579216a2bf60SHong Zhang 57937fa3a6a0SHong Zhang if (!levels && both_identity) { 579416a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 57954dd39f65SShri Abhyankar ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 57964dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 579735aa4fcfSShri Abhyankar 5798d5f3da31SBarry Smith fact->factortype = MAT_FACTOR_ILU; 579935aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 580035aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 580135aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 580235aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 580335aa4fcfSShri Abhyankar b->row = isrow; 580435aa4fcfSShri Abhyankar b->col = iscol; 580535aa4fcfSShri Abhyankar b->icol = isicol; 580635aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 580735aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 580835aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 580935aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 581035aa4fcfSShri Abhyankar PetscFunctionReturn(0); 581135aa4fcfSShri Abhyankar } 581235aa4fcfSShri Abhyankar 581335aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 581435aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 581535aa4fcfSShri Abhyankar 581635aa4fcfSShri Abhyankar /* get new row pointers */ 581735aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 581835aa4fcfSShri Abhyankar bi[0] = 0; 581935aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 582035aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 582135aa4fcfSShri Abhyankar bdiag[0] = 0; 582235aa4fcfSShri Abhyankar 5823fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 582435aa4fcfSShri Abhyankar 582535aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 582635aa4fcfSShri Abhyankar nlnk = n + 1; 582735aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 582835aa4fcfSShri Abhyankar 582935aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 583035aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 583135aa4fcfSShri Abhyankar current_space = free_space; 583235aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 583335aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 583435aa4fcfSShri Abhyankar 583535aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 583635aa4fcfSShri Abhyankar nzi = 0; 583735aa4fcfSShri Abhyankar /* copy current row into linked list */ 583835aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 5839e32f2f54SBarry Smith if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 584035aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 584135aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 584235aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 584335aa4fcfSShri Abhyankar nzi += nlnk; 584435aa4fcfSShri Abhyankar 584535aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 584635aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 584735aa4fcfSShri Abhyankar fm = n; 584835aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 584935aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 585035aa4fcfSShri Abhyankar lnk[fm] = i; 585135aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 585235aa4fcfSShri Abhyankar nzi++; dcount++; 585335aa4fcfSShri Abhyankar } 585435aa4fcfSShri Abhyankar 585535aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 585635aa4fcfSShri Abhyankar nzbd = 0; 585735aa4fcfSShri Abhyankar prow = lnk[n]; 585835aa4fcfSShri Abhyankar while (prow < i) { 585935aa4fcfSShri Abhyankar nnz = bdiag[prow]; 586035aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 586135aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 586235aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 586335aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 586435aa4fcfSShri Abhyankar nzi += nlnk; 586535aa4fcfSShri Abhyankar prow = lnk[prow]; 586635aa4fcfSShri Abhyankar nzbd++; 586735aa4fcfSShri Abhyankar } 586835aa4fcfSShri Abhyankar bdiag[i] = nzbd; 586935aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 587035aa4fcfSShri Abhyankar 587135aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 587235aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 587335aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 587435aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 587535aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 587635aa4fcfSShri Abhyankar reallocs++; 587735aa4fcfSShri Abhyankar } 587835aa4fcfSShri Abhyankar 587935aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 588035aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 588135aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 588235aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 588335aa4fcfSShri Abhyankar 588435aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 588565e19b50SBarry Smith if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 588635aa4fcfSShri Abhyankar 588735aa4fcfSShri Abhyankar current_space->array += nzi; 588835aa4fcfSShri Abhyankar current_space->local_used += nzi; 588935aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 589035aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 589135aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 589235aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 589335aa4fcfSShri Abhyankar } 589435aa4fcfSShri Abhyankar 589535aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 589635aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 589735aa4fcfSShri Abhyankar 589835aa4fcfSShri Abhyankar /* destroy list of free space and other temporary arrays */ 589935aa4fcfSShri Abhyankar ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 590035aa4fcfSShri Abhyankar 590135aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 59022ce24eb6SHong Zhang ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 590335aa4fcfSShri Abhyankar 590435aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 590535aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5906fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 590735aa4fcfSShri Abhyankar 590835aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 590935aa4fcfSShri Abhyankar { 591035aa4fcfSShri Abhyankar PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 591135aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 591235aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 591335aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 591435aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 591535aa4fcfSShri Abhyankar if (diagonal_fill) { 591635aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 591735aa4fcfSShri Abhyankar } 591835aa4fcfSShri Abhyankar } 591935aa4fcfSShri Abhyankar #endif 592035aa4fcfSShri Abhyankar 592135aa4fcfSShri Abhyankar /* put together the new matrix */ 592235aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 592335aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 592435aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 592535aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 592635aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 592735aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 592835aa4fcfSShri Abhyankar ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 592935aa4fcfSShri Abhyankar b->j = bj; 593035aa4fcfSShri Abhyankar b->i = bi; 593135aa4fcfSShri Abhyankar b->diag = bdiag; 593235aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 593335aa4fcfSShri Abhyankar b->ilen = 0; 593435aa4fcfSShri Abhyankar b->imax = 0; 593535aa4fcfSShri Abhyankar b->row = isrow; 593635aa4fcfSShri Abhyankar b->col = iscol; 593735aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 593835aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 593935aa4fcfSShri Abhyankar b->icol = isicol; 594035aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 594135aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 594235aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 594335aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 594435aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 5945ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocs; 5946ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 5947ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 59484dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 594935aa4fcfSShri Abhyankar PetscFunctionReturn(0); 595035aa4fcfSShri Abhyankar } 595135aa4fcfSShri Abhyankar 595235aa4fcfSShri Abhyankar 59534e2b4712SSatish Balay /* 59544e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 59554e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 59564e2b4712SSatish Balay Not a good example of code reuse. 59574e2b4712SSatish Balay */ 59584a2ae208SSatish Balay #undef __FUNCT__ 595906e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 596006e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 59614e2b4712SSatish Balay { 59624e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 59634e2b4712SSatish Balay IS isicol; 59646849ba73SBarry Smith PetscErrorCode ierr; 59655d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 59665d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5967a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5968d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 596941df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 5970329f5518SBarry Smith PetscReal f; 59714e2b4712SSatish Balay 59724e2b4712SSatish Balay PetscFunctionBegin; 59736bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 5974e32f2f54SBarry Smith if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 59756bce7ff8SHong Zhang 5976435faa5fSBarry Smith f = info->fill; 5977690b6cddSBarry Smith levels = (PetscInt)info->levels; 5978690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 59794c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 598016a2bf60SHong Zhang 5981667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5982667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 59837d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 5984309c388cSBarry Smith 598541df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 598616a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 59878b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 59886bce7ff8SHong Zhang 5989d5f3da31SBarry Smith fact->factortype = MAT_FACTOR_ILU; 5990ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 5991bb3d539aSBarry Smith b->row = isrow; 5992bb3d539aSBarry Smith b->col = iscol; 5993bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5994bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5995bb3d539aSBarry Smith b->icol = isicol; 5996bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5997b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 59986bce7ff8SHong Zhang PetscFunctionReturn(0); 59996bce7ff8SHong Zhang } 60006bce7ff8SHong Zhang 60016bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 60024e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 60034e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 60044e2b4712SSatish Balay 60054e2b4712SSatish Balay /* get new row pointers */ 6006690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 60074e2b4712SSatish Balay ainew[0] = 0; 60084e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 6009690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 6010690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 60114e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 6012690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 60134e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 6014690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 60154e2b4712SSatish Balay /* im is level for each filled value */ 6016690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 60174e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 6018690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 60194e2b4712SSatish Balay dloc[0] = 0; 60204e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 6021435faa5fSBarry Smith 6022435faa5fSBarry Smith /* copy prow into linked list */ 60234e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 6024e32f2f54SBarry Smith if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 60254e2b4712SSatish Balay xi = aj + ai[r[prow]]; 60264e2b4712SSatish Balay fill[n] = n; 6027435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 60284e2b4712SSatish Balay while (nz--) { 60294e2b4712SSatish Balay fm = n; 60304e2b4712SSatish Balay idx = ic[*xi++]; 60314e2b4712SSatish Balay do { 60324e2b4712SSatish Balay m = fm; 60334e2b4712SSatish Balay fm = fill[m]; 60344e2b4712SSatish Balay } while (fm < idx); 60354e2b4712SSatish Balay fill[m] = idx; 60364e2b4712SSatish Balay fill[idx] = fm; 60374e2b4712SSatish Balay im[idx] = 0; 60384e2b4712SSatish Balay } 6039435faa5fSBarry Smith 6040435faa5fSBarry Smith /* make sure diagonal entry is included */ 6041435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 6042435faa5fSBarry Smith fm = n; 6043435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 6044435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 6045435faa5fSBarry Smith fill[fm] = prow; 6046435faa5fSBarry Smith im[prow] = 0; 6047435faa5fSBarry Smith nzf++; 6048335d9088SBarry Smith dcount++; 6049435faa5fSBarry Smith } 6050435faa5fSBarry Smith 60514e2b4712SSatish Balay nzi = 0; 60524e2b4712SSatish Balay row = fill[n]; 60534e2b4712SSatish Balay while (row < prow) { 60544e2b4712SSatish Balay incrlev = im[row] + 1; 60554e2b4712SSatish Balay nz = dloc[row]; 6056435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 60574e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 60584e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 60594e2b4712SSatish Balay fm = row; 60604e2b4712SSatish Balay while (nnz-- > 0) { 60614e2b4712SSatish Balay idx = *xi++; 60624e2b4712SSatish Balay if (*flev + incrlev > levels) { 60634e2b4712SSatish Balay flev++; 60644e2b4712SSatish Balay continue; 60654e2b4712SSatish Balay } 60664e2b4712SSatish Balay do { 60674e2b4712SSatish Balay m = fm; 60684e2b4712SSatish Balay fm = fill[m]; 60694e2b4712SSatish Balay } while (fm < idx); 60704e2b4712SSatish Balay if (fm != idx) { 60714e2b4712SSatish Balay im[idx] = *flev + incrlev; 60724e2b4712SSatish Balay fill[m] = idx; 60734e2b4712SSatish Balay fill[idx] = fm; 60744e2b4712SSatish Balay fm = idx; 60754e2b4712SSatish Balay nzf++; 6076ecf371e4SBarry Smith } else { 60774e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 60784e2b4712SSatish Balay } 60794e2b4712SSatish Balay flev++; 60804e2b4712SSatish Balay } 60814e2b4712SSatish Balay row = fill[row]; 60824e2b4712SSatish Balay nzi++; 60834e2b4712SSatish Balay } 60844e2b4712SSatish Balay /* copy new filled row into permanent storage */ 60854e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 60864e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 6087ecf371e4SBarry Smith 6088ecf371e4SBarry Smith /* estimate how much additional space we will need */ 6089ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 6090ecf371e4SBarry Smith /* just double the memory each time */ 6091690b6cddSBarry Smith PetscInt maxadd = jmax; 6092ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 60934e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 60944e2b4712SSatish Balay jmax += maxadd; 6095ecf371e4SBarry Smith 6096ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 60975d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 60985d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6099606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 61005d0c19d7SBarry Smith ajnew = xitmp; 61015d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 61025d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6103606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 61045d0c19d7SBarry Smith ajfill = xitmp; 6105eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 61064e2b4712SSatish Balay } 61075d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 61084e2b4712SSatish Balay flev = ajfill + ainew[prow]; 61094e2b4712SSatish Balay dloc[prow] = nzi; 61104e2b4712SSatish Balay fm = fill[n]; 61114e2b4712SSatish Balay while (nzf--) { 61125d0c19d7SBarry Smith *xitmp++ = fm; 61134e2b4712SSatish Balay *flev++ = im[fm]; 61144e2b4712SSatish Balay fm = fill[fm]; 61154e2b4712SSatish Balay } 6116435faa5fSBarry Smith /* make sure row has diagonal entry */ 6117435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 6118e32f2f54SBarry Smith SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 61192401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 6120435faa5fSBarry Smith } 61214e2b4712SSatish Balay } 6122606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 61234e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 61244e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6125606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 6126606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 61274e2b4712SSatish Balay 61286cf91177SBarry Smith #if defined(PETSC_USE_INFO) 61294e2b4712SSatish Balay { 6130329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 6131ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 6132ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6133ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 6134ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6135335d9088SBarry Smith if (diagonal_fill) { 6136ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 6137335d9088SBarry Smith } 61384e2b4712SSatish Balay } 613963ba0a88SBarry Smith #endif 61404e2b4712SSatish Balay 61414e2b4712SSatish Balay /* put together the new matrix */ 6142719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6143719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6144ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 6145e6b907acSBarry Smith b->free_a = PETSC_TRUE; 6146e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 61477c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 6148a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 61494e2b4712SSatish Balay b->j = ajnew; 61504e2b4712SSatish Balay b->i = ainew; 61514e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 61524e2b4712SSatish Balay b->diag = dloc; 61537f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 61544e2b4712SSatish Balay b->ilen = 0; 61554e2b4712SSatish Balay b->imax = 0; 61564e2b4712SSatish Balay b->row = isrow; 61574e2b4712SSatish Balay b->col = iscol; 6158bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6159c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6160c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6161e51c0b9cSSatish Balay b->icol = isicol; 616287828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 61634e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 61644e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 6165719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 61664e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 61674e2b4712SSatish Balay 6168ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocate; 6169ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 6170ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 61716bce7ff8SHong Zhang 61728b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 61738661488fSKris Buschelman PetscFunctionReturn(0); 61748661488fSKris Buschelman } 61758661488fSKris Buschelman 6176732ee342SKris Buschelman #undef __FUNCT__ 61777e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 6178dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 61797e7071cdSKris Buschelman { 618012272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 618112272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 61825a9542e3SKris Buschelman PetscFunctionBegin; 61837cf1b8d3SKris Buschelman /* Undo Column scaling */ 61847cf1b8d3SKris Buschelman /* while (nz--) { */ 61857cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 61867cf1b8d3SKris Buschelman /* } */ 6187c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 6188c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 61897cf1b8d3SKris Buschelman PetscFunctionReturn(0); 61907cf1b8d3SKris Buschelman } 61917cf1b8d3SKris Buschelman 61927cf1b8d3SKris Buschelman #undef __FUNCT__ 61937cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 6194dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 61957cf1b8d3SKris Buschelman { 61967cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6197b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 61982aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 61995a9542e3SKris Buschelman PetscFunctionBegin; 62000b9da03eSKris Buschelman /* Is this really necessary? */ 620120235379SKris Buschelman while (nz--) { 62020b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 62037e7071cdSKris Buschelman } 6204c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 62057e7071cdSKris Buschelman PetscFunctionReturn(0); 62067e7071cdSKris Buschelman } 62077e7071cdSKris Buschelman 6208732ee342SKris Buschelman 6209