1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 1016a2bf60SHong Zhang #include "petscbt.h" 1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 124e2b4712SSatish Balay 134a2ae208SSatish Balay #undef __FUNCT__ 1406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 1506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 16f1af5d2fSBarry Smith { 17f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18dfbe8321SBarry Smith PetscErrorCode ierr; 19690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20690b6cddSBarry Smith PetscInt *diag = a->diag; 21f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2287828ca2SBarry Smith PetscScalar s1,*x,*b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 5906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 6006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 66f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6887828ca2SBarry Smith PetscScalar *x,*b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 1111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 1184dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 1194dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 1206929473cSShri Abhyankar { 1216929473cSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1226929473cSShri Abhyankar PetscErrorCode ierr; 1236929473cSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1246929473cSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 1256929473cSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 1266929473cSShri Abhyankar MatScalar *aa=a->a,*v; 1276929473cSShri Abhyankar PetscScalar s1,s2,x1,x2; 1286929473cSShri Abhyankar PetscScalar *x,*b; 1296929473cSShri Abhyankar 1306929473cSShri Abhyankar PetscFunctionBegin; 1316929473cSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1326929473cSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1336929473cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1346929473cSShri Abhyankar 1356929473cSShri Abhyankar /* forward solve the U^T */ 1366929473cSShri Abhyankar idx = 0; 1376929473cSShri Abhyankar for (i=0; i<n; i++) { 1386929473cSShri Abhyankar v = aa + bs2*diag[i]; 1396929473cSShri Abhyankar /* multiply by the inverse of the block diagonal */ 1406929473cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 1416929473cSShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 1426929473cSShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 1436929473cSShri Abhyankar v -= bs2; 1446929473cSShri Abhyankar 1456929473cSShri Abhyankar vi = aj + diag[i] - 1; 1466929473cSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 1476929473cSShri Abhyankar for(j=0;j>-nz;j--){ 1486929473cSShri Abhyankar oidx = bs*vi[j]; 1496929473cSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2; 1506929473cSShri Abhyankar x[oidx+1] -= v[2]*s1 + v[3]*s2; 1516929473cSShri Abhyankar v -= bs2; 1526929473cSShri Abhyankar } 1536929473cSShri Abhyankar x[idx] = s1;x[1+idx] = s2; 1546929473cSShri Abhyankar idx += bs; 1556929473cSShri Abhyankar } 1566929473cSShri Abhyankar /* backward solve the L^T */ 1576929473cSShri Abhyankar for (i=n-1; i>=0; i--){ 1586929473cSShri Abhyankar v = aa + bs2*ai[i]; 1596929473cSShri Abhyankar vi = aj + ai[i]; 1606929473cSShri Abhyankar nz = ai[i+1] - ai[i]; 1616929473cSShri Abhyankar idt = bs*i; 1626929473cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 1636929473cSShri Abhyankar for(j=0;j<nz;j++){ 1646929473cSShri Abhyankar idx = bs*vi[j]; 1656929473cSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2; 1666929473cSShri Abhyankar x[idx+1] -= v[2]*s1 + v[3]*s2; 1676929473cSShri Abhyankar v += bs2; 1686929473cSShri Abhyankar } 1696929473cSShri Abhyankar } 1706929473cSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1716929473cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1726929473cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1736929473cSShri Abhyankar PetscFunctionReturn(0); 1746929473cSShri Abhyankar } 1756929473cSShri Abhyankar 1766929473cSShri Abhyankar #undef __FUNCT__ 17706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 17806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 179f1af5d2fSBarry Smith { 180f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181dfbe8321SBarry Smith PetscErrorCode ierr; 182690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 183690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 184f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18587828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 18687828ca2SBarry Smith PetscScalar *x,*b; 187f1af5d2fSBarry Smith 188f1af5d2fSBarry Smith PetscFunctionBegin; 189ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1901ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192f1af5d2fSBarry Smith 193f1af5d2fSBarry Smith /* forward solve the U^T */ 194f1af5d2fSBarry Smith idx = 0; 195f1af5d2fSBarry Smith for (i=0; i<n; i++) { 196f1af5d2fSBarry Smith 197f1af5d2fSBarry Smith v = aa + 9*diag[i]; 198f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 199ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 200f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 201f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 202f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 203f1af5d2fSBarry Smith v += 9; 204f1af5d2fSBarry Smith 205f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 206f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 207f1af5d2fSBarry Smith while (nz--) { 208f1af5d2fSBarry Smith oidx = 3*(*vi++); 209f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 210f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 211f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 212f1af5d2fSBarry Smith v += 9; 213f1af5d2fSBarry Smith } 214f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 215f1af5d2fSBarry Smith idx += 3; 216f1af5d2fSBarry Smith } 217f1af5d2fSBarry Smith /* backward solve the L^T */ 218f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 219f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 220f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 221f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 222f1af5d2fSBarry Smith idt = 3*i; 223f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 224f1af5d2fSBarry Smith while (nz--) { 225f1af5d2fSBarry Smith idx = 3*(*vi--); 226f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 227f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 228f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 229f1af5d2fSBarry Smith v -= 9; 230f1af5d2fSBarry Smith } 231f1af5d2fSBarry Smith } 2321ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 235f1af5d2fSBarry Smith PetscFunctionReturn(0); 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith 2384a2ae208SSatish Balay #undef __FUNCT__ 2394dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 2404dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2418499736aSShri Abhyankar { 2428499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2438499736aSShri Abhyankar PetscErrorCode ierr; 2448499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2458499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 2468499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 2478499736aSShri Abhyankar MatScalar *aa=a->a,*v; 2488499736aSShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 2498499736aSShri Abhyankar PetscScalar *x,*b; 2508499736aSShri Abhyankar 2518499736aSShri Abhyankar PetscFunctionBegin; 2528499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2538499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2548499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2558499736aSShri Abhyankar 2568499736aSShri Abhyankar /* forward solve the U^T */ 2578499736aSShri Abhyankar idx = 0; 2588499736aSShri Abhyankar for (i=0; i<n; i++) { 2598499736aSShri Abhyankar v = aa + bs2*diag[i]; 2608499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 2618499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2628499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 2638499736aSShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 2648499736aSShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 2658499736aSShri Abhyankar v -= bs2; 2668499736aSShri Abhyankar 2678499736aSShri Abhyankar vi = aj + diag[i] - 1; 2688499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 2698499736aSShri Abhyankar for(j=0;j>-nz;j--){ 2708499736aSShri Abhyankar oidx = bs*vi[j]; 2718499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 2728499736aSShri Abhyankar x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 2738499736aSShri Abhyankar x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 2748499736aSShri Abhyankar v -= bs2; 2758499736aSShri Abhyankar } 2768499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 2778499736aSShri Abhyankar idx += bs; 2788499736aSShri Abhyankar } 2798499736aSShri Abhyankar /* backward solve the L^T */ 2808499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 2818499736aSShri Abhyankar v = aa + bs2*ai[i]; 2828499736aSShri Abhyankar vi = aj + ai[i]; 2838499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 2848499736aSShri Abhyankar idt = bs*i; 2858499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 2868499736aSShri Abhyankar for(j=0;j<nz;j++){ 2878499736aSShri Abhyankar idx = bs*vi[j]; 2888499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 2898499736aSShri Abhyankar x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 2908499736aSShri Abhyankar x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 2918499736aSShri Abhyankar v += bs2; 2928499736aSShri Abhyankar } 2938499736aSShri Abhyankar } 2948499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2958499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2968499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2978499736aSShri Abhyankar PetscFunctionReturn(0); 2988499736aSShri Abhyankar } 2998499736aSShri Abhyankar 3008499736aSShri Abhyankar #undef __FUNCT__ 30106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 30206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 303f1af5d2fSBarry Smith { 304f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305dfbe8321SBarry Smith PetscErrorCode ierr; 306690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 307690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 308f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 30987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 31087828ca2SBarry Smith PetscScalar *x,*b; 311f1af5d2fSBarry Smith 312f1af5d2fSBarry Smith PetscFunctionBegin; 313ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3141ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3151ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 316f1af5d2fSBarry Smith 317f1af5d2fSBarry Smith /* forward solve the U^T */ 318f1af5d2fSBarry Smith idx = 0; 319f1af5d2fSBarry Smith for (i=0; i<n; i++) { 320f1af5d2fSBarry Smith 321f1af5d2fSBarry Smith v = aa + 16*diag[i]; 322f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 323ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 324f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 325f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 326f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 327f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 328f1af5d2fSBarry Smith v += 16; 329f1af5d2fSBarry Smith 330f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 331f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 332f1af5d2fSBarry Smith while (nz--) { 333f1af5d2fSBarry Smith oidx = 4*(*vi++); 334f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 335f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 336f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 337f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 338f1af5d2fSBarry Smith v += 16; 339f1af5d2fSBarry Smith } 340f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 341f1af5d2fSBarry Smith idx += 4; 342f1af5d2fSBarry Smith } 343f1af5d2fSBarry Smith /* backward solve the L^T */ 344f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 345f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 346f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 347f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 348f1af5d2fSBarry Smith idt = 4*i; 349f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 350f1af5d2fSBarry Smith while (nz--) { 351f1af5d2fSBarry Smith idx = 4*(*vi--); 352f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 353f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 354f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 355f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 356f1af5d2fSBarry Smith v -= 16; 357f1af5d2fSBarry Smith } 358f1af5d2fSBarry Smith } 3591ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3601ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 361dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 362f1af5d2fSBarry Smith PetscFunctionReturn(0); 363f1af5d2fSBarry Smith } 364f1af5d2fSBarry Smith 3654a2ae208SSatish Balay #undef __FUNCT__ 3664dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 3674dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 3688499736aSShri Abhyankar { 3698499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3708499736aSShri Abhyankar PetscErrorCode ierr; 3718499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 3728499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 3738499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 3748499736aSShri Abhyankar MatScalar *aa=a->a,*v; 3758499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3768499736aSShri Abhyankar PetscScalar *x,*b; 3778499736aSShri Abhyankar 3788499736aSShri Abhyankar PetscFunctionBegin; 3798499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3808499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3818499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3828499736aSShri Abhyankar 3838499736aSShri Abhyankar /* forward solve the U^T */ 3848499736aSShri Abhyankar idx = 0; 3858499736aSShri Abhyankar for (i=0; i<n; i++) { 3868499736aSShri Abhyankar v = aa + bs2*diag[i]; 3878499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 3888499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 3898499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 3908499736aSShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 3918499736aSShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 3928499736aSShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 3938499736aSShri Abhyankar v -= bs2; 3948499736aSShri Abhyankar 3958499736aSShri Abhyankar vi = aj + diag[i] - 1; 3968499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 3978499736aSShri Abhyankar for(j=0;j>-nz;j--){ 3988499736aSShri Abhyankar oidx = bs*vi[j]; 3998499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4008499736aSShri Abhyankar x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4018499736aSShri Abhyankar x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4028499736aSShri Abhyankar x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4038499736aSShri Abhyankar v -= bs2; 4048499736aSShri Abhyankar } 4058499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 4068499736aSShri Abhyankar idx += bs; 4078499736aSShri Abhyankar } 4088499736aSShri Abhyankar /* backward solve the L^T */ 4098499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 4108499736aSShri Abhyankar v = aa + bs2*ai[i]; 4118499736aSShri Abhyankar vi = aj + ai[i]; 4128499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 4138499736aSShri Abhyankar idt = bs*i; 4148499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 4158499736aSShri Abhyankar for(j=0;j<nz;j++){ 4168499736aSShri Abhyankar idx = bs*vi[j]; 4178499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4188499736aSShri Abhyankar x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4198499736aSShri Abhyankar x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4208499736aSShri Abhyankar x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4218499736aSShri Abhyankar v += bs2; 4228499736aSShri Abhyankar } 4238499736aSShri Abhyankar } 4248499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4258499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4268499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4278499736aSShri Abhyankar PetscFunctionReturn(0); 4288499736aSShri Abhyankar } 4298499736aSShri Abhyankar 4308499736aSShri Abhyankar #undef __FUNCT__ 43106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 43206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 433f1af5d2fSBarry Smith { 434f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 435dfbe8321SBarry Smith PetscErrorCode ierr; 436690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 437690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 438f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 43987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 44087828ca2SBarry Smith PetscScalar *x,*b; 441f1af5d2fSBarry Smith 442f1af5d2fSBarry Smith PetscFunctionBegin; 443ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4441ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4451ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 446f1af5d2fSBarry Smith 447f1af5d2fSBarry Smith /* forward solve the U^T */ 448f1af5d2fSBarry Smith idx = 0; 449f1af5d2fSBarry Smith for (i=0; i<n; i++) { 450f1af5d2fSBarry Smith 451f1af5d2fSBarry Smith v = aa + 25*diag[i]; 452f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 453ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 454f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 455f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 456f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 457f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 458f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 459f1af5d2fSBarry Smith v += 25; 460f1af5d2fSBarry Smith 461f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 462f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 463f1af5d2fSBarry Smith while (nz--) { 464f1af5d2fSBarry Smith oidx = 5*(*vi++); 465f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 466f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 467f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 468f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 469f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 470f1af5d2fSBarry Smith v += 25; 471f1af5d2fSBarry Smith } 472f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 473f1af5d2fSBarry Smith idx += 5; 474f1af5d2fSBarry Smith } 475f1af5d2fSBarry Smith /* backward solve the L^T */ 476f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 477f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 478f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 479f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 480f1af5d2fSBarry Smith idt = 5*i; 481f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 482f1af5d2fSBarry Smith while (nz--) { 483f1af5d2fSBarry Smith idx = 5*(*vi--); 484f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 485f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 486f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 487f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 488f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 489f1af5d2fSBarry Smith v -= 25; 490f1af5d2fSBarry Smith } 491f1af5d2fSBarry Smith } 4921ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 494dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 495f1af5d2fSBarry Smith PetscFunctionReturn(0); 496f1af5d2fSBarry Smith } 497f1af5d2fSBarry Smith 4984a2ae208SSatish Balay #undef __FUNCT__ 4994dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 5004dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 5018499736aSShri Abhyankar { 5028499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5038499736aSShri Abhyankar PetscErrorCode ierr; 5048499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5058499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 5068499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 5078499736aSShri Abhyankar MatScalar *aa=a->a,*v; 5088499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 5098499736aSShri Abhyankar PetscScalar *x,*b; 5108499736aSShri Abhyankar 5118499736aSShri Abhyankar PetscFunctionBegin; 5128499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 5138499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5148499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5158499736aSShri Abhyankar 5168499736aSShri Abhyankar /* forward solve the U^T */ 5178499736aSShri Abhyankar idx = 0; 5188499736aSShri Abhyankar for (i=0; i<n; i++) { 5198499736aSShri Abhyankar v = aa + bs2*diag[i]; 5208499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 5218499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 5228499736aSShri Abhyankar x5 = x[4+idx]; 5238499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 5248499736aSShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 5258499736aSShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 5268499736aSShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 5278499736aSShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 5288499736aSShri Abhyankar v -= bs2; 5298499736aSShri Abhyankar 5308499736aSShri Abhyankar vi = aj + diag[i] - 1; 5318499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 5328499736aSShri Abhyankar for(j=0;j>-nz;j--){ 5338499736aSShri Abhyankar oidx = bs*vi[j]; 5348499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5358499736aSShri Abhyankar x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5368499736aSShri Abhyankar x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5378499736aSShri Abhyankar x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5388499736aSShri Abhyankar x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5398499736aSShri Abhyankar v -= bs2; 5408499736aSShri Abhyankar } 5418499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 5428499736aSShri Abhyankar idx += bs; 5438499736aSShri Abhyankar } 5448499736aSShri Abhyankar /* backward solve the L^T */ 5458499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 5468499736aSShri Abhyankar v = aa + bs2*ai[i]; 5478499736aSShri Abhyankar vi = aj + ai[i]; 5488499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 5498499736aSShri Abhyankar idt = bs*i; 5508499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 5518499736aSShri Abhyankar for(j=0;j<nz;j++){ 5528499736aSShri Abhyankar idx = bs*vi[j]; 5538499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5548499736aSShri Abhyankar x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5558499736aSShri Abhyankar x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5568499736aSShri Abhyankar x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5578499736aSShri Abhyankar x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5588499736aSShri Abhyankar v += bs2; 5598499736aSShri Abhyankar } 5608499736aSShri Abhyankar } 5618499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5628499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5638499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5648499736aSShri Abhyankar PetscFunctionReturn(0); 5658499736aSShri Abhyankar } 5668499736aSShri Abhyankar 5678499736aSShri Abhyankar #undef __FUNCT__ 56806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 56906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 570f1af5d2fSBarry Smith { 571f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 572dfbe8321SBarry Smith PetscErrorCode ierr; 573690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 574690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 575f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 57687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 57787828ca2SBarry Smith PetscScalar *x,*b; 578f1af5d2fSBarry Smith 579f1af5d2fSBarry Smith PetscFunctionBegin; 580ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 5811ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 583f1af5d2fSBarry Smith 584f1af5d2fSBarry Smith /* forward solve the U^T */ 585f1af5d2fSBarry Smith idx = 0; 586f1af5d2fSBarry Smith for (i=0; i<n; i++) { 587f1af5d2fSBarry Smith 588f1af5d2fSBarry Smith v = aa + 36*diag[i]; 589f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 590ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 591ef66eb69SBarry Smith x6 = x[5+idx]; 592f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 593f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 594f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 595f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 596f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 597f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 598f1af5d2fSBarry Smith v += 36; 599f1af5d2fSBarry Smith 600f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 601f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 602f1af5d2fSBarry Smith while (nz--) { 603f1af5d2fSBarry Smith oidx = 6*(*vi++); 604f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 605f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 606f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 607f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 608f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 609f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 610f1af5d2fSBarry Smith v += 36; 611f1af5d2fSBarry Smith } 612f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 613f1af5d2fSBarry Smith x[5+idx] = s6; 614f1af5d2fSBarry Smith idx += 6; 615f1af5d2fSBarry Smith } 616f1af5d2fSBarry Smith /* backward solve the L^T */ 617f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 618f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 619f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 620f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 621f1af5d2fSBarry Smith idt = 6*i; 622f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 623f1af5d2fSBarry Smith s6 = x[5+idt]; 624f1af5d2fSBarry Smith while (nz--) { 625f1af5d2fSBarry Smith idx = 6*(*vi--); 626f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632f1af5d2fSBarry Smith v -= 36; 633f1af5d2fSBarry Smith } 634f1af5d2fSBarry Smith } 6351ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6361ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 637dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 638f1af5d2fSBarry Smith PetscFunctionReturn(0); 639f1af5d2fSBarry Smith } 640f1af5d2fSBarry Smith 6414a2ae208SSatish Balay #undef __FUNCT__ 6424dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 6434dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 6448499736aSShri Abhyankar { 6458499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 6468499736aSShri Abhyankar PetscErrorCode ierr; 6478499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 6488499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 6498499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 6508499736aSShri Abhyankar MatScalar *aa=a->a,*v; 6518499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 6528499736aSShri Abhyankar PetscScalar *x,*b; 6538499736aSShri Abhyankar 6548499736aSShri Abhyankar PetscFunctionBegin; 6558499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 6568499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6578499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 6588499736aSShri Abhyankar 6598499736aSShri Abhyankar /* forward solve the U^T */ 6608499736aSShri Abhyankar idx = 0; 6618499736aSShri Abhyankar for (i=0; i<n; i++) { 6628499736aSShri Abhyankar v = aa + bs2*diag[i]; 6638499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 6648499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 6658499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; 6668499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 6678499736aSShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 6688499736aSShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 6698499736aSShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 6708499736aSShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 6718499736aSShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 6728499736aSShri Abhyankar v -= bs2; 6738499736aSShri Abhyankar 6748499736aSShri Abhyankar vi = aj + diag[i] - 1; 6758499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 6768499736aSShri Abhyankar for(j=0;j>-nz;j--){ 6778499736aSShri Abhyankar oidx = bs*vi[j]; 6788499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 6798499736aSShri Abhyankar x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 6808499736aSShri Abhyankar x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 6818499736aSShri Abhyankar x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 6828499736aSShri Abhyankar x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 6838499736aSShri Abhyankar x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 6848499736aSShri Abhyankar v -= bs2; 6858499736aSShri Abhyankar } 6868499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 6878499736aSShri Abhyankar x[5+idx] = s6; 6888499736aSShri Abhyankar idx += bs; 6898499736aSShri Abhyankar } 6908499736aSShri Abhyankar /* backward solve the L^T */ 6918499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 6928499736aSShri Abhyankar v = aa + bs2*ai[i]; 6938499736aSShri Abhyankar vi = aj + ai[i]; 6948499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 6958499736aSShri Abhyankar idt = bs*i; 6968499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 6978499736aSShri Abhyankar s6 = x[5+idt]; 6988499736aSShri Abhyankar for(j=0;j<nz;j++){ 6998499736aSShri Abhyankar idx = bs*vi[j]; 7008499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 7018499736aSShri Abhyankar x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 7028499736aSShri Abhyankar x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 7038499736aSShri Abhyankar x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 7048499736aSShri Abhyankar x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 7058499736aSShri Abhyankar x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 7068499736aSShri Abhyankar v += bs2; 7078499736aSShri Abhyankar } 7088499736aSShri Abhyankar } 7098499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7108499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 7118499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 7128499736aSShri Abhyankar PetscFunctionReturn(0); 7138499736aSShri Abhyankar } 7148499736aSShri Abhyankar 7158499736aSShri Abhyankar #undef __FUNCT__ 71606e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 71706e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 718f1af5d2fSBarry Smith { 719f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 720dfbe8321SBarry Smith PetscErrorCode ierr; 721690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 722690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 723f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 72487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 72587828ca2SBarry Smith PetscScalar *x,*b; 726f1af5d2fSBarry Smith 727f1af5d2fSBarry Smith PetscFunctionBegin; 728ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 7291ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7301ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 731f1af5d2fSBarry Smith 732f1af5d2fSBarry Smith /* forward solve the U^T */ 733f1af5d2fSBarry Smith idx = 0; 734f1af5d2fSBarry Smith for (i=0; i<n; i++) { 735f1af5d2fSBarry Smith 736f1af5d2fSBarry Smith v = aa + 49*diag[i]; 737f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 738ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 739ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 740f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 741f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 742f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 743f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 744f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 745f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 746f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 747f1af5d2fSBarry Smith v += 49; 748f1af5d2fSBarry Smith 749f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 750f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 751f1af5d2fSBarry Smith while (nz--) { 752f1af5d2fSBarry Smith oidx = 7*(*vi++); 753f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 754f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 755f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 756f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 757f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 758f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 759f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 760f1af5d2fSBarry Smith v += 49; 761f1af5d2fSBarry Smith } 762f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 763f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 764f1af5d2fSBarry Smith idx += 7; 765f1af5d2fSBarry Smith } 766f1af5d2fSBarry Smith /* backward solve the L^T */ 767f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 768f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 769f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 770f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 771f1af5d2fSBarry Smith idt = 7*i; 772f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 773f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 774f1af5d2fSBarry Smith while (nz--) { 775f1af5d2fSBarry Smith idx = 7*(*vi--); 776f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 777f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 778f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 779f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 780f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 781f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 782f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 783f1af5d2fSBarry Smith v -= 49; 784f1af5d2fSBarry Smith } 785f1af5d2fSBarry Smith } 7861ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7871ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 788dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 789f1af5d2fSBarry Smith PetscFunctionReturn(0); 790f1af5d2fSBarry Smith } 7918499736aSShri Abhyankar #undef __FUNCT__ 7924dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 7934dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 7948499736aSShri Abhyankar { 7958499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 7968499736aSShri Abhyankar PetscErrorCode ierr; 7978499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 7988499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 7998499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 8008499736aSShri Abhyankar MatScalar *aa=a->a,*v; 8018499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 8028499736aSShri Abhyankar PetscScalar *x,*b; 8038499736aSShri Abhyankar 8048499736aSShri Abhyankar PetscFunctionBegin; 8058499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 8068499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8078499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 8088499736aSShri Abhyankar 8098499736aSShri Abhyankar /* forward solve the U^T */ 8108499736aSShri Abhyankar idx = 0; 8118499736aSShri Abhyankar for (i=0; i<n; i++) { 8128499736aSShri Abhyankar v = aa + bs2*diag[i]; 8138499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 8148499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 8158499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 8168499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 8178499736aSShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 8188499736aSShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 8198499736aSShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 8208499736aSShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 8218499736aSShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 8228499736aSShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 8238499736aSShri Abhyankar v -= bs2; 8248499736aSShri Abhyankar vi = aj + diag[i] - 1; 8258499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 8268499736aSShri Abhyankar for(j=0;j>-nz;j--){ 8278499736aSShri Abhyankar oidx = bs*vi[j]; 8288499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8298499736aSShri Abhyankar x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8308499736aSShri Abhyankar x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8318499736aSShri Abhyankar x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8328499736aSShri Abhyankar x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8338499736aSShri Abhyankar x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8348499736aSShri Abhyankar x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8358499736aSShri Abhyankar v -= bs2; 8368499736aSShri Abhyankar } 8378499736aSShri Abhyankar x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 8388499736aSShri Abhyankar x[5+idx] = s6; x[6+idx] = s7; 8398499736aSShri Abhyankar idx += bs; 8408499736aSShri Abhyankar } 8418499736aSShri Abhyankar /* backward solve the L^T */ 8428499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 8438499736aSShri Abhyankar v = aa + bs2*ai[i]; 8448499736aSShri Abhyankar vi = aj + ai[i]; 8458499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 8468499736aSShri Abhyankar idt = bs*i; 8478499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 8488499736aSShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; 8498499736aSShri Abhyankar for(j=0;j<nz;j++){ 8508499736aSShri Abhyankar idx = bs*vi[j]; 8518499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8528499736aSShri Abhyankar x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8538499736aSShri Abhyankar x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8548499736aSShri Abhyankar x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8558499736aSShri Abhyankar x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8568499736aSShri Abhyankar x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8578499736aSShri Abhyankar x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8588499736aSShri Abhyankar v += bs2; 8598499736aSShri Abhyankar } 8608499736aSShri Abhyankar } 8618499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8628499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 8638499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 8648499736aSShri Abhyankar PetscFunctionReturn(0); 8658499736aSShri Abhyankar } 866f1af5d2fSBarry Smith 867f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 8684a2ae208SSatish Balay #undef __FUNCT__ 86906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 87006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 871f1af5d2fSBarry Smith { 872f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 873f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8746849ba73SBarry Smith PetscErrorCode ierr; 8755d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8765d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 877690b6cddSBarry Smith PetscInt *diag = a->diag; 878f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 87987828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 880f1af5d2fSBarry Smith 881f1af5d2fSBarry Smith PetscFunctionBegin; 8821ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8831ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 884f1af5d2fSBarry Smith t = a->solve_work; 885f1af5d2fSBarry Smith 886f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 887f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 888f1af5d2fSBarry Smith 889f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 890f1af5d2fSBarry Smith for (i=0; i<n; i++) { 891f1af5d2fSBarry Smith t[i] = b[c[i]]; 892f1af5d2fSBarry Smith } 893f1af5d2fSBarry Smith 894f1af5d2fSBarry Smith /* forward solve the U^T */ 895f1af5d2fSBarry Smith for (i=0; i<n; i++) { 896f1af5d2fSBarry Smith 897f1af5d2fSBarry Smith v = aa + diag[i]; 898f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 899f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 900f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 901f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 902f1af5d2fSBarry Smith while (nz--) { 903f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 904f1af5d2fSBarry Smith } 905f1af5d2fSBarry Smith t[i] = s1; 906f1af5d2fSBarry Smith } 907f1af5d2fSBarry Smith /* backward solve the L^T */ 908f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 909f1af5d2fSBarry Smith v = aa + diag[i] - 1; 910f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 911f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 912f1af5d2fSBarry Smith s1 = t[i]; 913f1af5d2fSBarry Smith while (nz--) { 914f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 915f1af5d2fSBarry Smith } 916f1af5d2fSBarry Smith } 917f1af5d2fSBarry Smith 918f1af5d2fSBarry Smith /* copy t into x according to permutation */ 919f1af5d2fSBarry Smith for (i=0; i<n; i++) { 920f1af5d2fSBarry Smith x[r[i]] = t[i]; 921f1af5d2fSBarry Smith } 922f1af5d2fSBarry Smith 923f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 924f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9251ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 9261ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 927dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 928f1af5d2fSBarry Smith PetscFunctionReturn(0); 929f1af5d2fSBarry Smith } 930f1af5d2fSBarry Smith 9314a2ae208SSatish Balay #undef __FUNCT__ 93206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 93306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 934f1af5d2fSBarry Smith { 935f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 936f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9376849ba73SBarry Smith PetscErrorCode ierr; 9385d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9395d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 940690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 941f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 94287828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 94387828ca2SBarry Smith PetscScalar *x,*b,*t; 944f1af5d2fSBarry Smith 945f1af5d2fSBarry Smith PetscFunctionBegin; 9461ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9471ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 948f1af5d2fSBarry Smith t = a->solve_work; 949f1af5d2fSBarry Smith 950f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 951f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 952f1af5d2fSBarry Smith 953f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 954f1af5d2fSBarry Smith ii = 0; 955f1af5d2fSBarry Smith for (i=0; i<n; i++) { 956f1af5d2fSBarry Smith ic = 2*c[i]; 957f1af5d2fSBarry Smith t[ii] = b[ic]; 958f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 959f1af5d2fSBarry Smith ii += 2; 960f1af5d2fSBarry Smith } 961f1af5d2fSBarry Smith 962f1af5d2fSBarry Smith /* forward solve the U^T */ 963f1af5d2fSBarry Smith idx = 0; 964f1af5d2fSBarry Smith for (i=0; i<n; i++) { 965f1af5d2fSBarry Smith 966f1af5d2fSBarry Smith v = aa + 4*diag[i]; 967f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 968f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 969f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 970f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 971f1af5d2fSBarry Smith v += 4; 972f1af5d2fSBarry Smith 973f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 974f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 975f1af5d2fSBarry Smith while (nz--) { 976f1af5d2fSBarry Smith oidx = 2*(*vi++); 977f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 978f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 979f1af5d2fSBarry Smith v += 4; 980f1af5d2fSBarry Smith } 981f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 982f1af5d2fSBarry Smith idx += 2; 983f1af5d2fSBarry Smith } 984f1af5d2fSBarry Smith /* backward solve the L^T */ 985f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 986f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 987f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 988f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 989f1af5d2fSBarry Smith idt = 2*i; 990f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 991f1af5d2fSBarry Smith while (nz--) { 992f1af5d2fSBarry Smith idx = 2*(*vi--); 993f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 994f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 995f1af5d2fSBarry Smith v -= 4; 996f1af5d2fSBarry Smith } 997f1af5d2fSBarry Smith } 998f1af5d2fSBarry Smith 999f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1000f1af5d2fSBarry Smith ii = 0; 1001f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1002f1af5d2fSBarry Smith ir = 2*r[i]; 1003f1af5d2fSBarry Smith x[ir] = t[ii]; 1004f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1005f1af5d2fSBarry Smith ii += 2; 1006f1af5d2fSBarry Smith } 1007f1af5d2fSBarry Smith 1008f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1009f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 10101ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10111ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1012dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1013f1af5d2fSBarry Smith PetscFunctionReturn(0); 1014f1af5d2fSBarry Smith } 1015f1af5d2fSBarry Smith 10164a2ae208SSatish Balay #undef __FUNCT__ 10174dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 10184dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 101932121132SShri Abhyankar { 102032121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 102132121132SShri Abhyankar PetscErrorCode ierr; 102232121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 102332121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 102432121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 102532121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 102632121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 102732121132SShri Abhyankar MatScalar *aa=a->a,*v; 102832121132SShri Abhyankar PetscScalar s1,s2,x1,x2; 102932121132SShri Abhyankar PetscScalar *x,*b,*t; 103032121132SShri Abhyankar 103132121132SShri Abhyankar PetscFunctionBegin; 103232121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 103332121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 103432121132SShri Abhyankar t = a->solve_work; 103532121132SShri Abhyankar 103632121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 103732121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 103832121132SShri Abhyankar 103932121132SShri Abhyankar /* copy b into temp work space according to permutation */ 104032121132SShri Abhyankar for(i=0;i<n;i++){ 104132121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 104232121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; 104332121132SShri Abhyankar } 104432121132SShri Abhyankar 104532121132SShri Abhyankar /* forward solve the U^T */ 104632121132SShri Abhyankar idx = 0; 104732121132SShri Abhyankar for (i=0; i<n; i++) { 104832121132SShri Abhyankar v = aa + bs2*diag[i]; 104932121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 105032121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 105132121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 105232121132SShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 105332121132SShri Abhyankar v -= bs2; 105432121132SShri Abhyankar 105532121132SShri Abhyankar vi = aj + diag[i] - 1; 105632121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 105732121132SShri Abhyankar for(j=0;j>-nz;j--){ 105832121132SShri Abhyankar oidx = bs*vi[j]; 105932121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2; 106032121132SShri Abhyankar t[oidx+1] -= v[2]*s1 + v[3]*s2; 106132121132SShri Abhyankar v -= bs2; 106232121132SShri Abhyankar } 106332121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 106432121132SShri Abhyankar idx += bs; 106532121132SShri Abhyankar } 106632121132SShri Abhyankar /* backward solve the L^T */ 106732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 106832121132SShri Abhyankar v = aa + bs2*ai[i]; 106932121132SShri Abhyankar vi = aj + ai[i]; 107032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 107132121132SShri Abhyankar idt = bs*i; 107232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 107332121132SShri Abhyankar for(j=0;j<nz;j++){ 107432121132SShri Abhyankar idx = bs*vi[j]; 107532121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2; 107632121132SShri Abhyankar t[idx+1] -= v[2]*s1 + v[3]*s2; 107732121132SShri Abhyankar v += bs2; 107832121132SShri Abhyankar } 107932121132SShri Abhyankar } 108032121132SShri Abhyankar 108132121132SShri Abhyankar /* copy t into x according to permutation */ 108232121132SShri Abhyankar for(i=0;i<n;i++){ 108332121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 108432121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; 108532121132SShri Abhyankar } 108632121132SShri Abhyankar 108732121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 108832121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 108932121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 109032121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 109132121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 109232121132SShri Abhyankar PetscFunctionReturn(0); 109332121132SShri Abhyankar } 109432121132SShri Abhyankar 109532121132SShri Abhyankar #undef __FUNCT__ 109606e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 109706e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1098f1af5d2fSBarry Smith { 1099f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1100f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 11016849ba73SBarry Smith PetscErrorCode ierr; 11025d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 11035d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1104690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1105f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 110687828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 110787828ca2SBarry Smith PetscScalar *x,*b,*t; 1108f1af5d2fSBarry Smith 1109f1af5d2fSBarry Smith PetscFunctionBegin; 11101ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11111ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1112f1af5d2fSBarry Smith t = a->solve_work; 1113f1af5d2fSBarry Smith 1114f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1115f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1116f1af5d2fSBarry Smith 1117f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1118f1af5d2fSBarry Smith ii = 0; 1119f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1120f1af5d2fSBarry Smith ic = 3*c[i]; 1121f1af5d2fSBarry Smith t[ii] = b[ic]; 1122f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1123f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1124f1af5d2fSBarry Smith ii += 3; 1125f1af5d2fSBarry Smith } 1126f1af5d2fSBarry Smith 1127f1af5d2fSBarry Smith /* forward solve the U^T */ 1128f1af5d2fSBarry Smith idx = 0; 1129f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1130f1af5d2fSBarry Smith 1131f1af5d2fSBarry Smith v = aa + 9*diag[i]; 1132f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1133f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1134f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1135f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1136f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1137f1af5d2fSBarry Smith v += 9; 1138f1af5d2fSBarry Smith 1139f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1140f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1141f1af5d2fSBarry Smith while (nz--) { 1142f1af5d2fSBarry Smith oidx = 3*(*vi++); 1143f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1144f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1145f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1146f1af5d2fSBarry Smith v += 9; 1147f1af5d2fSBarry Smith } 1148f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1149f1af5d2fSBarry Smith idx += 3; 1150f1af5d2fSBarry Smith } 1151f1af5d2fSBarry Smith /* backward solve the L^T */ 1152f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1153f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 1154f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1155f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1156f1af5d2fSBarry Smith idt = 3*i; 1157f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1158f1af5d2fSBarry Smith while (nz--) { 1159f1af5d2fSBarry Smith idx = 3*(*vi--); 1160f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1161f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1162f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1163f1af5d2fSBarry Smith v -= 9; 1164f1af5d2fSBarry Smith } 1165f1af5d2fSBarry Smith } 1166f1af5d2fSBarry Smith 1167f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1168f1af5d2fSBarry Smith ii = 0; 1169f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1170f1af5d2fSBarry Smith ir = 3*r[i]; 1171f1af5d2fSBarry Smith x[ir] = t[ii]; 1172f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1173f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1174f1af5d2fSBarry Smith ii += 3; 1175f1af5d2fSBarry Smith } 1176f1af5d2fSBarry Smith 1177f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1178f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11791ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11801ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1181dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1182f1af5d2fSBarry Smith PetscFunctionReturn(0); 1183f1af5d2fSBarry Smith } 1184f1af5d2fSBarry Smith 11854a2ae208SSatish Balay #undef __FUNCT__ 11864dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 11874dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 118832121132SShri Abhyankar { 118932121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 119032121132SShri Abhyankar PetscErrorCode ierr; 119132121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 119232121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 119332121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 119432121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 119532121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 119632121132SShri Abhyankar MatScalar *aa=a->a,*v; 119732121132SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 119832121132SShri Abhyankar PetscScalar *x,*b,*t; 119932121132SShri Abhyankar 120032121132SShri Abhyankar PetscFunctionBegin; 120132121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 120232121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 120332121132SShri Abhyankar t = a->solve_work; 120432121132SShri Abhyankar 120532121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 120632121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 120732121132SShri Abhyankar 120832121132SShri Abhyankar /* copy b into temp work space according to permutation */ 120932121132SShri Abhyankar for(i=0;i<n;i++){ 121032121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 121132121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 121232121132SShri Abhyankar } 121332121132SShri Abhyankar 121432121132SShri Abhyankar /* forward solve the U^T */ 121532121132SShri Abhyankar idx = 0; 121632121132SShri Abhyankar for (i=0; i<n; i++) { 121732121132SShri Abhyankar v = aa + bs2*diag[i]; 121832121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 121932121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 122032121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 122132121132SShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 122232121132SShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 122332121132SShri Abhyankar v -= bs2; 122432121132SShri Abhyankar 122532121132SShri Abhyankar vi = aj + diag[i] - 1; 122632121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 122732121132SShri Abhyankar for(j=0;j>-nz;j--){ 122832121132SShri Abhyankar oidx = bs*vi[j]; 122932121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 123032121132SShri Abhyankar t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 123132121132SShri Abhyankar t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 123232121132SShri Abhyankar v -= bs2; 123332121132SShri Abhyankar } 123432121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 123532121132SShri Abhyankar idx += bs; 123632121132SShri Abhyankar } 123732121132SShri Abhyankar /* backward solve the L^T */ 123832121132SShri Abhyankar for (i=n-1; i>=0; i--){ 123932121132SShri Abhyankar v = aa + bs2*ai[i]; 124032121132SShri Abhyankar vi = aj + ai[i]; 124132121132SShri Abhyankar nz = ai[i+1] - ai[i]; 124232121132SShri Abhyankar idt = bs*i; 124332121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 124432121132SShri Abhyankar for(j=0;j<nz;j++){ 124532121132SShri Abhyankar idx = bs*vi[j]; 124632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 124732121132SShri Abhyankar t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 124832121132SShri Abhyankar t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 124932121132SShri Abhyankar v += bs2; 125032121132SShri Abhyankar } 125132121132SShri Abhyankar } 125232121132SShri Abhyankar 125332121132SShri Abhyankar /* copy t into x according to permutation */ 125432121132SShri Abhyankar for(i=0;i<n;i++){ 125532121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 125632121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 125732121132SShri Abhyankar } 125832121132SShri Abhyankar 125932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 126032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 126132121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 126232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 126332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 126432121132SShri Abhyankar PetscFunctionReturn(0); 126532121132SShri Abhyankar } 126632121132SShri Abhyankar 126732121132SShri Abhyankar #undef __FUNCT__ 126806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 126906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1270f1af5d2fSBarry Smith { 1271f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1272f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 12736849ba73SBarry Smith PetscErrorCode ierr; 12745d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 12755d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1276690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1277f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 127887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 127987828ca2SBarry Smith PetscScalar *x,*b,*t; 1280f1af5d2fSBarry Smith 1281f1af5d2fSBarry Smith PetscFunctionBegin; 12821ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 12831ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1284f1af5d2fSBarry Smith t = a->solve_work; 1285f1af5d2fSBarry Smith 1286f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1287f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1288f1af5d2fSBarry Smith 1289f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1290f1af5d2fSBarry Smith ii = 0; 1291f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1292f1af5d2fSBarry Smith ic = 4*c[i]; 1293f1af5d2fSBarry Smith t[ii] = b[ic]; 1294f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1295f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1296f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1297f1af5d2fSBarry Smith ii += 4; 1298f1af5d2fSBarry Smith } 1299f1af5d2fSBarry Smith 1300f1af5d2fSBarry Smith /* forward solve the U^T */ 1301f1af5d2fSBarry Smith idx = 0; 1302f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1303f1af5d2fSBarry Smith 1304f1af5d2fSBarry Smith v = aa + 16*diag[i]; 1305f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1306f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1307f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1308f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1309f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1310f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1311f1af5d2fSBarry Smith v += 16; 1312f1af5d2fSBarry Smith 1313f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1314f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1315f1af5d2fSBarry Smith while (nz--) { 1316f1af5d2fSBarry Smith oidx = 4*(*vi++); 1317f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1318f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1319f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1320f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1321f1af5d2fSBarry Smith v += 16; 1322f1af5d2fSBarry Smith } 1323f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1324f1af5d2fSBarry Smith idx += 4; 1325f1af5d2fSBarry Smith } 1326f1af5d2fSBarry Smith /* backward solve the L^T */ 1327f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1328f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 1329f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1330f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1331f1af5d2fSBarry Smith idt = 4*i; 1332f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1333f1af5d2fSBarry Smith while (nz--) { 1334f1af5d2fSBarry Smith idx = 4*(*vi--); 1335f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1336f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1337f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1338f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1339f1af5d2fSBarry Smith v -= 16; 1340f1af5d2fSBarry Smith } 1341f1af5d2fSBarry Smith } 1342f1af5d2fSBarry Smith 1343f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1344f1af5d2fSBarry Smith ii = 0; 1345f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1346f1af5d2fSBarry Smith ir = 4*r[i]; 1347f1af5d2fSBarry Smith x[ir] = t[ii]; 1348f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1349f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1350f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1351f1af5d2fSBarry Smith ii += 4; 1352f1af5d2fSBarry Smith } 1353f1af5d2fSBarry Smith 1354f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1355f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13561ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 13571ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1358dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1359f1af5d2fSBarry Smith PetscFunctionReturn(0); 1360f1af5d2fSBarry Smith } 1361f1af5d2fSBarry Smith 13624a2ae208SSatish Balay #undef __FUNCT__ 13634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 13644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 136532121132SShri Abhyankar { 136632121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 136732121132SShri Abhyankar PetscErrorCode ierr; 136832121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 136932121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 137032121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 137132121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 137232121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 137332121132SShri Abhyankar MatScalar *aa=a->a,*v; 137432121132SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 137532121132SShri Abhyankar PetscScalar *x,*b,*t; 137632121132SShri Abhyankar 137732121132SShri Abhyankar PetscFunctionBegin; 137832121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 137932121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 138032121132SShri Abhyankar t = a->solve_work; 138132121132SShri Abhyankar 138232121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 138332121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 138432121132SShri Abhyankar 138532121132SShri Abhyankar /* copy b into temp work space according to permutation */ 138632121132SShri Abhyankar for(i=0;i<n;i++){ 138732121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 138832121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 138932121132SShri Abhyankar } 139032121132SShri Abhyankar 139132121132SShri Abhyankar /* forward solve the U^T */ 139232121132SShri Abhyankar idx = 0; 139332121132SShri Abhyankar for (i=0; i<n; i++) { 139432121132SShri Abhyankar v = aa + bs2*diag[i]; 139532121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 139632121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 139732121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 139832121132SShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 139932121132SShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 140032121132SShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 140132121132SShri Abhyankar v -= bs2; 140232121132SShri Abhyankar 140332121132SShri Abhyankar vi = aj + diag[i] - 1; 140432121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 140532121132SShri Abhyankar for(j=0;j>-nz;j--){ 140632121132SShri Abhyankar oidx = bs*vi[j]; 140732121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 140832121132SShri Abhyankar t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 140932121132SShri Abhyankar t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 141032121132SShri Abhyankar t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 141132121132SShri Abhyankar v -= bs2; 141232121132SShri Abhyankar } 141332121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 141432121132SShri Abhyankar idx += bs; 141532121132SShri Abhyankar } 141632121132SShri Abhyankar /* backward solve the L^T */ 141732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 141832121132SShri Abhyankar v = aa + bs2*ai[i]; 141932121132SShri Abhyankar vi = aj + ai[i]; 142032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 142132121132SShri Abhyankar idt = bs*i; 142232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 142332121132SShri Abhyankar for(j=0;j<nz;j++){ 142432121132SShri Abhyankar idx = bs*vi[j]; 142532121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 142632121132SShri Abhyankar t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 142732121132SShri Abhyankar t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 142832121132SShri Abhyankar t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 142932121132SShri Abhyankar v += bs2; 143032121132SShri Abhyankar } 143132121132SShri Abhyankar } 143232121132SShri Abhyankar 143332121132SShri Abhyankar /* copy t into x according to permutation */ 143432121132SShri Abhyankar for(i=0;i<n;i++){ 143532121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 143632121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 143732121132SShri Abhyankar } 143832121132SShri Abhyankar 143932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 144032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 144132121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 144232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 144332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 144432121132SShri Abhyankar PetscFunctionReturn(0); 144532121132SShri Abhyankar } 144632121132SShri Abhyankar 144732121132SShri Abhyankar #undef __FUNCT__ 144806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 144906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1450f1af5d2fSBarry Smith { 1451f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1452f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 14536849ba73SBarry Smith PetscErrorCode ierr; 14545d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 14555d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1456690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1457f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 145887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 145987828ca2SBarry Smith PetscScalar *x,*b,*t; 1460f1af5d2fSBarry Smith 1461f1af5d2fSBarry Smith PetscFunctionBegin; 14621ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 14631ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1464f1af5d2fSBarry Smith t = a->solve_work; 1465f1af5d2fSBarry Smith 1466f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1467f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1468f1af5d2fSBarry Smith 1469f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1470f1af5d2fSBarry Smith ii = 0; 1471f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1472f1af5d2fSBarry Smith ic = 5*c[i]; 1473f1af5d2fSBarry Smith t[ii] = b[ic]; 1474f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1475f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1476f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1477f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1478f1af5d2fSBarry Smith ii += 5; 1479f1af5d2fSBarry Smith } 1480f1af5d2fSBarry Smith 1481f1af5d2fSBarry Smith /* forward solve the U^T */ 1482f1af5d2fSBarry Smith idx = 0; 1483f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1484f1af5d2fSBarry Smith 1485f1af5d2fSBarry Smith v = aa + 25*diag[i]; 1486f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1487f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1488f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1489f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1490f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1491f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1492f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1493f1af5d2fSBarry Smith v += 25; 1494f1af5d2fSBarry Smith 1495f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1496f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1497f1af5d2fSBarry Smith while (nz--) { 1498f1af5d2fSBarry Smith oidx = 5*(*vi++); 1499f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1500f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1501f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1502f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1503f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1504f1af5d2fSBarry Smith v += 25; 1505f1af5d2fSBarry Smith } 1506f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1507f1af5d2fSBarry Smith idx += 5; 1508f1af5d2fSBarry Smith } 1509f1af5d2fSBarry Smith /* backward solve the L^T */ 1510f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1511f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 1512f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1513f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1514f1af5d2fSBarry Smith idt = 5*i; 1515f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1516f1af5d2fSBarry Smith while (nz--) { 1517f1af5d2fSBarry Smith idx = 5*(*vi--); 1518f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1519f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1520f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1521f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1522f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1523f1af5d2fSBarry Smith v -= 25; 1524f1af5d2fSBarry Smith } 1525f1af5d2fSBarry Smith } 1526f1af5d2fSBarry Smith 1527f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1528f1af5d2fSBarry Smith ii = 0; 1529f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1530f1af5d2fSBarry Smith ir = 5*r[i]; 1531f1af5d2fSBarry Smith x[ir] = t[ii]; 1532f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1533f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1534f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1535f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1536f1af5d2fSBarry Smith ii += 5; 1537f1af5d2fSBarry Smith } 1538f1af5d2fSBarry Smith 1539f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1540f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 15411ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 15421ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1543dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1544f1af5d2fSBarry Smith PetscFunctionReturn(0); 1545f1af5d2fSBarry Smith } 1546f1af5d2fSBarry Smith 15474a2ae208SSatish Balay #undef __FUNCT__ 15484dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 15494dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 155032121132SShri Abhyankar { 155132121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 155232121132SShri Abhyankar PetscErrorCode ierr; 155332121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 155432121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 155532121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 155632121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 155732121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 155832121132SShri Abhyankar MatScalar *aa=a->a,*v; 155932121132SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 156032121132SShri Abhyankar PetscScalar *x,*b,*t; 156132121132SShri Abhyankar 156232121132SShri Abhyankar PetscFunctionBegin; 156332121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 156432121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 156532121132SShri Abhyankar t = a->solve_work; 156632121132SShri Abhyankar 156732121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 156832121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 156932121132SShri Abhyankar 157032121132SShri Abhyankar /* copy b into temp work space according to permutation */ 157132121132SShri Abhyankar for(i=0;i<n;i++){ 157232121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 157332121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 157432121132SShri Abhyankar t[ii+4] = b[ic+4]; 157532121132SShri Abhyankar } 157632121132SShri Abhyankar 157732121132SShri Abhyankar /* forward solve the U^T */ 157832121132SShri Abhyankar idx = 0; 157932121132SShri Abhyankar for (i=0; i<n; i++) { 158032121132SShri Abhyankar v = aa + bs2*diag[i]; 158132121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 158232121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 158332121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 158432121132SShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 158532121132SShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 158632121132SShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 158732121132SShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 158832121132SShri Abhyankar v -= bs2; 158932121132SShri Abhyankar 159032121132SShri Abhyankar vi = aj + diag[i] - 1; 159132121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 159232121132SShri Abhyankar for(j=0;j>-nz;j--){ 159332121132SShri Abhyankar oidx = bs*vi[j]; 159432121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 159532121132SShri Abhyankar t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 159632121132SShri Abhyankar t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 159732121132SShri Abhyankar t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 159832121132SShri Abhyankar t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 159932121132SShri Abhyankar v -= bs2; 160032121132SShri Abhyankar } 160132121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 160232121132SShri Abhyankar idx += bs; 160332121132SShri Abhyankar } 160432121132SShri Abhyankar /* backward solve the L^T */ 160532121132SShri Abhyankar for (i=n-1; i>=0; i--){ 160632121132SShri Abhyankar v = aa + bs2*ai[i]; 160732121132SShri Abhyankar vi = aj + ai[i]; 160832121132SShri Abhyankar nz = ai[i+1] - ai[i]; 160932121132SShri Abhyankar idt = bs*i; 161032121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 161132121132SShri Abhyankar for(j=0;j<nz;j++){ 161232121132SShri Abhyankar idx = bs*vi[j]; 161332121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 161432121132SShri Abhyankar t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 161532121132SShri Abhyankar t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 161632121132SShri Abhyankar t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 161732121132SShri Abhyankar t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 161832121132SShri Abhyankar v += bs2; 161932121132SShri Abhyankar } 162032121132SShri Abhyankar } 162132121132SShri Abhyankar 162232121132SShri Abhyankar /* copy t into x according to permutation */ 162332121132SShri Abhyankar for(i=0;i<n;i++){ 162432121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 162532121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 162632121132SShri Abhyankar x[ir+4] = t[ii+4]; 162732121132SShri Abhyankar } 162832121132SShri Abhyankar 162932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 163032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 163132121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 163232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 163332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 163432121132SShri Abhyankar PetscFunctionReturn(0); 163532121132SShri Abhyankar } 163632121132SShri Abhyankar 163732121132SShri Abhyankar #undef __FUNCT__ 163806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 163906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1640f1af5d2fSBarry Smith { 1641f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1642f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 16436849ba73SBarry Smith PetscErrorCode ierr; 16445d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 16455d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1646690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1647f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 164887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 164987828ca2SBarry Smith PetscScalar *x,*b,*t; 1650f1af5d2fSBarry Smith 1651f1af5d2fSBarry Smith PetscFunctionBegin; 16521ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 16531ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1654f1af5d2fSBarry Smith t = a->solve_work; 1655f1af5d2fSBarry Smith 1656f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1657f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1658f1af5d2fSBarry Smith 1659f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1660f1af5d2fSBarry Smith ii = 0; 1661f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1662f1af5d2fSBarry Smith ic = 6*c[i]; 1663f1af5d2fSBarry Smith t[ii] = b[ic]; 1664f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1665f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1666f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1667f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1668f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1669f1af5d2fSBarry Smith ii += 6; 1670f1af5d2fSBarry Smith } 1671f1af5d2fSBarry Smith 1672f1af5d2fSBarry Smith /* forward solve the U^T */ 1673f1af5d2fSBarry Smith idx = 0; 1674f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1675f1af5d2fSBarry Smith 1676f1af5d2fSBarry Smith v = aa + 36*diag[i]; 1677f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1678f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1679f1af5d2fSBarry Smith x6 = t[5+idx]; 1680f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1681f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1682f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1683f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1684f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1685f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1686f1af5d2fSBarry Smith v += 36; 1687f1af5d2fSBarry Smith 1688f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1689f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1690f1af5d2fSBarry Smith while (nz--) { 1691f1af5d2fSBarry Smith oidx = 6*(*vi++); 1692f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1693f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1694f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1695f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1696f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1697f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1698f1af5d2fSBarry Smith v += 36; 1699f1af5d2fSBarry Smith } 1700f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1701f1af5d2fSBarry Smith t[5+idx] = s6; 1702f1af5d2fSBarry Smith idx += 6; 1703f1af5d2fSBarry Smith } 1704f1af5d2fSBarry Smith /* backward solve the L^T */ 1705f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1706f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 1707f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1708f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1709f1af5d2fSBarry Smith idt = 6*i; 1710f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1711f1af5d2fSBarry Smith s6 = t[5+idt]; 1712f1af5d2fSBarry Smith while (nz--) { 1713f1af5d2fSBarry Smith idx = 6*(*vi--); 1714f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1715f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1716f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1717f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1718f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1719f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1720f1af5d2fSBarry Smith v -= 36; 1721f1af5d2fSBarry Smith } 1722f1af5d2fSBarry Smith } 1723f1af5d2fSBarry Smith 1724f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1725f1af5d2fSBarry Smith ii = 0; 1726f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1727f1af5d2fSBarry Smith ir = 6*r[i]; 1728f1af5d2fSBarry Smith x[ir] = t[ii]; 1729f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1730f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1731f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1732f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1733f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1734f1af5d2fSBarry Smith ii += 6; 1735f1af5d2fSBarry Smith } 1736f1af5d2fSBarry Smith 1737f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1738f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 17391ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 17401ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1741dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1742f1af5d2fSBarry Smith PetscFunctionReturn(0); 1743f1af5d2fSBarry Smith } 1744f1af5d2fSBarry Smith 17454a2ae208SSatish Balay #undef __FUNCT__ 17464dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 17474dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 174832121132SShri Abhyankar { 174932121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 175032121132SShri Abhyankar PetscErrorCode ierr; 175132121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 175232121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 175332121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 175432121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 175532121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 175632121132SShri Abhyankar MatScalar *aa=a->a,*v; 175732121132SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 175832121132SShri Abhyankar PetscScalar *x,*b,*t; 175932121132SShri Abhyankar 176032121132SShri Abhyankar PetscFunctionBegin; 176132121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 176232121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 176332121132SShri Abhyankar t = a->solve_work; 176432121132SShri Abhyankar 176532121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 176632121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 176732121132SShri Abhyankar 176832121132SShri Abhyankar /* copy b into temp work space according to permutation */ 176932121132SShri Abhyankar for(i=0;i<n;i++){ 177032121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 177132121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 177232121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 177332121132SShri Abhyankar } 177432121132SShri Abhyankar 177532121132SShri Abhyankar /* forward solve the U^T */ 177632121132SShri Abhyankar idx = 0; 177732121132SShri Abhyankar for (i=0; i<n; i++) { 177832121132SShri Abhyankar v = aa + bs2*diag[i]; 177932121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 178032121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 178132121132SShri Abhyankar x6 = t[5+idx]; 178232121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 178332121132SShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 178432121132SShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 178532121132SShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 178632121132SShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 178732121132SShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 178832121132SShri Abhyankar v -= bs2; 178932121132SShri Abhyankar 179032121132SShri Abhyankar vi = aj + diag[i] - 1; 179132121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 179232121132SShri Abhyankar for(j=0;j>-nz;j--){ 179332121132SShri Abhyankar oidx = bs*vi[j]; 179432121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 179532121132SShri Abhyankar t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 179632121132SShri Abhyankar t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 179732121132SShri Abhyankar t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 179832121132SShri Abhyankar t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 179932121132SShri Abhyankar t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 180032121132SShri Abhyankar v -= bs2; 180132121132SShri Abhyankar } 180232121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 180332121132SShri Abhyankar t[5+idx] = s6; 180432121132SShri Abhyankar idx += bs; 180532121132SShri Abhyankar } 180632121132SShri Abhyankar /* backward solve the L^T */ 180732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 180832121132SShri Abhyankar v = aa + bs2*ai[i]; 180932121132SShri Abhyankar vi = aj + ai[i]; 181032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 181132121132SShri Abhyankar idt = bs*i; 181232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 181332121132SShri Abhyankar s6 = t[5+idt]; 181432121132SShri Abhyankar for(j=0;j<nz;j++){ 181532121132SShri Abhyankar idx = bs*vi[j]; 181632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 181732121132SShri Abhyankar t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 181832121132SShri Abhyankar t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 181932121132SShri Abhyankar t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 182032121132SShri Abhyankar t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 182132121132SShri Abhyankar t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 182232121132SShri Abhyankar v += bs2; 182332121132SShri Abhyankar } 182432121132SShri Abhyankar } 182532121132SShri Abhyankar 182632121132SShri Abhyankar /* copy t into x according to permutation */ 182732121132SShri Abhyankar for(i=0;i<n;i++){ 182832121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 182932121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 183032121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 183132121132SShri Abhyankar } 183232121132SShri Abhyankar 183332121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 183432121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 183532121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 183632121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 183732121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 183832121132SShri Abhyankar PetscFunctionReturn(0); 183932121132SShri Abhyankar } 184032121132SShri Abhyankar 184132121132SShri Abhyankar #undef __FUNCT__ 184206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 184306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1844f1af5d2fSBarry Smith { 1845f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1846f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 18476849ba73SBarry Smith PetscErrorCode ierr; 18485d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 18495d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1850690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1851f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 185287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 185387828ca2SBarry Smith PetscScalar *x,*b,*t; 1854f1af5d2fSBarry Smith 1855f1af5d2fSBarry Smith PetscFunctionBegin; 18561ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 18571ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1858f1af5d2fSBarry Smith t = a->solve_work; 1859f1af5d2fSBarry Smith 1860f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1861f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1862f1af5d2fSBarry Smith 1863f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1864f1af5d2fSBarry Smith ii = 0; 1865f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1866f1af5d2fSBarry Smith ic = 7*c[i]; 1867f1af5d2fSBarry Smith t[ii] = b[ic]; 1868f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1869f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1870f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1871f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1872f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1873f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1874f1af5d2fSBarry Smith ii += 7; 1875f1af5d2fSBarry Smith } 1876f1af5d2fSBarry Smith 1877f1af5d2fSBarry Smith /* forward solve the U^T */ 1878f1af5d2fSBarry Smith idx = 0; 1879f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1880f1af5d2fSBarry Smith 1881f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1882f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1883f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1884f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1885f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1886f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1887f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1888f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1889f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1890f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1891f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1892f1af5d2fSBarry Smith v += 49; 1893f1af5d2fSBarry Smith 1894f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1895f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1896f1af5d2fSBarry Smith while (nz--) { 1897f1af5d2fSBarry Smith oidx = 7*(*vi++); 1898f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1899f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1900f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1901f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1902f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1903f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1904f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1905f1af5d2fSBarry Smith v += 49; 1906f1af5d2fSBarry Smith } 1907f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1908f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1909f1af5d2fSBarry Smith idx += 7; 1910f1af5d2fSBarry Smith } 1911f1af5d2fSBarry Smith /* backward solve the L^T */ 1912f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1913f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1914f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1915f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1916f1af5d2fSBarry Smith idt = 7*i; 1917f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1918f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1919f1af5d2fSBarry Smith while (nz--) { 1920f1af5d2fSBarry Smith idx = 7*(*vi--); 1921f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1922f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1923f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1924f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1925f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1926f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1927f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1928f1af5d2fSBarry Smith v -= 49; 1929f1af5d2fSBarry Smith } 1930f1af5d2fSBarry Smith } 1931f1af5d2fSBarry Smith 1932f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1933f1af5d2fSBarry Smith ii = 0; 1934f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1935f1af5d2fSBarry Smith ir = 7*r[i]; 1936f1af5d2fSBarry Smith x[ir] = t[ii]; 1937f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1938f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1939f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1940f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1941f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1942f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1943f1af5d2fSBarry Smith ii += 7; 1944f1af5d2fSBarry Smith } 1945f1af5d2fSBarry Smith 1946f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1947f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 19481ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 19491ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1950dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1951f1af5d2fSBarry Smith PetscFunctionReturn(0); 1952f1af5d2fSBarry Smith } 195332121132SShri Abhyankar #undef __FUNCT__ 19544dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 19554dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 195632121132SShri Abhyankar { 195732121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 195832121132SShri Abhyankar PetscErrorCode ierr; 195932121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 196032121132SShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 196132121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 196232121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 196332121132SShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 196432121132SShri Abhyankar MatScalar *aa=a->a,*v; 196532121132SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 196632121132SShri Abhyankar PetscScalar *x,*b,*t; 196732121132SShri Abhyankar 196832121132SShri Abhyankar PetscFunctionBegin; 196932121132SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 197032121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 197132121132SShri Abhyankar t = a->solve_work; 197232121132SShri Abhyankar 197332121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 197432121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 197532121132SShri Abhyankar 197632121132SShri Abhyankar /* copy b into temp work space according to permutation */ 197732121132SShri Abhyankar for(i=0;i<n;i++){ 197832121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 197932121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 198032121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 198132121132SShri Abhyankar } 198232121132SShri Abhyankar 198332121132SShri Abhyankar /* forward solve the U^T */ 198432121132SShri Abhyankar idx = 0; 198532121132SShri Abhyankar for (i=0; i<n; i++) { 198632121132SShri Abhyankar v = aa + bs2*diag[i]; 198732121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 198832121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 198932121132SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 199032121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 199132121132SShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 199232121132SShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 199332121132SShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 199432121132SShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 199532121132SShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 199632121132SShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 199732121132SShri Abhyankar v -= bs2; 199832121132SShri Abhyankar 199932121132SShri Abhyankar vi = aj + diag[i] - 1; 200032121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 200132121132SShri Abhyankar for(j=0;j>-nz;j--){ 200232121132SShri Abhyankar oidx = bs*vi[j]; 200332121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 200432121132SShri Abhyankar t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 200532121132SShri Abhyankar t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 200632121132SShri Abhyankar t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 200732121132SShri Abhyankar t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 200832121132SShri Abhyankar t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 200932121132SShri Abhyankar t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 201032121132SShri Abhyankar v -= bs2; 201132121132SShri Abhyankar } 201232121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 201332121132SShri Abhyankar t[5+idx] = s6; t[6+idx] = s7; 201432121132SShri Abhyankar idx += bs; 201532121132SShri Abhyankar } 201632121132SShri Abhyankar /* backward solve the L^T */ 201732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 201832121132SShri Abhyankar v = aa + bs2*ai[i]; 201932121132SShri Abhyankar vi = aj + ai[i]; 202032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 202132121132SShri Abhyankar idt = bs*i; 202232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 202332121132SShri Abhyankar s6 = t[5+idt]; s7 = t[6+idt]; 202432121132SShri Abhyankar for(j=0;j<nz;j++){ 202532121132SShri Abhyankar idx = bs*vi[j]; 202632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 202732121132SShri Abhyankar t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 202832121132SShri Abhyankar t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 202932121132SShri Abhyankar t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 203032121132SShri Abhyankar t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 203132121132SShri Abhyankar t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 203232121132SShri Abhyankar t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 203332121132SShri Abhyankar v += bs2; 203432121132SShri Abhyankar } 203532121132SShri Abhyankar } 203632121132SShri Abhyankar 203732121132SShri Abhyankar /* copy t into x according to permutation */ 203832121132SShri Abhyankar for(i=0;i<n;i++){ 203932121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 204032121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 204132121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 204232121132SShri Abhyankar } 204332121132SShri Abhyankar 204432121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 204532121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 204632121132SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 204732121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 204832121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 204932121132SShri Abhyankar PetscFunctionReturn(0); 205032121132SShri Abhyankar } 2051f1af5d2fSBarry Smith 20524e2b4712SSatish Balay /* ----------------------------------------------------------- */ 20534a2ae208SSatish Balay #undef __FUNCT__ 205406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 205506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 20564e2b4712SSatish Balay { 20574e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 20584e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 20596849ba73SBarry Smith PetscErrorCode ierr; 20605d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 20615d0c19d7SBarry Smith PetscInt i,n=a->mbs; 20625d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 20633f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 206487828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 20654e2b4712SSatish Balay 20664e2b4712SSatish Balay PetscFunctionBegin; 20671ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 20681ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2069f1af5d2fSBarry Smith t = a->solve_work; 20704e2b4712SSatish Balay 20714e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 20724e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 20734e2b4712SSatish Balay 20744e2b4712SSatish Balay /* forward solve the lower triangular */ 207587828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 20764e2b4712SSatish Balay for (i=1; i<n; i++) { 20774e2b4712SSatish Balay v = aa + bs2*ai[i]; 20784e2b4712SSatish Balay vi = aj + ai[i]; 20794e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 2080f1af5d2fSBarry Smith s = t + bs*i; 208187828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 20824e2b4712SSatish Balay while (nz--) { 2083f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 20844e2b4712SSatish Balay v += bs2; 20854e2b4712SSatish Balay } 20864e2b4712SSatish Balay } 20874e2b4712SSatish Balay /* backward solve the upper triangular */ 2088d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 20894e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 20904e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 20914e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 20924e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 209387828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 20944e2b4712SSatish Balay while (nz--) { 2095f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 20964e2b4712SSatish Balay v += bs2; 20974e2b4712SSatish Balay } 2098f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 209987828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 21004e2b4712SSatish Balay } 21014e2b4712SSatish Balay 21024e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21034e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21041ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 21051ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2106dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 21074e2b4712SSatish Balay PetscFunctionReturn(0); 21084e2b4712SSatish Balay } 21094e2b4712SSatish Balay 21105c42ef9dSBarry Smith /* ----------------------------------------------------------- */ 21115c42ef9dSBarry Smith #undef __FUNCT__ 211206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 211306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 21145c42ef9dSBarry Smith { 21155c42ef9dSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21165c42ef9dSBarry Smith IS iscol=a->col,isrow=a->row; 21175c42ef9dSBarry Smith PetscErrorCode ierr; 21185c42ef9dSBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 21195c42ef9dSBarry Smith PetscInt i,n=a->mbs,j; 21205c42ef9dSBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 21215c42ef9dSBarry Smith const MatScalar *aa=a->a,*v; 21225c42ef9dSBarry Smith PetscScalar *x,*t,*ls; 21235c42ef9dSBarry Smith const PetscScalar *b; 21245c42ef9dSBarry Smith PetscFunctionBegin; 21255c42ef9dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21265c42ef9dSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21275c42ef9dSBarry Smith t = a->solve_work; 21285c42ef9dSBarry Smith 21295c42ef9dSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21305c42ef9dSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 21315c42ef9dSBarry Smith 21325c42ef9dSBarry Smith /* copy the b into temp work space according to permutation */ 21335c42ef9dSBarry Smith for (i=0; i<n; i++) { 21345c42ef9dSBarry Smith for (j=0; j<bs; j++) { 21355c42ef9dSBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 21365c42ef9dSBarry Smith } 21375c42ef9dSBarry Smith } 21385c42ef9dSBarry Smith 21395c42ef9dSBarry Smith 21405c42ef9dSBarry Smith /* forward solve the upper triangular transpose */ 21415c42ef9dSBarry Smith ls = a->solve_work + A->cmap->n; 21425c42ef9dSBarry Smith for (i=0; i<n; i++){ 21435c42ef9dSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 21445c42ef9dSBarry Smith Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 21455c42ef9dSBarry Smith v = aa + bs2*(a->diag[i] + 1); 21465c42ef9dSBarry Smith vi = aj + a->diag[i] + 1; 21475c42ef9dSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 21485c42ef9dSBarry Smith while (nz--) { 21495c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 21505c42ef9dSBarry Smith v += bs2; 21515c42ef9dSBarry Smith } 21525c42ef9dSBarry Smith } 21535c42ef9dSBarry Smith 21545c42ef9dSBarry Smith /* backward solve the lower triangular transpose */ 21555c42ef9dSBarry Smith for (i=n-1; i>=0; i--) { 21565c42ef9dSBarry Smith v = aa + bs2*ai[i]; 21575c42ef9dSBarry Smith vi = aj + ai[i]; 21585c42ef9dSBarry Smith nz = a->diag[i] - ai[i]; 21595c42ef9dSBarry Smith while (nz--) { 21605c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 21615c42ef9dSBarry Smith v += bs2; 21625c42ef9dSBarry Smith } 21635c42ef9dSBarry Smith } 21645c42ef9dSBarry Smith 21655c42ef9dSBarry Smith /* copy t into x according to permutation */ 21665c42ef9dSBarry Smith for (i=0; i<n; i++) { 21675c42ef9dSBarry Smith for (j=0; j<bs; j++) { 21685c42ef9dSBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 21695c42ef9dSBarry Smith } 21705c42ef9dSBarry Smith } 21715c42ef9dSBarry Smith 21725c42ef9dSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21735c42ef9dSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21745c42ef9dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21755c42ef9dSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 21765c42ef9dSBarry Smith ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 21775c42ef9dSBarry Smith PetscFunctionReturn(0); 21785c42ef9dSBarry Smith } 21795c42ef9dSBarry Smith 21804a2ae208SSatish Balay #undef __FUNCT__ 21814dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 21824dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 21838499736aSShri Abhyankar { 21848499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21858499736aSShri Abhyankar IS iscol=a->col,isrow=a->row; 21868499736aSShri Abhyankar PetscErrorCode ierr; 21878499736aSShri Abhyankar const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 21888499736aSShri Abhyankar PetscInt i,n=a->mbs,j; 21898499736aSShri Abhyankar PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 21908499736aSShri Abhyankar const MatScalar *aa=a->a,*v; 21918499736aSShri Abhyankar PetscScalar *x,*t,*ls; 21928499736aSShri Abhyankar const PetscScalar *b; 21938499736aSShri Abhyankar PetscFunctionBegin; 21948499736aSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21958499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21968499736aSShri Abhyankar t = a->solve_work; 21978499736aSShri Abhyankar 21988499736aSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21998499736aSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22008499736aSShri Abhyankar 22018499736aSShri Abhyankar /* copy the b into temp work space according to permutation */ 22028499736aSShri Abhyankar for (i=0; i<n; i++) { 22038499736aSShri Abhyankar for (j=0; j<bs; j++) { 22048499736aSShri Abhyankar t[i*bs+j] = b[c[i]*bs+j]; 22058499736aSShri Abhyankar } 22068499736aSShri Abhyankar } 22078499736aSShri Abhyankar 22088499736aSShri Abhyankar 22098499736aSShri Abhyankar /* forward solve the upper triangular transpose */ 22108499736aSShri Abhyankar ls = a->solve_work + A->cmap->n; 22118499736aSShri Abhyankar for (i=0; i<n; i++){ 22128499736aSShri Abhyankar ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 22138499736aSShri Abhyankar Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 22148499736aSShri Abhyankar v = aa + bs2*(diag[i] - 1); 22158499736aSShri Abhyankar vi = aj + diag[i] - 1; 22168499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 22178499736aSShri Abhyankar for(j=0;j>-nz;j--){ 22188499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 22198499736aSShri Abhyankar v -= bs2; 22208499736aSShri Abhyankar } 22218499736aSShri Abhyankar } 22228499736aSShri Abhyankar 22238499736aSShri Abhyankar /* backward solve the lower triangular transpose */ 22248499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 22258499736aSShri Abhyankar v = aa + bs2*ai[i]; 22268499736aSShri Abhyankar vi = aj + ai[i]; 22278499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 22288499736aSShri Abhyankar for(j=0;j<nz;j++){ 22298499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 22308499736aSShri Abhyankar v += bs2; 22318499736aSShri Abhyankar } 22328499736aSShri Abhyankar } 22338499736aSShri Abhyankar 22348499736aSShri Abhyankar /* copy t into x according to permutation */ 22358499736aSShri Abhyankar for (i=0; i<n; i++) { 22368499736aSShri Abhyankar for (j=0; j<bs; j++) { 22378499736aSShri Abhyankar x[bs*r[i]+j] = t[bs*i+j]; 22388499736aSShri Abhyankar } 22398499736aSShri Abhyankar } 22408499736aSShri Abhyankar 22418499736aSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22428499736aSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 22438499736aSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22448499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 22458499736aSShri Abhyankar ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 22468499736aSShri Abhyankar PetscFunctionReturn(0); 22478499736aSShri Abhyankar } 22488499736aSShri Abhyankar 2249*2b0b2ea7SShri Abhyankar /* bs = 15 for PFLOTRAN */ 2250*2b0b2ea7SShri Abhyankar #undef __FUNCT__ 2251*2b0b2ea7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15" 2252*2b0b2ea7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15(Mat A,Vec bb,Vec xx) 2253*2b0b2ea7SShri Abhyankar { 2254*2b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2255*2b0b2ea7SShri Abhyankar IS iscol=a->col,isrow=a->row; 2256*2b0b2ea7SShri Abhyankar PetscErrorCode ierr; 2257*2b0b2ea7SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi,bs=A->rmap->bs,bs2=a->bs2; 2258*2b0b2ea7SShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 2259*2b0b2ea7SShri Abhyankar MatScalar *aa=a->a,*v; 2260*2b0b2ea7SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 2261*2b0b2ea7SShri Abhyankar PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 2262*2b0b2ea7SShri Abhyankar PetscScalar *x,*b,*t; 2263*2b0b2ea7SShri Abhyankar 2264*2b0b2ea7SShri Abhyankar PetscFunctionBegin; 2265*2b0b2ea7SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2266*2b0b2ea7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2267*2b0b2ea7SShri Abhyankar t = a->solve_work; 2268*2b0b2ea7SShri Abhyankar 2269*2b0b2ea7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2270*2b0b2ea7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2271*2b0b2ea7SShri Abhyankar 2272*2b0b2ea7SShri Abhyankar /* forward solve the lower triangular */ 2273*2b0b2ea7SShri Abhyankar idx = bs*r[0]; 2274*2b0b2ea7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2275*2b0b2ea7SShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; t[7] = b[7+idx]; t[8] = b[8+idx]; t[9] = b[9+idx]; 2276*2b0b2ea7SShri Abhyankar t[10] = b[10+idx]; t[11] = b[11+idx]; t[12] = b[12+idx]; t[13] = b[13+idx]; t[14] = b[14+idx]; 2277*2b0b2ea7SShri Abhyankar 2278*2b0b2ea7SShri Abhyankar for (i=1; i<n; i++) { 2279*2b0b2ea7SShri Abhyankar v = aa + bs2*ai[i]; 2280*2b0b2ea7SShri Abhyankar vi = aj + ai[i]; 2281*2b0b2ea7SShri Abhyankar nz = ai[i+1] - ai[i]; 2282*2b0b2ea7SShri Abhyankar idx = bs*r[i]; 2283*2b0b2ea7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; s4 = b[3+idx]; s5 = b[4+idx]; 2284*2b0b2ea7SShri Abhyankar s6 = b[5+idx]; s7 = b[6+idx]; s8 = b[7+idx]; s9 = b[8+idx]; s10 = b[9+idx]; 2285*2b0b2ea7SShri Abhyankar s11 = b[10+idx]; s12 = b[11+idx]; s13 = b[12+idx]; s14 = b[13+idx]; s15 = b[14+idx]; 2286*2b0b2ea7SShri Abhyankar for(m=0;m<nz;m++){ 2287*2b0b2ea7SShri Abhyankar idx = bs*vi[m]; 2288*2b0b2ea7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2289*2b0b2ea7SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; x8 = t[7+idx]; x9 = t[8+idx]; x10 = t[9+idx]; 2290*2b0b2ea7SShri Abhyankar x11 = t[10+idx]; x12 = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx]; 2291*2b0b2ea7SShri Abhyankar 2292*2b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2293*2b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2294*2b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2295*2b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2296*2b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2297*2b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2298*2b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2299*2b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2300*2b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2301*2b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2302*2b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2303*2b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2304*2b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2305*2b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2306*2b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2307*2b0b2ea7SShri Abhyankar 2308*2b0b2ea7SShri Abhyankar v += bs2; 2309*2b0b2ea7SShri Abhyankar } 2310*2b0b2ea7SShri Abhyankar idx = bs*i; 2311*2b0b2ea7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] = s5; 2312*2b0b2ea7SShri Abhyankar t[5+idx] = s6; t[6+idx] = s7; t[7+idx] = s8; t[8+idx] = s9; t[9+idx] = s10; 2313*2b0b2ea7SShri Abhyankar t[10+idx] = s11; t[11+idx] = s12; t[12+idx] = s13; t[13+idx] = s14; t[14+idx] = s15; 2314*2b0b2ea7SShri Abhyankar 2315*2b0b2ea7SShri Abhyankar } 2316*2b0b2ea7SShri Abhyankar /* backward solve the upper triangular */ 2317*2b0b2ea7SShri Abhyankar for (i=n-1; i>=0; i--){ 2318*2b0b2ea7SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 2319*2b0b2ea7SShri Abhyankar vi = aj + adiag[i+1]+1; 2320*2b0b2ea7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 2321*2b0b2ea7SShri Abhyankar idt = bs*i; 2322*2b0b2ea7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2323*2b0b2ea7SShri Abhyankar s6 = t[5+idt]; s7 = t[6+idt]; s8 = t[7+idt]; s9 = t[8+idt]; s10 = t[9+idt]; 2324*2b0b2ea7SShri Abhyankar s11 = t[10+idt]; s12 = t[11+idt]; s13 = t[12+idt]; s14 = t[13+idt]; s15 = t[14+idt]; 2325*2b0b2ea7SShri Abhyankar 2326*2b0b2ea7SShri Abhyankar for(m=0;m<nz;m++){ 2327*2b0b2ea7SShri Abhyankar idx = bs*vi[m]; 2328*2b0b2ea7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2329*2b0b2ea7SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; x8 = t[7+idx]; x9 = t[8+idx]; x10 = t[9+idx]; 2330*2b0b2ea7SShri Abhyankar x11 = t[10+idx]; x12 = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx]; 2331*2b0b2ea7SShri Abhyankar 2332*2b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2333*2b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2334*2b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2335*2b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2336*2b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2337*2b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2338*2b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2339*2b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2340*2b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2341*2b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2342*2b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2343*2b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2344*2b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2345*2b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2346*2b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2347*2b0b2ea7SShri Abhyankar 2348*2b0b2ea7SShri Abhyankar v += bs2; 2349*2b0b2ea7SShri Abhyankar } 2350*2b0b2ea7SShri Abhyankar idc = bs*c[i]; 2351*2b0b2ea7SShri Abhyankar 2352*2b0b2ea7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 2353*2b0b2ea7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 2354*2b0b2ea7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 2355*2b0b2ea7SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 2356*2b0b2ea7SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 2357*2b0b2ea7SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 2358*2b0b2ea7SShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 2359*2b0b2ea7SShri Abhyankar x[7+idc] = t[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 2360*2b0b2ea7SShri Abhyankar x[8+idc] = t[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 2361*2b0b2ea7SShri Abhyankar x[9+idc] = t[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 2362*2b0b2ea7SShri Abhyankar x[10+idc] = t[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 2363*2b0b2ea7SShri Abhyankar x[11+idc] = t[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 2364*2b0b2ea7SShri Abhyankar x[12+idc] = t[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 2365*2b0b2ea7SShri Abhyankar x[13+idc] = t[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 2366*2b0b2ea7SShri Abhyankar x[14+idc] = t[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 2367*2b0b2ea7SShri Abhyankar 2368*2b0b2ea7SShri Abhyankar } 2369*2b0b2ea7SShri Abhyankar 2370*2b0b2ea7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2371*2b0b2ea7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2372*2b0b2ea7SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2373*2b0b2ea7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2374*2b0b2ea7SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2375*2b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 2376*2b0b2ea7SShri Abhyankar } 2377*2b0b2ea7SShri Abhyankar 23788499736aSShri Abhyankar #undef __FUNCT__ 237906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 238006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 23814e2b4712SSatish Balay { 23824e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 23834e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 23846849ba73SBarry Smith PetscErrorCode ierr; 23855d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 23865d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 23873f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 238887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 238987828ca2SBarry Smith PetscScalar *x,*b,*t; 23904e2b4712SSatish Balay 23914e2b4712SSatish Balay PetscFunctionBegin; 23921ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 23931ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2394f1af5d2fSBarry Smith t = a->solve_work; 23954e2b4712SSatish Balay 23964e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 23974e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 23984e2b4712SSatish Balay 23994e2b4712SSatish Balay /* forward solve the lower triangular */ 24004e2b4712SSatish Balay idx = 7*(*r++); 2401f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2402f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2403f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 24044e2b4712SSatish Balay 24054e2b4712SSatish Balay for (i=1; i<n; i++) { 24064e2b4712SSatish Balay v = aa + 49*ai[i]; 24074e2b4712SSatish Balay vi = aj + ai[i]; 24084e2b4712SSatish Balay nz = diag[i] - ai[i]; 24094e2b4712SSatish Balay idx = 7*(*r++); 2410f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2411f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 24124e2b4712SSatish Balay while (nz--) { 24134e2b4712SSatish Balay idx = 7*(*vi++); 2414f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2415f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2416f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 2417f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2418f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2419f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2420f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2421f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2422f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2423f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 24244e2b4712SSatish Balay v += 49; 24254e2b4712SSatish Balay } 24264e2b4712SSatish Balay idx = 7*i; 2427f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2428f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2429f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 24304e2b4712SSatish Balay } 24314e2b4712SSatish Balay /* backward solve the upper triangular */ 24324e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 24334e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 24344e2b4712SSatish Balay vi = aj + diag[i] + 1; 24354e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 24364e2b4712SSatish Balay idt = 7*i; 2437f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2438f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2439f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 24404e2b4712SSatish Balay while (nz--) { 24414e2b4712SSatish Balay idx = 7*(*vi++); 2442f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2443f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2444f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 2445f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2446f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2447f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2448f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2449f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2450f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2451f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 24524e2b4712SSatish Balay v += 49; 24534e2b4712SSatish Balay } 24544e2b4712SSatish Balay idc = 7*(*c--); 24554e2b4712SSatish Balay v = aa + 49*diag[i]; 2456f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2457f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2458f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2459f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2460f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2461f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2462f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2463f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2464f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2465f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2466f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2467f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2468f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2469f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 24704e2b4712SSatish Balay } 24714e2b4712SSatish Balay 24724e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 24734e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 24741ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 24751ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2476dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 24774e2b4712SSatish Balay PetscFunctionReturn(0); 24784e2b4712SSatish Balay } 24794e2b4712SSatish Balay 24808f690400SShri Abhyankar #undef __FUNCT__ 24814dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7" 24824dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 248335aa4fcfSShri Abhyankar { 248435aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 248535aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 248635aa4fcfSShri Abhyankar PetscErrorCode ierr; 248735aa4fcfSShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 248835aa4fcfSShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 248935aa4fcfSShri Abhyankar MatScalar *aa=a->a,*v; 249035aa4fcfSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 249135aa4fcfSShri Abhyankar PetscScalar *x,*b,*t; 249235aa4fcfSShri Abhyankar 249335aa4fcfSShri Abhyankar PetscFunctionBegin; 249435aa4fcfSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 249535aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 249635aa4fcfSShri Abhyankar t = a->solve_work; 249735aa4fcfSShri Abhyankar 249835aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 249935aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 250035aa4fcfSShri Abhyankar 250135aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 250235aa4fcfSShri Abhyankar idx = 7*r[0]; 250335aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 250435aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 250535aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 250635aa4fcfSShri Abhyankar 250735aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 250835aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 250935aa4fcfSShri Abhyankar vi = aj + ai[i]; 251035aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 251135aa4fcfSShri Abhyankar idx = 7*r[i]; 251235aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 251335aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 251435aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 251535aa4fcfSShri Abhyankar idx = 7*vi[m]; 251635aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 251735aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 251835aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 251935aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 252035aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 252135aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 252235aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 252335aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 252435aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 252535aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 252635aa4fcfSShri Abhyankar v += 49; 252735aa4fcfSShri Abhyankar } 252835aa4fcfSShri Abhyankar idx = 7*i; 252935aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 253035aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 253135aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 253235aa4fcfSShri Abhyankar } 253335aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 253435aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 253535aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 253635aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 253735aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 253835aa4fcfSShri Abhyankar idt = 7*i; 253935aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 254035aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 254135aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 254235aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 254335aa4fcfSShri Abhyankar idx = 7*vi[m]; 254435aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 254535aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 254635aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 254735aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 254835aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 254935aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 255035aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 255135aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 255235aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 255335aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 255435aa4fcfSShri Abhyankar v += 49; 255535aa4fcfSShri Abhyankar } 255635aa4fcfSShri Abhyankar idc = 7*c[i]; 255735aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 255835aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 255935aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 256035aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 256135aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 256235aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 256335aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 256435aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 256535aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 256635aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 256735aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 256835aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 256935aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 257035aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 257135aa4fcfSShri Abhyankar } 257235aa4fcfSShri Abhyankar 257335aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 257435aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 257535aa4fcfSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 257635aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 257735aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 257835aa4fcfSShri Abhyankar PetscFunctionReturn(0); 257935aa4fcfSShri Abhyankar } 258035aa4fcfSShri Abhyankar 258135aa4fcfSShri Abhyankar #undef __FUNCT__ 258206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 258306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 258415091d37SBarry Smith { 258515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2586690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2587dfbe8321SBarry Smith PetscErrorCode ierr; 2588690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2589d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2590d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2591d9fead3dSBarry Smith const PetscScalar *b; 259215091d37SBarry Smith 259315091d37SBarry Smith PetscFunctionBegin; 2594d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25951ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 259615091d37SBarry Smith /* forward solve the lower triangular */ 259715091d37SBarry Smith idx = 0; 259815091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 259915091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 260015091d37SBarry Smith x[6] = b[6+idx]; 260115091d37SBarry Smith for (i=1; i<n; i++) { 260215091d37SBarry Smith v = aa + 49*ai[i]; 260315091d37SBarry Smith vi = aj + ai[i]; 260415091d37SBarry Smith nz = diag[i] - ai[i]; 260515091d37SBarry Smith idx = 7*i; 2606f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2607f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2608f1af5d2fSBarry Smith s7 = b[6+idx]; 260915091d37SBarry Smith while (nz--) { 261015091d37SBarry Smith jdx = 7*(*vi++); 261115091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 261215091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 261315091d37SBarry Smith x7 = x[6+jdx]; 2614f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2615f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2616f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2617f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2618f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2619f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2620f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 262115091d37SBarry Smith v += 49; 262215091d37SBarry Smith } 2623f1af5d2fSBarry Smith x[idx] = s1; 2624f1af5d2fSBarry Smith x[1+idx] = s2; 2625f1af5d2fSBarry Smith x[2+idx] = s3; 2626f1af5d2fSBarry Smith x[3+idx] = s4; 2627f1af5d2fSBarry Smith x[4+idx] = s5; 2628f1af5d2fSBarry Smith x[5+idx] = s6; 2629f1af5d2fSBarry Smith x[6+idx] = s7; 263015091d37SBarry Smith } 263115091d37SBarry Smith /* backward solve the upper triangular */ 263215091d37SBarry Smith for (i=n-1; i>=0; i--){ 263315091d37SBarry Smith v = aa + 49*diag[i] + 49; 263415091d37SBarry Smith vi = aj + diag[i] + 1; 263515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 263615091d37SBarry Smith idt = 7*i; 2637f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2638f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2639f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 2640f1af5d2fSBarry Smith s7 = x[6+idt]; 264115091d37SBarry Smith while (nz--) { 264215091d37SBarry Smith idx = 7*(*vi++); 264315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 264415091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 264515091d37SBarry Smith x7 = x[6+idx]; 2646f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2647f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2648f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2649f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2650f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2651f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2652f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 265315091d37SBarry Smith v += 49; 265415091d37SBarry Smith } 265515091d37SBarry Smith v = aa + 49*diag[i]; 2656f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2657f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 2658f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2659f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 2660f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2661f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 2662f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2663f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 2664f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2665f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 2666f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2667f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 2668f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2669f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 267015091d37SBarry Smith } 267115091d37SBarry Smith 2672d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26731ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2674dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 267515091d37SBarry Smith PetscFunctionReturn(0); 267615091d37SBarry Smith } 267715091d37SBarry Smith 2678cee9d6f2SShri Abhyankar #undef __FUNCT__ 26794dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 26804dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 268153cca76cSShri Abhyankar { 268253cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 268353cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 268453cca76cSShri Abhyankar PetscErrorCode ierr; 268553cca76cSShri Abhyankar PetscInt idx,jdx,idt; 268653cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 268753cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 268853cca76cSShri Abhyankar PetscScalar *x; 268953cca76cSShri Abhyankar const PetscScalar *b; 269053cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 269153cca76cSShri Abhyankar 269253cca76cSShri Abhyankar PetscFunctionBegin; 269353cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 269453cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 269553cca76cSShri Abhyankar /* forward solve the lower triangular */ 269653cca76cSShri Abhyankar idx = 0; 269753cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 269853cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 269953cca76cSShri Abhyankar for (i=1; i<n; i++) { 270053cca76cSShri Abhyankar v = aa + bs2*ai[i]; 270153cca76cSShri Abhyankar vi = aj + ai[i]; 270253cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 270353cca76cSShri Abhyankar idx = bs*i; 270453cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 270553cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 270653cca76cSShri Abhyankar for(k=0;k<nz;k++) { 270753cca76cSShri Abhyankar jdx = bs*vi[k]; 270853cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 270953cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 271053cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 271153cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 271253cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 271353cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 271453cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 271553cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 271653cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 271753cca76cSShri Abhyankar v += bs2; 271853cca76cSShri Abhyankar } 271953cca76cSShri Abhyankar 272053cca76cSShri Abhyankar x[idx] = s1; 272153cca76cSShri Abhyankar x[1+idx] = s2; 272253cca76cSShri Abhyankar x[2+idx] = s3; 272353cca76cSShri Abhyankar x[3+idx] = s4; 272453cca76cSShri Abhyankar x[4+idx] = s5; 272553cca76cSShri Abhyankar x[5+idx] = s6; 272653cca76cSShri Abhyankar x[6+idx] = s7; 272753cca76cSShri Abhyankar } 272853cca76cSShri Abhyankar 272953cca76cSShri Abhyankar /* backward solve the upper triangular */ 273053cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 273153cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 273253cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 273353cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 273453cca76cSShri Abhyankar idt = bs*i; 273553cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 273653cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 273753cca76cSShri Abhyankar for(k=0;k<nz;k++) { 273853cca76cSShri Abhyankar idx = bs*vi[k]; 273953cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 274053cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 274153cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 274253cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 274353cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 274453cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 274553cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 274653cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 274753cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 274853cca76cSShri Abhyankar v += bs2; 274953cca76cSShri Abhyankar } 275053cca76cSShri Abhyankar /* x = inv_diagonal*x */ 275153cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 275253cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 275353cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 275453cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 275553cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 275653cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 275753cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 275853cca76cSShri Abhyankar } 275953cca76cSShri Abhyankar 276053cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 276153cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 276253cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 276353cca76cSShri Abhyankar PetscFunctionReturn(0); 276453cca76cSShri Abhyankar } 276553cca76cSShri Abhyankar 276653cca76cSShri Abhyankar #undef __FUNCT__ 276706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 276806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 276915091d37SBarry Smith { 277015091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 277115091d37SBarry Smith IS iscol=a->col,isrow=a->row; 27726849ba73SBarry Smith PetscErrorCode ierr; 27735d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 27745d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2775d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2776d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2777d9fead3dSBarry Smith const PetscScalar *b; 277815091d37SBarry Smith PetscFunctionBegin; 2779d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27801ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2781f1af5d2fSBarry Smith t = a->solve_work; 278215091d37SBarry Smith 278315091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 278415091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 278515091d37SBarry Smith 278615091d37SBarry Smith /* forward solve the lower triangular */ 278715091d37SBarry Smith idx = 6*(*r++); 2788f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2789f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 2790f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 279115091d37SBarry Smith for (i=1; i<n; i++) { 279215091d37SBarry Smith v = aa + 36*ai[i]; 279315091d37SBarry Smith vi = aj + ai[i]; 279415091d37SBarry Smith nz = diag[i] - ai[i]; 279515091d37SBarry Smith idx = 6*(*r++); 2796f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2797f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 279815091d37SBarry Smith while (nz--) { 279915091d37SBarry Smith idx = 6*(*vi++); 2800f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2801f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2802f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2803f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2804f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2805f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2806f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2807f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 280815091d37SBarry Smith v += 36; 280915091d37SBarry Smith } 281015091d37SBarry Smith idx = 6*i; 2811f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2812f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 2813f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 281415091d37SBarry Smith } 281515091d37SBarry Smith /* backward solve the upper triangular */ 281615091d37SBarry Smith for (i=n-1; i>=0; i--){ 281715091d37SBarry Smith v = aa + 36*diag[i] + 36; 281815091d37SBarry Smith vi = aj + diag[i] + 1; 281915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 282015091d37SBarry Smith idt = 6*i; 2821f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2822f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 2823f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 282415091d37SBarry Smith while (nz--) { 282515091d37SBarry Smith idx = 6*(*vi++); 2826f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2827f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2828f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 2829f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2830f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2831f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2832f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2833f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2834f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 283515091d37SBarry Smith v += 36; 283615091d37SBarry Smith } 283715091d37SBarry Smith idc = 6*(*c--); 283815091d37SBarry Smith v = aa + 36*diag[i]; 2839f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2840f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 2841f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2842f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 2843f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2844f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 2845f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2846f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 2847f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2848f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 2849f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2850f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 285115091d37SBarry Smith } 285215091d37SBarry Smith 285315091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 285415091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2855d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28561ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2857dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 285815091d37SBarry Smith PetscFunctionReturn(0); 285915091d37SBarry Smith } 286015091d37SBarry Smith 28616506fda5SShri Abhyankar #undef __FUNCT__ 28624dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6" 28634dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 28646506fda5SShri Abhyankar { 28656506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 28666506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 28676506fda5SShri Abhyankar PetscErrorCode ierr; 28686506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 28696506fda5SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 28706506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 28716506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 28726506fda5SShri Abhyankar const PetscScalar *b; 28736506fda5SShri Abhyankar PetscFunctionBegin; 28746506fda5SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28756506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28766506fda5SShri Abhyankar t = a->solve_work; 28776506fda5SShri Abhyankar 28786506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 28796506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 28806506fda5SShri Abhyankar 28816506fda5SShri Abhyankar /* forward solve the lower triangular */ 28826506fda5SShri Abhyankar idx = 6*r[0]; 28836506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 28846506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 28856506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 28866506fda5SShri Abhyankar for (i=1; i<n; i++) { 28876506fda5SShri Abhyankar v = aa + 36*ai[i]; 28886506fda5SShri Abhyankar vi = aj + ai[i]; 28896506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 28906506fda5SShri Abhyankar idx = 6*r[i]; 28916506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 28926506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 28936506fda5SShri Abhyankar for(m=0;m<nz;m++){ 28946506fda5SShri Abhyankar idx = 6*vi[m]; 28956506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 28966506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 28976506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 28986506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 28996506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 29006506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 29016506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 29026506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 29036506fda5SShri Abhyankar v += 36; 29046506fda5SShri Abhyankar } 29056506fda5SShri Abhyankar idx = 6*i; 29066506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 29076506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 29086506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 29096506fda5SShri Abhyankar } 29106506fda5SShri Abhyankar /* backward solve the upper triangular */ 29116506fda5SShri Abhyankar for (i=n-1; i>=0; i--){ 29126506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 29136506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 29146506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 29156506fda5SShri Abhyankar idt = 6*i; 29166506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 29176506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 29186506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 29196506fda5SShri Abhyankar for(m=0;m<nz;m++){ 29206506fda5SShri Abhyankar idx = 6*vi[m]; 29216506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 29226506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 29236506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 29246506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 29256506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 29266506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 29276506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 29286506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 29296506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 29306506fda5SShri Abhyankar v += 36; 29316506fda5SShri Abhyankar } 29326506fda5SShri Abhyankar idc = 6*c[i]; 29336506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 29346506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 29356506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 29366506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 29376506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 29386506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 29396506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 29406506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 29416506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 29426506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 29436506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 29446506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 29456506fda5SShri Abhyankar } 29466506fda5SShri Abhyankar 29476506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 29486506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 29496506fda5SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29506506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 29516506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 29526506fda5SShri Abhyankar PetscFunctionReturn(0); 29536506fda5SShri Abhyankar } 29548f690400SShri Abhyankar 29558f690400SShri Abhyankar #undef __FUNCT__ 295606e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 295706e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 295815091d37SBarry Smith { 295915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2960690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2961dfbe8321SBarry Smith PetscErrorCode ierr; 2962690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2963d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2964d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2965d9fead3dSBarry Smith const PetscScalar *b; 296615091d37SBarry Smith 296715091d37SBarry Smith PetscFunctionBegin; 2968d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29691ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 297015091d37SBarry Smith /* forward solve the lower triangular */ 297115091d37SBarry Smith idx = 0; 297215091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 297315091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 297415091d37SBarry Smith for (i=1; i<n; i++) { 297515091d37SBarry Smith v = aa + 36*ai[i]; 297615091d37SBarry Smith vi = aj + ai[i]; 297715091d37SBarry Smith nz = diag[i] - ai[i]; 297815091d37SBarry Smith idx = 6*i; 2979f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2980f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 298115091d37SBarry Smith while (nz--) { 298215091d37SBarry Smith jdx = 6*(*vi++); 298315091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 298415091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2985f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2986f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2987f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2988f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2989f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2990f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 299115091d37SBarry Smith v += 36; 299215091d37SBarry Smith } 2993f1af5d2fSBarry Smith x[idx] = s1; 2994f1af5d2fSBarry Smith x[1+idx] = s2; 2995f1af5d2fSBarry Smith x[2+idx] = s3; 2996f1af5d2fSBarry Smith x[3+idx] = s4; 2997f1af5d2fSBarry Smith x[4+idx] = s5; 2998f1af5d2fSBarry Smith x[5+idx] = s6; 299915091d37SBarry Smith } 300015091d37SBarry Smith /* backward solve the upper triangular */ 300115091d37SBarry Smith for (i=n-1; i>=0; i--){ 300215091d37SBarry Smith v = aa + 36*diag[i] + 36; 300315091d37SBarry Smith vi = aj + diag[i] + 1; 300415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 300515091d37SBarry Smith idt = 6*i; 3006f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3007f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 3008f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 300915091d37SBarry Smith while (nz--) { 301015091d37SBarry Smith idx = 6*(*vi++); 301115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 301215091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3013f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3014f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3015f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3016f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3017f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3018f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 301915091d37SBarry Smith v += 36; 302015091d37SBarry Smith } 302115091d37SBarry Smith v = aa + 36*diag[i]; 3022f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3023f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3024f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3025f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3026f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3027f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 302815091d37SBarry Smith } 302915091d37SBarry Smith 3030d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30311ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3032dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 303315091d37SBarry Smith PetscFunctionReturn(0); 303415091d37SBarry Smith } 303515091d37SBarry Smith 3036cee9d6f2SShri Abhyankar #undef __FUNCT__ 30374dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 30384dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 303953cca76cSShri Abhyankar { 304053cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 304153cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 304253cca76cSShri Abhyankar PetscErrorCode ierr; 304353cca76cSShri Abhyankar PetscInt idx,jdx,idt; 304453cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 304553cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 304653cca76cSShri Abhyankar PetscScalar *x; 304753cca76cSShri Abhyankar const PetscScalar *b; 304853cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 304953cca76cSShri Abhyankar 305053cca76cSShri Abhyankar PetscFunctionBegin; 305153cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 305253cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 305353cca76cSShri Abhyankar /* forward solve the lower triangular */ 305453cca76cSShri Abhyankar idx = 0; 305553cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 305653cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 305753cca76cSShri Abhyankar for (i=1; i<n; i++) { 305853cca76cSShri Abhyankar v = aa + bs2*ai[i]; 305953cca76cSShri Abhyankar vi = aj + ai[i]; 306053cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 306153cca76cSShri Abhyankar idx = bs*i; 306253cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 306353cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 306453cca76cSShri Abhyankar for(k=0;k<nz;k++){ 306553cca76cSShri Abhyankar jdx = bs*vi[k]; 306653cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 306753cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 306853cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 306953cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 307053cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 307153cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 307253cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 307353cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 307453cca76cSShri Abhyankar v += bs2; 307553cca76cSShri Abhyankar } 307653cca76cSShri Abhyankar 307753cca76cSShri Abhyankar x[idx] = s1; 307853cca76cSShri Abhyankar x[1+idx] = s2; 307953cca76cSShri Abhyankar x[2+idx] = s3; 308053cca76cSShri Abhyankar x[3+idx] = s4; 308153cca76cSShri Abhyankar x[4+idx] = s5; 308253cca76cSShri Abhyankar x[5+idx] = s6; 308353cca76cSShri Abhyankar } 308453cca76cSShri Abhyankar 308553cca76cSShri Abhyankar /* backward solve the upper triangular */ 308653cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 308753cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 308853cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 308953cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 309053cca76cSShri Abhyankar idt = bs*i; 309153cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 309253cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 309353cca76cSShri Abhyankar for(k=0;k<nz;k++){ 309453cca76cSShri Abhyankar idx = bs*vi[k]; 309553cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 309653cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 309753cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 309853cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 309953cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 310053cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 310153cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 310253cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 310353cca76cSShri Abhyankar v += bs2; 310453cca76cSShri Abhyankar } 310553cca76cSShri Abhyankar /* x = inv_diagonal*x */ 310653cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 310753cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 310853cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 310953cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 311053cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 311153cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 311253cca76cSShri Abhyankar } 311353cca76cSShri Abhyankar 311453cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 311553cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 311653cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 311753cca76cSShri Abhyankar PetscFunctionReturn(0); 311853cca76cSShri Abhyankar } 311953cca76cSShri Abhyankar 312053cca76cSShri Abhyankar #undef __FUNCT__ 312106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 312206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 31234e2b4712SSatish Balay { 31244e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 31254e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 31266849ba73SBarry Smith PetscErrorCode ierr; 31275d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 31285d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3129d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3130d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3131d9fead3dSBarry Smith const PetscScalar *b; 31324e2b4712SSatish Balay 31334e2b4712SSatish Balay PetscFunctionBegin; 3134d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 31351ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3136f1af5d2fSBarry Smith t = a->solve_work; 31374e2b4712SSatish Balay 31384e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 31394e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 31404e2b4712SSatish Balay 31414e2b4712SSatish Balay /* forward solve the lower triangular */ 31424e2b4712SSatish Balay idx = 5*(*r++); 3143f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3144f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 31454e2b4712SSatish Balay for (i=1; i<n; i++) { 31464e2b4712SSatish Balay v = aa + 25*ai[i]; 31474e2b4712SSatish Balay vi = aj + ai[i]; 31484e2b4712SSatish Balay nz = diag[i] - ai[i]; 31494e2b4712SSatish Balay idx = 5*(*r++); 3150f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3151f1af5d2fSBarry Smith s5 = b[4+idx]; 31524e2b4712SSatish Balay while (nz--) { 31534e2b4712SSatish Balay idx = 5*(*vi++); 3154f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3155f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 3156f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3157f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3158f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3159f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3160f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 31614e2b4712SSatish Balay v += 25; 31624e2b4712SSatish Balay } 31634e2b4712SSatish Balay idx = 5*i; 3164f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3165f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 31664e2b4712SSatish Balay } 31674e2b4712SSatish Balay /* backward solve the upper triangular */ 31684e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 31694e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 31704e2b4712SSatish Balay vi = aj + diag[i] + 1; 31714e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 31724e2b4712SSatish Balay idt = 5*i; 3173f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3174f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 31754e2b4712SSatish Balay while (nz--) { 31764e2b4712SSatish Balay idx = 5*(*vi++); 3177f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3178f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3179f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3180f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3181f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3182f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3183f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 31844e2b4712SSatish Balay v += 25; 31854e2b4712SSatish Balay } 31864e2b4712SSatish Balay idc = 5*(*c--); 31874e2b4712SSatish Balay v = aa + 25*diag[i]; 3188f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3189f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 3190f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3191f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 3192f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3193f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 3194f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3195f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 3196f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3197f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 31984e2b4712SSatish Balay } 31994e2b4712SSatish Balay 32004e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 32014e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3202d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 32031ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3204dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 32054e2b4712SSatish Balay PetscFunctionReturn(0); 32064e2b4712SSatish Balay } 32074e2b4712SSatish Balay 320878bb4007SShri Abhyankar #undef __FUNCT__ 32094dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5" 32104dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 321178bb4007SShri Abhyankar { 321278bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 321378bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 321478bb4007SShri Abhyankar PetscErrorCode ierr; 321578bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 321678bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 321778bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 321878bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 321978bb4007SShri Abhyankar const PetscScalar *b; 322078bb4007SShri Abhyankar 322178bb4007SShri Abhyankar PetscFunctionBegin; 322278bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 322378bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 322478bb4007SShri Abhyankar t = a->solve_work; 322578bb4007SShri Abhyankar 322678bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 322778bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 322878bb4007SShri Abhyankar 322978bb4007SShri Abhyankar /* forward solve the lower triangular */ 323078bb4007SShri Abhyankar idx = 5*r[0]; 323178bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 323278bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 323378bb4007SShri Abhyankar for (i=1; i<n; i++) { 323478bb4007SShri Abhyankar v = aa + 25*ai[i]; 323578bb4007SShri Abhyankar vi = aj + ai[i]; 323678bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 323778bb4007SShri Abhyankar idx = 5*r[i]; 323878bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 323978bb4007SShri Abhyankar s5 = b[4+idx]; 324078bb4007SShri Abhyankar for(m=0;m<nz;m++){ 324178bb4007SShri Abhyankar idx = 5*vi[m]; 324278bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 324378bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 324478bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 324578bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 324678bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 324778bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 324878bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 324978bb4007SShri Abhyankar v += 25; 325078bb4007SShri Abhyankar } 325178bb4007SShri Abhyankar idx = 5*i; 325278bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 325378bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 325478bb4007SShri Abhyankar } 325578bb4007SShri Abhyankar /* backward solve the upper triangular */ 325678bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 325778bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 325878bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 325978bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 326078bb4007SShri Abhyankar idt = 5*i; 326178bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 326278bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 326378bb4007SShri Abhyankar for(m=0;m<nz;m++){ 326478bb4007SShri Abhyankar idx = 5*vi[m]; 326578bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 326678bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 326778bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 326878bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 326978bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 327078bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 327178bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 327278bb4007SShri Abhyankar v += 25; 327378bb4007SShri Abhyankar } 327478bb4007SShri Abhyankar idc = 5*c[i]; 327578bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 327678bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 327778bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 327878bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 327978bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 328078bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 328178bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 328278bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 328378bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 328478bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 328578bb4007SShri Abhyankar } 328678bb4007SShri Abhyankar 328778bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 328878bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 328978bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 329078bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 329178bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 329278bb4007SShri Abhyankar PetscFunctionReturn(0); 329378bb4007SShri Abhyankar } 329478bb4007SShri Abhyankar 32958f690400SShri Abhyankar #undef __FUNCT__ 329606e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 329706e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 329815091d37SBarry Smith { 329915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3300690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 3301dfbe8321SBarry Smith PetscErrorCode ierr; 3302690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 3303d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3304d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3305d9fead3dSBarry Smith const PetscScalar *b; 330615091d37SBarry Smith 330715091d37SBarry Smith PetscFunctionBegin; 3308d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 33091ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 331015091d37SBarry Smith /* forward solve the lower triangular */ 331115091d37SBarry Smith idx = 0; 331215091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 331315091d37SBarry Smith for (i=1; i<n; i++) { 331415091d37SBarry Smith v = aa + 25*ai[i]; 331515091d37SBarry Smith vi = aj + ai[i]; 331615091d37SBarry Smith nz = diag[i] - ai[i]; 331715091d37SBarry Smith idx = 5*i; 3318f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 331915091d37SBarry Smith while (nz--) { 332015091d37SBarry Smith jdx = 5*(*vi++); 332115091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3322f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3323f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3324f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3325f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3326f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 332715091d37SBarry Smith v += 25; 332815091d37SBarry Smith } 3329f1af5d2fSBarry Smith x[idx] = s1; 3330f1af5d2fSBarry Smith x[1+idx] = s2; 3331f1af5d2fSBarry Smith x[2+idx] = s3; 3332f1af5d2fSBarry Smith x[3+idx] = s4; 3333f1af5d2fSBarry Smith x[4+idx] = s5; 333415091d37SBarry Smith } 333515091d37SBarry Smith /* backward solve the upper triangular */ 333615091d37SBarry Smith for (i=n-1; i>=0; i--){ 333715091d37SBarry Smith v = aa + 25*diag[i] + 25; 333815091d37SBarry Smith vi = aj + diag[i] + 1; 333915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 334015091d37SBarry Smith idt = 5*i; 3341f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3342f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 334315091d37SBarry Smith while (nz--) { 334415091d37SBarry Smith idx = 5*(*vi++); 334515091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3346f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3347f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3348f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3349f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3350f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 335115091d37SBarry Smith v += 25; 335215091d37SBarry Smith } 335315091d37SBarry Smith v = aa + 25*diag[i]; 3354f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3355f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3356f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3357f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3358f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 335915091d37SBarry Smith } 336015091d37SBarry Smith 3361d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 33621ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3363dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 336415091d37SBarry Smith PetscFunctionReturn(0); 336515091d37SBarry Smith } 336615091d37SBarry Smith 3367cee9d6f2SShri Abhyankar #undef __FUNCT__ 33684dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 33694dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 337053cca76cSShri Abhyankar { 337153cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 337253cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 337353cca76cSShri Abhyankar PetscErrorCode ierr; 337453cca76cSShri Abhyankar PetscInt jdx; 337553cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 337653cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 337753cca76cSShri Abhyankar const PetscScalar *b; 337853cca76cSShri Abhyankar 337953cca76cSShri Abhyankar PetscFunctionBegin; 338053cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 338153cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 338253cca76cSShri Abhyankar /* forward solve the lower triangular */ 338353cca76cSShri Abhyankar idx = 0; 338453cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 338553cca76cSShri Abhyankar for (i=1; i<n; i++) { 338653cca76cSShri Abhyankar v = aa + 25*ai[i]; 338753cca76cSShri Abhyankar vi = aj + ai[i]; 338853cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 338953cca76cSShri Abhyankar idx = 5*i; 339053cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 339153cca76cSShri Abhyankar for(k=0;k<nz;k++) { 339253cca76cSShri Abhyankar jdx = 5*vi[k]; 339353cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 339453cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 339553cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 339653cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 339753cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 339853cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 339953cca76cSShri Abhyankar v += 25; 340053cca76cSShri Abhyankar } 340153cca76cSShri Abhyankar x[idx] = s1; 340253cca76cSShri Abhyankar x[1+idx] = s2; 340353cca76cSShri Abhyankar x[2+idx] = s3; 340453cca76cSShri Abhyankar x[3+idx] = s4; 340553cca76cSShri Abhyankar x[4+idx] = s5; 340653cca76cSShri Abhyankar } 340753cca76cSShri Abhyankar 340853cca76cSShri Abhyankar /* backward solve the upper triangular */ 340953cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 341053cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 341153cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 341253cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 341353cca76cSShri Abhyankar idt = 5*i; 341453cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 341553cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 341653cca76cSShri Abhyankar for(k=0;k<nz;k++){ 341753cca76cSShri Abhyankar idx = 5*vi[k]; 341853cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 341953cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 342053cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 342153cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 342253cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 342353cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 342453cca76cSShri Abhyankar v += 25; 342553cca76cSShri Abhyankar } 342653cca76cSShri Abhyankar /* x = inv_diagonal*x */ 342753cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 342853cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 342953cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 343053cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 343153cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 343253cca76cSShri Abhyankar } 343353cca76cSShri Abhyankar 343453cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 343553cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 343653cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 343753cca76cSShri Abhyankar PetscFunctionReturn(0); 343853cca76cSShri Abhyankar } 343953cca76cSShri Abhyankar 344053cca76cSShri Abhyankar #undef __FUNCT__ 344106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 344206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 34434e2b4712SSatish Balay { 34444e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 34454e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 34466849ba73SBarry Smith PetscErrorCode ierr; 34475d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 34485d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3449d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3450d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3451d9fead3dSBarry Smith const PetscScalar *b; 34524e2b4712SSatish Balay 34534e2b4712SSatish Balay PetscFunctionBegin; 3454d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 34551ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3456f1af5d2fSBarry Smith t = a->solve_work; 34574e2b4712SSatish Balay 34584e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 34594e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 34604e2b4712SSatish Balay 34614e2b4712SSatish Balay /* forward solve the lower triangular */ 34624e2b4712SSatish Balay idx = 4*(*r++); 3463f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3464f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 34654e2b4712SSatish Balay for (i=1; i<n; i++) { 34664e2b4712SSatish Balay v = aa + 16*ai[i]; 34674e2b4712SSatish Balay vi = aj + ai[i]; 34684e2b4712SSatish Balay nz = diag[i] - ai[i]; 34694e2b4712SSatish Balay idx = 4*(*r++); 3470f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 34714e2b4712SSatish Balay while (nz--) { 34724e2b4712SSatish Balay idx = 4*(*vi++); 3473f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3474f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3475f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3476f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3477f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 34784e2b4712SSatish Balay v += 16; 34794e2b4712SSatish Balay } 34804e2b4712SSatish Balay idx = 4*i; 3481f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3482f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 34834e2b4712SSatish Balay } 34844e2b4712SSatish Balay /* backward solve the upper triangular */ 34854e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 34864e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 34874e2b4712SSatish Balay vi = aj + diag[i] + 1; 34884e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 34894e2b4712SSatish Balay idt = 4*i; 3490f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3491f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 34924e2b4712SSatish Balay while (nz--) { 34934e2b4712SSatish Balay idx = 4*(*vi++); 3494f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3495f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 3496f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3497f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3498f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3499f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 35004e2b4712SSatish Balay v += 16; 35014e2b4712SSatish Balay } 35024e2b4712SSatish Balay idc = 4*(*c--); 35034e2b4712SSatish Balay v = aa + 16*diag[i]; 3504f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3505f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3506f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3507f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 35084e2b4712SSatish Balay } 35094e2b4712SSatish Balay 35104e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 35114e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3512d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 35131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3514dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 35154e2b4712SSatish Balay PetscFunctionReturn(0); 35164e2b4712SSatish Balay } 3517f26ec98cSKris Buschelman 35188f690400SShri Abhyankar #undef __FUNCT__ 35194dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4" 35204dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 352178bb4007SShri Abhyankar { 352278bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 352378bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 352478bb4007SShri Abhyankar PetscErrorCode ierr; 352578bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 352678bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 352778bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 352878bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 352978bb4007SShri Abhyankar const PetscScalar *b; 353078bb4007SShri Abhyankar 353178bb4007SShri Abhyankar PetscFunctionBegin; 353278bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 353378bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 353478bb4007SShri Abhyankar t = a->solve_work; 353578bb4007SShri Abhyankar 353678bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 353778bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 353878bb4007SShri Abhyankar 353978bb4007SShri Abhyankar /* forward solve the lower triangular */ 354078bb4007SShri Abhyankar idx = 4*r[0]; 354178bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 354278bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 354378bb4007SShri Abhyankar for (i=1; i<n; i++) { 354478bb4007SShri Abhyankar v = aa + 16*ai[i]; 354578bb4007SShri Abhyankar vi = aj + ai[i]; 354678bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 354778bb4007SShri Abhyankar idx = 4*r[i]; 354878bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 354978bb4007SShri Abhyankar for(m=0;m<nz;m++){ 355078bb4007SShri Abhyankar idx = 4*vi[m]; 355178bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 355278bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 355378bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 355478bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 355578bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 355678bb4007SShri Abhyankar v += 16; 355778bb4007SShri Abhyankar } 355878bb4007SShri Abhyankar idx = 4*i; 355978bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 356078bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 356178bb4007SShri Abhyankar } 356278bb4007SShri Abhyankar /* backward solve the upper triangular */ 356378bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 356478bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 356578bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 356678bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 356778bb4007SShri Abhyankar idt = 4*i; 356878bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 356978bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 357078bb4007SShri Abhyankar for(m=0;m<nz;m++){ 357178bb4007SShri Abhyankar idx = 4*vi[m]; 357278bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 357378bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 357478bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 357578bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 357678bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 357778bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 357878bb4007SShri Abhyankar v += 16; 357978bb4007SShri Abhyankar } 358078bb4007SShri Abhyankar idc = 4*c[i]; 358178bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 358278bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 358378bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 358478bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 358578bb4007SShri Abhyankar } 358678bb4007SShri Abhyankar 358778bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 358878bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 358978bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 359078bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 359178bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 359278bb4007SShri Abhyankar PetscFunctionReturn(0); 359378bb4007SShri Abhyankar } 359478bb4007SShri Abhyankar 359578bb4007SShri Abhyankar #undef __FUNCT__ 3596f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3597dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3598f26ec98cSKris Buschelman { 3599f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3600f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 36016849ba73SBarry Smith PetscErrorCode ierr; 36025d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 36035d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3604d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3605d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3606d9fead3dSBarry Smith PetscScalar *x; 3607d9fead3dSBarry Smith const PetscScalar *b; 3608f26ec98cSKris Buschelman 3609f26ec98cSKris Buschelman PetscFunctionBegin; 3610d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 36111ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3612f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 3613f26ec98cSKris Buschelman 3614f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3615f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3616f26ec98cSKris Buschelman 3617f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3618f26ec98cSKris Buschelman idx = 4*(*r++); 3619f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 3620f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 3621f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 3622f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 3623f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3624f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3625f26ec98cSKris Buschelman vi = aj + ai[i]; 3626f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3627f26ec98cSKris Buschelman idx = 4*(*r++); 3628f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3629f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3630f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3631f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3632f26ec98cSKris Buschelman while (nz--) { 3633f26ec98cSKris Buschelman idx = 4*(*vi++); 3634f26ec98cSKris Buschelman x1 = t[idx]; 3635f26ec98cSKris Buschelman x2 = t[1+idx]; 3636f26ec98cSKris Buschelman x3 = t[2+idx]; 3637f26ec98cSKris Buschelman x4 = t[3+idx]; 3638f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3639f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3640f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3641f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3642f26ec98cSKris Buschelman v += 16; 3643f26ec98cSKris Buschelman } 3644f26ec98cSKris Buschelman idx = 4*i; 3645f26ec98cSKris Buschelman t[idx] = s1; 3646f26ec98cSKris Buschelman t[1+idx] = s2; 3647f26ec98cSKris Buschelman t[2+idx] = s3; 3648f26ec98cSKris Buschelman t[3+idx] = s4; 3649f26ec98cSKris Buschelman } 3650f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3651f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3652f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 3653f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3654f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3655f26ec98cSKris Buschelman idt = 4*i; 3656f26ec98cSKris Buschelman s1 = t[idt]; 3657f26ec98cSKris Buschelman s2 = t[1+idt]; 3658f26ec98cSKris Buschelman s3 = t[2+idt]; 3659f26ec98cSKris Buschelman s4 = t[3+idt]; 3660f26ec98cSKris Buschelman while (nz--) { 3661f26ec98cSKris Buschelman idx = 4*(*vi++); 3662f26ec98cSKris Buschelman x1 = t[idx]; 3663f26ec98cSKris Buschelman x2 = t[1+idx]; 3664f26ec98cSKris Buschelman x3 = t[2+idx]; 3665f26ec98cSKris Buschelman x4 = t[3+idx]; 3666f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3667f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3668f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3669f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3670f26ec98cSKris Buschelman v += 16; 3671f26ec98cSKris Buschelman } 3672f26ec98cSKris Buschelman idc = 4*(*c--); 3673f26ec98cSKris Buschelman v = aa + 16*diag[i]; 3674f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3675f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3676f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3677f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3678f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 3679f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 3680f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 3681f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 3682f26ec98cSKris Buschelman } 3683f26ec98cSKris Buschelman 3684f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3685f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3686d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 36871ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3688dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3689f26ec98cSKris Buschelman PetscFunctionReturn(0); 3690f26ec98cSKris Buschelman } 3691f26ec98cSKris Buschelman 369224c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 369324c233c2SKris Buschelman 369424c233c2SKris Buschelman #include PETSC_HAVE_SSE 369524c233c2SKris Buschelman 369624c233c2SKris Buschelman #undef __FUNCT__ 369724c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3698dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 369924c233c2SKris Buschelman { 370024c233c2SKris Buschelman /* 370124c233c2SKris Buschelman Note: This code uses demotion of double 370224c233c2SKris Buschelman to float when performing the mixed-mode computation. 370324c233c2SKris Buschelman This may not be numerically reasonable for all applications. 370424c233c2SKris Buschelman */ 370524c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 370624c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 37076849ba73SBarry Smith PetscErrorCode ierr; 37085d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 37095d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 371024c233c2SKris Buschelman MatScalar *aa=a->a,*v; 371187828ca2SBarry Smith PetscScalar *x,*b,*t; 371224c233c2SKris Buschelman 371324c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 371424c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 371524c233c2SKris Buschelman unsigned long offset; 371624c233c2SKris Buschelman 371724c233c2SKris Buschelman PetscFunctionBegin; 371824c233c2SKris Buschelman SSE_SCOPE_BEGIN; 371924c233c2SKris Buschelman 372024c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 372124c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 372224c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 372324c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 372424c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 372524c233c2SKris Buschelman 37261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 37271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 372824c233c2SKris Buschelman t = a->solve_work; 372924c233c2SKris Buschelman 373024c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 373124c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 373224c233c2SKris Buschelman 373324c233c2SKris Buschelman /* forward solve the lower triangular */ 373424c233c2SKris Buschelman idx = 4*(*r++); 373524c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 373624c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 373724c233c2SKris Buschelman v = aa + 16*ai[1]; 373824c233c2SKris Buschelman 373924c233c2SKris Buschelman for (i=1; i<n;) { 374024c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 374124c233c2SKris Buschelman vi = aj + ai[i]; 374224c233c2SKris Buschelman nz = diag[i] - ai[i]; 374324c233c2SKris Buschelman idx = 4*(*r++); 374424c233c2SKris Buschelman 374524c233c2SKris Buschelman /* Demote sum from double to float */ 374624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 374724c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 374824c233c2SKris Buschelman 374924c233c2SKris Buschelman while (nz--) { 375024c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 375124c233c2SKris Buschelman idx = 4*(*vi++); 375224c233c2SKris Buschelman 375324c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 375424c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 375524c233c2SKris Buschelman 375624c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 375724c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 375824c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 375924c233c2SKris Buschelman 376024c233c2SKris Buschelman /* First Column */ 376124c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 376224c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 376324c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 376424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 376524c233c2SKris Buschelman 376624c233c2SKris Buschelman /* Second Column */ 376724c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 376824c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 376924c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 377024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 377124c233c2SKris Buschelman 377224c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 377324c233c2SKris Buschelman 377424c233c2SKris Buschelman /* Third Column */ 377524c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 377624c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 377724c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 377824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 377924c233c2SKris Buschelman 378024c233c2SKris Buschelman /* Fourth Column */ 378124c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 378224c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 378324c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 378424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 378524c233c2SKris Buschelman SSE_INLINE_END_2 378624c233c2SKris Buschelman 378724c233c2SKris Buschelman v += 16; 378824c233c2SKris Buschelman } 378924c233c2SKris Buschelman idx = 4*i; 379024c233c2SKris Buschelman v = aa + 16*ai[++i]; 379124c233c2SKris Buschelman PREFETCH_NTA(v); 379224c233c2SKris Buschelman STORE_PS(tmps,XMM7); 379324c233c2SKris Buschelman 379424c233c2SKris Buschelman /* Promote result from float to double */ 379524c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 379624c233c2SKris Buschelman } 379724c233c2SKris Buschelman /* backward solve the upper triangular */ 379824c233c2SKris Buschelman idt = 4*(n-1); 379924c233c2SKris Buschelman ai16 = 16*diag[n-1]; 380024c233c2SKris Buschelman v = aa + ai16 + 16; 380124c233c2SKris Buschelman for (i=n-1; i>=0;){ 380224c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 380324c233c2SKris Buschelman vi = aj + diag[i] + 1; 380424c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 380524c233c2SKris Buschelman 380624c233c2SKris Buschelman /* Demote accumulator from double to float */ 380724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 380824c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 380924c233c2SKris Buschelman 381024c233c2SKris Buschelman while (nz--) { 381124c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 381224c233c2SKris Buschelman idx = 4*(*vi++); 381324c233c2SKris Buschelman 381424c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 381524c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 381624c233c2SKris Buschelman 381724c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 381824c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 381924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 382024c233c2SKris Buschelman 382124c233c2SKris Buschelman /* First Column */ 382224c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 382324c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 382424c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 382524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 382624c233c2SKris Buschelman 382724c233c2SKris Buschelman /* Second Column */ 382824c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 382924c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 383024c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 383124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 383224c233c2SKris Buschelman 383324c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 383424c233c2SKris Buschelman 383524c233c2SKris Buschelman /* Third Column */ 383624c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 383724c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 383824c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 383924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 384024c233c2SKris Buschelman 384124c233c2SKris Buschelman /* Fourth Column */ 384224c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 384324c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 384424c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 384524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 384624c233c2SKris Buschelman SSE_INLINE_END_2 384724c233c2SKris Buschelman v += 16; 384824c233c2SKris Buschelman } 384924c233c2SKris Buschelman v = aa + ai16; 385024c233c2SKris Buschelman ai16 = 16*diag[--i]; 385124c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 385224c233c2SKris Buschelman /* 385324c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 385424c233c2SKris Buschelman which was inverted as part of the factorization 385524c233c2SKris Buschelman */ 385624c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 385724c233c2SKris Buschelman /* First Column */ 385824c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 385924c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 386024c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 386124c233c2SKris Buschelman 386224c233c2SKris Buschelman /* Second Column */ 386324c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 386424c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 386524c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 386624c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 386724c233c2SKris Buschelman 386824c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 386924c233c2SKris Buschelman 387024c233c2SKris Buschelman /* Third Column */ 387124c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 387224c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 387324c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 387424c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 387524c233c2SKris Buschelman 387624c233c2SKris Buschelman /* Fourth Column */ 387724c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 387824c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 387924c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 388024c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 388124c233c2SKris Buschelman 388224c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 388324c233c2SKris Buschelman SSE_INLINE_END_3 388424c233c2SKris Buschelman 388524c233c2SKris Buschelman /* Promote solution from float to double */ 388624c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 388724c233c2SKris Buschelman 388824c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 388924c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 389024c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 389124c233c2SKris Buschelman idc = 4*(*c--); 389224c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 389324c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 389424c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 389524c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 389624c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 389724c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 389824c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 389924c233c2SKris Buschelman SSE_INLINE_END_2 390024c233c2SKris Buschelman v = aa + ai16 + 16; 390124c233c2SKris Buschelman idt -= 4; 390224c233c2SKris Buschelman } 390324c233c2SKris Buschelman 390424c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 390524c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 39061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 39071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3908dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 390924c233c2SKris Buschelman SSE_SCOPE_END; 391024c233c2SKris Buschelman PetscFunctionReturn(0); 391124c233c2SKris Buschelman } 391224c233c2SKris Buschelman 391324c233c2SKris Buschelman #endif 39140ef38995SBarry Smith 39150ef38995SBarry Smith 39164e2b4712SSatish Balay /* 39174e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 39184e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 39194e2b4712SSatish Balay */ 39204a2ae208SSatish Balay #undef __FUNCT__ 392106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 392206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 39234e2b4712SSatish Balay { 39244e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3925356650c2SBarry Smith PetscInt n=a->mbs; 3926356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 3927dfbe8321SBarry Smith PetscErrorCode ierr; 3928356650c2SBarry Smith const PetscInt *diag = a->diag; 3929d9fead3dSBarry Smith const MatScalar *aa=a->a; 3930d9fead3dSBarry Smith PetscScalar *x; 3931d9fead3dSBarry Smith const PetscScalar *b; 39324e2b4712SSatish Balay 39334e2b4712SSatish Balay PetscFunctionBegin; 3934d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 39351ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 39364e2b4712SSatish Balay 3937aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 39382853dc0eSBarry Smith { 393987828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 39402853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 39412853dc0eSBarry Smith } 3942aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 39432853dc0eSBarry Smith { 394487828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 39452853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 39462853dc0eSBarry Smith } 3947aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 39482853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3949e1293385SBarry Smith #else 395030d4dcafSBarry Smith { 395187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3952d9fead3dSBarry Smith const MatScalar *v; 3953356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 3954356650c2SBarry Smith const PetscInt *vi; 3955e1293385SBarry Smith 39564e2b4712SSatish Balay /* forward solve the lower triangular */ 39574e2b4712SSatish Balay idx = 0; 3958e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 39594e2b4712SSatish Balay for (i=1; i<n; i++) { 39604e2b4712SSatish Balay v = aa + 16*ai[i]; 39614e2b4712SSatish Balay vi = aj + ai[i]; 39624e2b4712SSatish Balay nz = diag[i] - ai[i]; 3963e1293385SBarry Smith idx += 4; 3964f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 39654e2b4712SSatish Balay while (nz--) { 39664e2b4712SSatish Balay jdx = 4*(*vi++); 39674e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3968f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3969f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3970f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3971f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 39724e2b4712SSatish Balay v += 16; 39734e2b4712SSatish Balay } 3974f1af5d2fSBarry Smith x[idx] = s1; 3975f1af5d2fSBarry Smith x[1+idx] = s2; 3976f1af5d2fSBarry Smith x[2+idx] = s3; 3977f1af5d2fSBarry Smith x[3+idx] = s4; 39784e2b4712SSatish Balay } 39794e2b4712SSatish Balay /* backward solve the upper triangular */ 39804e555682SBarry Smith idt = 4*(n-1); 39814e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 39824e555682SBarry Smith ai16 = 16*diag[i]; 39834e555682SBarry Smith v = aa + ai16 + 16; 39844e2b4712SSatish Balay vi = aj + diag[i] + 1; 39854e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3986f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3987f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 39884e2b4712SSatish Balay while (nz--) { 39894e2b4712SSatish Balay idx = 4*(*vi++); 39904e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3991f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3992f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3993f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3994f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 39954e2b4712SSatish Balay v += 16; 39964e2b4712SSatish Balay } 39974e555682SBarry Smith v = aa + ai16; 3998f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3999f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4000f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4001f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4002329f5518SBarry Smith idt -= 4; 40034e2b4712SSatish Balay } 400430d4dcafSBarry Smith } 4005e1293385SBarry Smith #endif 40064e2b4712SSatish Balay 4007d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 40081ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4009dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 40104e2b4712SSatish Balay PetscFunctionReturn(0); 40114e2b4712SSatish Balay } 40124e2b4712SSatish Balay 4013b2b2dd24SShri Abhyankar #undef __FUNCT__ 40144dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 40154dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4016b2b2dd24SShri Abhyankar { 4017b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4018b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4019b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4020b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 4021b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4022b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4023b2b2dd24SShri Abhyankar PetscScalar *x; 4024b2b2dd24SShri Abhyankar const PetscScalar *b; 4025b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4026cee9d6f2SShri Abhyankar 4027b2b2dd24SShri Abhyankar PetscFunctionBegin; 4028b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4029b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4030b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4031b2b2dd24SShri Abhyankar idx = 0; 4032b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4033b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4034b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4035b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4036b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4037b2b2dd24SShri Abhyankar idx = bs*i; 4038b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4039b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 4040b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4041b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4042b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4043b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4044b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4045b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4046b2b2dd24SShri Abhyankar 4047b2b2dd24SShri Abhyankar v += bs2; 4048b2b2dd24SShri Abhyankar } 4049b2b2dd24SShri Abhyankar 4050b2b2dd24SShri Abhyankar x[idx] = s1; 4051b2b2dd24SShri Abhyankar x[1+idx] = s2; 4052b2b2dd24SShri Abhyankar x[2+idx] = s3; 4053b2b2dd24SShri Abhyankar x[3+idx] = s4; 4054b2b2dd24SShri Abhyankar } 4055b2b2dd24SShri Abhyankar 4056b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4057b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4058b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4059b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4060b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4061b2b2dd24SShri Abhyankar idt = bs*i; 4062b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4063b2b2dd24SShri Abhyankar 4064b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4065b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4066b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4067b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4068b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4069b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4070b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4071b2b2dd24SShri Abhyankar 4072b2b2dd24SShri Abhyankar v += bs2; 4073b2b2dd24SShri Abhyankar } 4074b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4075b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4076b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4077b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4078b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4079b2b2dd24SShri Abhyankar 4080b2b2dd24SShri Abhyankar } 4081b2b2dd24SShri Abhyankar 4082b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4083b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4084b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4085b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4086b2b2dd24SShri Abhyankar } 4087cee9d6f2SShri Abhyankar 4088cee9d6f2SShri Abhyankar #undef __FUNCT__ 4089f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4090dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4091f26ec98cSKris Buschelman { 4092f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4093690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4094dfbe8321SBarry Smith PetscErrorCode ierr; 4095690b6cddSBarry Smith PetscInt *diag = a->diag; 4096f26ec98cSKris Buschelman MatScalar *aa=a->a; 4097f26ec98cSKris Buschelman PetscScalar *x,*b; 4098f26ec98cSKris Buschelman 4099f26ec98cSKris Buschelman PetscFunctionBegin; 41001ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 41011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4102f26ec98cSKris Buschelman 4103f26ec98cSKris Buschelman { 4104f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4105f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 4106690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 4107f26ec98cSKris Buschelman 4108f26ec98cSKris Buschelman /* forward solve the lower triangular */ 4109f26ec98cSKris Buschelman idx = 0; 4110f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 4111f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 4112f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 4113f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 4114f26ec98cSKris Buschelman for (i=1; i<n; i++) { 4115f26ec98cSKris Buschelman v = aa + 16*ai[i]; 4116f26ec98cSKris Buschelman vi = aj + ai[i]; 4117f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 4118f26ec98cSKris Buschelman idx += 4; 4119f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 4120f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 4121f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 4122f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 4123f26ec98cSKris Buschelman while (nz--) { 4124f26ec98cSKris Buschelman jdx = 4*(*vi++); 4125f26ec98cSKris Buschelman x1 = t[jdx]; 4126f26ec98cSKris Buschelman x2 = t[1+jdx]; 4127f26ec98cSKris Buschelman x3 = t[2+jdx]; 4128f26ec98cSKris Buschelman x4 = t[3+jdx]; 4129f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4130f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4131f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4132f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4133f26ec98cSKris Buschelman v += 16; 4134f26ec98cSKris Buschelman } 4135f26ec98cSKris Buschelman t[idx] = s1; 4136f26ec98cSKris Buschelman t[1+idx] = s2; 4137f26ec98cSKris Buschelman t[2+idx] = s3; 4138f26ec98cSKris Buschelman t[3+idx] = s4; 4139f26ec98cSKris Buschelman } 4140f26ec98cSKris Buschelman /* backward solve the upper triangular */ 4141f26ec98cSKris Buschelman idt = 4*(n-1); 4142f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 4143f26ec98cSKris Buschelman ai16 = 16*diag[i]; 4144f26ec98cSKris Buschelman v = aa + ai16 + 16; 4145f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 4146f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 4147f26ec98cSKris Buschelman s1 = t[idt]; 4148f26ec98cSKris Buschelman s2 = t[1+idt]; 4149f26ec98cSKris Buschelman s3 = t[2+idt]; 4150f26ec98cSKris Buschelman s4 = t[3+idt]; 4151f26ec98cSKris Buschelman while (nz--) { 4152f26ec98cSKris Buschelman idx = 4*(*vi++); 4153f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 4154f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 4155f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 4156f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 4157f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4158f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4159f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4160f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4161f26ec98cSKris Buschelman v += 16; 4162f26ec98cSKris Buschelman } 4163f26ec98cSKris Buschelman v = aa + ai16; 4164f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4165f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4166f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4167f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4168f26ec98cSKris Buschelman idt -= 4; 4169f26ec98cSKris Buschelman } 4170f26ec98cSKris Buschelman } 4171f26ec98cSKris Buschelman 41721ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 41731ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4174dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4175f26ec98cSKris Buschelman PetscFunctionReturn(0); 4176f26ec98cSKris Buschelman } 4177f26ec98cSKris Buschelman 41783660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 41793660e330SKris Buschelman 41803660e330SKris Buschelman #include PETSC_HAVE_SSE 41813660e330SKris Buschelman #undef __FUNCT__ 41827cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4183dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 41843660e330SKris Buschelman { 41853660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 41862aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 4187dfbe8321SBarry Smith PetscErrorCode ierr; 4188dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 41893660e330SKris Buschelman MatScalar *aa=a->a; 419087828ca2SBarry Smith PetscScalar *x,*b; 41913660e330SKris Buschelman 41923660e330SKris Buschelman PetscFunctionBegin; 41933660e330SKris Buschelman SSE_SCOPE_BEGIN; 41943660e330SKris Buschelman /* 41953660e330SKris Buschelman Note: This code currently uses demotion of double 41963660e330SKris Buschelman to float when performing the mixed-mode computation. 41973660e330SKris Buschelman This may not be numerically reasonable for all applications. 41983660e330SKris Buschelman */ 41993660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 42003660e330SKris Buschelman 42011ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 42021ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 42033660e330SKris Buschelman { 4204eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 4205eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 42062aa5897fSKris Buschelman int nz,i,idt,ai16; 42072aa5897fSKris Buschelman unsigned int jdx,idx; 42082aa5897fSKris Buschelman unsigned short *vi; 4209eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 42103660e330SKris Buschelman 4211eb05f457SKris Buschelman /* First block is the identity. */ 42123660e330SKris Buschelman idx = 0; 4213eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 42142aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 42153660e330SKris Buschelman 42163660e330SKris Buschelman for (i=1; i<n;) { 42173660e330SKris Buschelman PREFETCH_NTA(&v[8]); 42183660e330SKris Buschelman vi = aj + ai[i]; 42193660e330SKris Buschelman nz = diag[i] - ai[i]; 42203660e330SKris Buschelman idx += 4; 42213660e330SKris Buschelman 4222eb05f457SKris Buschelman /* Demote RHS from double to float. */ 4223eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4224eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 42253660e330SKris Buschelman 42263660e330SKris Buschelman while (nz--) { 42273660e330SKris Buschelman PREFETCH_NTA(&v[16]); 42282aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 42293660e330SKris Buschelman 42303660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 4231eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 42323660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 42333660e330SKris Buschelman 42343660e330SKris Buschelman /* First Column */ 42353660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 42363660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 42373660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 42383660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 42393660e330SKris Buschelman 42403660e330SKris Buschelman /* Second Column */ 42413660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 42423660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 42433660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 42443660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 42453660e330SKris Buschelman 42463660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 42473660e330SKris Buschelman 42483660e330SKris Buschelman /* Third Column */ 42493660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 42503660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 42513660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 42523660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 42533660e330SKris Buschelman 42543660e330SKris Buschelman /* Fourth Column */ 42553660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 42563660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 42573660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 42583660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 42593660e330SKris Buschelman SSE_INLINE_END_2 42603660e330SKris Buschelman 42613660e330SKris Buschelman v += 16; 42623660e330SKris Buschelman } 42633660e330SKris Buschelman v = aa + 16*ai[++i]; 42643660e330SKris Buschelman PREFETCH_NTA(v); 4265eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 42663660e330SKris Buschelman } 4267eb05f457SKris Buschelman 4268eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 4269eb05f457SKris Buschelman 42703660e330SKris Buschelman idt = 4*(n-1); 42713660e330SKris Buschelman ai16 = 16*diag[n-1]; 42723660e330SKris Buschelman v = aa + ai16 + 16; 42733660e330SKris Buschelman for (i=n-1; i>=0;){ 42743660e330SKris Buschelman PREFETCH_NTA(&v[8]); 42753660e330SKris Buschelman vi = aj + diag[i] + 1; 42763660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 42773660e330SKris Buschelman 4278eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 42793660e330SKris Buschelman 42803660e330SKris Buschelman while (nz--) { 42813660e330SKris Buschelman PREFETCH_NTA(&v[16]); 42822aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 42833660e330SKris Buschelman 42843660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 4285eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 42863660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 42873660e330SKris Buschelman 42883660e330SKris Buschelman /* First Column */ 42893660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 42903660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 42913660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 42923660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 42933660e330SKris Buschelman 42943660e330SKris Buschelman /* Second Column */ 42953660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 42963660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 42973660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 42983660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 42993660e330SKris Buschelman 43003660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 43013660e330SKris Buschelman 43023660e330SKris Buschelman /* Third Column */ 43033660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 43043660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 43053660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 43063660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 43073660e330SKris Buschelman 43083660e330SKris Buschelman /* Fourth Column */ 43093660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 43103660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 43113660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 43123660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 43133660e330SKris Buschelman SSE_INLINE_END_2 43143660e330SKris Buschelman v += 16; 43153660e330SKris Buschelman } 43163660e330SKris Buschelman v = aa + ai16; 43173660e330SKris Buschelman ai16 = 16*diag[--i]; 43183660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 43193660e330SKris Buschelman /* 43203660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 43213660e330SKris Buschelman which was inverted as part of the factorization 43223660e330SKris Buschelman */ 4323eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 43243660e330SKris Buschelman /* First Column */ 43253660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 43263660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 43273660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 43283660e330SKris Buschelman 43293660e330SKris Buschelman /* Second Column */ 43303660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 43313660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 43323660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 43333660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 43343660e330SKris Buschelman 43353660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 43363660e330SKris Buschelman 43373660e330SKris Buschelman /* Third Column */ 43383660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 43393660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 43403660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 43413660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 43423660e330SKris Buschelman 43433660e330SKris Buschelman /* Fourth Column */ 43443660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 43453660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 43463660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 43473660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 43483660e330SKris Buschelman 43493660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 43503660e330SKris Buschelman SSE_INLINE_END_3 43513660e330SKris Buschelman 43523660e330SKris Buschelman v = aa + ai16 + 16; 43533660e330SKris Buschelman idt -= 4; 43543660e330SKris Buschelman } 4355eb05f457SKris Buschelman 4356eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 4357eb05f457SKris Buschelman idt = 4*(n-1); 4358eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 4359eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4360eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4361eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 4362eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 4363eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 4364eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 4365eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 4366eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 436754693613SKris Buschelman idt -= 4; 43683660e330SKris Buschelman } 4369eb05f457SKris Buschelman 4370eb05f457SKris Buschelman } /* End of artificial scope. */ 43711ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 43721ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4373dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 43743660e330SKris Buschelman SSE_SCOPE_END; 43753660e330SKris Buschelman PetscFunctionReturn(0); 43763660e330SKris Buschelman } 43773660e330SKris Buschelman 43787cf1b8d3SKris Buschelman #undef __FUNCT__ 43797cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4380dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 43817cf1b8d3SKris Buschelman { 43827cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 43837cf1b8d3SKris Buschelman int *aj=a->j; 4384dfbe8321SBarry Smith PetscErrorCode ierr; 4385dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 43867cf1b8d3SKris Buschelman MatScalar *aa=a->a; 43877cf1b8d3SKris Buschelman PetscScalar *x,*b; 43887cf1b8d3SKris Buschelman 43897cf1b8d3SKris Buschelman PetscFunctionBegin; 43907cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 43917cf1b8d3SKris Buschelman /* 43927cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 43937cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 43947cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 43957cf1b8d3SKris Buschelman */ 43967cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 43977cf1b8d3SKris Buschelman 43981ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 43991ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 44007cf1b8d3SKris Buschelman { 44017cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 44027cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 44037cf1b8d3SKris Buschelman int nz,i,idt,ai16; 44047cf1b8d3SKris Buschelman int jdx,idx; 44057cf1b8d3SKris Buschelman int *vi; 44067cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 44077cf1b8d3SKris Buschelman 44087cf1b8d3SKris Buschelman /* First block is the identity. */ 44097cf1b8d3SKris Buschelman idx = 0; 44107cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 44117cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 44127cf1b8d3SKris Buschelman 44137cf1b8d3SKris Buschelman for (i=1; i<n;) { 44147cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 44157cf1b8d3SKris Buschelman vi = aj + ai[i]; 44167cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 44177cf1b8d3SKris Buschelman idx += 4; 44187cf1b8d3SKris Buschelman 44197cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 44207cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 44217cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 44227cf1b8d3SKris Buschelman 44237cf1b8d3SKris Buschelman while (nz--) { 44247cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 44257cf1b8d3SKris Buschelman jdx = 4*(*vi++); 44267cf1b8d3SKris Buschelman /* jdx = *vi++; */ 44277cf1b8d3SKris Buschelman 44287cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 44297cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 44307cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 44317cf1b8d3SKris Buschelman 44327cf1b8d3SKris Buschelman /* First Column */ 44337cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 44347cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 44357cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 44367cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 44377cf1b8d3SKris Buschelman 44387cf1b8d3SKris Buschelman /* Second Column */ 44397cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 44407cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 44417cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 44427cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 44437cf1b8d3SKris Buschelman 44447cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 44457cf1b8d3SKris Buschelman 44467cf1b8d3SKris Buschelman /* Third Column */ 44477cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 44487cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 44497cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 44507cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 44517cf1b8d3SKris Buschelman 44527cf1b8d3SKris Buschelman /* Fourth Column */ 44537cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 44547cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 44557cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 44567cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 44577cf1b8d3SKris Buschelman SSE_INLINE_END_2 44587cf1b8d3SKris Buschelman 44597cf1b8d3SKris Buschelman v += 16; 44607cf1b8d3SKris Buschelman } 44617cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 44627cf1b8d3SKris Buschelman PREFETCH_NTA(v); 44637cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 44647cf1b8d3SKris Buschelman } 44657cf1b8d3SKris Buschelman 44667cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 44677cf1b8d3SKris Buschelman 44687cf1b8d3SKris Buschelman idt = 4*(n-1); 44697cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 44707cf1b8d3SKris Buschelman v = aa + ai16 + 16; 44717cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 44727cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 44737cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 44747cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 44757cf1b8d3SKris Buschelman 44767cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 44777cf1b8d3SKris Buschelman 44787cf1b8d3SKris Buschelman while (nz--) { 44797cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 44807cf1b8d3SKris Buschelman idx = 4*(*vi++); 44817cf1b8d3SKris Buschelman /* idx = *vi++; */ 44827cf1b8d3SKris Buschelman 44837cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 44847cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 44857cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 44867cf1b8d3SKris Buschelman 44877cf1b8d3SKris Buschelman /* First Column */ 44887cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 44897cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 44907cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 44917cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 44927cf1b8d3SKris Buschelman 44937cf1b8d3SKris Buschelman /* Second Column */ 44947cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 44957cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 44967cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 44977cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 44987cf1b8d3SKris Buschelman 44997cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 45007cf1b8d3SKris Buschelman 45017cf1b8d3SKris Buschelman /* Third Column */ 45027cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 45037cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 45047cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 45057cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 45067cf1b8d3SKris Buschelman 45077cf1b8d3SKris Buschelman /* Fourth Column */ 45087cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 45097cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 45107cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 45117cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 45127cf1b8d3SKris Buschelman SSE_INLINE_END_2 45137cf1b8d3SKris Buschelman v += 16; 45147cf1b8d3SKris Buschelman } 45157cf1b8d3SKris Buschelman v = aa + ai16; 45167cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 45177cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 45187cf1b8d3SKris Buschelman /* 45197cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 45207cf1b8d3SKris Buschelman which was inverted as part of the factorization 45217cf1b8d3SKris Buschelman */ 45227cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 45237cf1b8d3SKris Buschelman /* First Column */ 45247cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 45257cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 45267cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 45277cf1b8d3SKris Buschelman 45287cf1b8d3SKris Buschelman /* Second Column */ 45297cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 45307cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 45317cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 45327cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 45337cf1b8d3SKris Buschelman 45347cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 45357cf1b8d3SKris Buschelman 45367cf1b8d3SKris Buschelman /* Third Column */ 45377cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 45387cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 45397cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 45407cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 45417cf1b8d3SKris Buschelman 45427cf1b8d3SKris Buschelman /* Fourth Column */ 45437cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 45447cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 45457cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 45467cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 45477cf1b8d3SKris Buschelman 45487cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 45497cf1b8d3SKris Buschelman SSE_INLINE_END_3 45507cf1b8d3SKris Buschelman 45517cf1b8d3SKris Buschelman v = aa + ai16 + 16; 45527cf1b8d3SKris Buschelman idt -= 4; 45537cf1b8d3SKris Buschelman } 45547cf1b8d3SKris Buschelman 45557cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 45567cf1b8d3SKris Buschelman idt = 4*(n-1); 45577cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 45587cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 45597cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 45607cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 45617cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 45627cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 45637cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 45647cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 45657cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 45667cf1b8d3SKris Buschelman idt -= 4; 45677cf1b8d3SKris Buschelman } 45687cf1b8d3SKris Buschelman 45697cf1b8d3SKris Buschelman } /* End of artificial scope. */ 45701ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 45711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4572dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 45737cf1b8d3SKris Buschelman SSE_SCOPE_END; 45747cf1b8d3SKris Buschelman PetscFunctionReturn(0); 45757cf1b8d3SKris Buschelman } 45767cf1b8d3SKris Buschelman 45773660e330SKris Buschelman #endif 45788f690400SShri Abhyankar 45794a2ae208SSatish Balay #undef __FUNCT__ 458006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 458106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 45824e2b4712SSatish Balay { 45834e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 45844e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 45856849ba73SBarry Smith PetscErrorCode ierr; 45865d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 45875d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4588d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4589d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4590d9fead3dSBarry Smith const PetscScalar *b; 45914e2b4712SSatish Balay 45924e2b4712SSatish Balay PetscFunctionBegin; 4593d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4595f1af5d2fSBarry Smith t = a->solve_work; 45964e2b4712SSatish Balay 45974e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 45984e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 45994e2b4712SSatish Balay 46004e2b4712SSatish Balay /* forward solve the lower triangular */ 46014e2b4712SSatish Balay idx = 3*(*r++); 4602f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 46034e2b4712SSatish Balay for (i=1; i<n; i++) { 46044e2b4712SSatish Balay v = aa + 9*ai[i]; 46054e2b4712SSatish Balay vi = aj + ai[i]; 46064e2b4712SSatish Balay nz = diag[i] - ai[i]; 46074e2b4712SSatish Balay idx = 3*(*r++); 4608f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 46094e2b4712SSatish Balay while (nz--) { 46104e2b4712SSatish Balay idx = 3*(*vi++); 4611f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4612f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4613f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4614f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 46154e2b4712SSatish Balay v += 9; 46164e2b4712SSatish Balay } 46174e2b4712SSatish Balay idx = 3*i; 4618f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 46194e2b4712SSatish Balay } 46204e2b4712SSatish Balay /* backward solve the upper triangular */ 46214e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 46224e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 46234e2b4712SSatish Balay vi = aj + diag[i] + 1; 46244e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 46254e2b4712SSatish Balay idt = 3*i; 4626f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 46274e2b4712SSatish Balay while (nz--) { 46284e2b4712SSatish Balay idx = 3*(*vi++); 4629f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4630f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4631f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4632f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 46334e2b4712SSatish Balay v += 9; 46344e2b4712SSatish Balay } 46354e2b4712SSatish Balay idc = 3*(*c--); 46364e2b4712SSatish Balay v = aa + 9*diag[i]; 4637f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4638f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4639f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 46404e2b4712SSatish Balay } 46414e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 46424e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4643d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46441ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4645dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 46464e2b4712SSatish Balay PetscFunctionReturn(0); 46474e2b4712SSatish Balay } 46484e2b4712SSatish Balay 46490c4413a7SShri Abhyankar #undef __FUNCT__ 46504dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3" 46514dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 46520c4413a7SShri Abhyankar { 46530c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 46540c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 46550c4413a7SShri Abhyankar PetscErrorCode ierr; 46560c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 46570c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 46580c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 46590c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 46600c4413a7SShri Abhyankar const PetscScalar *b; 46610c4413a7SShri Abhyankar 46620c4413a7SShri Abhyankar PetscFunctionBegin; 46630c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46640c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 46650c4413a7SShri Abhyankar t = a->solve_work; 46660c4413a7SShri Abhyankar 46670c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 46680c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 46690c4413a7SShri Abhyankar 46700c4413a7SShri Abhyankar /* forward solve the lower triangular */ 46710c4413a7SShri Abhyankar idx = 3*r[0]; 46720c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 46730c4413a7SShri Abhyankar for (i=1; i<n; i++) { 46740c4413a7SShri Abhyankar v = aa + 9*ai[i]; 46750c4413a7SShri Abhyankar vi = aj + ai[i]; 46760c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 46770c4413a7SShri Abhyankar idx = 3*r[i]; 46780c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 46790c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 46800c4413a7SShri Abhyankar idx = 3*vi[m]; 46810c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 46820c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 46830c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 46840c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 46850c4413a7SShri Abhyankar v += 9; 46860c4413a7SShri Abhyankar } 46870c4413a7SShri Abhyankar idx = 3*i; 46880c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 46890c4413a7SShri Abhyankar } 46900c4413a7SShri Abhyankar /* backward solve the upper triangular */ 46910c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 46920c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 46930c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 46940c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 46950c4413a7SShri Abhyankar idt = 3*i; 46960c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 46970c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 46980c4413a7SShri Abhyankar idx = 3*vi[m]; 46990c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 47000c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 47010c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 47020c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 47030c4413a7SShri Abhyankar v += 9; 47040c4413a7SShri Abhyankar } 47050c4413a7SShri Abhyankar idc = 3*c[i]; 47060c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 47070c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 47080c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 47090c4413a7SShri Abhyankar } 47100c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 47110c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 47120c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47130c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 47140c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 47150c4413a7SShri Abhyankar PetscFunctionReturn(0); 47160c4413a7SShri Abhyankar } 47170c4413a7SShri Abhyankar 471815091d37SBarry Smith /* 471915091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 472015091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 472115091d37SBarry Smith */ 47224a2ae208SSatish Balay #undef __FUNCT__ 472306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 472406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 472515091d37SBarry Smith { 472615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4727690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4728dfbe8321SBarry Smith PetscErrorCode ierr; 4729690b6cddSBarry Smith PetscInt *diag = a->diag; 4730d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4731d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4732d9fead3dSBarry Smith const PetscScalar *b; 4733690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 473415091d37SBarry Smith 473515091d37SBarry Smith PetscFunctionBegin; 4736d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47371ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 473815091d37SBarry Smith 473915091d37SBarry Smith /* forward solve the lower triangular */ 474015091d37SBarry Smith idx = 0; 474115091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 474215091d37SBarry Smith for (i=1; i<n; i++) { 474315091d37SBarry Smith v = aa + 9*ai[i]; 474415091d37SBarry Smith vi = aj + ai[i]; 474515091d37SBarry Smith nz = diag[i] - ai[i]; 474615091d37SBarry Smith idx += 3; 4747f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 474815091d37SBarry Smith while (nz--) { 474915091d37SBarry Smith jdx = 3*(*vi++); 475015091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4751f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4752f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4753f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 475415091d37SBarry Smith v += 9; 475515091d37SBarry Smith } 4756f1af5d2fSBarry Smith x[idx] = s1; 4757f1af5d2fSBarry Smith x[1+idx] = s2; 4758f1af5d2fSBarry Smith x[2+idx] = s3; 475915091d37SBarry Smith } 476015091d37SBarry Smith /* backward solve the upper triangular */ 476115091d37SBarry Smith for (i=n-1; i>=0; i--){ 476215091d37SBarry Smith v = aa + 9*diag[i] + 9; 476315091d37SBarry Smith vi = aj + diag[i] + 1; 476415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 476515091d37SBarry Smith idt = 3*i; 4766f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4767f1af5d2fSBarry Smith s3 = x[2+idt]; 476815091d37SBarry Smith while (nz--) { 476915091d37SBarry Smith idx = 3*(*vi++); 477015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4771f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4772f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4773f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 477415091d37SBarry Smith v += 9; 477515091d37SBarry Smith } 477615091d37SBarry Smith v = aa + 9*diag[i]; 4777f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4778f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4779f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 478015091d37SBarry Smith } 478115091d37SBarry Smith 4782d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47831ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4784dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 478515091d37SBarry Smith PetscFunctionReturn(0); 478615091d37SBarry Smith } 478715091d37SBarry Smith 4788cee9d6f2SShri Abhyankar #undef __FUNCT__ 47894dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 47904dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4791b2b2dd24SShri Abhyankar { 4792b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4793b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4794b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4795b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 4796b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4797b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4798b2b2dd24SShri Abhyankar PetscScalar *x; 4799b2b2dd24SShri Abhyankar const PetscScalar *b; 4800b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4801b2b2dd24SShri Abhyankar 4802b2b2dd24SShri Abhyankar PetscFunctionBegin; 4803b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4804b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4805b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4806b2b2dd24SShri Abhyankar idx = 0; 4807b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4808b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4809b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4810b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4811b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4812b2b2dd24SShri Abhyankar idx = bs*i; 4813b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4814b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4815b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4816b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4817b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4818b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4819b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4820b2b2dd24SShri Abhyankar 4821b2b2dd24SShri Abhyankar v += bs2; 4822b2b2dd24SShri Abhyankar } 4823b2b2dd24SShri Abhyankar 4824b2b2dd24SShri Abhyankar x[idx] = s1; 4825b2b2dd24SShri Abhyankar x[1+idx] = s2; 4826b2b2dd24SShri Abhyankar x[2+idx] = s3; 4827b2b2dd24SShri Abhyankar } 4828b2b2dd24SShri Abhyankar 4829b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4830b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4831b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4832b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4833b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4834b2b2dd24SShri Abhyankar idt = bs*i; 4835b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4836b2b2dd24SShri Abhyankar 4837b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4838b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4839b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4840b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4841b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4842b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4843b2b2dd24SShri Abhyankar 4844b2b2dd24SShri Abhyankar v += bs2; 4845b2b2dd24SShri Abhyankar } 4846b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4847b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4848b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4849b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4850b2b2dd24SShri Abhyankar 4851b2b2dd24SShri Abhyankar } 4852b2b2dd24SShri Abhyankar 4853b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4854b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4855b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4856b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4857b2b2dd24SShri Abhyankar } 4858b2b2dd24SShri Abhyankar 4859b2b2dd24SShri Abhyankar #undef __FUNCT__ 486006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 486106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 48624e2b4712SSatish Balay { 48634e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 48644e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 48656849ba73SBarry Smith PetscErrorCode ierr; 48665d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 48675d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4868d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4869d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 4870d9fead3dSBarry Smith const PetscScalar *b; 48714e2b4712SSatish Balay 48724e2b4712SSatish Balay PetscFunctionBegin; 4873d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 48741ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4875f1af5d2fSBarry Smith t = a->solve_work; 48764e2b4712SSatish Balay 48774e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 48784e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 48794e2b4712SSatish Balay 48804e2b4712SSatish Balay /* forward solve the lower triangular */ 48814e2b4712SSatish Balay idx = 2*(*r++); 4882f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 48834e2b4712SSatish Balay for (i=1; i<n; i++) { 48844e2b4712SSatish Balay v = aa + 4*ai[i]; 48854e2b4712SSatish Balay vi = aj + ai[i]; 48864e2b4712SSatish Balay nz = diag[i] - ai[i]; 48874e2b4712SSatish Balay idx = 2*(*r++); 4888f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 48894e2b4712SSatish Balay while (nz--) { 48904e2b4712SSatish Balay idx = 2*(*vi++); 4891f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4892f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4893f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 48944e2b4712SSatish Balay v += 4; 48954e2b4712SSatish Balay } 48964e2b4712SSatish Balay idx = 2*i; 4897f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 48984e2b4712SSatish Balay } 48994e2b4712SSatish Balay /* backward solve the upper triangular */ 49004e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 49014e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 49024e2b4712SSatish Balay vi = aj + diag[i] + 1; 49034e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 49044e2b4712SSatish Balay idt = 2*i; 4905f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 49064e2b4712SSatish Balay while (nz--) { 49074e2b4712SSatish Balay idx = 2*(*vi++); 4908f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4909f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4910f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 49114e2b4712SSatish Balay v += 4; 49124e2b4712SSatish Balay } 49134e2b4712SSatish Balay idc = 2*(*c--); 49144e2b4712SSatish Balay v = aa + 4*diag[i]; 4915f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4916f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 49174e2b4712SSatish Balay } 49184e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 49194e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4920d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 49211ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4922dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 49234e2b4712SSatish Balay PetscFunctionReturn(0); 49244e2b4712SSatish Balay } 49254e2b4712SSatish Balay 49260c4413a7SShri Abhyankar #undef __FUNCT__ 49274dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2" 49284dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 49290c4413a7SShri Abhyankar { 49300c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 49310c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 49320c4413a7SShri Abhyankar PetscErrorCode ierr; 49330c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 49340c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 49350c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 49360c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 49370c4413a7SShri Abhyankar const PetscScalar *b; 49380c4413a7SShri Abhyankar 49390c4413a7SShri Abhyankar PetscFunctionBegin; 49400c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 49410c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 49420c4413a7SShri Abhyankar t = a->solve_work; 49430c4413a7SShri Abhyankar 49440c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 49450c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 49460c4413a7SShri Abhyankar 49470c4413a7SShri Abhyankar /* forward solve the lower triangular */ 49480c4413a7SShri Abhyankar idx = 2*r[0]; 49490c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 49500c4413a7SShri Abhyankar for (i=1; i<n; i++) { 49510c4413a7SShri Abhyankar v = aa + 4*ai[i]; 49520c4413a7SShri Abhyankar vi = aj + ai[i]; 49530c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 49540c4413a7SShri Abhyankar idx = 2*r[i]; 49550c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 49560c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 49570c4413a7SShri Abhyankar jdx = 2*vi[m]; 49580c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 49590c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 49600c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 49610c4413a7SShri Abhyankar v += 4; 49620c4413a7SShri Abhyankar } 49630c4413a7SShri Abhyankar idx = 2*i; 49640c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 49650c4413a7SShri Abhyankar } 49660c4413a7SShri Abhyankar /* backward solve the upper triangular */ 49670c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 49680c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 49690c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 49700c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 49710c4413a7SShri Abhyankar idt = 2*i; 49720c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 49730c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 49740c4413a7SShri Abhyankar idx = 2*vi[m]; 49750c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 49760c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 49770c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 49780c4413a7SShri Abhyankar v += 4; 49790c4413a7SShri Abhyankar } 49800c4413a7SShri Abhyankar idc = 2*c[i]; 49810c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 49820c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 49830c4413a7SShri Abhyankar } 49840c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 49850c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 49860c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 49870c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 49880c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 49890c4413a7SShri Abhyankar PetscFunctionReturn(0); 49900c4413a7SShri Abhyankar } 49918f690400SShri Abhyankar 499215091d37SBarry Smith /* 499315091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 499415091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 499515091d37SBarry Smith */ 49964a2ae208SSatish Balay #undef __FUNCT__ 499706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 499806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 499915091d37SBarry Smith { 500015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5001690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 5002dfbe8321SBarry Smith PetscErrorCode ierr; 5003690b6cddSBarry Smith PetscInt *diag = a->diag; 5004d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 5005d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 5006d9fead3dSBarry Smith const PetscScalar *b; 5007690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 500815091d37SBarry Smith 500915091d37SBarry Smith PetscFunctionBegin; 5010d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 50111ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 501215091d37SBarry Smith 501315091d37SBarry Smith /* forward solve the lower triangular */ 501415091d37SBarry Smith idx = 0; 501515091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 501615091d37SBarry Smith for (i=1; i<n; i++) { 501715091d37SBarry Smith v = aa + 4*ai[i]; 501815091d37SBarry Smith vi = aj + ai[i]; 501915091d37SBarry Smith nz = diag[i] - ai[i]; 502015091d37SBarry Smith idx += 2; 5021f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 502215091d37SBarry Smith while (nz--) { 502315091d37SBarry Smith jdx = 2*(*vi++); 502415091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 5025f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5026f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 502715091d37SBarry Smith v += 4; 502815091d37SBarry Smith } 5029f1af5d2fSBarry Smith x[idx] = s1; 5030f1af5d2fSBarry Smith x[1+idx] = s2; 503115091d37SBarry Smith } 503215091d37SBarry Smith /* backward solve the upper triangular */ 503315091d37SBarry Smith for (i=n-1; i>=0; i--){ 503415091d37SBarry Smith v = aa + 4*diag[i] + 4; 503515091d37SBarry Smith vi = aj + diag[i] + 1; 503615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 503715091d37SBarry Smith idt = 2*i; 5038f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 503915091d37SBarry Smith while (nz--) { 504015091d37SBarry Smith idx = 2*(*vi++); 504115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 5042f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5043f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 504415091d37SBarry Smith v += 4; 504515091d37SBarry Smith } 504615091d37SBarry Smith v = aa + 4*diag[i]; 5047f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 5048f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 504915091d37SBarry Smith } 505015091d37SBarry Smith 5051d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 50521ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5053dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 505415091d37SBarry Smith PetscFunctionReturn(0); 505515091d37SBarry Smith } 505615091d37SBarry Smith 5057cee9d6f2SShri Abhyankar #undef __FUNCT__ 50584dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 50594dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5060b2b2dd24SShri Abhyankar { 5061b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5062b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 5063b2b2dd24SShri Abhyankar PetscErrorCode ierr; 5064b2b2dd24SShri Abhyankar PetscInt jdx; 5065b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 5066b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 5067b2b2dd24SShri Abhyankar const PetscScalar *b; 5068b2b2dd24SShri Abhyankar 5069b2b2dd24SShri Abhyankar PetscFunctionBegin; 5070b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5071b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5072b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 5073b2b2dd24SShri Abhyankar idx = 0; 5074b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 5075b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 5076b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 5077b2b2dd24SShri Abhyankar vi = aj + ai[i]; 5078b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 5079b2b2dd24SShri Abhyankar idx = 2*i; 5080b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 5081b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5082b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 5083b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 5084b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5085b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5086b2b2dd24SShri Abhyankar v += 4; 5087b2b2dd24SShri Abhyankar } 5088b2b2dd24SShri Abhyankar x[idx] = s1; 5089b2b2dd24SShri Abhyankar x[1+idx] = s2; 5090b2b2dd24SShri Abhyankar } 5091b2b2dd24SShri Abhyankar 5092b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 5093b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 5094b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 5095b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 5096b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5097b2b2dd24SShri Abhyankar idt = 2*i; 5098b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 5099b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5100b2b2dd24SShri Abhyankar idx = 2*vi[k]; 5101b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 5102b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5103b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5104b2b2dd24SShri Abhyankar v += 4; 5105b2b2dd24SShri Abhyankar } 5106b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 5107b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 5108b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 5109b2b2dd24SShri Abhyankar } 5110b2b2dd24SShri Abhyankar 5111b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5112b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5113b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5114b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 5115b2b2dd24SShri Abhyankar } 5116b2b2dd24SShri Abhyankar 5117b2b2dd24SShri Abhyankar #undef __FUNCT__ 511806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 511906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 51204e2b4712SSatish Balay { 51214e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 51224e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 51236849ba73SBarry Smith PetscErrorCode ierr; 51245d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 51255d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 51263f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 512787828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 51284e2b4712SSatish Balay 51294e2b4712SSatish Balay PetscFunctionBegin; 51304e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 51314e2b4712SSatish Balay 51321ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 51331ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5134f1af5d2fSBarry Smith t = a->solve_work; 51354e2b4712SSatish Balay 51364e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 51374e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 51384e2b4712SSatish Balay 51394e2b4712SSatish Balay /* forward solve the lower triangular */ 5140f1af5d2fSBarry Smith t[0] = b[*r++]; 51414e2b4712SSatish Balay for (i=1; i<n; i++) { 51424e2b4712SSatish Balay v = aa + ai[i]; 51434e2b4712SSatish Balay vi = aj + ai[i]; 51444e2b4712SSatish Balay nz = diag[i] - ai[i]; 5145f1af5d2fSBarry Smith s1 = b[*r++]; 51464e2b4712SSatish Balay while (nz--) { 5147f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 51484e2b4712SSatish Balay } 5149f1af5d2fSBarry Smith t[i] = s1; 51504e2b4712SSatish Balay } 51514e2b4712SSatish Balay /* backward solve the upper triangular */ 51524e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 51534e2b4712SSatish Balay v = aa + diag[i] + 1; 51544e2b4712SSatish Balay vi = aj + diag[i] + 1; 51554e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 5156f1af5d2fSBarry Smith s1 = t[i]; 51574e2b4712SSatish Balay while (nz--) { 5158f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 51594e2b4712SSatish Balay } 5160f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 51614e2b4712SSatish Balay } 51624e2b4712SSatish Balay 51634e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 51644e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 51651ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 51661ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5167dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 51684e2b4712SSatish Balay PetscFunctionReturn(0); 51694e2b4712SSatish Balay } 517015091d37SBarry Smith /* 517115091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 517215091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 517315091d37SBarry Smith */ 51744a2ae208SSatish Balay #undef __FUNCT__ 517506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 517606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 517715091d37SBarry Smith { 517815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5179690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 5180dfbe8321SBarry Smith PetscErrorCode ierr; 5181690b6cddSBarry Smith PetscInt *diag = a->diag; 518215091d37SBarry Smith MatScalar *aa=a->a; 518387828ca2SBarry Smith PetscScalar *x,*b; 518487828ca2SBarry Smith PetscScalar s1,x1; 518515091d37SBarry Smith MatScalar *v; 5186690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 518715091d37SBarry Smith 518815091d37SBarry Smith PetscFunctionBegin; 51891ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 51901ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 519115091d37SBarry Smith 519215091d37SBarry Smith /* forward solve the lower triangular */ 519315091d37SBarry Smith idx = 0; 519415091d37SBarry Smith x[0] = b[0]; 519515091d37SBarry Smith for (i=1; i<n; i++) { 519615091d37SBarry Smith v = aa + ai[i]; 519715091d37SBarry Smith vi = aj + ai[i]; 519815091d37SBarry Smith nz = diag[i] - ai[i]; 519915091d37SBarry Smith idx += 1; 5200f1af5d2fSBarry Smith s1 = b[idx]; 520115091d37SBarry Smith while (nz--) { 520215091d37SBarry Smith jdx = *vi++; 520315091d37SBarry Smith x1 = x[jdx]; 5204f1af5d2fSBarry Smith s1 -= v[0]*x1; 520515091d37SBarry Smith v += 1; 520615091d37SBarry Smith } 5207f1af5d2fSBarry Smith x[idx] = s1; 520815091d37SBarry Smith } 520915091d37SBarry Smith /* backward solve the upper triangular */ 521015091d37SBarry Smith for (i=n-1; i>=0; i--){ 521115091d37SBarry Smith v = aa + diag[i] + 1; 521215091d37SBarry Smith vi = aj + diag[i] + 1; 521315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 521415091d37SBarry Smith idt = i; 5215f1af5d2fSBarry Smith s1 = x[idt]; 521615091d37SBarry Smith while (nz--) { 521715091d37SBarry Smith idx = *vi++; 521815091d37SBarry Smith x1 = x[idx]; 5219f1af5d2fSBarry Smith s1 -= v[0]*x1; 522015091d37SBarry Smith v += 1; 522115091d37SBarry Smith } 522215091d37SBarry Smith v = aa + diag[i]; 5223f1af5d2fSBarry Smith x[idt] = v[0]*s1; 522415091d37SBarry Smith } 52251ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 52261ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5227dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 522815091d37SBarry Smith PetscFunctionReturn(0); 522915091d37SBarry Smith } 52304e2b4712SSatish Balay 52314e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 523216a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 52338b1456e3SHong Zhang //EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_inplace(Mat,PetscTruth); 52344dd39f65SShri Abhyankar //EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 52356bce7ff8SHong Zhang 5236*2b0b2ea7SShri Abhyankar /* bs = 15 for PFLOTRAN */ 5237*2b0b2ea7SShri Abhyankar #undef __FUNCT__ 5238*2b0b2ea7SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15" 5239*2b0b2ea7SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15(Mat B,Mat A,const MatFactorInfo *info) 5240*2b0b2ea7SShri Abhyankar { 5241*2b0b2ea7SShri Abhyankar Mat C=B; 5242*2b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5243*2b0b2ea7SShri Abhyankar IS isrow = b->row,isicol = b->icol; 5244*2b0b2ea7SShri Abhyankar PetscErrorCode ierr; 5245*2b0b2ea7SShri Abhyankar const PetscInt *r,*ic,*ics; 5246*2b0b2ea7SShri Abhyankar PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5247*2b0b2ea7SShri Abhyankar PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj,*v_pivots; 5248*2b0b2ea7SShri Abhyankar MatScalar *rtmp,*pc,*mwork,*v,*v_work,*pv,*aa=a->a; 5249*2b0b2ea7SShri Abhyankar PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 5250*2b0b2ea7SShri Abhyankar PetscReal shift = info->shiftinblocks; 5251*2b0b2ea7SShri Abhyankar 5252*2b0b2ea7SShri Abhyankar PetscFunctionBegin; 5253*2b0b2ea7SShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5254*2b0b2ea7SShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5255*2b0b2ea7SShri Abhyankar 5256*2b0b2ea7SShri Abhyankar 5257*2b0b2ea7SShri Abhyankar /* generate work space needed by the factorization */ 5258*2b0b2ea7SShri Abhyankar ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 5259*2b0b2ea7SShri Abhyankar ierr = PetscMalloc2(bs,MatScalar,&v_work,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5260*2b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5261*2b0b2ea7SShri Abhyankar ics = ic; 5262*2b0b2ea7SShri Abhyankar 5263*2b0b2ea7SShri Abhyankar for (i=0; i<n; i++){ 5264*2b0b2ea7SShri Abhyankar /* zero rtmp */ 5265*2b0b2ea7SShri Abhyankar /* L part */ 5266*2b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 5267*2b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 5268*2b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 5269*2b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5270*2b0b2ea7SShri Abhyankar } 5271*2b0b2ea7SShri Abhyankar 5272*2b0b2ea7SShri Abhyankar /* U part */ 5273*2b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 5274*2b0b2ea7SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 5275*2b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 5276*2b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5277*2b0b2ea7SShri Abhyankar } 5278*2b0b2ea7SShri Abhyankar 5279*2b0b2ea7SShri Abhyankar /* load in initial (unfactored row) */ 5280*2b0b2ea7SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 5281*2b0b2ea7SShri Abhyankar ajtmp = aj + ai[r[i]]; 5282*2b0b2ea7SShri Abhyankar v = aa + bs2*ai[r[i]]; 5283*2b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 5284*2b0b2ea7SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5285*2b0b2ea7SShri Abhyankar } 5286*2b0b2ea7SShri Abhyankar 5287*2b0b2ea7SShri Abhyankar /* elimination */ 5288*2b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 5289*2b0b2ea7SShri Abhyankar nzL = bi[i+1] - bi[i]; 5290*2b0b2ea7SShri Abhyankar for(k=0;k < nzL;k++) { 5291*2b0b2ea7SShri Abhyankar row = bjtmp[k]; 5292*2b0b2ea7SShri Abhyankar pc = rtmp + bs2*row; 5293*2b0b2ea7SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5294*2b0b2ea7SShri Abhyankar if (flg) { 5295*2b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[row]; 5296*2b0b2ea7SShri Abhyankar /* ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr); */ 5297*2b0b2ea7SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); 5298*2b0b2ea7SShri Abhyankar 5299*2b0b2ea7SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5300*2b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 5301*2b0b2ea7SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5302*2b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 5303*2b0b2ea7SShri Abhyankar v = rtmp + bs2*pj[j]; 5304*2b0b2ea7SShri Abhyankar /* ierr = Kernel_A_gets_A_minus_B_times_C_15(v,pc,pv);CHKERRQ(ierr); */ 5305*2b0b2ea7SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,v,pc,pv); 5306*2b0b2ea7SShri Abhyankar pv += bs2; 5307*2b0b2ea7SShri Abhyankar } 5308*2b0b2ea7SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*nz+2*bs2*bs-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5309*2b0b2ea7SShri Abhyankar } 5310*2b0b2ea7SShri Abhyankar } 5311*2b0b2ea7SShri Abhyankar 5312*2b0b2ea7SShri Abhyankar /* finished row so stick it into b->a */ 5313*2b0b2ea7SShri Abhyankar /* L part */ 5314*2b0b2ea7SShri Abhyankar pv = b->a + bs2*bi[i] ; 5315*2b0b2ea7SShri Abhyankar pj = b->j + bi[i] ; 5316*2b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 5317*2b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 5318*2b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5319*2b0b2ea7SShri Abhyankar } 5320*2b0b2ea7SShri Abhyankar 5321*2b0b2ea7SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 5322*2b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[i]; 5323*2b0b2ea7SShri Abhyankar pj = b->j + bdiag[i]; 5324*2b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5325*2b0b2ea7SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5326*2b0b2ea7SShri Abhyankar /*ierr = Kernel_A_gets_inverse_A_7(pv,shift);CHKERRQ(ierr); */ 5327*2b0b2ea7SShri Abhyankar 5328*2b0b2ea7SShri Abhyankar /* U part */ 5329*2b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 5330*2b0b2ea7SShri Abhyankar pj = b->j + bdiag[i+1]+1; 5331*2b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 5332*2b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 5333*2b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5334*2b0b2ea7SShri Abhyankar } 5335*2b0b2ea7SShri Abhyankar } 5336*2b0b2ea7SShri Abhyankar 5337*2b0b2ea7SShri Abhyankar ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5338*2b0b2ea7SShri Abhyankar ierr = PetscFree2(v_work,v_pivots);CHKERRQ(ierr); 5339*2b0b2ea7SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5340*2b0b2ea7SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5341*2b0b2ea7SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_15; 5342*2b0b2ea7SShri Abhyankar C->ops->solvetranspose = 0; 5343*2b0b2ea7SShri Abhyankar C->assembled = PETSC_TRUE; 5344*2b0b2ea7SShri Abhyankar ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5345*2b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 5346*2b0b2ea7SShri Abhyankar } 5347*2b0b2ea7SShri Abhyankar 53486bce7ff8SHong Zhang #undef __FUNCT__ 53494dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 53504dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 53516bce7ff8SHong Zhang { 53526bce7ff8SHong Zhang Mat C=B; 53536bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 53546bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 53556bce7ff8SHong Zhang PetscErrorCode ierr; 53566bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 53576bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 53586bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5359b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5360914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5361914a18a2SHong Zhang MatScalar *v_work; 5362ae3d28f0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 53636bce7ff8SHong Zhang 53646bce7ff8SHong Zhang PetscFunctionBegin; 53656bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 53666bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5367ae3d28f0SHong Zhang 5368fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5369fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 53706bce7ff8SHong Zhang ics = ic; 53716bce7ff8SHong Zhang 5372914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 5373fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5374914a18a2SHong Zhang 53756bce7ff8SHong Zhang for (i=0; i<n; i++){ 53766bce7ff8SHong Zhang /* zero rtmp */ 53776bce7ff8SHong Zhang /* L part */ 53786bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 53796bce7ff8SHong Zhang bjtmp = bj + bi[i]; 5380914a18a2SHong Zhang for (j=0; j<nz; j++){ 5381914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5382914a18a2SHong Zhang } 53836bce7ff8SHong Zhang 53846bce7ff8SHong Zhang /* U part */ 53851a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 53861a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 53871a83e813SShri Abhyankar for (j=0; j<nz; j++){ 53881a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 53891a83e813SShri Abhyankar } 53901a83e813SShri Abhyankar 53911a83e813SShri Abhyankar /* load in initial (unfactored row) */ 53921a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 53931a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 53941a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 53951a83e813SShri Abhyankar for (j=0; j<nz; j++) { 53961a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 53971a83e813SShri Abhyankar } 53981a83e813SShri Abhyankar 53991a83e813SShri Abhyankar /* elimination */ 54001a83e813SShri Abhyankar bjtmp = bj + bi[i]; 54011a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 54021a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 54031a83e813SShri Abhyankar row = bjtmp[k]; 54041a83e813SShri Abhyankar pc = rtmp + bs2*row; 54051a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 54061a83e813SShri Abhyankar if (flg) { 54071a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 54081a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 54091a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 54101a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 54111a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 54121a83e813SShri Abhyankar for (j=0; j<nz; j++) { 54131a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 54141a83e813SShri Abhyankar } 54151a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 54161a83e813SShri Abhyankar } 54171a83e813SShri Abhyankar } 54181a83e813SShri Abhyankar 54191a83e813SShri Abhyankar /* finished row so stick it into b->a */ 54201a83e813SShri Abhyankar /* L part */ 54211a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 54221a83e813SShri Abhyankar pj = b->j + bi[i] ; 54231a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 54241a83e813SShri Abhyankar for (j=0; j<nz; j++) { 54251a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 54261a83e813SShri Abhyankar } 54271a83e813SShri Abhyankar 54281a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 54291a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 54301a83e813SShri Abhyankar pj = b->j + bdiag[i]; 54311a83e813SShri Abhyankar /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 54321a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 54331a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 54341a83e813SShri Abhyankar 54351a83e813SShri Abhyankar /* U part */ 54361a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 54371a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 54381a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 54391a83e813SShri Abhyankar for (j=0; j<nz; j++){ 54401a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 54411a83e813SShri Abhyankar } 54421a83e813SShri Abhyankar } 54431a83e813SShri Abhyankar 54441a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 5445fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 54461a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 54471a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 54481a83e813SShri Abhyankar 5449ae3d28f0SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5450ae3d28f0SHong Zhang ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5451ae3d28f0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 5452ae3d28f0SHong Zhang if (both_identity){ 54534dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5454ae3d28f0SHong Zhang } else { 54554dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N; 5456ae3d28f0SHong Zhang } 54574dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5458ae3d28f0SHong Zhang 54591a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 54601a83e813SShri Abhyankar ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 54611a83e813SShri Abhyankar PetscFunctionReturn(0); 54621a83e813SShri Abhyankar } 54631a83e813SShri Abhyankar 54646bce7ff8SHong Zhang /* 54656bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 54664dd39f65SShri Abhyankar See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 54674dd39f65SShri Abhyankar because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 54686bce7ff8SHong Zhang */ 5469c0c7eb62SShri Abhyankar 54706bce7ff8SHong Zhang #undef __FUNCT__ 54714dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 54724dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 54736bce7ff8SHong Zhang { 54746bce7ff8SHong Zhang 54756bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 54766bce7ff8SHong Zhang PetscErrorCode ierr; 547716a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 547835aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 547935aa4fcfSShri Abhyankar 548035aa4fcfSShri Abhyankar PetscFunctionBegin; 548135aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 548235aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 548335aa4fcfSShri Abhyankar 548435aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 548535aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 548635aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 548735aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 548835aa4fcfSShri Abhyankar if (!b->diag){ 548935aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 549035aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 549135aa4fcfSShri Abhyankar } 549235aa4fcfSShri Abhyankar bdiag = b->diag; 549335aa4fcfSShri Abhyankar 549435aa4fcfSShri Abhyankar if (n > 0) { 549535aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 549635aa4fcfSShri Abhyankar } 549735aa4fcfSShri Abhyankar 549835aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 549935aa4fcfSShri Abhyankar bi = b->i; 550035aa4fcfSShri Abhyankar bj = b->j; 550135aa4fcfSShri Abhyankar 550235aa4fcfSShri Abhyankar /* L part */ 550335aa4fcfSShri Abhyankar bi[0] = 0; 550435aa4fcfSShri Abhyankar for (i=0; i<n; i++){ 550535aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 550635aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 550735aa4fcfSShri Abhyankar aj = a->j + ai[i]; 550835aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 550935aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 551035aa4fcfSShri Abhyankar } 551135aa4fcfSShri Abhyankar } 551235aa4fcfSShri Abhyankar 551335aa4fcfSShri Abhyankar /* U part */ 551435aa4fcfSShri Abhyankar bi_temp = bi[n]; 551535aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 551635aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 551735aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 551835aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 551935aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 552035aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 552135aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 552235aa4fcfSShri Abhyankar } 552335aa4fcfSShri Abhyankar /* diag[i] */ 552435aa4fcfSShri Abhyankar *bj = i; bj++; 552535aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 552635aa4fcfSShri Abhyankar } 552735aa4fcfSShri Abhyankar PetscFunctionReturn(0); 552835aa4fcfSShri Abhyankar } 552935aa4fcfSShri Abhyankar 553035aa4fcfSShri Abhyankar #undef __FUNCT__ 55314dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 55324dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 553316a2bf60SHong Zhang { 553416a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 553516a2bf60SHong Zhang IS isicol; 553616a2bf60SHong Zhang PetscErrorCode ierr; 553716a2bf60SHong Zhang const PetscInt *r,*ic; 55387fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 553916a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 554016a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 554116a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 55427fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 554316a2bf60SHong Zhang PetscReal f; 554416a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 554516a2bf60SHong Zhang PetscBT lnkbt; 554616a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 554716a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 554816a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 554916a2bf60SHong Zhang PetscTruth missing; 55507fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 55514dd39f65SShri Abhyankar PetscTruth olddatastruct = PETSC_FALSE; 555216a2bf60SHong Zhang 555316a2bf60SHong Zhang PetscFunctionBegin; 55544dd39f65SShri Abhyankar ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_old",&olddatastruct,PETSC_NULL);CHKERRQ(ierr); 55554dd39f65SShri Abhyankar if (olddatastruct){ 555606e38f1dSHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_inplace(fact,A,isrow,iscol,info);CHKERRQ(ierr); 555706e38f1dSHong Zhang PetscFunctionReturn(0); 555806e38f1dSHong Zhang } 555916a2bf60SHong Zhang if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 556016a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 556116a2bf60SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 556216a2bf60SHong Zhang 556316a2bf60SHong Zhang f = info->fill; 556416a2bf60SHong Zhang levels = (PetscInt)info->levels; 556516a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 556616a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 556716a2bf60SHong Zhang 556816a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 556916a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 55707fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 557116a2bf60SHong Zhang 55727fa3a6a0SHong Zhang if (!levels && both_identity) { 557316a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 55744dd39f65SShri Abhyankar ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 55754dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 557635aa4fcfSShri Abhyankar 557735aa4fcfSShri Abhyankar fact->factor = MAT_FACTOR_ILU; 557835aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 557935aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 558035aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 558135aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 558235aa4fcfSShri Abhyankar b->row = isrow; 558335aa4fcfSShri Abhyankar b->col = iscol; 558435aa4fcfSShri Abhyankar b->icol = isicol; 558535aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 558635aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 558735aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 558835aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 558935aa4fcfSShri Abhyankar PetscFunctionReturn(0); 559035aa4fcfSShri Abhyankar } 559135aa4fcfSShri Abhyankar 559235aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 559335aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 559435aa4fcfSShri Abhyankar 559535aa4fcfSShri Abhyankar /* get new row pointers */ 559635aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 559735aa4fcfSShri Abhyankar bi[0] = 0; 559835aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 559935aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 560035aa4fcfSShri Abhyankar bdiag[0] = 0; 560135aa4fcfSShri Abhyankar 5602fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 560335aa4fcfSShri Abhyankar 560435aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 560535aa4fcfSShri Abhyankar nlnk = n + 1; 560635aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 560735aa4fcfSShri Abhyankar 560835aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 560935aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 561035aa4fcfSShri Abhyankar current_space = free_space; 561135aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 561235aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 561335aa4fcfSShri Abhyankar 561435aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 561535aa4fcfSShri Abhyankar nzi = 0; 561635aa4fcfSShri Abhyankar /* copy current row into linked list */ 561735aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 561835aa4fcfSShri Abhyankar if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 561935aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 562035aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 562135aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 562235aa4fcfSShri Abhyankar nzi += nlnk; 562335aa4fcfSShri Abhyankar 562435aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 562535aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 562635aa4fcfSShri Abhyankar fm = n; 562735aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 562835aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 562935aa4fcfSShri Abhyankar lnk[fm] = i; 563035aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 563135aa4fcfSShri Abhyankar nzi++; dcount++; 563235aa4fcfSShri Abhyankar } 563335aa4fcfSShri Abhyankar 563435aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 563535aa4fcfSShri Abhyankar nzbd = 0; 563635aa4fcfSShri Abhyankar prow = lnk[n]; 563735aa4fcfSShri Abhyankar while (prow < i) { 563835aa4fcfSShri Abhyankar nnz = bdiag[prow]; 563935aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 564035aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 564135aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 564235aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 564335aa4fcfSShri Abhyankar nzi += nlnk; 564435aa4fcfSShri Abhyankar prow = lnk[prow]; 564535aa4fcfSShri Abhyankar nzbd++; 564635aa4fcfSShri Abhyankar } 564735aa4fcfSShri Abhyankar bdiag[i] = nzbd; 564835aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 564935aa4fcfSShri Abhyankar 565035aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 565135aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 565235aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 565335aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 565435aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 565535aa4fcfSShri Abhyankar reallocs++; 565635aa4fcfSShri Abhyankar } 565735aa4fcfSShri Abhyankar 565835aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 565935aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 566035aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 566135aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 566235aa4fcfSShri Abhyankar 566335aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 566435aa4fcfSShri Abhyankar if (*(bj_ptr[i]+bdiag[i]) != i) { 566535aa4fcfSShri Abhyankar SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 566635aa4fcfSShri Abhyankar try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 566735aa4fcfSShri Abhyankar } 566835aa4fcfSShri Abhyankar 566935aa4fcfSShri Abhyankar current_space->array += nzi; 567035aa4fcfSShri Abhyankar current_space->local_used += nzi; 567135aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 567235aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 567335aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 567435aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 567535aa4fcfSShri Abhyankar } 567635aa4fcfSShri Abhyankar 567735aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 567835aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 567935aa4fcfSShri Abhyankar 568035aa4fcfSShri Abhyankar /* destroy list of free space and other temporary arrays */ 568135aa4fcfSShri Abhyankar ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 568235aa4fcfSShri Abhyankar 568335aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 56842ce24eb6SHong Zhang ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 568535aa4fcfSShri Abhyankar 568635aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 568735aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5688fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 568935aa4fcfSShri Abhyankar 569035aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 569135aa4fcfSShri Abhyankar { 569235aa4fcfSShri Abhyankar PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 569335aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 569435aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 569535aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 569635aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 569735aa4fcfSShri Abhyankar if (diagonal_fill) { 569835aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 569935aa4fcfSShri Abhyankar } 570035aa4fcfSShri Abhyankar } 570135aa4fcfSShri Abhyankar #endif 570235aa4fcfSShri Abhyankar 570335aa4fcfSShri Abhyankar /* put together the new matrix */ 570435aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 570535aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 570635aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 570735aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 570835aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 570935aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 571035aa4fcfSShri Abhyankar ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 571135aa4fcfSShri Abhyankar b->j = bj; 571235aa4fcfSShri Abhyankar b->i = bi; 571335aa4fcfSShri Abhyankar b->diag = bdiag; 571435aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 571535aa4fcfSShri Abhyankar b->ilen = 0; 571635aa4fcfSShri Abhyankar b->imax = 0; 571735aa4fcfSShri Abhyankar b->row = isrow; 571835aa4fcfSShri Abhyankar b->col = iscol; 571935aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 572035aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 572135aa4fcfSShri Abhyankar b->icol = isicol; 572235aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 572335aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 572435aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 572535aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 572635aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 5727ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocs; 5728ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 5729ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 57304dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 573135aa4fcfSShri Abhyankar PetscFunctionReturn(0); 573235aa4fcfSShri Abhyankar } 573335aa4fcfSShri Abhyankar 573435aa4fcfSShri Abhyankar 57354e2b4712SSatish Balay /* 57364e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 57374e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 57384e2b4712SSatish Balay Not a good example of code reuse. 57394e2b4712SSatish Balay */ 57404a2ae208SSatish Balay #undef __FUNCT__ 574106e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 574206e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 57434e2b4712SSatish Balay { 57444e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 57454e2b4712SSatish Balay IS isicol; 57466849ba73SBarry Smith PetscErrorCode ierr; 57475d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 57485d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5749a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5750d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 575141df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 5752329f5518SBarry Smith PetscReal f; 57534e2b4712SSatish Balay 57544e2b4712SSatish Balay PetscFunctionBegin; 57556bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 57566bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 57576bce7ff8SHong Zhang 5758435faa5fSBarry Smith f = info->fill; 5759690b6cddSBarry Smith levels = (PetscInt)info->levels; 5760690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 57614c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 576216a2bf60SHong Zhang 5763667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5764667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 57657d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 5766309c388cSBarry Smith 576741df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 576816a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 57698b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 57706bce7ff8SHong Zhang 5771719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 5772ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 5773bb3d539aSBarry Smith b->row = isrow; 5774bb3d539aSBarry Smith b->col = iscol; 5775bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5776bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5777bb3d539aSBarry Smith b->icol = isicol; 5778bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5779b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 57806bce7ff8SHong Zhang PetscFunctionReturn(0); 57816bce7ff8SHong Zhang } 57826bce7ff8SHong Zhang 57836bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 57844e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 57854e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 57864e2b4712SSatish Balay 57874e2b4712SSatish Balay /* get new row pointers */ 5788690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 57894e2b4712SSatish Balay ainew[0] = 0; 57904e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 5791690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 5792690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 57934e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 5794690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 57954e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 5796690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 57974e2b4712SSatish Balay /* im is level for each filled value */ 5798690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 57994e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 5800690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 58014e2b4712SSatish Balay dloc[0] = 0; 58024e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 5803435faa5fSBarry Smith 5804435faa5fSBarry Smith /* copy prow into linked list */ 58054e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 58063b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 58074e2b4712SSatish Balay xi = aj + ai[r[prow]]; 58084e2b4712SSatish Balay fill[n] = n; 5809435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 58104e2b4712SSatish Balay while (nz--) { 58114e2b4712SSatish Balay fm = n; 58124e2b4712SSatish Balay idx = ic[*xi++]; 58134e2b4712SSatish Balay do { 58144e2b4712SSatish Balay m = fm; 58154e2b4712SSatish Balay fm = fill[m]; 58164e2b4712SSatish Balay } while (fm < idx); 58174e2b4712SSatish Balay fill[m] = idx; 58184e2b4712SSatish Balay fill[idx] = fm; 58194e2b4712SSatish Balay im[idx] = 0; 58204e2b4712SSatish Balay } 5821435faa5fSBarry Smith 5822435faa5fSBarry Smith /* make sure diagonal entry is included */ 5823435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 5824435faa5fSBarry Smith fm = n; 5825435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 5826435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5827435faa5fSBarry Smith fill[fm] = prow; 5828435faa5fSBarry Smith im[prow] = 0; 5829435faa5fSBarry Smith nzf++; 5830335d9088SBarry Smith dcount++; 5831435faa5fSBarry Smith } 5832435faa5fSBarry Smith 58334e2b4712SSatish Balay nzi = 0; 58344e2b4712SSatish Balay row = fill[n]; 58354e2b4712SSatish Balay while (row < prow) { 58364e2b4712SSatish Balay incrlev = im[row] + 1; 58374e2b4712SSatish Balay nz = dloc[row]; 5838435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 58394e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 58404e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 58414e2b4712SSatish Balay fm = row; 58424e2b4712SSatish Balay while (nnz-- > 0) { 58434e2b4712SSatish Balay idx = *xi++; 58444e2b4712SSatish Balay if (*flev + incrlev > levels) { 58454e2b4712SSatish Balay flev++; 58464e2b4712SSatish Balay continue; 58474e2b4712SSatish Balay } 58484e2b4712SSatish Balay do { 58494e2b4712SSatish Balay m = fm; 58504e2b4712SSatish Balay fm = fill[m]; 58514e2b4712SSatish Balay } while (fm < idx); 58524e2b4712SSatish Balay if (fm != idx) { 58534e2b4712SSatish Balay im[idx] = *flev + incrlev; 58544e2b4712SSatish Balay fill[m] = idx; 58554e2b4712SSatish Balay fill[idx] = fm; 58564e2b4712SSatish Balay fm = idx; 58574e2b4712SSatish Balay nzf++; 5858ecf371e4SBarry Smith } else { 58594e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 58604e2b4712SSatish Balay } 58614e2b4712SSatish Balay flev++; 58624e2b4712SSatish Balay } 58634e2b4712SSatish Balay row = fill[row]; 58644e2b4712SSatish Balay nzi++; 58654e2b4712SSatish Balay } 58664e2b4712SSatish Balay /* copy new filled row into permanent storage */ 58674e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 58684e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 5869ecf371e4SBarry Smith 5870ecf371e4SBarry Smith /* estimate how much additional space we will need */ 5871ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5872ecf371e4SBarry Smith /* just double the memory each time */ 5873690b6cddSBarry Smith PetscInt maxadd = jmax; 5874ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 58754e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 58764e2b4712SSatish Balay jmax += maxadd; 5877ecf371e4SBarry Smith 5878ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 58795d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 58805d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5881606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 58825d0c19d7SBarry Smith ajnew = xitmp; 58835d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 58845d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5885606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 58865d0c19d7SBarry Smith ajfill = xitmp; 5887eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 58884e2b4712SSatish Balay } 58895d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 58904e2b4712SSatish Balay flev = ajfill + ainew[prow]; 58914e2b4712SSatish Balay dloc[prow] = nzi; 58924e2b4712SSatish Balay fm = fill[n]; 58934e2b4712SSatish Balay while (nzf--) { 58945d0c19d7SBarry Smith *xitmp++ = fm; 58954e2b4712SSatish Balay *flev++ = im[fm]; 58964e2b4712SSatish Balay fm = fill[fm]; 58974e2b4712SSatish Balay } 5898435faa5fSBarry Smith /* make sure row has diagonal entry */ 5899435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 590077431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 59012401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5902435faa5fSBarry Smith } 59034e2b4712SSatish Balay } 5904606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 59054e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 59064e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5907606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 5908606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 59094e2b4712SSatish Balay 59106cf91177SBarry Smith #if defined(PETSC_USE_INFO) 59114e2b4712SSatish Balay { 5912329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5913ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5914ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5915ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5916ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5917335d9088SBarry Smith if (diagonal_fill) { 5918ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5919335d9088SBarry Smith } 59204e2b4712SSatish Balay } 592163ba0a88SBarry Smith #endif 59224e2b4712SSatish Balay 59234e2b4712SSatish Balay /* put together the new matrix */ 5924719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5925719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5926ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 5927e6b907acSBarry Smith b->free_a = PETSC_TRUE; 5928e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 59297c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 5930a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 59314e2b4712SSatish Balay b->j = ajnew; 59324e2b4712SSatish Balay b->i = ainew; 59334e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 59344e2b4712SSatish Balay b->diag = dloc; 59357f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 59364e2b4712SSatish Balay b->ilen = 0; 59374e2b4712SSatish Balay b->imax = 0; 59384e2b4712SSatish Balay b->row = isrow; 59394e2b4712SSatish Balay b->col = iscol; 5940bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5941c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5942c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5943e51c0b9cSSatish Balay b->icol = isicol; 594487828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 59454e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 59464e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 5947719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 59484e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 59494e2b4712SSatish Balay 5950ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocate; 5951ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 5952ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 59536bce7ff8SHong Zhang 59548b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 59558661488fSKris Buschelman PetscFunctionReturn(0); 59568661488fSKris Buschelman } 59578661488fSKris Buschelman 5958732ee342SKris Buschelman #undef __FUNCT__ 59597e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5960dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 59617e7071cdSKris Buschelman { 596212272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 596312272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 59645a9542e3SKris Buschelman PetscFunctionBegin; 59657cf1b8d3SKris Buschelman /* Undo Column scaling */ 59667cf1b8d3SKris Buschelman /* while (nz--) { */ 59677cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 59687cf1b8d3SKris Buschelman /* } */ 5969c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 5970c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 59717cf1b8d3SKris Buschelman PetscFunctionReturn(0); 59727cf1b8d3SKris Buschelman } 59737cf1b8d3SKris Buschelman 59747cf1b8d3SKris Buschelman #undef __FUNCT__ 59757cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5976dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 59777cf1b8d3SKris Buschelman { 59787cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5979b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 59802aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 59815a9542e3SKris Buschelman PetscFunctionBegin; 59820b9da03eSKris Buschelman /* Is this really necessary? */ 598320235379SKris Buschelman while (nz--) { 59840b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 59857e7071cdSKris Buschelman } 5986c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 59877e7071cdSKris Buschelman PetscFunctionReturn(0); 59887e7071cdSKris Buschelman } 59897e7071cdSKris Buschelman 5990732ee342SKris Buschelman 5991