14e2b4712SSatish Balay /* 24e2b4712SSatish Balay Factorization code for BAIJ format. 34e2b4712SSatish Balay */ 44e2b4712SSatish Balay 54e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h" 64e2b4712SSatish Balay #include "src/inline/ilu.h" 774c49faeSBarry Smith #include "src/inline/dot.h" 84e2b4712SSatish Balay 94a2ae208SSatish Balay #undef __FUNCT__ 104a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 11dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 12f1af5d2fSBarry Smith { 13f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 14dfbe8321SBarry Smith PetscErrorCode ierr; 15dfbe8321SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 16f1af5d2fSBarry Smith int *diag = a->diag; 17f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 1887828ca2SBarry Smith PetscScalar s1,*x,*b; 19f1af5d2fSBarry Smith 20f1af5d2fSBarry Smith PetscFunctionBegin; 21ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 221ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 231ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24f1af5d2fSBarry Smith 25f1af5d2fSBarry Smith /* forward solve the U^T */ 26f1af5d2fSBarry Smith for (i=0; i<n; i++) { 27f1af5d2fSBarry Smith 28f1af5d2fSBarry Smith v = aa + diag[i]; 29f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 30ef66eb69SBarry Smith s1 = (*v++)*x[i]; 31f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 32f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 33f1af5d2fSBarry Smith while (nz--) { 34f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 35f1af5d2fSBarry Smith } 36f1af5d2fSBarry Smith x[i] = s1; 37f1af5d2fSBarry Smith } 38f1af5d2fSBarry Smith /* backward solve the L^T */ 39f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 40f1af5d2fSBarry Smith v = aa + diag[i] - 1; 41f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 42f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 43f1af5d2fSBarry Smith s1 = x[i]; 44f1af5d2fSBarry Smith while (nz--) { 45f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 46f1af5d2fSBarry Smith } 47f1af5d2fSBarry Smith } 481ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 491ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 50b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 51f1af5d2fSBarry Smith PetscFunctionReturn(0); 52f1af5d2fSBarry Smith } 53f1af5d2fSBarry Smith 544a2ae208SSatish Balay #undef __FUNCT__ 554a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 56dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 57f1af5d2fSBarry Smith { 58f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 59dfbe8321SBarry Smith PetscErrorCode ierr; 60dfbe8321SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 61f1af5d2fSBarry Smith int *diag = a->diag,oidx; 62f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6387828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6487828ca2SBarry Smith PetscScalar *x,*b; 65f1af5d2fSBarry Smith 66f1af5d2fSBarry Smith PetscFunctionBegin; 67ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 681ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 691ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 70f1af5d2fSBarry Smith 71f1af5d2fSBarry Smith /* forward solve the U^T */ 72f1af5d2fSBarry Smith idx = 0; 73f1af5d2fSBarry Smith for (i=0; i<n; i++) { 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith v = aa + 4*diag[i]; 76f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 77ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 78f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 79f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 80f1af5d2fSBarry Smith v += 4; 81f1af5d2fSBarry Smith 82f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 83f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 84f1af5d2fSBarry Smith while (nz--) { 85f1af5d2fSBarry Smith oidx = 2*(*vi++); 86f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 87f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 88f1af5d2fSBarry Smith v += 4; 89f1af5d2fSBarry Smith } 90f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 91f1af5d2fSBarry Smith idx += 2; 92f1af5d2fSBarry Smith } 93f1af5d2fSBarry Smith /* backward solve the L^T */ 94f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 95f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 96f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 97f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 98f1af5d2fSBarry Smith idt = 2*i; 99f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 100f1af5d2fSBarry Smith while (nz--) { 101f1af5d2fSBarry Smith idx = 2*(*vi--); 102f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 103f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 104f1af5d2fSBarry Smith v -= 4; 105f1af5d2fSBarry Smith } 106f1af5d2fSBarry Smith } 1071ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1081ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 109b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 110f1af5d2fSBarry Smith PetscFunctionReturn(0); 111f1af5d2fSBarry Smith } 112f1af5d2fSBarry Smith 1134a2ae208SSatish Balay #undef __FUNCT__ 1144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 115dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 116f1af5d2fSBarry Smith { 117f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 118dfbe8321SBarry Smith PetscErrorCode ierr; 119dfbe8321SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 120f1af5d2fSBarry Smith int *diag = a->diag,oidx; 121f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12287828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12387828ca2SBarry Smith PetscScalar *x,*b; 124f1af5d2fSBarry Smith 125f1af5d2fSBarry Smith PetscFunctionBegin; 126ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1271ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 129f1af5d2fSBarry Smith 130f1af5d2fSBarry Smith /* forward solve the U^T */ 131f1af5d2fSBarry Smith idx = 0; 132f1af5d2fSBarry Smith for (i=0; i<n; i++) { 133f1af5d2fSBarry Smith 134f1af5d2fSBarry Smith v = aa + 9*diag[i]; 135f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 136ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 137f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 138f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 139f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 140f1af5d2fSBarry Smith v += 9; 141f1af5d2fSBarry Smith 142f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 143f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 144f1af5d2fSBarry Smith while (nz--) { 145f1af5d2fSBarry Smith oidx = 3*(*vi++); 146f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 147f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 148f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 149f1af5d2fSBarry Smith v += 9; 150f1af5d2fSBarry Smith } 151f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 152f1af5d2fSBarry Smith idx += 3; 153f1af5d2fSBarry Smith } 154f1af5d2fSBarry Smith /* backward solve the L^T */ 155f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 156f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 157f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 158f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 159f1af5d2fSBarry Smith idt = 3*i; 160f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 161f1af5d2fSBarry Smith while (nz--) { 162f1af5d2fSBarry Smith idx = 3*(*vi--); 163f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 164f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 165f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 166f1af5d2fSBarry Smith v -= 9; 167f1af5d2fSBarry Smith } 168f1af5d2fSBarry Smith } 1691ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1701ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 171b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 172f1af5d2fSBarry Smith PetscFunctionReturn(0); 173f1af5d2fSBarry Smith } 174f1af5d2fSBarry Smith 1754a2ae208SSatish Balay #undef __FUNCT__ 1764a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 177dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 178f1af5d2fSBarry Smith { 179f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 180dfbe8321SBarry Smith PetscErrorCode ierr; 181dfbe8321SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 182f1af5d2fSBarry Smith int *diag = a->diag,oidx; 183f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18587828ca2SBarry Smith PetscScalar *x,*b; 186f1af5d2fSBarry Smith 187f1af5d2fSBarry Smith PetscFunctionBegin; 188ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1891ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1901ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 191f1af5d2fSBarry Smith 192f1af5d2fSBarry Smith /* forward solve the U^T */ 193f1af5d2fSBarry Smith idx = 0; 194f1af5d2fSBarry Smith for (i=0; i<n; i++) { 195f1af5d2fSBarry Smith 196f1af5d2fSBarry Smith v = aa + 16*diag[i]; 197f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 198ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 199f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 200f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 201f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 202f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 203f1af5d2fSBarry Smith v += 16; 204f1af5d2fSBarry Smith 205f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 206f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 207f1af5d2fSBarry Smith while (nz--) { 208f1af5d2fSBarry Smith oidx = 4*(*vi++); 209f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 210f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 211f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 212f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 213f1af5d2fSBarry Smith v += 16; 214f1af5d2fSBarry Smith } 215f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 216f1af5d2fSBarry Smith idx += 4; 217f1af5d2fSBarry Smith } 218f1af5d2fSBarry Smith /* backward solve the L^T */ 219f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 220f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 221f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 222f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 223f1af5d2fSBarry Smith idt = 4*i; 224f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 225f1af5d2fSBarry Smith while (nz--) { 226f1af5d2fSBarry Smith idx = 4*(*vi--); 227f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 228f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 229f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 230f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 231f1af5d2fSBarry Smith v -= 16; 232f1af5d2fSBarry Smith } 233f1af5d2fSBarry Smith } 2341ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2351ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 236b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 237f1af5d2fSBarry Smith PetscFunctionReturn(0); 238f1af5d2fSBarry Smith } 239f1af5d2fSBarry Smith 2404a2ae208SSatish Balay #undef __FUNCT__ 2414a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 242dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 243f1af5d2fSBarry Smith { 244f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 245dfbe8321SBarry Smith PetscErrorCode ierr; 246dfbe8321SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 247f1af5d2fSBarry Smith int *diag = a->diag,oidx; 248f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 24987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 25087828ca2SBarry Smith PetscScalar *x,*b; 251f1af5d2fSBarry Smith 252f1af5d2fSBarry Smith PetscFunctionBegin; 253ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2541ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2551ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 256f1af5d2fSBarry Smith 257f1af5d2fSBarry Smith /* forward solve the U^T */ 258f1af5d2fSBarry Smith idx = 0; 259f1af5d2fSBarry Smith for (i=0; i<n; i++) { 260f1af5d2fSBarry Smith 261f1af5d2fSBarry Smith v = aa + 25*diag[i]; 262f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 263ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 264f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 265f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 266f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 267f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 268f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 269f1af5d2fSBarry Smith v += 25; 270f1af5d2fSBarry Smith 271f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 272f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 273f1af5d2fSBarry Smith while (nz--) { 274f1af5d2fSBarry Smith oidx = 5*(*vi++); 275f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 276f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 277f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 278f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 279f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 280f1af5d2fSBarry Smith v += 25; 281f1af5d2fSBarry Smith } 282f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 283f1af5d2fSBarry Smith idx += 5; 284f1af5d2fSBarry Smith } 285f1af5d2fSBarry Smith /* backward solve the L^T */ 286f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 287f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 288f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 289f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 290f1af5d2fSBarry Smith idt = 5*i; 291f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 292f1af5d2fSBarry Smith while (nz--) { 293f1af5d2fSBarry Smith idx = 5*(*vi--); 294f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 295f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 296f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 297f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 298f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 299f1af5d2fSBarry Smith v -= 25; 300f1af5d2fSBarry Smith } 301f1af5d2fSBarry Smith } 3021ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3031ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 304b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 305f1af5d2fSBarry Smith PetscFunctionReturn(0); 306f1af5d2fSBarry Smith } 307f1af5d2fSBarry Smith 3084a2ae208SSatish Balay #undef __FUNCT__ 3094a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 310dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 311f1af5d2fSBarry Smith { 312f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 313dfbe8321SBarry Smith PetscErrorCode ierr; 314dfbe8321SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 315f1af5d2fSBarry Smith int *diag = a->diag,oidx; 316f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 31787828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 31887828ca2SBarry Smith PetscScalar *x,*b; 319f1af5d2fSBarry Smith 320f1af5d2fSBarry Smith PetscFunctionBegin; 321ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3221ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3231ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 324f1af5d2fSBarry Smith 325f1af5d2fSBarry Smith /* forward solve the U^T */ 326f1af5d2fSBarry Smith idx = 0; 327f1af5d2fSBarry Smith for (i=0; i<n; i++) { 328f1af5d2fSBarry Smith 329f1af5d2fSBarry Smith v = aa + 36*diag[i]; 330f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 331ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 332ef66eb69SBarry Smith x6 = x[5+idx]; 333f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 334f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 335f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 336f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 337f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 338f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 339f1af5d2fSBarry Smith v += 36; 340f1af5d2fSBarry Smith 341f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 342f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 343f1af5d2fSBarry Smith while (nz--) { 344f1af5d2fSBarry Smith oidx = 6*(*vi++); 345f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 346f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 347f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 348f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 349f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 350f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 351f1af5d2fSBarry Smith v += 36; 352f1af5d2fSBarry Smith } 353f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 354f1af5d2fSBarry Smith x[5+idx] = s6; 355f1af5d2fSBarry Smith idx += 6; 356f1af5d2fSBarry Smith } 357f1af5d2fSBarry Smith /* backward solve the L^T */ 358f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 359f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 360f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 361f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 362f1af5d2fSBarry Smith idt = 6*i; 363f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 364f1af5d2fSBarry Smith s6 = x[5+idt]; 365f1af5d2fSBarry Smith while (nz--) { 366f1af5d2fSBarry Smith idx = 6*(*vi--); 367f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 368f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 369f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 370f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 371f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 372f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 373f1af5d2fSBarry Smith v -= 36; 374f1af5d2fSBarry Smith } 375f1af5d2fSBarry Smith } 3761ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3771ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 378b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 379f1af5d2fSBarry Smith PetscFunctionReturn(0); 380f1af5d2fSBarry Smith } 381f1af5d2fSBarry Smith 3824a2ae208SSatish Balay #undef __FUNCT__ 3834a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 384dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 385f1af5d2fSBarry Smith { 386f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 387dfbe8321SBarry Smith PetscErrorCode ierr; 388dfbe8321SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 389f1af5d2fSBarry Smith int *diag = a->diag,oidx; 390f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 39187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 39287828ca2SBarry Smith PetscScalar *x,*b; 393f1af5d2fSBarry Smith 394f1af5d2fSBarry Smith PetscFunctionBegin; 395ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3961ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3971ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 398f1af5d2fSBarry Smith 399f1af5d2fSBarry Smith /* forward solve the U^T */ 400f1af5d2fSBarry Smith idx = 0; 401f1af5d2fSBarry Smith for (i=0; i<n; i++) { 402f1af5d2fSBarry Smith 403f1af5d2fSBarry Smith v = aa + 49*diag[i]; 404f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 405ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 406ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 407f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 408f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 409f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 410f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 411f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 412f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 413f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 414f1af5d2fSBarry Smith v += 49; 415f1af5d2fSBarry Smith 416f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 417f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 418f1af5d2fSBarry Smith while (nz--) { 419f1af5d2fSBarry Smith oidx = 7*(*vi++); 420f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 421f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 422f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 423f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 424f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 425f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 426f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 427f1af5d2fSBarry Smith v += 49; 428f1af5d2fSBarry Smith } 429f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 430f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 431f1af5d2fSBarry Smith idx += 7; 432f1af5d2fSBarry Smith } 433f1af5d2fSBarry Smith /* backward solve the L^T */ 434f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 435f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 436f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 437f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 438f1af5d2fSBarry Smith idt = 7*i; 439f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 440f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 441f1af5d2fSBarry Smith while (nz--) { 442f1af5d2fSBarry Smith idx = 7*(*vi--); 443f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 444f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 445f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 446f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 447f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 448f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 449f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 450f1af5d2fSBarry Smith v -= 49; 451f1af5d2fSBarry Smith } 452f1af5d2fSBarry Smith } 4531ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4541ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 455b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 456f1af5d2fSBarry Smith PetscFunctionReturn(0); 457f1af5d2fSBarry Smith } 458f1af5d2fSBarry Smith 459f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4604a2ae208SSatish Balay #undef __FUNCT__ 4614a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 462dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 463f1af5d2fSBarry Smith { 464f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 465f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 466*6849ba73SBarry Smith PetscErrorCode ierr; 467*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 468f1af5d2fSBarry Smith int *diag = a->diag; 469f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 47087828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 471f1af5d2fSBarry Smith 472f1af5d2fSBarry Smith PetscFunctionBegin; 4731ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4741ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 475f1af5d2fSBarry Smith t = a->solve_work; 476f1af5d2fSBarry Smith 477f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 478f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 479f1af5d2fSBarry Smith 480f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 481f1af5d2fSBarry Smith for (i=0; i<n; i++) { 482f1af5d2fSBarry Smith t[i] = b[c[i]]; 483f1af5d2fSBarry Smith } 484f1af5d2fSBarry Smith 485f1af5d2fSBarry Smith /* forward solve the U^T */ 486f1af5d2fSBarry Smith for (i=0; i<n; i++) { 487f1af5d2fSBarry Smith 488f1af5d2fSBarry Smith v = aa + diag[i]; 489f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 490f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 491f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 492f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 493f1af5d2fSBarry Smith while (nz--) { 494f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 495f1af5d2fSBarry Smith } 496f1af5d2fSBarry Smith t[i] = s1; 497f1af5d2fSBarry Smith } 498f1af5d2fSBarry Smith /* backward solve the L^T */ 499f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 500f1af5d2fSBarry Smith v = aa + diag[i] - 1; 501f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 502f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 503f1af5d2fSBarry Smith s1 = t[i]; 504f1af5d2fSBarry Smith while (nz--) { 505f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 506f1af5d2fSBarry Smith } 507f1af5d2fSBarry Smith } 508f1af5d2fSBarry Smith 509f1af5d2fSBarry Smith /* copy t into x according to permutation */ 510f1af5d2fSBarry Smith for (i=0; i<n; i++) { 511f1af5d2fSBarry Smith x[r[i]] = t[i]; 512f1af5d2fSBarry Smith } 513f1af5d2fSBarry Smith 514f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 515f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5161ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5171ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 518b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 519f1af5d2fSBarry Smith PetscFunctionReturn(0); 520f1af5d2fSBarry Smith } 521f1af5d2fSBarry Smith 5224a2ae208SSatish Balay #undef __FUNCT__ 5234a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 524dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 525f1af5d2fSBarry Smith { 526f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 527f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 528*6849ba73SBarry Smith PetscErrorCode ierr; 529*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 530f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 531f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53287828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 53387828ca2SBarry Smith PetscScalar *x,*b,*t; 534f1af5d2fSBarry Smith 535f1af5d2fSBarry Smith PetscFunctionBegin; 5361ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5371ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 538f1af5d2fSBarry Smith t = a->solve_work; 539f1af5d2fSBarry Smith 540f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 541f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 542f1af5d2fSBarry Smith 543f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 544f1af5d2fSBarry Smith ii = 0; 545f1af5d2fSBarry Smith for (i=0; i<n; i++) { 546f1af5d2fSBarry Smith ic = 2*c[i]; 547f1af5d2fSBarry Smith t[ii] = b[ic]; 548f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 549f1af5d2fSBarry Smith ii += 2; 550f1af5d2fSBarry Smith } 551f1af5d2fSBarry Smith 552f1af5d2fSBarry Smith /* forward solve the U^T */ 553f1af5d2fSBarry Smith idx = 0; 554f1af5d2fSBarry Smith for (i=0; i<n; i++) { 555f1af5d2fSBarry Smith 556f1af5d2fSBarry Smith v = aa + 4*diag[i]; 557f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 558f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 559f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 560f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 561f1af5d2fSBarry Smith v += 4; 562f1af5d2fSBarry Smith 563f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 564f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 565f1af5d2fSBarry Smith while (nz--) { 566f1af5d2fSBarry Smith oidx = 2*(*vi++); 567f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 568f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 569f1af5d2fSBarry Smith v += 4; 570f1af5d2fSBarry Smith } 571f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 572f1af5d2fSBarry Smith idx += 2; 573f1af5d2fSBarry Smith } 574f1af5d2fSBarry Smith /* backward solve the L^T */ 575f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 576f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 577f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 578f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 579f1af5d2fSBarry Smith idt = 2*i; 580f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 581f1af5d2fSBarry Smith while (nz--) { 582f1af5d2fSBarry Smith idx = 2*(*vi--); 583f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 584f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 585f1af5d2fSBarry Smith v -= 4; 586f1af5d2fSBarry Smith } 587f1af5d2fSBarry Smith } 588f1af5d2fSBarry Smith 589f1af5d2fSBarry Smith /* copy t into x according to permutation */ 590f1af5d2fSBarry Smith ii = 0; 591f1af5d2fSBarry Smith for (i=0; i<n; i++) { 592f1af5d2fSBarry Smith ir = 2*r[i]; 593f1af5d2fSBarry Smith x[ir] = t[ii]; 594f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 595f1af5d2fSBarry Smith ii += 2; 596f1af5d2fSBarry Smith } 597f1af5d2fSBarry Smith 598f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 599f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6001ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6011ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 602b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 603f1af5d2fSBarry Smith PetscFunctionReturn(0); 604f1af5d2fSBarry Smith } 605f1af5d2fSBarry Smith 6064a2ae208SSatish Balay #undef __FUNCT__ 6074a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 608dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 609f1af5d2fSBarry Smith { 610f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 611f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 612*6849ba73SBarry Smith PetscErrorCode ierr; 613*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 614f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 615f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 61687828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 61787828ca2SBarry Smith PetscScalar *x,*b,*t; 618f1af5d2fSBarry Smith 619f1af5d2fSBarry Smith PetscFunctionBegin; 6201ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 622f1af5d2fSBarry Smith t = a->solve_work; 623f1af5d2fSBarry Smith 624f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 625f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 626f1af5d2fSBarry Smith 627f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 628f1af5d2fSBarry Smith ii = 0; 629f1af5d2fSBarry Smith for (i=0; i<n; i++) { 630f1af5d2fSBarry Smith ic = 3*c[i]; 631f1af5d2fSBarry Smith t[ii] = b[ic]; 632f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 633f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 634f1af5d2fSBarry Smith ii += 3; 635f1af5d2fSBarry Smith } 636f1af5d2fSBarry Smith 637f1af5d2fSBarry Smith /* forward solve the U^T */ 638f1af5d2fSBarry Smith idx = 0; 639f1af5d2fSBarry Smith for (i=0; i<n; i++) { 640f1af5d2fSBarry Smith 641f1af5d2fSBarry Smith v = aa + 9*diag[i]; 642f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 643f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 644f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 645f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 646f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 647f1af5d2fSBarry Smith v += 9; 648f1af5d2fSBarry Smith 649f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 650f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 651f1af5d2fSBarry Smith while (nz--) { 652f1af5d2fSBarry Smith oidx = 3*(*vi++); 653f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 654f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 655f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 656f1af5d2fSBarry Smith v += 9; 657f1af5d2fSBarry Smith } 658f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 659f1af5d2fSBarry Smith idx += 3; 660f1af5d2fSBarry Smith } 661f1af5d2fSBarry Smith /* backward solve the L^T */ 662f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 663f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 664f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 665f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 666f1af5d2fSBarry Smith idt = 3*i; 667f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 668f1af5d2fSBarry Smith while (nz--) { 669f1af5d2fSBarry Smith idx = 3*(*vi--); 670f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 671f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 672f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 673f1af5d2fSBarry Smith v -= 9; 674f1af5d2fSBarry Smith } 675f1af5d2fSBarry Smith } 676f1af5d2fSBarry Smith 677f1af5d2fSBarry Smith /* copy t into x according to permutation */ 678f1af5d2fSBarry Smith ii = 0; 679f1af5d2fSBarry Smith for (i=0; i<n; i++) { 680f1af5d2fSBarry Smith ir = 3*r[i]; 681f1af5d2fSBarry Smith x[ir] = t[ii]; 682f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 683f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 684f1af5d2fSBarry Smith ii += 3; 685f1af5d2fSBarry Smith } 686f1af5d2fSBarry Smith 687f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 688f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6891ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6901ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 691b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 692f1af5d2fSBarry Smith PetscFunctionReturn(0); 693f1af5d2fSBarry Smith } 694f1af5d2fSBarry Smith 6954a2ae208SSatish Balay #undef __FUNCT__ 6964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 697dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 698f1af5d2fSBarry Smith { 699f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 700f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 701*6849ba73SBarry Smith PetscErrorCode ierr; 702*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 703f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 704f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 70587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 70687828ca2SBarry Smith PetscScalar *x,*b,*t; 707f1af5d2fSBarry Smith 708f1af5d2fSBarry Smith PetscFunctionBegin; 7091ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7101ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 711f1af5d2fSBarry Smith t = a->solve_work; 712f1af5d2fSBarry Smith 713f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 714f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 715f1af5d2fSBarry Smith 716f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 717f1af5d2fSBarry Smith ii = 0; 718f1af5d2fSBarry Smith for (i=0; i<n; i++) { 719f1af5d2fSBarry Smith ic = 4*c[i]; 720f1af5d2fSBarry Smith t[ii] = b[ic]; 721f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 722f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 723f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 724f1af5d2fSBarry Smith ii += 4; 725f1af5d2fSBarry Smith } 726f1af5d2fSBarry Smith 727f1af5d2fSBarry Smith /* forward solve the U^T */ 728f1af5d2fSBarry Smith idx = 0; 729f1af5d2fSBarry Smith for (i=0; i<n; i++) { 730f1af5d2fSBarry Smith 731f1af5d2fSBarry Smith v = aa + 16*diag[i]; 732f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 733f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 734f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 735f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 736f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 737f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 738f1af5d2fSBarry Smith v += 16; 739f1af5d2fSBarry Smith 740f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 741f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 742f1af5d2fSBarry Smith while (nz--) { 743f1af5d2fSBarry Smith oidx = 4*(*vi++); 744f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 745f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 746f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 747f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 748f1af5d2fSBarry Smith v += 16; 749f1af5d2fSBarry Smith } 750f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 751f1af5d2fSBarry Smith idx += 4; 752f1af5d2fSBarry Smith } 753f1af5d2fSBarry Smith /* backward solve the L^T */ 754f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 755f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 756f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 757f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 758f1af5d2fSBarry Smith idt = 4*i; 759f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 760f1af5d2fSBarry Smith while (nz--) { 761f1af5d2fSBarry Smith idx = 4*(*vi--); 762f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 763f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 764f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 765f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 766f1af5d2fSBarry Smith v -= 16; 767f1af5d2fSBarry Smith } 768f1af5d2fSBarry Smith } 769f1af5d2fSBarry Smith 770f1af5d2fSBarry Smith /* copy t into x according to permutation */ 771f1af5d2fSBarry Smith ii = 0; 772f1af5d2fSBarry Smith for (i=0; i<n; i++) { 773f1af5d2fSBarry Smith ir = 4*r[i]; 774f1af5d2fSBarry Smith x[ir] = t[ii]; 775f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 776f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 777f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 778f1af5d2fSBarry Smith ii += 4; 779f1af5d2fSBarry Smith } 780f1af5d2fSBarry Smith 781f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 782f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7831ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7841ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 785b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 786f1af5d2fSBarry Smith PetscFunctionReturn(0); 787f1af5d2fSBarry Smith } 788f1af5d2fSBarry Smith 7894a2ae208SSatish Balay #undef __FUNCT__ 7904a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 791dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 792f1af5d2fSBarry Smith { 793f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 794f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 795*6849ba73SBarry Smith PetscErrorCode ierr; 796*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 797f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 798f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 79987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 80087828ca2SBarry Smith PetscScalar *x,*b,*t; 801f1af5d2fSBarry Smith 802f1af5d2fSBarry Smith PetscFunctionBegin; 8031ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8041ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 805f1af5d2fSBarry Smith t = a->solve_work; 806f1af5d2fSBarry Smith 807f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 808f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 809f1af5d2fSBarry Smith 810f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 811f1af5d2fSBarry Smith ii = 0; 812f1af5d2fSBarry Smith for (i=0; i<n; i++) { 813f1af5d2fSBarry Smith ic = 5*c[i]; 814f1af5d2fSBarry Smith t[ii] = b[ic]; 815f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 816f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 817f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 818f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 819f1af5d2fSBarry Smith ii += 5; 820f1af5d2fSBarry Smith } 821f1af5d2fSBarry Smith 822f1af5d2fSBarry Smith /* forward solve the U^T */ 823f1af5d2fSBarry Smith idx = 0; 824f1af5d2fSBarry Smith for (i=0; i<n; i++) { 825f1af5d2fSBarry Smith 826f1af5d2fSBarry Smith v = aa + 25*diag[i]; 827f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 828f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 829f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 830f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 831f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 832f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 833f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 834f1af5d2fSBarry Smith v += 25; 835f1af5d2fSBarry Smith 836f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 837f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 838f1af5d2fSBarry Smith while (nz--) { 839f1af5d2fSBarry Smith oidx = 5*(*vi++); 840f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 841f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 842f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 843f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 844f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 845f1af5d2fSBarry Smith v += 25; 846f1af5d2fSBarry Smith } 847f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 848f1af5d2fSBarry Smith idx += 5; 849f1af5d2fSBarry Smith } 850f1af5d2fSBarry Smith /* backward solve the L^T */ 851f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 852f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 853f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 854f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 855f1af5d2fSBarry Smith idt = 5*i; 856f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 857f1af5d2fSBarry Smith while (nz--) { 858f1af5d2fSBarry Smith idx = 5*(*vi--); 859f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 860f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 861f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 862f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 863f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 864f1af5d2fSBarry Smith v -= 25; 865f1af5d2fSBarry Smith } 866f1af5d2fSBarry Smith } 867f1af5d2fSBarry Smith 868f1af5d2fSBarry Smith /* copy t into x according to permutation */ 869f1af5d2fSBarry Smith ii = 0; 870f1af5d2fSBarry Smith for (i=0; i<n; i++) { 871f1af5d2fSBarry Smith ir = 5*r[i]; 872f1af5d2fSBarry Smith x[ir] = t[ii]; 873f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 874f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 875f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 876f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 877f1af5d2fSBarry Smith ii += 5; 878f1af5d2fSBarry Smith } 879f1af5d2fSBarry Smith 880f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 881f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8821ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8831ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 884b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 885f1af5d2fSBarry Smith PetscFunctionReturn(0); 886f1af5d2fSBarry Smith } 887f1af5d2fSBarry Smith 8884a2ae208SSatish Balay #undef __FUNCT__ 8894a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 890dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 891f1af5d2fSBarry Smith { 892f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 893f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 894*6849ba73SBarry Smith PetscErrorCode ierr; 895*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 896f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 897f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 89887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 89987828ca2SBarry Smith PetscScalar *x,*b,*t; 900f1af5d2fSBarry Smith 901f1af5d2fSBarry Smith PetscFunctionBegin; 9021ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9031ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 904f1af5d2fSBarry Smith t = a->solve_work; 905f1af5d2fSBarry Smith 906f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 907f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 908f1af5d2fSBarry Smith 909f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 910f1af5d2fSBarry Smith ii = 0; 911f1af5d2fSBarry Smith for (i=0; i<n; i++) { 912f1af5d2fSBarry Smith ic = 6*c[i]; 913f1af5d2fSBarry Smith t[ii] = b[ic]; 914f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 915f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 916f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 917f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 918f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 919f1af5d2fSBarry Smith ii += 6; 920f1af5d2fSBarry Smith } 921f1af5d2fSBarry Smith 922f1af5d2fSBarry Smith /* forward solve the U^T */ 923f1af5d2fSBarry Smith idx = 0; 924f1af5d2fSBarry Smith for (i=0; i<n; i++) { 925f1af5d2fSBarry Smith 926f1af5d2fSBarry Smith v = aa + 36*diag[i]; 927f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 928f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 929f1af5d2fSBarry Smith x6 = t[5+idx]; 930f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 931f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 932f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 933f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 934f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 935f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 936f1af5d2fSBarry Smith v += 36; 937f1af5d2fSBarry Smith 938f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 939f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 940f1af5d2fSBarry Smith while (nz--) { 941f1af5d2fSBarry Smith oidx = 6*(*vi++); 942f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 943f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 944f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 945f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 946f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 947f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 948f1af5d2fSBarry Smith v += 36; 949f1af5d2fSBarry Smith } 950f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 951f1af5d2fSBarry Smith t[5+idx] = s6; 952f1af5d2fSBarry Smith idx += 6; 953f1af5d2fSBarry Smith } 954f1af5d2fSBarry Smith /* backward solve the L^T */ 955f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 956f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 957f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 958f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 959f1af5d2fSBarry Smith idt = 6*i; 960f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 961f1af5d2fSBarry Smith s6 = t[5+idt]; 962f1af5d2fSBarry Smith while (nz--) { 963f1af5d2fSBarry Smith idx = 6*(*vi--); 964f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 965f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 966f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 967f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 968f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 969f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 970f1af5d2fSBarry Smith v -= 36; 971f1af5d2fSBarry Smith } 972f1af5d2fSBarry Smith } 973f1af5d2fSBarry Smith 974f1af5d2fSBarry Smith /* copy t into x according to permutation */ 975f1af5d2fSBarry Smith ii = 0; 976f1af5d2fSBarry Smith for (i=0; i<n; i++) { 977f1af5d2fSBarry Smith ir = 6*r[i]; 978f1af5d2fSBarry Smith x[ir] = t[ii]; 979f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 980f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 981f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 982f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 983f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 984f1af5d2fSBarry Smith ii += 6; 985f1af5d2fSBarry Smith } 986f1af5d2fSBarry Smith 987f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 988f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9891ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 9901ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 991b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 992f1af5d2fSBarry Smith PetscFunctionReturn(0); 993f1af5d2fSBarry Smith } 994f1af5d2fSBarry Smith 9954a2ae208SSatish Balay #undef __FUNCT__ 9964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 997dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 998f1af5d2fSBarry Smith { 999f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1000f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 1001*6849ba73SBarry Smith PetscErrorCode ierr; 1002*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 1003f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 1004f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 100587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 100687828ca2SBarry Smith PetscScalar *x,*b,*t; 1007f1af5d2fSBarry Smith 1008f1af5d2fSBarry Smith PetscFunctionBegin; 10091ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10101ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1011f1af5d2fSBarry Smith t = a->solve_work; 1012f1af5d2fSBarry Smith 1013f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1014f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1015f1af5d2fSBarry Smith 1016f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1017f1af5d2fSBarry Smith ii = 0; 1018f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1019f1af5d2fSBarry Smith ic = 7*c[i]; 1020f1af5d2fSBarry Smith t[ii] = b[ic]; 1021f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1022f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1023f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1024f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1025f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1026f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1027f1af5d2fSBarry Smith ii += 7; 1028f1af5d2fSBarry Smith } 1029f1af5d2fSBarry Smith 1030f1af5d2fSBarry Smith /* forward solve the U^T */ 1031f1af5d2fSBarry Smith idx = 0; 1032f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1033f1af5d2fSBarry Smith 1034f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1035f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1036f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1037f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1038f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1039f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1040f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1041f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1042f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1043f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1044f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1045f1af5d2fSBarry Smith v += 49; 1046f1af5d2fSBarry Smith 1047f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1048f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1049f1af5d2fSBarry Smith while (nz--) { 1050f1af5d2fSBarry Smith oidx = 7*(*vi++); 1051f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1052f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1053f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1054f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1055f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1056f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1057f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1058f1af5d2fSBarry Smith v += 49; 1059f1af5d2fSBarry Smith } 1060f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1061f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1062f1af5d2fSBarry Smith idx += 7; 1063f1af5d2fSBarry Smith } 1064f1af5d2fSBarry Smith /* backward solve the L^T */ 1065f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1066f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1067f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1068f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1069f1af5d2fSBarry Smith idt = 7*i; 1070f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1071f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1072f1af5d2fSBarry Smith while (nz--) { 1073f1af5d2fSBarry Smith idx = 7*(*vi--); 1074f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1075f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1076f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1077f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1078f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1079f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1080f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1081f1af5d2fSBarry Smith v -= 49; 1082f1af5d2fSBarry Smith } 1083f1af5d2fSBarry Smith } 1084f1af5d2fSBarry Smith 1085f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1086f1af5d2fSBarry Smith ii = 0; 1087f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1088f1af5d2fSBarry Smith ir = 7*r[i]; 1089f1af5d2fSBarry Smith x[ir] = t[ii]; 1090f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1091f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1092f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1093f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1094f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1095f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1096f1af5d2fSBarry Smith ii += 7; 1097f1af5d2fSBarry Smith } 1098f1af5d2fSBarry Smith 1099f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1100f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11011ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11021ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1103b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 1104f1af5d2fSBarry Smith PetscFunctionReturn(0); 1105f1af5d2fSBarry Smith } 1106f1af5d2fSBarry Smith 11074e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11084a2ae208SSatish Balay #undef __FUNCT__ 11094a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1110dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11114e2b4712SSatish Balay { 11124e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11134e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 1114*6849ba73SBarry Smith PetscErrorCode ierr; 1115*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 11164e2b4712SSatish Balay int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 11173f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 111887828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11194e2b4712SSatish Balay 11204e2b4712SSatish Balay PetscFunctionBegin; 11211ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11221ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1123f1af5d2fSBarry Smith t = a->solve_work; 11244e2b4712SSatish Balay 11254e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11264e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11274e2b4712SSatish Balay 11284e2b4712SSatish Balay /* forward solve the lower triangular */ 112987828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11304e2b4712SSatish Balay for (i=1; i<n; i++) { 11314e2b4712SSatish Balay v = aa + bs2*ai[i]; 11324e2b4712SSatish Balay vi = aj + ai[i]; 11334e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1134f1af5d2fSBarry Smith s = t + bs*i; 113587828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11364e2b4712SSatish Balay while (nz--) { 1137f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11384e2b4712SSatish Balay v += bs2; 11394e2b4712SSatish Balay } 11404e2b4712SSatish Balay } 11414e2b4712SSatish Balay /* backward solve the upper triangular */ 1142273d9f13SBarry Smith ls = a->solve_work + A->n; 11434e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11444e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11454e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11464e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 114787828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11484e2b4712SSatish Balay while (nz--) { 1149f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11504e2b4712SSatish Balay v += bs2; 11514e2b4712SSatish Balay } 1152f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 115387828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11544e2b4712SSatish Balay } 11554e2b4712SSatish Balay 11564e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11574e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11581ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11591ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1160b0a32e0cSBarry Smith PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 11614e2b4712SSatish Balay PetscFunctionReturn(0); 11624e2b4712SSatish Balay } 11634e2b4712SSatish Balay 11644a2ae208SSatish Balay #undef __FUNCT__ 11654a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1166dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11674e2b4712SSatish Balay { 11684e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11694e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 1170*6849ba73SBarry Smith PetscErrorCode ierr; 1171*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 11724e2b4712SSatish Balay int *diag = a->diag; 11733f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 117487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 117587828ca2SBarry Smith PetscScalar *x,*b,*t; 11764e2b4712SSatish Balay 11774e2b4712SSatish Balay PetscFunctionBegin; 11781ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1180f1af5d2fSBarry Smith t = a->solve_work; 11814e2b4712SSatish Balay 11824e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11834e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11844e2b4712SSatish Balay 11854e2b4712SSatish Balay /* forward solve the lower triangular */ 11864e2b4712SSatish Balay idx = 7*(*r++); 1187f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1188f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1189f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 11904e2b4712SSatish Balay 11914e2b4712SSatish Balay for (i=1; i<n; i++) { 11924e2b4712SSatish Balay v = aa + 49*ai[i]; 11934e2b4712SSatish Balay vi = aj + ai[i]; 11944e2b4712SSatish Balay nz = diag[i] - ai[i]; 11954e2b4712SSatish Balay idx = 7*(*r++); 1196f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1197f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 11984e2b4712SSatish Balay while (nz--) { 11994e2b4712SSatish Balay idx = 7*(*vi++); 1200f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1201f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1202f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1203f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1204f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1205f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1206f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1207f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1208f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1209f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12104e2b4712SSatish Balay v += 49; 12114e2b4712SSatish Balay } 12124e2b4712SSatish Balay idx = 7*i; 1213f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1214f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1215f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12164e2b4712SSatish Balay } 12174e2b4712SSatish Balay /* backward solve the upper triangular */ 12184e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12194e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12204e2b4712SSatish Balay vi = aj + diag[i] + 1; 12214e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12224e2b4712SSatish Balay idt = 7*i; 1223f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1224f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1225f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12264e2b4712SSatish Balay while (nz--) { 12274e2b4712SSatish Balay idx = 7*(*vi++); 1228f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1229f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1230f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1231f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1232f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1233f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1234f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1235f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1236f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1237f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12384e2b4712SSatish Balay v += 49; 12394e2b4712SSatish Balay } 12404e2b4712SSatish Balay idc = 7*(*c--); 12414e2b4712SSatish Balay v = aa + 49*diag[i]; 1242f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1243f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1244f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1245f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1246f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1247f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1248f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1249f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1250f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1251f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1252f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1253f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1254f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1255f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12564e2b4712SSatish Balay } 12574e2b4712SSatish Balay 12584e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12594e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12601ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12611ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1262b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 12634e2b4712SSatish Balay PetscFunctionReturn(0); 12644e2b4712SSatish Balay } 12654e2b4712SSatish Balay 12664a2ae208SSatish Balay #undef __FUNCT__ 12674a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1268dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 126915091d37SBarry Smith { 127015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 127115091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1272dfbe8321SBarry Smith PetscErrorCode ierr; 1273dfbe8321SBarry Smith int *diag = a->diag,jdx; 127415091d37SBarry Smith MatScalar *aa=a->a,*v; 127587828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 127615091d37SBarry Smith 127715091d37SBarry Smith PetscFunctionBegin; 12781ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 12791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 128015091d37SBarry Smith /* forward solve the lower triangular */ 128115091d37SBarry Smith idx = 0; 128215091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 128315091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 128415091d37SBarry Smith x[6] = b[6+idx]; 128515091d37SBarry Smith for (i=1; i<n; i++) { 128615091d37SBarry Smith v = aa + 49*ai[i]; 128715091d37SBarry Smith vi = aj + ai[i]; 128815091d37SBarry Smith nz = diag[i] - ai[i]; 128915091d37SBarry Smith idx = 7*i; 1290f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1291f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1292f1af5d2fSBarry Smith s7 = b[6+idx]; 129315091d37SBarry Smith while (nz--) { 129415091d37SBarry Smith jdx = 7*(*vi++); 129515091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 129615091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 129715091d37SBarry Smith x7 = x[6+jdx]; 1298f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1299f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1300f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1301f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1302f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1303f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1304f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 130515091d37SBarry Smith v += 49; 130615091d37SBarry Smith } 1307f1af5d2fSBarry Smith x[idx] = s1; 1308f1af5d2fSBarry Smith x[1+idx] = s2; 1309f1af5d2fSBarry Smith x[2+idx] = s3; 1310f1af5d2fSBarry Smith x[3+idx] = s4; 1311f1af5d2fSBarry Smith x[4+idx] = s5; 1312f1af5d2fSBarry Smith x[5+idx] = s6; 1313f1af5d2fSBarry Smith x[6+idx] = s7; 131415091d37SBarry Smith } 131515091d37SBarry Smith /* backward solve the upper triangular */ 131615091d37SBarry Smith for (i=n-1; i>=0; i--){ 131715091d37SBarry Smith v = aa + 49*diag[i] + 49; 131815091d37SBarry Smith vi = aj + diag[i] + 1; 131915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 132015091d37SBarry Smith idt = 7*i; 1321f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1322f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1323f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1324f1af5d2fSBarry Smith s7 = x[6+idt]; 132515091d37SBarry Smith while (nz--) { 132615091d37SBarry Smith idx = 7*(*vi++); 132715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 132815091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 132915091d37SBarry Smith x7 = x[6+idx]; 1330f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1331f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1332f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1333f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1334f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1335f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1336f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 133715091d37SBarry Smith v += 49; 133815091d37SBarry Smith } 133915091d37SBarry Smith v = aa + 49*diag[i]; 1340f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1341f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1342f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1343f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1344f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1345f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1346f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1347f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1348f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1349f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1350f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1351f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1352f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1353f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 135415091d37SBarry Smith } 135515091d37SBarry Smith 13561ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 13571ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1358b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 135915091d37SBarry Smith PetscFunctionReturn(0); 136015091d37SBarry Smith } 136115091d37SBarry Smith 13624a2ae208SSatish Balay #undef __FUNCT__ 13634a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1364dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 136515091d37SBarry Smith { 136615091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 136715091d37SBarry Smith IS iscol=a->col,isrow=a->row; 1368*6849ba73SBarry Smith PetscErrorCode ierr; 1369*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 137015091d37SBarry Smith int *diag = a->diag; 137115091d37SBarry Smith MatScalar *aa=a->a,*v; 137287828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 137315091d37SBarry Smith 137415091d37SBarry Smith PetscFunctionBegin; 13751ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 13761ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1377f1af5d2fSBarry Smith t = a->solve_work; 137815091d37SBarry Smith 137915091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 138015091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 138115091d37SBarry Smith 138215091d37SBarry Smith /* forward solve the lower triangular */ 138315091d37SBarry Smith idx = 6*(*r++); 1384f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1385f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1386f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 138715091d37SBarry Smith for (i=1; i<n; i++) { 138815091d37SBarry Smith v = aa + 36*ai[i]; 138915091d37SBarry Smith vi = aj + ai[i]; 139015091d37SBarry Smith nz = diag[i] - ai[i]; 139115091d37SBarry Smith idx = 6*(*r++); 1392f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1393f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 139415091d37SBarry Smith while (nz--) { 139515091d37SBarry Smith idx = 6*(*vi++); 1396f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1397f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1398f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1399f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1400f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1401f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1402f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1403f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 140415091d37SBarry Smith v += 36; 140515091d37SBarry Smith } 140615091d37SBarry Smith idx = 6*i; 1407f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1408f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1409f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 141015091d37SBarry Smith } 141115091d37SBarry Smith /* backward solve the upper triangular */ 141215091d37SBarry Smith for (i=n-1; i>=0; i--){ 141315091d37SBarry Smith v = aa + 36*diag[i] + 36; 141415091d37SBarry Smith vi = aj + diag[i] + 1; 141515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 141615091d37SBarry Smith idt = 6*i; 1417f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1418f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1419f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 142015091d37SBarry Smith while (nz--) { 142115091d37SBarry Smith idx = 6*(*vi++); 1422f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1423f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1424f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1425f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1426f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1427f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1428f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1429f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1430f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 143115091d37SBarry Smith v += 36; 143215091d37SBarry Smith } 143315091d37SBarry Smith idc = 6*(*c--); 143415091d37SBarry Smith v = aa + 36*diag[i]; 1435f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1436f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1437f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1438f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1439f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1440f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1441f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1442f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1443f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1444f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1445f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1446f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 144715091d37SBarry Smith } 144815091d37SBarry Smith 144915091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 145015091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 14511ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 14521ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1453b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 145415091d37SBarry Smith PetscFunctionReturn(0); 145515091d37SBarry Smith } 145615091d37SBarry Smith 14574a2ae208SSatish Balay #undef __FUNCT__ 14584a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1459dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 146015091d37SBarry Smith { 146115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 146215091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1463dfbe8321SBarry Smith PetscErrorCode ierr; 1464dfbe8321SBarry Smith int *diag = a->diag,jdx; 146515091d37SBarry Smith MatScalar *aa=a->a,*v; 146687828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 146715091d37SBarry Smith 146815091d37SBarry Smith PetscFunctionBegin; 14691ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 14701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 147115091d37SBarry Smith /* forward solve the lower triangular */ 147215091d37SBarry Smith idx = 0; 147315091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 147415091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 147515091d37SBarry Smith for (i=1; i<n; i++) { 147615091d37SBarry Smith v = aa + 36*ai[i]; 147715091d37SBarry Smith vi = aj + ai[i]; 147815091d37SBarry Smith nz = diag[i] - ai[i]; 147915091d37SBarry Smith idx = 6*i; 1480f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1481f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 148215091d37SBarry Smith while (nz--) { 148315091d37SBarry Smith jdx = 6*(*vi++); 148415091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 148515091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1486f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1487f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1488f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1489f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1490f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1491f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 149215091d37SBarry Smith v += 36; 149315091d37SBarry Smith } 1494f1af5d2fSBarry Smith x[idx] = s1; 1495f1af5d2fSBarry Smith x[1+idx] = s2; 1496f1af5d2fSBarry Smith x[2+idx] = s3; 1497f1af5d2fSBarry Smith x[3+idx] = s4; 1498f1af5d2fSBarry Smith x[4+idx] = s5; 1499f1af5d2fSBarry Smith x[5+idx] = s6; 150015091d37SBarry Smith } 150115091d37SBarry Smith /* backward solve the upper triangular */ 150215091d37SBarry Smith for (i=n-1; i>=0; i--){ 150315091d37SBarry Smith v = aa + 36*diag[i] + 36; 150415091d37SBarry Smith vi = aj + diag[i] + 1; 150515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 150615091d37SBarry Smith idt = 6*i; 1507f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1508f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1509f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 151015091d37SBarry Smith while (nz--) { 151115091d37SBarry Smith idx = 6*(*vi++); 151215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 151315091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1514f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1515f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1516f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1517f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1518f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1519f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 152015091d37SBarry Smith v += 36; 152115091d37SBarry Smith } 152215091d37SBarry Smith v = aa + 36*diag[i]; 1523f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1524f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1525f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1526f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1527f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1528f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 152915091d37SBarry Smith } 153015091d37SBarry Smith 15311ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 15321ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1533b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 153415091d37SBarry Smith PetscFunctionReturn(0); 153515091d37SBarry Smith } 153615091d37SBarry Smith 15374a2ae208SSatish Balay #undef __FUNCT__ 15384a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1539dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 15404e2b4712SSatish Balay { 15414e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15424e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 1543*6849ba73SBarry Smith PetscErrorCode ierr; 1544*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 15454e2b4712SSatish Balay int *diag = a->diag; 15463f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 154787828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 15484e2b4712SSatish Balay 15494e2b4712SSatish Balay PetscFunctionBegin; 15501ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 15511ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1552f1af5d2fSBarry Smith t = a->solve_work; 15534e2b4712SSatish Balay 15544e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 15554e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 15564e2b4712SSatish Balay 15574e2b4712SSatish Balay /* forward solve the lower triangular */ 15584e2b4712SSatish Balay idx = 5*(*r++); 1559f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1560f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 15614e2b4712SSatish Balay for (i=1; i<n; i++) { 15624e2b4712SSatish Balay v = aa + 25*ai[i]; 15634e2b4712SSatish Balay vi = aj + ai[i]; 15644e2b4712SSatish Balay nz = diag[i] - ai[i]; 15654e2b4712SSatish Balay idx = 5*(*r++); 1566f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1567f1af5d2fSBarry Smith s5 = b[4+idx]; 15684e2b4712SSatish Balay while (nz--) { 15694e2b4712SSatish Balay idx = 5*(*vi++); 1570f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1571f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1572f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1573f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1574f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1575f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1576f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 15774e2b4712SSatish Balay v += 25; 15784e2b4712SSatish Balay } 15794e2b4712SSatish Balay idx = 5*i; 1580f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1581f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 15824e2b4712SSatish Balay } 15834e2b4712SSatish Balay /* backward solve the upper triangular */ 15844e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 15854e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 15864e2b4712SSatish Balay vi = aj + diag[i] + 1; 15874e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 15884e2b4712SSatish Balay idt = 5*i; 1589f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1590f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 15914e2b4712SSatish Balay while (nz--) { 15924e2b4712SSatish Balay idx = 5*(*vi++); 1593f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1594f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1595f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1596f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1597f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1598f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1599f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 16004e2b4712SSatish Balay v += 25; 16014e2b4712SSatish Balay } 16024e2b4712SSatish Balay idc = 5*(*c--); 16034e2b4712SSatish Balay v = aa + 25*diag[i]; 1604f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1605f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 1606f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1607f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 1608f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1609f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 1610f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1611f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 1612f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1613f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 16144e2b4712SSatish Balay } 16154e2b4712SSatish Balay 16164e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 16174e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 16181ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 16191ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1620b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 16214e2b4712SSatish Balay PetscFunctionReturn(0); 16224e2b4712SSatish Balay } 16234e2b4712SSatish Balay 16244a2ae208SSatish Balay #undef __FUNCT__ 16254a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1626dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 162715091d37SBarry Smith { 162815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 162915091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1630dfbe8321SBarry Smith PetscErrorCode ierr; 1631dfbe8321SBarry Smith int *diag = a->diag,jdx; 163215091d37SBarry Smith MatScalar *aa=a->a,*v; 163387828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 163415091d37SBarry Smith 163515091d37SBarry Smith PetscFunctionBegin; 16361ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 16371ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 163815091d37SBarry Smith /* forward solve the lower triangular */ 163915091d37SBarry Smith idx = 0; 164015091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 164115091d37SBarry Smith for (i=1; i<n; i++) { 164215091d37SBarry Smith v = aa + 25*ai[i]; 164315091d37SBarry Smith vi = aj + ai[i]; 164415091d37SBarry Smith nz = diag[i] - ai[i]; 164515091d37SBarry Smith idx = 5*i; 1646f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 164715091d37SBarry Smith while (nz--) { 164815091d37SBarry Smith jdx = 5*(*vi++); 164915091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1650f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1651f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1652f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1653f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1654f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 165515091d37SBarry Smith v += 25; 165615091d37SBarry Smith } 1657f1af5d2fSBarry Smith x[idx] = s1; 1658f1af5d2fSBarry Smith x[1+idx] = s2; 1659f1af5d2fSBarry Smith x[2+idx] = s3; 1660f1af5d2fSBarry Smith x[3+idx] = s4; 1661f1af5d2fSBarry Smith x[4+idx] = s5; 166215091d37SBarry Smith } 166315091d37SBarry Smith /* backward solve the upper triangular */ 166415091d37SBarry Smith for (i=n-1; i>=0; i--){ 166515091d37SBarry Smith v = aa + 25*diag[i] + 25; 166615091d37SBarry Smith vi = aj + diag[i] + 1; 166715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 166815091d37SBarry Smith idt = 5*i; 1669f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1670f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 167115091d37SBarry Smith while (nz--) { 167215091d37SBarry Smith idx = 5*(*vi++); 167315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1674f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1675f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1676f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1677f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1678f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 167915091d37SBarry Smith v += 25; 168015091d37SBarry Smith } 168115091d37SBarry Smith v = aa + 25*diag[i]; 1682f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1683f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1684f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1685f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1686f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 168715091d37SBarry Smith } 168815091d37SBarry Smith 16891ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 16901ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1691b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 169215091d37SBarry Smith PetscFunctionReturn(0); 169315091d37SBarry Smith } 169415091d37SBarry Smith 16954a2ae208SSatish Balay #undef __FUNCT__ 16964a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1697dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 16984e2b4712SSatish Balay { 16994e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 17004e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 1701*6849ba73SBarry Smith PetscErrorCode ierr; 1702*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 17034e2b4712SSatish Balay int *diag = a->diag; 17043f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 170587828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 17064e2b4712SSatish Balay 17074e2b4712SSatish Balay PetscFunctionBegin; 17081ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 17091ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1710f1af5d2fSBarry Smith t = a->solve_work; 17114e2b4712SSatish Balay 17124e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 17134e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 17144e2b4712SSatish Balay 17154e2b4712SSatish Balay /* forward solve the lower triangular */ 17164e2b4712SSatish Balay idx = 4*(*r++); 1717f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1718f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 17194e2b4712SSatish Balay for (i=1; i<n; i++) { 17204e2b4712SSatish Balay v = aa + 16*ai[i]; 17214e2b4712SSatish Balay vi = aj + ai[i]; 17224e2b4712SSatish Balay nz = diag[i] - ai[i]; 17234e2b4712SSatish Balay idx = 4*(*r++); 1724f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 17254e2b4712SSatish Balay while (nz--) { 17264e2b4712SSatish Balay idx = 4*(*vi++); 1727f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1728f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1729f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1730f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1731f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17324e2b4712SSatish Balay v += 16; 17334e2b4712SSatish Balay } 17344e2b4712SSatish Balay idx = 4*i; 1735f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1736f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 17374e2b4712SSatish Balay } 17384e2b4712SSatish Balay /* backward solve the upper triangular */ 17394e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 17404e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 17414e2b4712SSatish Balay vi = aj + diag[i] + 1; 17424e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 17434e2b4712SSatish Balay idt = 4*i; 1744f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1745f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 17464e2b4712SSatish Balay while (nz--) { 17474e2b4712SSatish Balay idx = 4*(*vi++); 1748f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1749f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1750f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1751f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1752f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1753f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17544e2b4712SSatish Balay v += 16; 17554e2b4712SSatish Balay } 17564e2b4712SSatish Balay idc = 4*(*c--); 17574e2b4712SSatish Balay v = aa + 16*diag[i]; 1758f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1759f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1760f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1761f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 17624e2b4712SSatish Balay } 17634e2b4712SSatish Balay 17644e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 17654e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 17661ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 17671ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1768b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 17694e2b4712SSatish Balay PetscFunctionReturn(0); 17704e2b4712SSatish Balay } 1771f26ec98cSKris Buschelman 1772f26ec98cSKris Buschelman #undef __FUNCT__ 1773f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1774dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1775f26ec98cSKris Buschelman { 1776f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1777f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 1778*6849ba73SBarry Smith PetscErrorCode ierr; 1779*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1780f26ec98cSKris Buschelman int *diag = a->diag; 1781f26ec98cSKris Buschelman MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1782f26ec98cSKris Buschelman PetscScalar *x,*b; 1783f26ec98cSKris Buschelman 1784f26ec98cSKris Buschelman PetscFunctionBegin; 17851ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 17861ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1787f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 1788f26ec98cSKris Buschelman 1789f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1790f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1791f26ec98cSKris Buschelman 1792f26ec98cSKris Buschelman /* forward solve the lower triangular */ 1793f26ec98cSKris Buschelman idx = 4*(*r++); 1794f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 1795f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 1796f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 1797f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 1798f26ec98cSKris Buschelman for (i=1; i<n; i++) { 1799f26ec98cSKris Buschelman v = aa + 16*ai[i]; 1800f26ec98cSKris Buschelman vi = aj + ai[i]; 1801f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 1802f26ec98cSKris Buschelman idx = 4*(*r++); 1803f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 1804f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 1805f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 1806f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 1807f26ec98cSKris Buschelman while (nz--) { 1808f26ec98cSKris Buschelman idx = 4*(*vi++); 1809f26ec98cSKris Buschelman x1 = t[idx]; 1810f26ec98cSKris Buschelman x2 = t[1+idx]; 1811f26ec98cSKris Buschelman x3 = t[2+idx]; 1812f26ec98cSKris Buschelman x4 = t[3+idx]; 1813f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1814f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1815f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1816f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1817f26ec98cSKris Buschelman v += 16; 1818f26ec98cSKris Buschelman } 1819f26ec98cSKris Buschelman idx = 4*i; 1820f26ec98cSKris Buschelman t[idx] = s1; 1821f26ec98cSKris Buschelman t[1+idx] = s2; 1822f26ec98cSKris Buschelman t[2+idx] = s3; 1823f26ec98cSKris Buschelman t[3+idx] = s4; 1824f26ec98cSKris Buschelman } 1825f26ec98cSKris Buschelman /* backward solve the upper triangular */ 1826f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 1827f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 1828f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 1829f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 1830f26ec98cSKris Buschelman idt = 4*i; 1831f26ec98cSKris Buschelman s1 = t[idt]; 1832f26ec98cSKris Buschelman s2 = t[1+idt]; 1833f26ec98cSKris Buschelman s3 = t[2+idt]; 1834f26ec98cSKris Buschelman s4 = t[3+idt]; 1835f26ec98cSKris Buschelman while (nz--) { 1836f26ec98cSKris Buschelman idx = 4*(*vi++); 1837f26ec98cSKris Buschelman x1 = t[idx]; 1838f26ec98cSKris Buschelman x2 = t[1+idx]; 1839f26ec98cSKris Buschelman x3 = t[2+idx]; 1840f26ec98cSKris Buschelman x4 = t[3+idx]; 1841f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1842f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1843f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1844f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1845f26ec98cSKris Buschelman v += 16; 1846f26ec98cSKris Buschelman } 1847f26ec98cSKris Buschelman idc = 4*(*c--); 1848f26ec98cSKris Buschelman v = aa + 16*diag[i]; 1849f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1850f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1851f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1852f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1853f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 1854f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 1855f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 1856f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 1857f26ec98cSKris Buschelman } 1858f26ec98cSKris Buschelman 1859f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1860f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 18611ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 18621ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1863f26ec98cSKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 1864f26ec98cSKris Buschelman PetscFunctionReturn(0); 1865f26ec98cSKris Buschelman } 1866f26ec98cSKris Buschelman 186724c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 186824c233c2SKris Buschelman 186924c233c2SKris Buschelman #include PETSC_HAVE_SSE 187024c233c2SKris Buschelman 187124c233c2SKris Buschelman #undef __FUNCT__ 187224c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1873dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 187424c233c2SKris Buschelman { 187524c233c2SKris Buschelman /* 187624c233c2SKris Buschelman Note: This code uses demotion of double 187724c233c2SKris Buschelman to float when performing the mixed-mode computation. 187824c233c2SKris Buschelman This may not be numerically reasonable for all applications. 187924c233c2SKris Buschelman */ 188024c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 188124c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 1882*6849ba73SBarry Smith PetscErrorCode ierr; 1883*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 188424c233c2SKris Buschelman int *diag = a->diag,ai16; 188524c233c2SKris Buschelman MatScalar *aa=a->a,*v; 188687828ca2SBarry Smith PetscScalar *x,*b,*t; 188724c233c2SKris Buschelman 188824c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 188924c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 189024c233c2SKris Buschelman unsigned long offset; 189124c233c2SKris Buschelman 189224c233c2SKris Buschelman PetscFunctionBegin; 189324c233c2SKris Buschelman SSE_SCOPE_BEGIN; 189424c233c2SKris Buschelman 189524c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 189624c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 189724c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 189824c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 189924c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 190024c233c2SKris Buschelman 19011ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 19021ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 190324c233c2SKris Buschelman t = a->solve_work; 190424c233c2SKris Buschelman 190524c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 190624c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 190724c233c2SKris Buschelman 190824c233c2SKris Buschelman /* forward solve the lower triangular */ 190924c233c2SKris Buschelman idx = 4*(*r++); 191024c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 191124c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 191224c233c2SKris Buschelman v = aa + 16*ai[1]; 191324c233c2SKris Buschelman 191424c233c2SKris Buschelman for (i=1; i<n;) { 191524c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 191624c233c2SKris Buschelman vi = aj + ai[i]; 191724c233c2SKris Buschelman nz = diag[i] - ai[i]; 191824c233c2SKris Buschelman idx = 4*(*r++); 191924c233c2SKris Buschelman 192024c233c2SKris Buschelman /* Demote sum from double to float */ 192124c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 192224c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 192324c233c2SKris Buschelman 192424c233c2SKris Buschelman while (nz--) { 192524c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 192624c233c2SKris Buschelman idx = 4*(*vi++); 192724c233c2SKris Buschelman 192824c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 192924c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 193024c233c2SKris Buschelman 193124c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 193224c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 193324c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 193424c233c2SKris Buschelman 193524c233c2SKris Buschelman /* First Column */ 193624c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 193724c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 193824c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 193924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 194024c233c2SKris Buschelman 194124c233c2SKris Buschelman /* Second Column */ 194224c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 194324c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 194424c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 194524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 194624c233c2SKris Buschelman 194724c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 194824c233c2SKris Buschelman 194924c233c2SKris Buschelman /* Third Column */ 195024c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 195124c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 195224c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 195324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 195424c233c2SKris Buschelman 195524c233c2SKris Buschelman /* Fourth Column */ 195624c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 195724c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 195824c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 195924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 196024c233c2SKris Buschelman SSE_INLINE_END_2 196124c233c2SKris Buschelman 196224c233c2SKris Buschelman v += 16; 196324c233c2SKris Buschelman } 196424c233c2SKris Buschelman idx = 4*i; 196524c233c2SKris Buschelman v = aa + 16*ai[++i]; 196624c233c2SKris Buschelman PREFETCH_NTA(v); 196724c233c2SKris Buschelman STORE_PS(tmps,XMM7); 196824c233c2SKris Buschelman 196924c233c2SKris Buschelman /* Promote result from float to double */ 197024c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 197124c233c2SKris Buschelman } 197224c233c2SKris Buschelman /* backward solve the upper triangular */ 197324c233c2SKris Buschelman idt = 4*(n-1); 197424c233c2SKris Buschelman ai16 = 16*diag[n-1]; 197524c233c2SKris Buschelman v = aa + ai16 + 16; 197624c233c2SKris Buschelman for (i=n-1; i>=0;){ 197724c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 197824c233c2SKris Buschelman vi = aj + diag[i] + 1; 197924c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 198024c233c2SKris Buschelman 198124c233c2SKris Buschelman /* Demote accumulator from double to float */ 198224c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 198324c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 198424c233c2SKris Buschelman 198524c233c2SKris Buschelman while (nz--) { 198624c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 198724c233c2SKris Buschelman idx = 4*(*vi++); 198824c233c2SKris Buschelman 198924c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 199024c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 199124c233c2SKris Buschelman 199224c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 199324c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 199424c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 199524c233c2SKris Buschelman 199624c233c2SKris Buschelman /* First Column */ 199724c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 199824c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 199924c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 200024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 200124c233c2SKris Buschelman 200224c233c2SKris Buschelman /* Second Column */ 200324c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 200424c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 200524c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 200624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 200724c233c2SKris Buschelman 200824c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 200924c233c2SKris Buschelman 201024c233c2SKris Buschelman /* Third Column */ 201124c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 201224c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 201324c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 201424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 201524c233c2SKris Buschelman 201624c233c2SKris Buschelman /* Fourth Column */ 201724c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 201824c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 201924c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 202024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 202124c233c2SKris Buschelman SSE_INLINE_END_2 202224c233c2SKris Buschelman v += 16; 202324c233c2SKris Buschelman } 202424c233c2SKris Buschelman v = aa + ai16; 202524c233c2SKris Buschelman ai16 = 16*diag[--i]; 202624c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 202724c233c2SKris Buschelman /* 202824c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 202924c233c2SKris Buschelman which was inverted as part of the factorization 203024c233c2SKris Buschelman */ 203124c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 203224c233c2SKris Buschelman /* First Column */ 203324c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 203424c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 203524c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 203624c233c2SKris Buschelman 203724c233c2SKris Buschelman /* Second Column */ 203824c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 203924c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 204024c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 204124c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 204224c233c2SKris Buschelman 204324c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 204424c233c2SKris Buschelman 204524c233c2SKris Buschelman /* Third Column */ 204624c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 204724c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 204824c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 204924c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 205024c233c2SKris Buschelman 205124c233c2SKris Buschelman /* Fourth Column */ 205224c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 205324c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 205424c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 205524c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 205624c233c2SKris Buschelman 205724c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 205824c233c2SKris Buschelman SSE_INLINE_END_3 205924c233c2SKris Buschelman 206024c233c2SKris Buschelman /* Promote solution from float to double */ 206124c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 206224c233c2SKris Buschelman 206324c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 206424c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 206524c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 206624c233c2SKris Buschelman idc = 4*(*c--); 206724c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 206824c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 206924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 207024c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 207124c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 207224c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 207324c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 207424c233c2SKris Buschelman SSE_INLINE_END_2 207524c233c2SKris Buschelman v = aa + ai16 + 16; 207624c233c2SKris Buschelman idt -= 4; 207724c233c2SKris Buschelman } 207824c233c2SKris Buschelman 207924c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 208024c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 20811ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 20821ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 208324c233c2SKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 208424c233c2SKris Buschelman SSE_SCOPE_END; 208524c233c2SKris Buschelman PetscFunctionReturn(0); 208624c233c2SKris Buschelman } 208724c233c2SKris Buschelman 208824c233c2SKris Buschelman #endif 20890ef38995SBarry Smith 20900ef38995SBarry Smith 20914e2b4712SSatish Balay /* 20924e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 20934e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 20944e2b4712SSatish Balay */ 20954a2ae208SSatish Balay #undef __FUNCT__ 20964a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2097dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 20984e2b4712SSatish Balay { 20994e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 210030d4dcafSBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 2101dfbe8321SBarry Smith PetscErrorCode ierr; 2102dfbe8321SBarry Smith int *diag = a->diag; 21033f1db9ecSBarry Smith MatScalar *aa=a->a; 210487828ca2SBarry Smith PetscScalar *x,*b; 21054e2b4712SSatish Balay 21064e2b4712SSatish Balay PetscFunctionBegin; 21071ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 21081ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21094e2b4712SSatish Balay 2110aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 21112853dc0eSBarry Smith { 211287828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 21132853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 21142853dc0eSBarry Smith } 2115aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 21162853dc0eSBarry Smith { 211787828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 21182853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 21192853dc0eSBarry Smith } 2120aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 21212853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2122e1293385SBarry Smith #else 212330d4dcafSBarry Smith { 212487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 21253f1db9ecSBarry Smith MatScalar *v; 21264e555682SBarry Smith int jdx,idt,idx,nz,*vi,i,ai16; 2127e1293385SBarry Smith 21284e2b4712SSatish Balay /* forward solve the lower triangular */ 21294e2b4712SSatish Balay idx = 0; 2130e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 21314e2b4712SSatish Balay for (i=1; i<n; i++) { 21324e2b4712SSatish Balay v = aa + 16*ai[i]; 21334e2b4712SSatish Balay vi = aj + ai[i]; 21344e2b4712SSatish Balay nz = diag[i] - ai[i]; 2135e1293385SBarry Smith idx += 4; 2136f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 21374e2b4712SSatish Balay while (nz--) { 21384e2b4712SSatish Balay jdx = 4*(*vi++); 21394e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2140f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2141f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2142f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2143f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 21444e2b4712SSatish Balay v += 16; 21454e2b4712SSatish Balay } 2146f1af5d2fSBarry Smith x[idx] = s1; 2147f1af5d2fSBarry Smith x[1+idx] = s2; 2148f1af5d2fSBarry Smith x[2+idx] = s3; 2149f1af5d2fSBarry Smith x[3+idx] = s4; 21504e2b4712SSatish Balay } 21514e2b4712SSatish Balay /* backward solve the upper triangular */ 21524e555682SBarry Smith idt = 4*(n-1); 21534e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 21544e555682SBarry Smith ai16 = 16*diag[i]; 21554e555682SBarry Smith v = aa + ai16 + 16; 21564e2b4712SSatish Balay vi = aj + diag[i] + 1; 21574e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2158f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2159f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 21604e2b4712SSatish Balay while (nz--) { 21614e2b4712SSatish Balay idx = 4*(*vi++); 21624e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2163f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2164f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2165f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2166f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 21674e2b4712SSatish Balay v += 16; 21684e2b4712SSatish Balay } 21694e555682SBarry Smith v = aa + ai16; 2170f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2171f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2172f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2173f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2174329f5518SBarry Smith idt -= 4; 21754e2b4712SSatish Balay } 217630d4dcafSBarry Smith } 2177e1293385SBarry Smith #endif 21784e2b4712SSatish Balay 21791ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 21801ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2181b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 21824e2b4712SSatish Balay PetscFunctionReturn(0); 21834e2b4712SSatish Balay } 21844e2b4712SSatish Balay 2185f26ec98cSKris Buschelman #undef __FUNCT__ 2186f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2187dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2188f26ec98cSKris Buschelman { 2189f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2190f26ec98cSKris Buschelman int n=a->mbs,*ai=a->i,*aj=a->j; 2191dfbe8321SBarry Smith PetscErrorCode ierr; 2192dfbe8321SBarry Smith int *diag = a->diag; 2193f26ec98cSKris Buschelman MatScalar *aa=a->a; 2194f26ec98cSKris Buschelman PetscScalar *x,*b; 2195f26ec98cSKris Buschelman 2196f26ec98cSKris Buschelman PetscFunctionBegin; 21971ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 21981ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2199f26ec98cSKris Buschelman 2200f26ec98cSKris Buschelman { 2201f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2202f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 2203f26ec98cSKris Buschelman int jdx,idt,idx,nz,*vi,i,ai16; 2204f26ec98cSKris Buschelman 2205f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2206f26ec98cSKris Buschelman idx = 0; 2207f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 2208f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 2209f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 2210f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 2211f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2212f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2213f26ec98cSKris Buschelman vi = aj + ai[i]; 2214f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2215f26ec98cSKris Buschelman idx += 4; 2216f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2217f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2218f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2219f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2220f26ec98cSKris Buschelman while (nz--) { 2221f26ec98cSKris Buschelman jdx = 4*(*vi++); 2222f26ec98cSKris Buschelman x1 = t[jdx]; 2223f26ec98cSKris Buschelman x2 = t[1+jdx]; 2224f26ec98cSKris Buschelman x3 = t[2+jdx]; 2225f26ec98cSKris Buschelman x4 = t[3+jdx]; 2226f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2227f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2228f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2229f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2230f26ec98cSKris Buschelman v += 16; 2231f26ec98cSKris Buschelman } 2232f26ec98cSKris Buschelman t[idx] = s1; 2233f26ec98cSKris Buschelman t[1+idx] = s2; 2234f26ec98cSKris Buschelman t[2+idx] = s3; 2235f26ec98cSKris Buschelman t[3+idx] = s4; 2236f26ec98cSKris Buschelman } 2237f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2238f26ec98cSKris Buschelman idt = 4*(n-1); 2239f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2240f26ec98cSKris Buschelman ai16 = 16*diag[i]; 2241f26ec98cSKris Buschelman v = aa + ai16 + 16; 2242f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2243f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2244f26ec98cSKris Buschelman s1 = t[idt]; 2245f26ec98cSKris Buschelman s2 = t[1+idt]; 2246f26ec98cSKris Buschelman s3 = t[2+idt]; 2247f26ec98cSKris Buschelman s4 = t[3+idt]; 2248f26ec98cSKris Buschelman while (nz--) { 2249f26ec98cSKris Buschelman idx = 4*(*vi++); 2250f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 2251f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 2252f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 2253f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 2254f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2255f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2256f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2257f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2258f26ec98cSKris Buschelman v += 16; 2259f26ec98cSKris Buschelman } 2260f26ec98cSKris Buschelman v = aa + ai16; 2261f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2262f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2263f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2264f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2265f26ec98cSKris Buschelman idt -= 4; 2266f26ec98cSKris Buschelman } 2267f26ec98cSKris Buschelman } 2268f26ec98cSKris Buschelman 22691ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 22701ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2271f26ec98cSKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 2272f26ec98cSKris Buschelman PetscFunctionReturn(0); 2273f26ec98cSKris Buschelman } 2274f26ec98cSKris Buschelman 22753660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 22763660e330SKris Buschelman 22773660e330SKris Buschelman #include PETSC_HAVE_SSE 22783660e330SKris Buschelman #undef __FUNCT__ 22797cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2280dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 22813660e330SKris Buschelman { 22823660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 22832aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 2284dfbe8321SBarry Smith PetscErrorCode ierr; 2285dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 22863660e330SKris Buschelman MatScalar *aa=a->a; 228787828ca2SBarry Smith PetscScalar *x,*b; 22883660e330SKris Buschelman 22893660e330SKris Buschelman PetscFunctionBegin; 22903660e330SKris Buschelman SSE_SCOPE_BEGIN; 22913660e330SKris Buschelman /* 22923660e330SKris Buschelman Note: This code currently uses demotion of double 22933660e330SKris Buschelman to float when performing the mixed-mode computation. 22943660e330SKris Buschelman This may not be numerically reasonable for all applications. 22953660e330SKris Buschelman */ 22963660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 22973660e330SKris Buschelman 22981ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 22991ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23003660e330SKris Buschelman { 2301eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 2302eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 23032aa5897fSKris Buschelman int nz,i,idt,ai16; 23042aa5897fSKris Buschelman unsigned int jdx,idx; 23052aa5897fSKris Buschelman unsigned short *vi; 2306eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 23073660e330SKris Buschelman 2308eb05f457SKris Buschelman /* First block is the identity. */ 23093660e330SKris Buschelman idx = 0; 2310eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 23112aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 23123660e330SKris Buschelman 23133660e330SKris Buschelman for (i=1; i<n;) { 23143660e330SKris Buschelman PREFETCH_NTA(&v[8]); 23153660e330SKris Buschelman vi = aj + ai[i]; 23163660e330SKris Buschelman nz = diag[i] - ai[i]; 23173660e330SKris Buschelman idx += 4; 23183660e330SKris Buschelman 2319eb05f457SKris Buschelman /* Demote RHS from double to float. */ 2320eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2321eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 23223660e330SKris Buschelman 23233660e330SKris Buschelman while (nz--) { 23243660e330SKris Buschelman PREFETCH_NTA(&v[16]); 23252aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 23263660e330SKris Buschelman 23273660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 2328eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 23293660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 23303660e330SKris Buschelman 23313660e330SKris Buschelman /* First Column */ 23323660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 23333660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 23343660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 23353660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 23363660e330SKris Buschelman 23373660e330SKris Buschelman /* Second Column */ 23383660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 23393660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 23403660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 23413660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 23423660e330SKris Buschelman 23433660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 23443660e330SKris Buschelman 23453660e330SKris Buschelman /* Third Column */ 23463660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 23473660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 23483660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 23493660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 23503660e330SKris Buschelman 23513660e330SKris Buschelman /* Fourth Column */ 23523660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 23533660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 23543660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 23553660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 23563660e330SKris Buschelman SSE_INLINE_END_2 23573660e330SKris Buschelman 23583660e330SKris Buschelman v += 16; 23593660e330SKris Buschelman } 23603660e330SKris Buschelman v = aa + 16*ai[++i]; 23613660e330SKris Buschelman PREFETCH_NTA(v); 2362eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 23633660e330SKris Buschelman } 2364eb05f457SKris Buschelman 2365eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 2366eb05f457SKris Buschelman 23673660e330SKris Buschelman idt = 4*(n-1); 23683660e330SKris Buschelman ai16 = 16*diag[n-1]; 23693660e330SKris Buschelman v = aa + ai16 + 16; 23703660e330SKris Buschelman for (i=n-1; i>=0;){ 23713660e330SKris Buschelman PREFETCH_NTA(&v[8]); 23723660e330SKris Buschelman vi = aj + diag[i] + 1; 23733660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 23743660e330SKris Buschelman 2375eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 23763660e330SKris Buschelman 23773660e330SKris Buschelman while (nz--) { 23783660e330SKris Buschelman PREFETCH_NTA(&v[16]); 23792aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 23803660e330SKris Buschelman 23813660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 2382eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 23833660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 23843660e330SKris Buschelman 23853660e330SKris Buschelman /* First Column */ 23863660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 23873660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 23883660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 23893660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 23903660e330SKris Buschelman 23913660e330SKris Buschelman /* Second Column */ 23923660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 23933660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 23943660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 23953660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 23963660e330SKris Buschelman 23973660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 23983660e330SKris Buschelman 23993660e330SKris Buschelman /* Third Column */ 24003660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 24013660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 24023660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 24033660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 24043660e330SKris Buschelman 24053660e330SKris Buschelman /* Fourth Column */ 24063660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 24073660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 24083660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 24093660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 24103660e330SKris Buschelman SSE_INLINE_END_2 24113660e330SKris Buschelman v += 16; 24123660e330SKris Buschelman } 24133660e330SKris Buschelman v = aa + ai16; 24143660e330SKris Buschelman ai16 = 16*diag[--i]; 24153660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 24163660e330SKris Buschelman /* 24173660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 24183660e330SKris Buschelman which was inverted as part of the factorization 24193660e330SKris Buschelman */ 2420eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 24213660e330SKris Buschelman /* First Column */ 24223660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 24233660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 24243660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 24253660e330SKris Buschelman 24263660e330SKris Buschelman /* Second Column */ 24273660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 24283660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 24293660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 24303660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 24313660e330SKris Buschelman 24323660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 24333660e330SKris Buschelman 24343660e330SKris Buschelman /* Third Column */ 24353660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 24363660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 24373660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 24383660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 24393660e330SKris Buschelman 24403660e330SKris Buschelman /* Fourth Column */ 24413660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 24423660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 24433660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 24443660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 24453660e330SKris Buschelman 24463660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 24473660e330SKris Buschelman SSE_INLINE_END_3 24483660e330SKris Buschelman 24493660e330SKris Buschelman v = aa + ai16 + 16; 24503660e330SKris Buschelman idt -= 4; 24513660e330SKris Buschelman } 2452eb05f457SKris Buschelman 2453eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 2454eb05f457SKris Buschelman idt = 4*(n-1); 2455eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 2456eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2457eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2458eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 2459eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 2460eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 2461eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 2462eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 2463eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 246454693613SKris Buschelman idt -= 4; 24653660e330SKris Buschelman } 2466eb05f457SKris Buschelman 2467eb05f457SKris Buschelman } /* End of artificial scope. */ 24681ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 24691ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 24703660e330SKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 24713660e330SKris Buschelman SSE_SCOPE_END; 24723660e330SKris Buschelman PetscFunctionReturn(0); 24733660e330SKris Buschelman } 24743660e330SKris Buschelman 24757cf1b8d3SKris Buschelman #undef __FUNCT__ 24767cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2477dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 24787cf1b8d3SKris Buschelman { 24797cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 24807cf1b8d3SKris Buschelman int *aj=a->j; 2481dfbe8321SBarry Smith PetscErrorCode ierr; 2482dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 24837cf1b8d3SKris Buschelman MatScalar *aa=a->a; 24847cf1b8d3SKris Buschelman PetscScalar *x,*b; 24857cf1b8d3SKris Buschelman 24867cf1b8d3SKris Buschelman PetscFunctionBegin; 24877cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 24887cf1b8d3SKris Buschelman /* 24897cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 24907cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 24917cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 24927cf1b8d3SKris Buschelman */ 24937cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 24947cf1b8d3SKris Buschelman 24951ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 24961ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24977cf1b8d3SKris Buschelman { 24987cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 24997cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 25007cf1b8d3SKris Buschelman int nz,i,idt,ai16; 25017cf1b8d3SKris Buschelman int jdx,idx; 25027cf1b8d3SKris Buschelman int *vi; 25037cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 25047cf1b8d3SKris Buschelman 25057cf1b8d3SKris Buschelman /* First block is the identity. */ 25067cf1b8d3SKris Buschelman idx = 0; 25077cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 25087cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 25097cf1b8d3SKris Buschelman 25107cf1b8d3SKris Buschelman for (i=1; i<n;) { 25117cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 25127cf1b8d3SKris Buschelman vi = aj + ai[i]; 25137cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 25147cf1b8d3SKris Buschelman idx += 4; 25157cf1b8d3SKris Buschelman 25167cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 25177cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 25187cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 25197cf1b8d3SKris Buschelman 25207cf1b8d3SKris Buschelman while (nz--) { 25217cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 25227cf1b8d3SKris Buschelman jdx = 4*(*vi++); 25237cf1b8d3SKris Buschelman /* jdx = *vi++; */ 25247cf1b8d3SKris Buschelman 25257cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 25267cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 25277cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 25287cf1b8d3SKris Buschelman 25297cf1b8d3SKris Buschelman /* First Column */ 25307cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 25317cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25327cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 25337cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 25347cf1b8d3SKris Buschelman 25357cf1b8d3SKris Buschelman /* Second Column */ 25367cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 25377cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 25387cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 25397cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 25407cf1b8d3SKris Buschelman 25417cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 25427cf1b8d3SKris Buschelman 25437cf1b8d3SKris Buschelman /* Third Column */ 25447cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 25457cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 25467cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 25477cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 25487cf1b8d3SKris Buschelman 25497cf1b8d3SKris Buschelman /* Fourth Column */ 25507cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 25517cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 25527cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 25537cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 25547cf1b8d3SKris Buschelman SSE_INLINE_END_2 25557cf1b8d3SKris Buschelman 25567cf1b8d3SKris Buschelman v += 16; 25577cf1b8d3SKris Buschelman } 25587cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 25597cf1b8d3SKris Buschelman PREFETCH_NTA(v); 25607cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 25617cf1b8d3SKris Buschelman } 25627cf1b8d3SKris Buschelman 25637cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 25647cf1b8d3SKris Buschelman 25657cf1b8d3SKris Buschelman idt = 4*(n-1); 25667cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 25677cf1b8d3SKris Buschelman v = aa + ai16 + 16; 25687cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 25697cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 25707cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 25717cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 25727cf1b8d3SKris Buschelman 25737cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 25747cf1b8d3SKris Buschelman 25757cf1b8d3SKris Buschelman while (nz--) { 25767cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 25777cf1b8d3SKris Buschelman idx = 4*(*vi++); 25787cf1b8d3SKris Buschelman /* idx = *vi++; */ 25797cf1b8d3SKris Buschelman 25807cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 25817cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 25827cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 25837cf1b8d3SKris Buschelman 25847cf1b8d3SKris Buschelman /* First Column */ 25857cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 25867cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25877cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 25887cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 25897cf1b8d3SKris Buschelman 25907cf1b8d3SKris Buschelman /* Second Column */ 25917cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 25927cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 25937cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 25947cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 25957cf1b8d3SKris Buschelman 25967cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 25977cf1b8d3SKris Buschelman 25987cf1b8d3SKris Buschelman /* Third Column */ 25997cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 26007cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26017cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 26027cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 26037cf1b8d3SKris Buschelman 26047cf1b8d3SKris Buschelman /* Fourth Column */ 26057cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 26067cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 26077cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 26087cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 26097cf1b8d3SKris Buschelman SSE_INLINE_END_2 26107cf1b8d3SKris Buschelman v += 16; 26117cf1b8d3SKris Buschelman } 26127cf1b8d3SKris Buschelman v = aa + ai16; 26137cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 26147cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 26157cf1b8d3SKris Buschelman /* 26167cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 26177cf1b8d3SKris Buschelman which was inverted as part of the factorization 26187cf1b8d3SKris Buschelman */ 26197cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 26207cf1b8d3SKris Buschelman /* First Column */ 26217cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 26227cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 26237cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 26247cf1b8d3SKris Buschelman 26257cf1b8d3SKris Buschelman /* Second Column */ 26267cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 26277cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 26287cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 26297cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 26307cf1b8d3SKris Buschelman 26317cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 26327cf1b8d3SKris Buschelman 26337cf1b8d3SKris Buschelman /* Third Column */ 26347cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 26357cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26367cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 26377cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 26387cf1b8d3SKris Buschelman 26397cf1b8d3SKris Buschelman /* Fourth Column */ 26407cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 26417cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 26427cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 26437cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 26447cf1b8d3SKris Buschelman 26457cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 26467cf1b8d3SKris Buschelman SSE_INLINE_END_3 26477cf1b8d3SKris Buschelman 26487cf1b8d3SKris Buschelman v = aa + ai16 + 16; 26497cf1b8d3SKris Buschelman idt -= 4; 26507cf1b8d3SKris Buschelman } 26517cf1b8d3SKris Buschelman 26527cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 26537cf1b8d3SKris Buschelman idt = 4*(n-1); 26547cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 26557cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 26567cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 26577cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 26587cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 26597cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 26607cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 26617cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 26627cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 26637cf1b8d3SKris Buschelman idt -= 4; 26647cf1b8d3SKris Buschelman } 26657cf1b8d3SKris Buschelman 26667cf1b8d3SKris Buschelman } /* End of artificial scope. */ 26671ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 26681ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 26697cf1b8d3SKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 26707cf1b8d3SKris Buschelman SSE_SCOPE_END; 26717cf1b8d3SKris Buschelman PetscFunctionReturn(0); 26727cf1b8d3SKris Buschelman } 26737cf1b8d3SKris Buschelman 26743660e330SKris Buschelman #endif 26754a2ae208SSatish Balay #undef __FUNCT__ 26764a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2677dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 26784e2b4712SSatish Balay { 26794e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 26804e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 2681*6849ba73SBarry Smith PetscErrorCode ierr; 2682*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 26834e2b4712SSatish Balay int *diag = a->diag; 26843f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 268587828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 26864e2b4712SSatish Balay 26874e2b4712SSatish Balay PetscFunctionBegin; 26881ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 26891ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2690f1af5d2fSBarry Smith t = a->solve_work; 26914e2b4712SSatish Balay 26924e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 26934e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 26944e2b4712SSatish Balay 26954e2b4712SSatish Balay /* forward solve the lower triangular */ 26964e2b4712SSatish Balay idx = 3*(*r++); 2697f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 26984e2b4712SSatish Balay for (i=1; i<n; i++) { 26994e2b4712SSatish Balay v = aa + 9*ai[i]; 27004e2b4712SSatish Balay vi = aj + ai[i]; 27014e2b4712SSatish Balay nz = diag[i] - ai[i]; 27024e2b4712SSatish Balay idx = 3*(*r++); 2703f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 27044e2b4712SSatish Balay while (nz--) { 27054e2b4712SSatish Balay idx = 3*(*vi++); 2706f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2707f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2708f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2709f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 27104e2b4712SSatish Balay v += 9; 27114e2b4712SSatish Balay } 27124e2b4712SSatish Balay idx = 3*i; 2713f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 27144e2b4712SSatish Balay } 27154e2b4712SSatish Balay /* backward solve the upper triangular */ 27164e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 27174e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 27184e2b4712SSatish Balay vi = aj + diag[i] + 1; 27194e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 27204e2b4712SSatish Balay idt = 3*i; 2721f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 27224e2b4712SSatish Balay while (nz--) { 27234e2b4712SSatish Balay idx = 3*(*vi++); 2724f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2725f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2726f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2727f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 27284e2b4712SSatish Balay v += 9; 27294e2b4712SSatish Balay } 27304e2b4712SSatish Balay idc = 3*(*c--); 27314e2b4712SSatish Balay v = aa + 9*diag[i]; 2732f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2733f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2734f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 27354e2b4712SSatish Balay } 27364e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 27374e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 27381ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 27391ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2740b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 27414e2b4712SSatish Balay PetscFunctionReturn(0); 27424e2b4712SSatish Balay } 27434e2b4712SSatish Balay 274415091d37SBarry Smith /* 274515091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 274615091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 274715091d37SBarry Smith */ 27484a2ae208SSatish Balay #undef __FUNCT__ 27494a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2750dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 275115091d37SBarry Smith { 275215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 275315091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 2754dfbe8321SBarry Smith PetscErrorCode ierr; 2755dfbe8321SBarry Smith int *diag = a->diag; 275615091d37SBarry Smith MatScalar *aa=a->a,*v; 275787828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 275815091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 275915091d37SBarry Smith 276015091d37SBarry Smith PetscFunctionBegin; 27611ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27621ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 276315091d37SBarry Smith 276415091d37SBarry Smith 276515091d37SBarry Smith /* forward solve the lower triangular */ 276615091d37SBarry Smith idx = 0; 276715091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 276815091d37SBarry Smith for (i=1; i<n; i++) { 276915091d37SBarry Smith v = aa + 9*ai[i]; 277015091d37SBarry Smith vi = aj + ai[i]; 277115091d37SBarry Smith nz = diag[i] - ai[i]; 277215091d37SBarry Smith idx += 3; 2773f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 277415091d37SBarry Smith while (nz--) { 277515091d37SBarry Smith jdx = 3*(*vi++); 277615091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2777f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2778f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2779f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 278015091d37SBarry Smith v += 9; 278115091d37SBarry Smith } 2782f1af5d2fSBarry Smith x[idx] = s1; 2783f1af5d2fSBarry Smith x[1+idx] = s2; 2784f1af5d2fSBarry Smith x[2+idx] = s3; 278515091d37SBarry Smith } 278615091d37SBarry Smith /* backward solve the upper triangular */ 278715091d37SBarry Smith for (i=n-1; i>=0; i--){ 278815091d37SBarry Smith v = aa + 9*diag[i] + 9; 278915091d37SBarry Smith vi = aj + diag[i] + 1; 279015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 279115091d37SBarry Smith idt = 3*i; 2792f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2793f1af5d2fSBarry Smith s3 = x[2+idt]; 279415091d37SBarry Smith while (nz--) { 279515091d37SBarry Smith idx = 3*(*vi++); 279615091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2797f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2798f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2799f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 280015091d37SBarry Smith v += 9; 280115091d37SBarry Smith } 280215091d37SBarry Smith v = aa + 9*diag[i]; 2803f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2804f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2805f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 280615091d37SBarry Smith } 280715091d37SBarry Smith 28081ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 28091ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2810b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 281115091d37SBarry Smith PetscFunctionReturn(0); 281215091d37SBarry Smith } 281315091d37SBarry Smith 28144a2ae208SSatish Balay #undef __FUNCT__ 28154a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2816dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 28174e2b4712SSatish Balay { 28184e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 28194e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 2820*6849ba73SBarry Smith PetscErrorCode ierr; 2821*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 28224e2b4712SSatish Balay int *diag = a->diag; 28233f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 282487828ca2SBarry Smith PetscScalar *x,*b,s1,s2,x1,x2,*t; 28254e2b4712SSatish Balay 28264e2b4712SSatish Balay PetscFunctionBegin; 28271ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 28281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2829f1af5d2fSBarry Smith t = a->solve_work; 28304e2b4712SSatish Balay 28314e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 28324e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 28334e2b4712SSatish Balay 28344e2b4712SSatish Balay /* forward solve the lower triangular */ 28354e2b4712SSatish Balay idx = 2*(*r++); 2836f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 28374e2b4712SSatish Balay for (i=1; i<n; i++) { 28384e2b4712SSatish Balay v = aa + 4*ai[i]; 28394e2b4712SSatish Balay vi = aj + ai[i]; 28404e2b4712SSatish Balay nz = diag[i] - ai[i]; 28414e2b4712SSatish Balay idx = 2*(*r++); 2842f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 28434e2b4712SSatish Balay while (nz--) { 28444e2b4712SSatish Balay idx = 2*(*vi++); 2845f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2846f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2847f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 28484e2b4712SSatish Balay v += 4; 28494e2b4712SSatish Balay } 28504e2b4712SSatish Balay idx = 2*i; 2851f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 28524e2b4712SSatish Balay } 28534e2b4712SSatish Balay /* backward solve the upper triangular */ 28544e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 28554e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 28564e2b4712SSatish Balay vi = aj + diag[i] + 1; 28574e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 28584e2b4712SSatish Balay idt = 2*i; 2859f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 28604e2b4712SSatish Balay while (nz--) { 28614e2b4712SSatish Balay idx = 2*(*vi++); 2862f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2863f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2864f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 28654e2b4712SSatish Balay v += 4; 28664e2b4712SSatish Balay } 28674e2b4712SSatish Balay idc = 2*(*c--); 28684e2b4712SSatish Balay v = aa + 4*diag[i]; 2869f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2870f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 28714e2b4712SSatish Balay } 28724e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 28734e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 28741ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 28751ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2876b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 28774e2b4712SSatish Balay PetscFunctionReturn(0); 28784e2b4712SSatish Balay } 28794e2b4712SSatish Balay 288015091d37SBarry Smith /* 288115091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 288215091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 288315091d37SBarry Smith */ 28844a2ae208SSatish Balay #undef __FUNCT__ 28854a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2886dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 288715091d37SBarry Smith { 288815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 288915091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 2890dfbe8321SBarry Smith PetscErrorCode ierr; 2891dfbe8321SBarry Smith int *diag = a->diag; 289215091d37SBarry Smith MatScalar *aa=a->a,*v; 289387828ca2SBarry Smith PetscScalar *x,*b,s1,s2,x1,x2; 289415091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 289515091d37SBarry Smith 289615091d37SBarry Smith PetscFunctionBegin; 28971ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 28981ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 289915091d37SBarry Smith 290015091d37SBarry Smith /* forward solve the lower triangular */ 290115091d37SBarry Smith idx = 0; 290215091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 290315091d37SBarry Smith for (i=1; i<n; i++) { 290415091d37SBarry Smith v = aa + 4*ai[i]; 290515091d37SBarry Smith vi = aj + ai[i]; 290615091d37SBarry Smith nz = diag[i] - ai[i]; 290715091d37SBarry Smith idx += 2; 2908f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 290915091d37SBarry Smith while (nz--) { 291015091d37SBarry Smith jdx = 2*(*vi++); 291115091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 2912f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2913f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 291415091d37SBarry Smith v += 4; 291515091d37SBarry Smith } 2916f1af5d2fSBarry Smith x[idx] = s1; 2917f1af5d2fSBarry Smith x[1+idx] = s2; 291815091d37SBarry Smith } 291915091d37SBarry Smith /* backward solve the upper triangular */ 292015091d37SBarry Smith for (i=n-1; i>=0; i--){ 292115091d37SBarry Smith v = aa + 4*diag[i] + 4; 292215091d37SBarry Smith vi = aj + diag[i] + 1; 292315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 292415091d37SBarry Smith idt = 2*i; 2925f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 292615091d37SBarry Smith while (nz--) { 292715091d37SBarry Smith idx = 2*(*vi++); 292815091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 2929f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2930f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 293115091d37SBarry Smith v += 4; 293215091d37SBarry Smith } 293315091d37SBarry Smith v = aa + 4*diag[i]; 2934f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 2935f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 293615091d37SBarry Smith } 293715091d37SBarry Smith 29381ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 29391ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2940b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 294115091d37SBarry Smith PetscFunctionReturn(0); 294215091d37SBarry Smith } 294315091d37SBarry Smith 29444a2ae208SSatish Balay #undef __FUNCT__ 29454a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2946dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 29474e2b4712SSatish Balay { 29484e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 29494e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 2950*6849ba73SBarry Smith PetscErrorCode ierr; 2951*6849ba73SBarry Smith int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 29524e2b4712SSatish Balay int *diag = a->diag; 29533f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 295487828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 29554e2b4712SSatish Balay 29564e2b4712SSatish Balay PetscFunctionBegin; 29574e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 29584e2b4712SSatish Balay 29591ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 29601ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2961f1af5d2fSBarry Smith t = a->solve_work; 29624e2b4712SSatish Balay 29634e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 29644e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 29654e2b4712SSatish Balay 29664e2b4712SSatish Balay /* forward solve the lower triangular */ 2967f1af5d2fSBarry Smith t[0] = b[*r++]; 29684e2b4712SSatish Balay for (i=1; i<n; i++) { 29694e2b4712SSatish Balay v = aa + ai[i]; 29704e2b4712SSatish Balay vi = aj + ai[i]; 29714e2b4712SSatish Balay nz = diag[i] - ai[i]; 2972f1af5d2fSBarry Smith s1 = b[*r++]; 29734e2b4712SSatish Balay while (nz--) { 2974f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 29754e2b4712SSatish Balay } 2976f1af5d2fSBarry Smith t[i] = s1; 29774e2b4712SSatish Balay } 29784e2b4712SSatish Balay /* backward solve the upper triangular */ 29794e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 29804e2b4712SSatish Balay v = aa + diag[i] + 1; 29814e2b4712SSatish Balay vi = aj + diag[i] + 1; 29824e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2983f1af5d2fSBarry Smith s1 = t[i]; 29844e2b4712SSatish Balay while (nz--) { 2985f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 29864e2b4712SSatish Balay } 2987f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 29884e2b4712SSatish Balay } 29894e2b4712SSatish Balay 29904e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 29914e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 29921ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 29931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2994b0a32e0cSBarry Smith PetscLogFlops(2*1*(a->nz) - A->n); 29954e2b4712SSatish Balay PetscFunctionReturn(0); 29964e2b4712SSatish Balay } 299715091d37SBarry Smith /* 299815091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 299915091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 300015091d37SBarry Smith */ 30014a2ae208SSatish Balay #undef __FUNCT__ 30024a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3003dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 300415091d37SBarry Smith { 300515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 300615091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 3007dfbe8321SBarry Smith PetscErrorCode ierr; 3008dfbe8321SBarry Smith int *diag = a->diag; 300915091d37SBarry Smith MatScalar *aa=a->a; 301087828ca2SBarry Smith PetscScalar *x,*b; 301187828ca2SBarry Smith PetscScalar s1,x1; 301215091d37SBarry Smith MatScalar *v; 301315091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 301415091d37SBarry Smith 301515091d37SBarry Smith PetscFunctionBegin; 30161ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 30171ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 301815091d37SBarry Smith 301915091d37SBarry Smith /* forward solve the lower triangular */ 302015091d37SBarry Smith idx = 0; 302115091d37SBarry Smith x[0] = b[0]; 302215091d37SBarry Smith for (i=1; i<n; i++) { 302315091d37SBarry Smith v = aa + ai[i]; 302415091d37SBarry Smith vi = aj + ai[i]; 302515091d37SBarry Smith nz = diag[i] - ai[i]; 302615091d37SBarry Smith idx += 1; 3027f1af5d2fSBarry Smith s1 = b[idx]; 302815091d37SBarry Smith while (nz--) { 302915091d37SBarry Smith jdx = *vi++; 303015091d37SBarry Smith x1 = x[jdx]; 3031f1af5d2fSBarry Smith s1 -= v[0]*x1; 303215091d37SBarry Smith v += 1; 303315091d37SBarry Smith } 3034f1af5d2fSBarry Smith x[idx] = s1; 303515091d37SBarry Smith } 303615091d37SBarry Smith /* backward solve the upper triangular */ 303715091d37SBarry Smith for (i=n-1; i>=0; i--){ 303815091d37SBarry Smith v = aa + diag[i] + 1; 303915091d37SBarry Smith vi = aj + diag[i] + 1; 304015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 304115091d37SBarry Smith idt = i; 3042f1af5d2fSBarry Smith s1 = x[idt]; 304315091d37SBarry Smith while (nz--) { 304415091d37SBarry Smith idx = *vi++; 304515091d37SBarry Smith x1 = x[idx]; 3046f1af5d2fSBarry Smith s1 -= v[0]*x1; 304715091d37SBarry Smith v += 1; 304815091d37SBarry Smith } 304915091d37SBarry Smith v = aa + diag[i]; 3050f1af5d2fSBarry Smith x[idt] = v[0]*s1; 305115091d37SBarry Smith } 30521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 30531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3054b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 305515091d37SBarry Smith PetscFunctionReturn(0); 305615091d37SBarry Smith } 30574e2b4712SSatish Balay 30584e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 30594e2b4712SSatish Balay /* 30604e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 30614e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 30624e2b4712SSatish Balay Not a good example of code reuse. 30634e2b4712SSatish Balay */ 3064dfbe8321SBarry Smith EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat); 3065435faa5fSBarry Smith 30664a2ae208SSatish Balay #undef __FUNCT__ 30674a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3068dfbe8321SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact) 30694e2b4712SSatish Balay { 30704e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 30714e2b4712SSatish Balay IS isicol; 3072*6849ba73SBarry Smith PetscErrorCode ierr; 3073*6849ba73SBarry Smith int *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j; 30744e2b4712SSatish Balay int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3075eb150c5cSKris Buschelman int *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0; 3076435faa5fSBarry Smith int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 30774533b203SBarry Smith PetscTruth col_identity,row_identity; 3078329f5518SBarry Smith PetscReal f; 30794e2b4712SSatish Balay 30804e2b4712SSatish Balay PetscFunctionBegin; 3081435faa5fSBarry Smith f = info->fill; 3082335d9088SBarry Smith levels = (int)info->levels; 3083335d9088SBarry Smith diagonal_fill = (int)info->diagonal_fill; 30844c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3085667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3086667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3087309c388cSBarry Smith 3088309c388cSBarry Smith if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3089bb3d539aSBarry Smith ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3090bb3d539aSBarry Smith (*fact)->factor = FACTOR_LU; 3091bb3d539aSBarry Smith b = (Mat_SeqBAIJ*)(*fact)->data; 3092bb3d539aSBarry Smith if (!b->diag) { 3093bb3d539aSBarry Smith ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3094bb3d539aSBarry Smith } 3095bb3d539aSBarry Smith ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3096bb3d539aSBarry Smith b->row = isrow; 3097bb3d539aSBarry Smith b->col = iscol; 3098bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3099bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3100bb3d539aSBarry Smith b->icol = isicol; 3101bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 310287828ca2SBarry Smith ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3103309c388cSBarry Smith } else { /* general case perform the symbolic factorization */ 31044e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 31054e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 31064e2b4712SSatish Balay 31074e2b4712SSatish Balay /* get new row pointers */ 3108b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 31094e2b4712SSatish Balay ainew[0] = 0; 31104e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 31114e2b4712SSatish Balay jmax = (int)(f*ai[n] + 1); 311282502324SSatish Balay ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 31134e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 311482502324SSatish Balay ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 31154e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 3116b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 31174e2b4712SSatish Balay /* im is level for each filled value */ 3118b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 31194e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 3120b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 31214e2b4712SSatish Balay dloc[0] = 0; 31224e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 3123435faa5fSBarry Smith 3124435faa5fSBarry Smith /* copy prow into linked list */ 31254e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 312629bbc08cSBarry Smith if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 31274e2b4712SSatish Balay xi = aj + ai[r[prow]]; 31284e2b4712SSatish Balay fill[n] = n; 3129435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 31304e2b4712SSatish Balay while (nz--) { 31314e2b4712SSatish Balay fm = n; 31324e2b4712SSatish Balay idx = ic[*xi++]; 31334e2b4712SSatish Balay do { 31344e2b4712SSatish Balay m = fm; 31354e2b4712SSatish Balay fm = fill[m]; 31364e2b4712SSatish Balay } while (fm < idx); 31374e2b4712SSatish Balay fill[m] = idx; 31384e2b4712SSatish Balay fill[idx] = fm; 31394e2b4712SSatish Balay im[idx] = 0; 31404e2b4712SSatish Balay } 3141435faa5fSBarry Smith 3142435faa5fSBarry Smith /* make sure diagonal entry is included */ 3143435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 3144435faa5fSBarry Smith fm = n; 3145435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 3146435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3147435faa5fSBarry Smith fill[fm] = prow; 3148435faa5fSBarry Smith im[prow] = 0; 3149435faa5fSBarry Smith nzf++; 3150335d9088SBarry Smith dcount++; 3151435faa5fSBarry Smith } 3152435faa5fSBarry Smith 31534e2b4712SSatish Balay nzi = 0; 31544e2b4712SSatish Balay row = fill[n]; 31554e2b4712SSatish Balay while (row < prow) { 31564e2b4712SSatish Balay incrlev = im[row] + 1; 31574e2b4712SSatish Balay nz = dloc[row]; 3158435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 31594e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 31604e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 31614e2b4712SSatish Balay fm = row; 31624e2b4712SSatish Balay while (nnz-- > 0) { 31634e2b4712SSatish Balay idx = *xi++; 31644e2b4712SSatish Balay if (*flev + incrlev > levels) { 31654e2b4712SSatish Balay flev++; 31664e2b4712SSatish Balay continue; 31674e2b4712SSatish Balay } 31684e2b4712SSatish Balay do { 31694e2b4712SSatish Balay m = fm; 31704e2b4712SSatish Balay fm = fill[m]; 31714e2b4712SSatish Balay } while (fm < idx); 31724e2b4712SSatish Balay if (fm != idx) { 31734e2b4712SSatish Balay im[idx] = *flev + incrlev; 31744e2b4712SSatish Balay fill[m] = idx; 31754e2b4712SSatish Balay fill[idx] = fm; 31764e2b4712SSatish Balay fm = idx; 31774e2b4712SSatish Balay nzf++; 3178ecf371e4SBarry Smith } else { 31794e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 31804e2b4712SSatish Balay } 31814e2b4712SSatish Balay flev++; 31824e2b4712SSatish Balay } 31834e2b4712SSatish Balay row = fill[row]; 31844e2b4712SSatish Balay nzi++; 31854e2b4712SSatish Balay } 31864e2b4712SSatish Balay /* copy new filled row into permanent storage */ 31874e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 31884e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 3189ecf371e4SBarry Smith 3190ecf371e4SBarry Smith /* estimate how much additional space we will need */ 3191ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3192ecf371e4SBarry Smith /* just double the memory each time */ 3193ecf371e4SBarry Smith int maxadd = jmax; 3194ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 31954e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 31964e2b4712SSatish Balay jmax += maxadd; 3197ecf371e4SBarry Smith 3198ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 319982502324SSatish Balay ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3200549d3d68SSatish Balay ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3201606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 32024e2b4712SSatish Balay ajnew = xi; 320382502324SSatish Balay ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3204549d3d68SSatish Balay ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3205606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 32064e2b4712SSatish Balay ajfill = xi; 3207eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 32084e2b4712SSatish Balay } 32094e2b4712SSatish Balay xi = ajnew + ainew[prow]; 32104e2b4712SSatish Balay flev = ajfill + ainew[prow]; 32114e2b4712SSatish Balay dloc[prow] = nzi; 32124e2b4712SSatish Balay fm = fill[n]; 32134e2b4712SSatish Balay while (nzf--) { 32144e2b4712SSatish Balay *xi++ = fm; 32154e2b4712SSatish Balay *flev++ = im[fm]; 32164e2b4712SSatish Balay fm = fill[fm]; 32174e2b4712SSatish Balay } 3218435faa5fSBarry Smith /* make sure row has diagonal entry */ 3219435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 322029bbc08cSBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 3221435faa5fSBarry Smith try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 3222435faa5fSBarry Smith } 32234e2b4712SSatish Balay } 3224606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 32254e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 32264e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3227606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 3228606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 32294e2b4712SSatish Balay 32304e2b4712SSatish Balay { 3231329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3232eb150c5cSKris Buschelman PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af); 3233b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 3234b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 3235b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 3236335d9088SBarry Smith if (diagonal_fill) { 3237b1bcba4aSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount); 3238335d9088SBarry Smith } 32394e2b4712SSatish Balay } 32404e2b4712SSatish Balay 32414e2b4712SSatish Balay /* put together the new matrix */ 3242f204ca49SKris Buschelman ierr = MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);CHKERRQ(ierr); 3243f204ca49SKris Buschelman ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr); 3244f204ca49SKris Buschelman ierr = MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);CHKERRQ(ierr); 3245b0a32e0cSBarry Smith PetscLogObjectParent(*fact,isicol); 32464e2b4712SSatish Balay b = (Mat_SeqBAIJ*)(*fact)->data; 3247606d414cSSatish Balay ierr = PetscFree(b->imax);CHKERRQ(ierr); 32487c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 32493f1db9ecSBarry Smith len = bs2*ainew[n]*sizeof(MatScalar); 32504e2b4712SSatish Balay /* the next line frees the default space generated by the Create() */ 3251606d414cSSatish Balay ierr = PetscFree(b->a);CHKERRQ(ierr); 3252606d414cSSatish Balay ierr = PetscFree(b->ilen);CHKERRQ(ierr); 325382502324SSatish Balay ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 32544e2b4712SSatish Balay b->j = ajnew; 32554e2b4712SSatish Balay b->i = ainew; 32564e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 32574e2b4712SSatish Balay b->diag = dloc; 32584e2b4712SSatish Balay b->ilen = 0; 32594e2b4712SSatish Balay b->imax = 0; 32604e2b4712SSatish Balay b->row = isrow; 32614e2b4712SSatish Balay b->col = iscol; 3262bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3263c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3264c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3265e51c0b9cSSatish Balay b->icol = isicol; 326687828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 32674e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 32684e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 326987828ca2SBarry Smith PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 32704e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 32714e2b4712SSatish Balay (*fact)->factor = FACTOR_LU; 32724e2b4712SSatish Balay 3273eb150c5cSKris Buschelman (*fact)->info.factor_mallocs = reallocate; 32744e2b4712SSatish Balay (*fact)->info.fill_ratio_given = f; 3275329f5518SBarry Smith (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3276309c388cSBarry Smith } 32774e2b4712SSatish Balay 3278309c388cSBarry Smith if (row_identity && col_identity) { 3279732ee342SKris Buschelman ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 32808661488fSKris Buschelman } 32818661488fSKris Buschelman PetscFunctionReturn(0); 32828661488fSKris Buschelman } 32838661488fSKris Buschelman 3284732ee342SKris Buschelman #undef __FUNCT__ 32857e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3286dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 32877e7071cdSKris Buschelman { 328812272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 328912272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 32905a9542e3SKris Buschelman PetscFunctionBegin; 32917cf1b8d3SKris Buschelman /* Undo Column scaling */ 32927cf1b8d3SKris Buschelman /* while (nz--) { */ 32937cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 32947cf1b8d3SKris Buschelman /* } */ 3295c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 3296c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 32977cf1b8d3SKris Buschelman PetscFunctionReturn(0); 32987cf1b8d3SKris Buschelman } 32997cf1b8d3SKris Buschelman 33007cf1b8d3SKris Buschelman #undef __FUNCT__ 33017cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3302dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 33037cf1b8d3SKris Buschelman { 33047cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33050b9da03eSKris Buschelman int *AJ=a->j,nz=a->nz; 33062aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 33075a9542e3SKris Buschelman PetscFunctionBegin; 33080b9da03eSKris Buschelman /* Is this really necessary? */ 330920235379SKris Buschelman while (nz--) { 33100b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 33117e7071cdSKris Buschelman } 3312c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 33137e7071cdSKris Buschelman PetscFunctionReturn(0); 33147e7071cdSKris Buschelman } 33157e7071cdSKris Buschelman 33167e7071cdSKris Buschelman #undef __FUNCT__ 3317732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3318dfbe8321SBarry Smith PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 33198661488fSKris Buschelman { 33208661488fSKris Buschelman /* 33218661488fSKris Buschelman Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 33228661488fSKris Buschelman with natural ordering 33238661488fSKris Buschelman */ 33248661488fSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 33258661488fSKris Buschelman 33268661488fSKris Buschelman PetscFunctionBegin; 3327a7ba9c3cSKris Buschelman inA->ops->solve = MatSolve_SeqBAIJ_Update; 3328a7ba9c3cSKris Buschelman inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 33298661488fSKris Buschelman switch (a->bs) { 33308661488fSKris Buschelman case 1: 33318661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3332732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 3333732ee342SKris Buschelman break; 3334309c388cSBarry Smith case 2: 33358661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3336732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 3337309c388cSBarry Smith break; 3338309c388cSBarry Smith case 3: 33398661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3340732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 3341309c388cSBarry Smith break; 3342309c388cSBarry Smith case 4: 3343a7d8d0baSKris Buschelman #if defined(PETSC_USE_MAT_SINGLE) 3344a7d8d0baSKris Buschelman { 3345a7d8d0baSKris Buschelman PetscTruth sse_enabled_local; 3346dfbe8321SBarry Smith PetscErrorCode ierr; 3347ccaa8a1bSKris Buschelman ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 33486b7cc795SKris Buschelman if (sse_enabled_local) { 3349b988c221SKris Buschelman # if defined(PETSC_HAVE_SSE) 33507cf1b8d3SKris Buschelman int i,*AJ=a->j,nz=a->nz,n=a->mbs; 33517cf1b8d3SKris Buschelman if (n==(unsigned short)n) { 33522aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 335313c7ffeeSKris Buschelman for (i=0;i<nz;i++) { 33542aa5897fSKris Buschelman aj[i] = (unsigned short)AJ[i]; 335513c7ffeeSKris Buschelman } 33567cf1b8d3SKris Buschelman inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj; 33577cf1b8d3SKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj; 335886b4ebfeSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n"); 33597cf1b8d3SKris Buschelman } else { 33607cf1b8d3SKris Buschelman /* Scale the column indices for easier indexing in MatSolve. */ 33617cf1b8d3SKris Buschelman /* for (i=0;i<nz;i++) { */ 33627cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]*4; */ 33637cf1b8d3SKris Buschelman /* } */ 33647e7071cdSKris Buschelman inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE; 33658661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 336686b4ebfeSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n"); 33677cf1b8d3SKris Buschelman } 3368b988c221SKris Buschelman # else 3369b988c221SKris Buschelman /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3370b988c221SKris Buschelman SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3371b988c221SKris Buschelman # endif 33723ba47ebaSKris Buschelman } else { 33738661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3374732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 33753ba47ebaSKris Buschelman } 3376a7d8d0baSKris Buschelman } 3377a7d8d0baSKris Buschelman #else 3378a7d8d0baSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3379a7d8d0baSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3380a7d8d0baSKris Buschelman #endif 3381309c388cSBarry Smith break; 3382309c388cSBarry Smith case 5: 33838661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3384732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 3385309c388cSBarry Smith break; 3386309c388cSBarry Smith case 6: 33878661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3388732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 3389309c388cSBarry Smith break; 3390309c388cSBarry Smith case 7: 33918661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3392732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 3393309c388cSBarry Smith break; 3394309c388cSBarry Smith } 33954e2b4712SSatish Balay PetscFunctionReturn(0); 33964e2b4712SSatish Balay } 3397732ee342SKris Buschelman 3398732ee342SKris Buschelman #undef __FUNCT__ 3399732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3400dfbe8321SBarry Smith PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A) 3401732ee342SKris Buschelman { 3402732ee342SKris Buschelman /* 3403732ee342SKris Buschelman Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3404732ee342SKris Buschelman with natural ordering 3405732ee342SKris Buschelman */ 3406732ee342SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3407732ee342SKris Buschelman IS row = a->row, col = a->col; 3408732ee342SKris Buschelman PetscTruth row_identity, col_identity; 340923c42b7cSKris Buschelman PetscTruth use_natural; 3410dfbe8321SBarry Smith PetscErrorCode ierr; 3411732ee342SKris Buschelman 3412732ee342SKris Buschelman PetscFunctionBegin; 3413cf242676SKris Buschelman 341494ee7fc8SKris Buschelman use_natural = PETSC_FALSE; 341521360622SBarry Smith if (row && col) { 3416732ee342SKris Buschelman ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3417732ee342SKris Buschelman ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3418732ee342SKris Buschelman 3419732ee342SKris Buschelman if (row_identity && col_identity) { 3420732ee342SKris Buschelman use_natural = PETSC_TRUE; 3421732ee342SKris Buschelman } 342221360622SBarry Smith } else { 342321360622SBarry Smith use_natural = PETSC_TRUE; 342421360622SBarry Smith } 342521360622SBarry Smith 3426732ee342SKris Buschelman switch (a->bs) { 3427732ee342SKris Buschelman case 1: 3428732ee342SKris Buschelman if (use_natural) { 3429732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3430732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3431732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 3432732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3433732ee342SKris Buschelman } else { 3434732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_1; 3435732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3436732ee342SKris Buschelman } 3437732ee342SKris Buschelman break; 3438732ee342SKris Buschelman case 2: 3439732ee342SKris Buschelman if (use_natural) { 3440732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3441732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3442732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 3443732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3444732ee342SKris Buschelman } else { 3445732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_2; 3446732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3447732ee342SKris Buschelman } 3448732ee342SKris Buschelman break; 3449732ee342SKris Buschelman case 3: 3450732ee342SKris Buschelman if (use_natural) { 3451732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3452732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3453732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 3454732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3455732ee342SKris Buschelman } else { 3456732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_3; 3457732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3458732ee342SKris Buschelman } 3459732ee342SKris Buschelman break; 3460732ee342SKris Buschelman case 4: 3461f26ec98cSKris Buschelman { 3462123145dfSKris Buschelman PetscTruth sse_enabled_local; 3463ccaa8a1bSKris Buschelman ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3464732ee342SKris Buschelman if (use_natural) { 34652859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE) 3466f26ec98cSKris Buschelman if (sse_enabled_local) { /* Natural + Single + SSE */ 3467eb150c5cSKris Buschelman # if defined(PETSC_HAVE_SSE) 3468995eb297SKris Buschelman int n=a->mbs; 3469995eb297SKris Buschelman if (n==(unsigned short)n) { 3470995eb297SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj; 3471995eb297SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n"); 3472995eb297SKris Buschelman } else { 3473732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 347486b4ebfeSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n"); 3475995eb297SKris Buschelman } 3476eb150c5cSKris Buschelman # else 3477eb150c5cSKris Buschelman /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3478eb150c5cSKris Buschelman SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3479eb150c5cSKris Buschelman # endif 3480f26ec98cSKris Buschelman } else { /* Natural + Single */ 3481f26ec98cSKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3482123145dfSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n"); 3483f26ec98cSKris Buschelman } 34842859b196SKris Buschelman #else 34852859b196SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3486123145dfSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 34872859b196SKris Buschelman #endif 3488732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3489123145dfSKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3490f26ec98cSKris Buschelman } else { /* Arbitrary ordering */ 34912859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE) 3492f26ec98cSKris Buschelman if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3493eb150c5cSKris Buschelman # if defined(PETSC_HAVE_SSE) 3494732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3495732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3496eb150c5cSKris Buschelman # else 3497eb150c5cSKris Buschelman /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3498eb150c5cSKris Buschelman SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3499eb150c5cSKris Buschelman # endif 3500f26ec98cSKris Buschelman } else { /* Arbitrary + Single */ 3501f26ec98cSKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3502f26ec98cSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"); 3503732ee342SKris Buschelman } 35042859b196SKris Buschelman #else 35052859b196SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4; 35062859b196SKris Buschelman #endif 3507732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3508732ee342SKris Buschelman } 3509f26ec98cSKris Buschelman } 3510732ee342SKris Buschelman break; 3511732ee342SKris Buschelman case 5: 3512732ee342SKris Buschelman if (use_natural) { 3513732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3514732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3515732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3516732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3517732ee342SKris Buschelman } else { 3518732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_5; 3519732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3520732ee342SKris Buschelman } 3521732ee342SKris Buschelman break; 3522732ee342SKris Buschelman case 6: 3523732ee342SKris Buschelman if (use_natural) { 3524732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3525732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3526732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3527732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3528732ee342SKris Buschelman } else { 3529732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_6; 3530732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3531732ee342SKris Buschelman } 3532732ee342SKris Buschelman break; 3533732ee342SKris Buschelman case 7: 3534732ee342SKris Buschelman if (use_natural) { 3535732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3536732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3537732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3538732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3539732ee342SKris Buschelman } else { 3540732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_7; 3541732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3542732ee342SKris Buschelman } 3543732ee342SKris Buschelman break; 354431801e53SKris Buschelman default: 354531801e53SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_N; 354631801e53SKris Buschelman break; 3547732ee342SKris Buschelman } 3548732ee342SKris Buschelman PetscFunctionReturn(0); 3549732ee342SKris Buschelman } 3550732ee342SKris Buschelman 3551732ee342SKris Buschelman #undef __FUNCT__ 3552732ee342SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3553dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3554dfbe8321SBarry Smith PetscErrorCode ierr; 3555732ee342SKris Buschelman 3556732ee342SKris Buschelman PetscFunctionBegin; 3557732ee342SKris Buschelman ierr = MatSeqBAIJ_UpdateSolvers(A); 3558cf242676SKris Buschelman if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3559732ee342SKris Buschelman ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3560cf242676SKris Buschelman } else { 3561cf242676SKris Buschelman SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3562cf242676SKris Buschelman } 3563732ee342SKris Buschelman PetscFunctionReturn(0); 3564732ee342SKris Buschelman } 3565732ee342SKris Buschelman 3566732ee342SKris Buschelman #undef __FUNCT__ 3567732ee342SKris Buschelman #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3568dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3569dfbe8321SBarry Smith PetscErrorCode ierr; 3570732ee342SKris Buschelman 3571732ee342SKris Buschelman PetscFunctionBegin; 3572732ee342SKris Buschelman ierr = MatSeqBAIJ_UpdateSolvers(A); 3573732ee342SKris Buschelman ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3574732ee342SKris Buschelman PetscFunctionReturn(0); 3575732ee342SKris Buschelman } 3576732ee342SKris Buschelman 3577732ee342SKris Buschelman 3578