1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 34e2b4712SSatish Balay /* 44e2b4712SSatish Balay Factorization code for BAIJ format. 54e2b4712SSatish Balay */ 64e2b4712SSatish Balay 74e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h" 84e2b4712SSatish Balay #include "src/inline/ilu.h" 974c49faeSBarry Smith #include "src/inline/dot.h" 104e2b4712SSatish Balay 114a2ae208SSatish Balay #undef __FUNCT__ 124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14f1af5d2fSBarry Smith { 15f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16dfbe8321SBarry Smith PetscErrorCode ierr; 17690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18690b6cddSBarry Smith PetscInt *diag = a->diag; 19f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2087828ca2SBarry Smith PetscScalar s1,*x,*b; 21f1af5d2fSBarry Smith 22f1af5d2fSBarry Smith PetscFunctionBegin; 23ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 241ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 251ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26f1af5d2fSBarry Smith 27f1af5d2fSBarry Smith /* forward solve the U^T */ 28f1af5d2fSBarry Smith for (i=0; i<n; i++) { 29f1af5d2fSBarry Smith 30f1af5d2fSBarry Smith v = aa + diag[i]; 31f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 32ef66eb69SBarry Smith s1 = (*v++)*x[i]; 33f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 34f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 35f1af5d2fSBarry Smith while (nz--) { 36f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 37f1af5d2fSBarry Smith } 38f1af5d2fSBarry Smith x[i] = s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith /* backward solve the L^T */ 41f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 42f1af5d2fSBarry Smith v = aa + diag[i] - 1; 43f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 44f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 45f1af5d2fSBarry Smith s1 = x[i]; 46f1af5d2fSBarry Smith while (nz--) { 47f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 48f1af5d2fSBarry Smith } 49f1af5d2fSBarry Smith } 501ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 511ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52d0f46423SBarry Smith ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 53f1af5d2fSBarry Smith PetscFunctionReturn(0); 54f1af5d2fSBarry Smith } 55f1af5d2fSBarry Smith 564a2ae208SSatish Balay #undef __FUNCT__ 574a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 58dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 59f1af5d2fSBarry Smith { 60f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 61dfbe8321SBarry Smith PetscErrorCode ierr; 62690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 63690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 64f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6587828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6687828ca2SBarry Smith PetscScalar *x,*b; 67f1af5d2fSBarry Smith 68f1af5d2fSBarry Smith PetscFunctionBegin; 69ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 701ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 711ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 72f1af5d2fSBarry Smith 73f1af5d2fSBarry Smith /* forward solve the U^T */ 74f1af5d2fSBarry Smith idx = 0; 75f1af5d2fSBarry Smith for (i=0; i<n; i++) { 76f1af5d2fSBarry Smith 77f1af5d2fSBarry Smith v = aa + 4*diag[i]; 78f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 79ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 80f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 81f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 82f1af5d2fSBarry Smith v += 4; 83f1af5d2fSBarry Smith 84f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 85f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 86f1af5d2fSBarry Smith while (nz--) { 87f1af5d2fSBarry Smith oidx = 2*(*vi++); 88f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 89f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 90f1af5d2fSBarry Smith v += 4; 91f1af5d2fSBarry Smith } 92f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 93f1af5d2fSBarry Smith idx += 2; 94f1af5d2fSBarry Smith } 95f1af5d2fSBarry Smith /* backward solve the L^T */ 96f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 97f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 98f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 99f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 100f1af5d2fSBarry Smith idt = 2*i; 101f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 102f1af5d2fSBarry Smith while (nz--) { 103f1af5d2fSBarry Smith idx = 2*(*vi--); 104f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 105f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 106f1af5d2fSBarry Smith v -= 4; 107f1af5d2fSBarry Smith } 108f1af5d2fSBarry Smith } 1091ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1101ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 111d0f46423SBarry Smith ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 112f1af5d2fSBarry Smith PetscFunctionReturn(0); 113f1af5d2fSBarry Smith } 114f1af5d2fSBarry Smith 1154a2ae208SSatish Balay #undef __FUNCT__ 1164a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 117dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 118f1af5d2fSBarry Smith { 119f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 120dfbe8321SBarry Smith PetscErrorCode ierr; 121690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 122690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 123f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12487828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12587828ca2SBarry Smith PetscScalar *x,*b; 126f1af5d2fSBarry Smith 127f1af5d2fSBarry Smith PetscFunctionBegin; 128ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1291ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1301ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 131f1af5d2fSBarry Smith 132f1af5d2fSBarry Smith /* forward solve the U^T */ 133f1af5d2fSBarry Smith idx = 0; 134f1af5d2fSBarry Smith for (i=0; i<n; i++) { 135f1af5d2fSBarry Smith 136f1af5d2fSBarry Smith v = aa + 9*diag[i]; 137f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 138ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 139f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 140f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 141f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 142f1af5d2fSBarry Smith v += 9; 143f1af5d2fSBarry Smith 144f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 145f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 146f1af5d2fSBarry Smith while (nz--) { 147f1af5d2fSBarry Smith oidx = 3*(*vi++); 148f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 149f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 150f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 151f1af5d2fSBarry Smith v += 9; 152f1af5d2fSBarry Smith } 153f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 154f1af5d2fSBarry Smith idx += 3; 155f1af5d2fSBarry Smith } 156f1af5d2fSBarry Smith /* backward solve the L^T */ 157f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 158f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 159f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 160f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 161f1af5d2fSBarry Smith idt = 3*i; 162f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 163f1af5d2fSBarry Smith while (nz--) { 164f1af5d2fSBarry Smith idx = 3*(*vi--); 165f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 166f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 167f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 168f1af5d2fSBarry Smith v -= 9; 169f1af5d2fSBarry Smith } 170f1af5d2fSBarry Smith } 1711ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1721ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 173d0f46423SBarry Smith ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 174f1af5d2fSBarry Smith PetscFunctionReturn(0); 175f1af5d2fSBarry Smith } 176f1af5d2fSBarry Smith 1774a2ae208SSatish Balay #undef __FUNCT__ 1784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 179dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 180f1af5d2fSBarry Smith { 181f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182dfbe8321SBarry Smith PetscErrorCode ierr; 183690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 184690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 185f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18787828ca2SBarry Smith PetscScalar *x,*b; 188f1af5d2fSBarry Smith 189f1af5d2fSBarry Smith PetscFunctionBegin; 190ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1911ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1921ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 193f1af5d2fSBarry Smith 194f1af5d2fSBarry Smith /* forward solve the U^T */ 195f1af5d2fSBarry Smith idx = 0; 196f1af5d2fSBarry Smith for (i=0; i<n; i++) { 197f1af5d2fSBarry Smith 198f1af5d2fSBarry Smith v = aa + 16*diag[i]; 199f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 200ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 201f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 202f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 203f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 204f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 205f1af5d2fSBarry Smith v += 16; 206f1af5d2fSBarry Smith 207f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 208f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 209f1af5d2fSBarry Smith while (nz--) { 210f1af5d2fSBarry Smith oidx = 4*(*vi++); 211f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 212f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 213f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 214f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 215f1af5d2fSBarry Smith v += 16; 216f1af5d2fSBarry Smith } 217f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 218f1af5d2fSBarry Smith idx += 4; 219f1af5d2fSBarry Smith } 220f1af5d2fSBarry Smith /* backward solve the L^T */ 221f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 222f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 223f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 224f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 225f1af5d2fSBarry Smith idt = 4*i; 226f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 227f1af5d2fSBarry Smith while (nz--) { 228f1af5d2fSBarry Smith idx = 4*(*vi--); 229f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 230f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 231f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 232f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 233f1af5d2fSBarry Smith v -= 16; 234f1af5d2fSBarry Smith } 235f1af5d2fSBarry Smith } 2361ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2371ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 238d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 239f1af5d2fSBarry Smith PetscFunctionReturn(0); 240f1af5d2fSBarry Smith } 241f1af5d2fSBarry Smith 2424a2ae208SSatish Balay #undef __FUNCT__ 2434a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 244dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 245f1af5d2fSBarry Smith { 246f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 247dfbe8321SBarry Smith PetscErrorCode ierr; 248690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 249690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 250f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 25187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 25287828ca2SBarry Smith PetscScalar *x,*b; 253f1af5d2fSBarry Smith 254f1af5d2fSBarry Smith PetscFunctionBegin; 255ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2561ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2571ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 258f1af5d2fSBarry Smith 259f1af5d2fSBarry Smith /* forward solve the U^T */ 260f1af5d2fSBarry Smith idx = 0; 261f1af5d2fSBarry Smith for (i=0; i<n; i++) { 262f1af5d2fSBarry Smith 263f1af5d2fSBarry Smith v = aa + 25*diag[i]; 264f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 265ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 266f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 267f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 268f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 269f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 270f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 271f1af5d2fSBarry Smith v += 25; 272f1af5d2fSBarry Smith 273f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 274f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 275f1af5d2fSBarry Smith while (nz--) { 276f1af5d2fSBarry Smith oidx = 5*(*vi++); 277f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 278f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 279f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 280f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 281f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 282f1af5d2fSBarry Smith v += 25; 283f1af5d2fSBarry Smith } 284f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 285f1af5d2fSBarry Smith idx += 5; 286f1af5d2fSBarry Smith } 287f1af5d2fSBarry Smith /* backward solve the L^T */ 288f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 289f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 290f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 291f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 292f1af5d2fSBarry Smith idt = 5*i; 293f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 294f1af5d2fSBarry Smith while (nz--) { 295f1af5d2fSBarry Smith idx = 5*(*vi--); 296f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 297f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 298f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 299f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 300f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 301f1af5d2fSBarry Smith v -= 25; 302f1af5d2fSBarry Smith } 303f1af5d2fSBarry Smith } 3041ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3051ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 306d0f46423SBarry Smith ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 307f1af5d2fSBarry Smith PetscFunctionReturn(0); 308f1af5d2fSBarry Smith } 309f1af5d2fSBarry Smith 3104a2ae208SSatish Balay #undef __FUNCT__ 3114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 312dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 313f1af5d2fSBarry Smith { 314f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 315dfbe8321SBarry Smith PetscErrorCode ierr; 316690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 317690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 318f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 31987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 32087828ca2SBarry Smith PetscScalar *x,*b; 321f1af5d2fSBarry Smith 322f1af5d2fSBarry Smith PetscFunctionBegin; 323ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3241ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3251ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 326f1af5d2fSBarry Smith 327f1af5d2fSBarry Smith /* forward solve the U^T */ 328f1af5d2fSBarry Smith idx = 0; 329f1af5d2fSBarry Smith for (i=0; i<n; i++) { 330f1af5d2fSBarry Smith 331f1af5d2fSBarry Smith v = aa + 36*diag[i]; 332f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 333ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 334ef66eb69SBarry Smith x6 = x[5+idx]; 335f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 336f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 337f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 338f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 339f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 340f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 341f1af5d2fSBarry Smith v += 36; 342f1af5d2fSBarry Smith 343f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 344f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 345f1af5d2fSBarry Smith while (nz--) { 346f1af5d2fSBarry Smith oidx = 6*(*vi++); 347f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 348f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 349f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 350f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 351f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 352f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 353f1af5d2fSBarry Smith v += 36; 354f1af5d2fSBarry Smith } 355f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 356f1af5d2fSBarry Smith x[5+idx] = s6; 357f1af5d2fSBarry Smith idx += 6; 358f1af5d2fSBarry Smith } 359f1af5d2fSBarry Smith /* backward solve the L^T */ 360f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 361f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 362f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 363f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 364f1af5d2fSBarry Smith idt = 6*i; 365f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 366f1af5d2fSBarry Smith s6 = x[5+idt]; 367f1af5d2fSBarry Smith while (nz--) { 368f1af5d2fSBarry Smith idx = 6*(*vi--); 369f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 370f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 371f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 372f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 373f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 374f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 375f1af5d2fSBarry Smith v -= 36; 376f1af5d2fSBarry Smith } 377f1af5d2fSBarry Smith } 3781ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3791ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 380d0f46423SBarry Smith ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 381f1af5d2fSBarry Smith PetscFunctionReturn(0); 382f1af5d2fSBarry Smith } 383f1af5d2fSBarry Smith 3844a2ae208SSatish Balay #undef __FUNCT__ 3854a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 386dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 387f1af5d2fSBarry Smith { 388f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 389dfbe8321SBarry Smith PetscErrorCode ierr; 390690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 391690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 392f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 39387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 39487828ca2SBarry Smith PetscScalar *x,*b; 395f1af5d2fSBarry Smith 396f1af5d2fSBarry Smith PetscFunctionBegin; 397ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3981ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3991ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 400f1af5d2fSBarry Smith 401f1af5d2fSBarry Smith /* forward solve the U^T */ 402f1af5d2fSBarry Smith idx = 0; 403f1af5d2fSBarry Smith for (i=0; i<n; i++) { 404f1af5d2fSBarry Smith 405f1af5d2fSBarry Smith v = aa + 49*diag[i]; 406f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 407ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 408ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 409f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 410f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 411f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 412f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 413f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 414f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 415f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 416f1af5d2fSBarry Smith v += 49; 417f1af5d2fSBarry Smith 418f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 419f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 420f1af5d2fSBarry Smith while (nz--) { 421f1af5d2fSBarry Smith oidx = 7*(*vi++); 422f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 423f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 424f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 425f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 426f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 427f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 428f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 429f1af5d2fSBarry Smith v += 49; 430f1af5d2fSBarry Smith } 431f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 432f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 433f1af5d2fSBarry Smith idx += 7; 434f1af5d2fSBarry Smith } 435f1af5d2fSBarry Smith /* backward solve the L^T */ 436f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 437f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 438f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 439f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 440f1af5d2fSBarry Smith idt = 7*i; 441f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 442f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 443f1af5d2fSBarry Smith while (nz--) { 444f1af5d2fSBarry Smith idx = 7*(*vi--); 445f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 446f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 447f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 448f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 449f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 450f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 451f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 452f1af5d2fSBarry Smith v -= 49; 453f1af5d2fSBarry Smith } 454f1af5d2fSBarry Smith } 4551ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4561ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457d0f46423SBarry Smith ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 458f1af5d2fSBarry Smith PetscFunctionReturn(0); 459f1af5d2fSBarry Smith } 460f1af5d2fSBarry Smith 461f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4624a2ae208SSatish Balay #undef __FUNCT__ 4634a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 464dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 465f1af5d2fSBarry Smith { 466f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 467f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 4686849ba73SBarry Smith PetscErrorCode ierr; 469690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 470690b6cddSBarry Smith PetscInt *diag = a->diag; 471f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 47287828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 473f1af5d2fSBarry Smith 474f1af5d2fSBarry Smith PetscFunctionBegin; 4751ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4761ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 477f1af5d2fSBarry Smith t = a->solve_work; 478f1af5d2fSBarry Smith 479f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 480f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 481f1af5d2fSBarry Smith 482f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 483f1af5d2fSBarry Smith for (i=0; i<n; i++) { 484f1af5d2fSBarry Smith t[i] = b[c[i]]; 485f1af5d2fSBarry Smith } 486f1af5d2fSBarry Smith 487f1af5d2fSBarry Smith /* forward solve the U^T */ 488f1af5d2fSBarry Smith for (i=0; i<n; i++) { 489f1af5d2fSBarry Smith 490f1af5d2fSBarry Smith v = aa + diag[i]; 491f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 492f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 493f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 494f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 495f1af5d2fSBarry Smith while (nz--) { 496f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 497f1af5d2fSBarry Smith } 498f1af5d2fSBarry Smith t[i] = s1; 499f1af5d2fSBarry Smith } 500f1af5d2fSBarry Smith /* backward solve the L^T */ 501f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 502f1af5d2fSBarry Smith v = aa + diag[i] - 1; 503f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 504f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 505f1af5d2fSBarry Smith s1 = t[i]; 506f1af5d2fSBarry Smith while (nz--) { 507f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 508f1af5d2fSBarry Smith } 509f1af5d2fSBarry Smith } 510f1af5d2fSBarry Smith 511f1af5d2fSBarry Smith /* copy t into x according to permutation */ 512f1af5d2fSBarry Smith for (i=0; i<n; i++) { 513f1af5d2fSBarry Smith x[r[i]] = t[i]; 514f1af5d2fSBarry Smith } 515f1af5d2fSBarry Smith 516f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 517f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5181ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5191ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 520d0f46423SBarry Smith ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 521f1af5d2fSBarry Smith PetscFunctionReturn(0); 522f1af5d2fSBarry Smith } 523f1af5d2fSBarry Smith 5244a2ae208SSatish Balay #undef __FUNCT__ 5254a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 526dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 527f1af5d2fSBarry Smith { 528f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 529f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5306849ba73SBarry Smith PetscErrorCode ierr; 531690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 532690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 533f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53487828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 53587828ca2SBarry Smith PetscScalar *x,*b,*t; 536f1af5d2fSBarry Smith 537f1af5d2fSBarry Smith PetscFunctionBegin; 5381ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5391ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 540f1af5d2fSBarry Smith t = a->solve_work; 541f1af5d2fSBarry Smith 542f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 543f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 544f1af5d2fSBarry Smith 545f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 546f1af5d2fSBarry Smith ii = 0; 547f1af5d2fSBarry Smith for (i=0; i<n; i++) { 548f1af5d2fSBarry Smith ic = 2*c[i]; 549f1af5d2fSBarry Smith t[ii] = b[ic]; 550f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 551f1af5d2fSBarry Smith ii += 2; 552f1af5d2fSBarry Smith } 553f1af5d2fSBarry Smith 554f1af5d2fSBarry Smith /* forward solve the U^T */ 555f1af5d2fSBarry Smith idx = 0; 556f1af5d2fSBarry Smith for (i=0; i<n; i++) { 557f1af5d2fSBarry Smith 558f1af5d2fSBarry Smith v = aa + 4*diag[i]; 559f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 560f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 561f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 562f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 563f1af5d2fSBarry Smith v += 4; 564f1af5d2fSBarry Smith 565f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 566f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 567f1af5d2fSBarry Smith while (nz--) { 568f1af5d2fSBarry Smith oidx = 2*(*vi++); 569f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 570f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 571f1af5d2fSBarry Smith v += 4; 572f1af5d2fSBarry Smith } 573f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 574f1af5d2fSBarry Smith idx += 2; 575f1af5d2fSBarry Smith } 576f1af5d2fSBarry Smith /* backward solve the L^T */ 577f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 578f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 579f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 580f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 581f1af5d2fSBarry Smith idt = 2*i; 582f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 583f1af5d2fSBarry Smith while (nz--) { 584f1af5d2fSBarry Smith idx = 2*(*vi--); 585f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 586f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 587f1af5d2fSBarry Smith v -= 4; 588f1af5d2fSBarry Smith } 589f1af5d2fSBarry Smith } 590f1af5d2fSBarry Smith 591f1af5d2fSBarry Smith /* copy t into x according to permutation */ 592f1af5d2fSBarry Smith ii = 0; 593f1af5d2fSBarry Smith for (i=0; i<n; i++) { 594f1af5d2fSBarry Smith ir = 2*r[i]; 595f1af5d2fSBarry Smith x[ir] = t[ii]; 596f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 597f1af5d2fSBarry Smith ii += 2; 598f1af5d2fSBarry Smith } 599f1af5d2fSBarry Smith 600f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 601f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6021ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6031ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 604d0f46423SBarry Smith ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 605f1af5d2fSBarry Smith PetscFunctionReturn(0); 606f1af5d2fSBarry Smith } 607f1af5d2fSBarry Smith 6084a2ae208SSatish Balay #undef __FUNCT__ 6094a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 610dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 611f1af5d2fSBarry Smith { 612f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 613f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6146849ba73SBarry Smith PetscErrorCode ierr; 615690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 616690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 617f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 61887828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 61987828ca2SBarry Smith PetscScalar *x,*b,*t; 620f1af5d2fSBarry Smith 621f1af5d2fSBarry Smith PetscFunctionBegin; 6221ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6231ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 624f1af5d2fSBarry Smith t = a->solve_work; 625f1af5d2fSBarry Smith 626f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 627f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 628f1af5d2fSBarry Smith 629f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 630f1af5d2fSBarry Smith ii = 0; 631f1af5d2fSBarry Smith for (i=0; i<n; i++) { 632f1af5d2fSBarry Smith ic = 3*c[i]; 633f1af5d2fSBarry Smith t[ii] = b[ic]; 634f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 635f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 636f1af5d2fSBarry Smith ii += 3; 637f1af5d2fSBarry Smith } 638f1af5d2fSBarry Smith 639f1af5d2fSBarry Smith /* forward solve the U^T */ 640f1af5d2fSBarry Smith idx = 0; 641f1af5d2fSBarry Smith for (i=0; i<n; i++) { 642f1af5d2fSBarry Smith 643f1af5d2fSBarry Smith v = aa + 9*diag[i]; 644f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 645f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 646f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 647f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 648f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 649f1af5d2fSBarry Smith v += 9; 650f1af5d2fSBarry Smith 651f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 652f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 653f1af5d2fSBarry Smith while (nz--) { 654f1af5d2fSBarry Smith oidx = 3*(*vi++); 655f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 656f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 657f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 658f1af5d2fSBarry Smith v += 9; 659f1af5d2fSBarry Smith } 660f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 661f1af5d2fSBarry Smith idx += 3; 662f1af5d2fSBarry Smith } 663f1af5d2fSBarry Smith /* backward solve the L^T */ 664f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 665f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 666f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 667f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 668f1af5d2fSBarry Smith idt = 3*i; 669f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 670f1af5d2fSBarry Smith while (nz--) { 671f1af5d2fSBarry Smith idx = 3*(*vi--); 672f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 673f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 674f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 675f1af5d2fSBarry Smith v -= 9; 676f1af5d2fSBarry Smith } 677f1af5d2fSBarry Smith } 678f1af5d2fSBarry Smith 679f1af5d2fSBarry Smith /* copy t into x according to permutation */ 680f1af5d2fSBarry Smith ii = 0; 681f1af5d2fSBarry Smith for (i=0; i<n; i++) { 682f1af5d2fSBarry Smith ir = 3*r[i]; 683f1af5d2fSBarry Smith x[ir] = t[ii]; 684f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 685f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 686f1af5d2fSBarry Smith ii += 3; 687f1af5d2fSBarry Smith } 688f1af5d2fSBarry Smith 689f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 690f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 693d0f46423SBarry Smith ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 694f1af5d2fSBarry Smith PetscFunctionReturn(0); 695f1af5d2fSBarry Smith } 696f1af5d2fSBarry Smith 6974a2ae208SSatish Balay #undef __FUNCT__ 6984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 699dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 700f1af5d2fSBarry Smith { 701f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 702f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7036849ba73SBarry Smith PetscErrorCode ierr; 704690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 705690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 706f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 70787828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 70887828ca2SBarry Smith PetscScalar *x,*b,*t; 709f1af5d2fSBarry Smith 710f1af5d2fSBarry Smith PetscFunctionBegin; 7111ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7121ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 713f1af5d2fSBarry Smith t = a->solve_work; 714f1af5d2fSBarry Smith 715f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 716f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 717f1af5d2fSBarry Smith 718f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 719f1af5d2fSBarry Smith ii = 0; 720f1af5d2fSBarry Smith for (i=0; i<n; i++) { 721f1af5d2fSBarry Smith ic = 4*c[i]; 722f1af5d2fSBarry Smith t[ii] = b[ic]; 723f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 724f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 725f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 726f1af5d2fSBarry Smith ii += 4; 727f1af5d2fSBarry Smith } 728f1af5d2fSBarry Smith 729f1af5d2fSBarry Smith /* forward solve the U^T */ 730f1af5d2fSBarry Smith idx = 0; 731f1af5d2fSBarry Smith for (i=0; i<n; i++) { 732f1af5d2fSBarry Smith 733f1af5d2fSBarry Smith v = aa + 16*diag[i]; 734f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 735f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 736f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 737f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 738f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 739f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 740f1af5d2fSBarry Smith v += 16; 741f1af5d2fSBarry Smith 742f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 743f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 744f1af5d2fSBarry Smith while (nz--) { 745f1af5d2fSBarry Smith oidx = 4*(*vi++); 746f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 747f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 748f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 749f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 750f1af5d2fSBarry Smith v += 16; 751f1af5d2fSBarry Smith } 752f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 753f1af5d2fSBarry Smith idx += 4; 754f1af5d2fSBarry Smith } 755f1af5d2fSBarry Smith /* backward solve the L^T */ 756f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 757f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 758f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 759f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 760f1af5d2fSBarry Smith idt = 4*i; 761f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 762f1af5d2fSBarry Smith while (nz--) { 763f1af5d2fSBarry Smith idx = 4*(*vi--); 764f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 765f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 766f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 767f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 768f1af5d2fSBarry Smith v -= 16; 769f1af5d2fSBarry Smith } 770f1af5d2fSBarry Smith } 771f1af5d2fSBarry Smith 772f1af5d2fSBarry Smith /* copy t into x according to permutation */ 773f1af5d2fSBarry Smith ii = 0; 774f1af5d2fSBarry Smith for (i=0; i<n; i++) { 775f1af5d2fSBarry Smith ir = 4*r[i]; 776f1af5d2fSBarry Smith x[ir] = t[ii]; 777f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 778f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 779f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 780f1af5d2fSBarry Smith ii += 4; 781f1af5d2fSBarry Smith } 782f1af5d2fSBarry Smith 783f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 784f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7851ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7861ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 787d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 788f1af5d2fSBarry Smith PetscFunctionReturn(0); 789f1af5d2fSBarry Smith } 790f1af5d2fSBarry Smith 7914a2ae208SSatish Balay #undef __FUNCT__ 7924a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 793dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 794f1af5d2fSBarry Smith { 795f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 796f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7976849ba73SBarry Smith PetscErrorCode ierr; 798690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 799690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 800f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 80187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 80287828ca2SBarry Smith PetscScalar *x,*b,*t; 803f1af5d2fSBarry Smith 804f1af5d2fSBarry Smith PetscFunctionBegin; 8051ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8061ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 807f1af5d2fSBarry Smith t = a->solve_work; 808f1af5d2fSBarry Smith 809f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 810f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 811f1af5d2fSBarry Smith 812f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 813f1af5d2fSBarry Smith ii = 0; 814f1af5d2fSBarry Smith for (i=0; i<n; i++) { 815f1af5d2fSBarry Smith ic = 5*c[i]; 816f1af5d2fSBarry Smith t[ii] = b[ic]; 817f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 818f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 819f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 820f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 821f1af5d2fSBarry Smith ii += 5; 822f1af5d2fSBarry Smith } 823f1af5d2fSBarry Smith 824f1af5d2fSBarry Smith /* forward solve the U^T */ 825f1af5d2fSBarry Smith idx = 0; 826f1af5d2fSBarry Smith for (i=0; i<n; i++) { 827f1af5d2fSBarry Smith 828f1af5d2fSBarry Smith v = aa + 25*diag[i]; 829f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 830f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 831f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 832f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 833f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 834f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 835f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 836f1af5d2fSBarry Smith v += 25; 837f1af5d2fSBarry Smith 838f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 839f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 840f1af5d2fSBarry Smith while (nz--) { 841f1af5d2fSBarry Smith oidx = 5*(*vi++); 842f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 843f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 844f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 845f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 846f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 847f1af5d2fSBarry Smith v += 25; 848f1af5d2fSBarry Smith } 849f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 850f1af5d2fSBarry Smith idx += 5; 851f1af5d2fSBarry Smith } 852f1af5d2fSBarry Smith /* backward solve the L^T */ 853f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 854f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 855f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 856f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 857f1af5d2fSBarry Smith idt = 5*i; 858f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 859f1af5d2fSBarry Smith while (nz--) { 860f1af5d2fSBarry Smith idx = 5*(*vi--); 861f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 862f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 863f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 864f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 865f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 866f1af5d2fSBarry Smith v -= 25; 867f1af5d2fSBarry Smith } 868f1af5d2fSBarry Smith } 869f1af5d2fSBarry Smith 870f1af5d2fSBarry Smith /* copy t into x according to permutation */ 871f1af5d2fSBarry Smith ii = 0; 872f1af5d2fSBarry Smith for (i=0; i<n; i++) { 873f1af5d2fSBarry Smith ir = 5*r[i]; 874f1af5d2fSBarry Smith x[ir] = t[ii]; 875f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 876f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 877f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 878f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 879f1af5d2fSBarry Smith ii += 5; 880f1af5d2fSBarry Smith } 881f1af5d2fSBarry Smith 882f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 883f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8841ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8851ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 886d0f46423SBarry Smith ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 887f1af5d2fSBarry Smith PetscFunctionReturn(0); 888f1af5d2fSBarry Smith } 889f1af5d2fSBarry Smith 8904a2ae208SSatish Balay #undef __FUNCT__ 8914a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 892dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 893f1af5d2fSBarry Smith { 894f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 895f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8966849ba73SBarry Smith PetscErrorCode ierr; 897690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 898690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 899f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 90087828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 90187828ca2SBarry Smith PetscScalar *x,*b,*t; 902f1af5d2fSBarry Smith 903f1af5d2fSBarry Smith PetscFunctionBegin; 9041ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9051ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 906f1af5d2fSBarry Smith t = a->solve_work; 907f1af5d2fSBarry Smith 908f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 909f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 910f1af5d2fSBarry Smith 911f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 912f1af5d2fSBarry Smith ii = 0; 913f1af5d2fSBarry Smith for (i=0; i<n; i++) { 914f1af5d2fSBarry Smith ic = 6*c[i]; 915f1af5d2fSBarry Smith t[ii] = b[ic]; 916f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 917f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 918f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 919f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 920f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 921f1af5d2fSBarry Smith ii += 6; 922f1af5d2fSBarry Smith } 923f1af5d2fSBarry Smith 924f1af5d2fSBarry Smith /* forward solve the U^T */ 925f1af5d2fSBarry Smith idx = 0; 926f1af5d2fSBarry Smith for (i=0; i<n; i++) { 927f1af5d2fSBarry Smith 928f1af5d2fSBarry Smith v = aa + 36*diag[i]; 929f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 930f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 931f1af5d2fSBarry Smith x6 = t[5+idx]; 932f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 933f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 934f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 935f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 936f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 937f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 938f1af5d2fSBarry Smith v += 36; 939f1af5d2fSBarry Smith 940f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 941f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 942f1af5d2fSBarry Smith while (nz--) { 943f1af5d2fSBarry Smith oidx = 6*(*vi++); 944f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 945f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 946f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 947f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 948f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 949f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 950f1af5d2fSBarry Smith v += 36; 951f1af5d2fSBarry Smith } 952f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 953f1af5d2fSBarry Smith t[5+idx] = s6; 954f1af5d2fSBarry Smith idx += 6; 955f1af5d2fSBarry Smith } 956f1af5d2fSBarry Smith /* backward solve the L^T */ 957f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 958f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 959f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 960f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 961f1af5d2fSBarry Smith idt = 6*i; 962f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 963f1af5d2fSBarry Smith s6 = t[5+idt]; 964f1af5d2fSBarry Smith while (nz--) { 965f1af5d2fSBarry Smith idx = 6*(*vi--); 966f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 967f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 968f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 969f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 970f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 971f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 972f1af5d2fSBarry Smith v -= 36; 973f1af5d2fSBarry Smith } 974f1af5d2fSBarry Smith } 975f1af5d2fSBarry Smith 976f1af5d2fSBarry Smith /* copy t into x according to permutation */ 977f1af5d2fSBarry Smith ii = 0; 978f1af5d2fSBarry Smith for (i=0; i<n; i++) { 979f1af5d2fSBarry Smith ir = 6*r[i]; 980f1af5d2fSBarry Smith x[ir] = t[ii]; 981f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 982f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 983f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 984f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 985f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 986f1af5d2fSBarry Smith ii += 6; 987f1af5d2fSBarry Smith } 988f1af5d2fSBarry Smith 989f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 990f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 9921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 993d0f46423SBarry Smith ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 994f1af5d2fSBarry Smith PetscFunctionReturn(0); 995f1af5d2fSBarry Smith } 996f1af5d2fSBarry Smith 9974a2ae208SSatish Balay #undef __FUNCT__ 9984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 999dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1000f1af5d2fSBarry Smith { 1001f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1002f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10036849ba73SBarry Smith PetscErrorCode ierr; 1004690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 1005690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1006f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 100787828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 100887828ca2SBarry Smith PetscScalar *x,*b,*t; 1009f1af5d2fSBarry Smith 1010f1af5d2fSBarry Smith PetscFunctionBegin; 10111ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10121ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1013f1af5d2fSBarry Smith t = a->solve_work; 1014f1af5d2fSBarry Smith 1015f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1016f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1017f1af5d2fSBarry Smith 1018f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1019f1af5d2fSBarry Smith ii = 0; 1020f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1021f1af5d2fSBarry Smith ic = 7*c[i]; 1022f1af5d2fSBarry Smith t[ii] = b[ic]; 1023f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1024f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1025f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1026f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1027f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1028f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1029f1af5d2fSBarry Smith ii += 7; 1030f1af5d2fSBarry Smith } 1031f1af5d2fSBarry Smith 1032f1af5d2fSBarry Smith /* forward solve the U^T */ 1033f1af5d2fSBarry Smith idx = 0; 1034f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1035f1af5d2fSBarry Smith 1036f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1037f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1038f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1039f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1040f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1041f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1042f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1043f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1044f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1045f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1046f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1047f1af5d2fSBarry Smith v += 49; 1048f1af5d2fSBarry Smith 1049f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1050f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1051f1af5d2fSBarry Smith while (nz--) { 1052f1af5d2fSBarry Smith oidx = 7*(*vi++); 1053f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1054f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1055f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1056f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1057f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1058f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1059f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1060f1af5d2fSBarry Smith v += 49; 1061f1af5d2fSBarry Smith } 1062f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1063f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1064f1af5d2fSBarry Smith idx += 7; 1065f1af5d2fSBarry Smith } 1066f1af5d2fSBarry Smith /* backward solve the L^T */ 1067f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1068f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1069f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1070f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1071f1af5d2fSBarry Smith idt = 7*i; 1072f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1073f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1074f1af5d2fSBarry Smith while (nz--) { 1075f1af5d2fSBarry Smith idx = 7*(*vi--); 1076f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1077f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1078f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1079f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1080f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1081f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1082f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1083f1af5d2fSBarry Smith v -= 49; 1084f1af5d2fSBarry Smith } 1085f1af5d2fSBarry Smith } 1086f1af5d2fSBarry Smith 1087f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1088f1af5d2fSBarry Smith ii = 0; 1089f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1090f1af5d2fSBarry Smith ir = 7*r[i]; 1091f1af5d2fSBarry Smith x[ir] = t[ii]; 1092f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1093f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1094f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1095f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1096f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1097f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1098f1af5d2fSBarry Smith ii += 7; 1099f1af5d2fSBarry Smith } 1100f1af5d2fSBarry Smith 1101f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1102f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11031ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11041ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1105d0f46423SBarry Smith ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 1106f1af5d2fSBarry Smith PetscFunctionReturn(0); 1107f1af5d2fSBarry Smith } 1108f1af5d2fSBarry Smith 11094e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11104a2ae208SSatish Balay #undef __FUNCT__ 11114a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1112dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11134e2b4712SSatish Balay { 11144e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11154e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11166849ba73SBarry Smith PetscErrorCode ierr; 1117690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1118d0f46423SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2,*rout,*cout; 11193f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 112087828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11214e2b4712SSatish Balay 11224e2b4712SSatish Balay PetscFunctionBegin; 11231ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11241ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1125f1af5d2fSBarry Smith t = a->solve_work; 11264e2b4712SSatish Balay 11274e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11284e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11294e2b4712SSatish Balay 11304e2b4712SSatish Balay /* forward solve the lower triangular */ 113187828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11324e2b4712SSatish Balay for (i=1; i<n; i++) { 11334e2b4712SSatish Balay v = aa + bs2*ai[i]; 11344e2b4712SSatish Balay vi = aj + ai[i]; 11354e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1136f1af5d2fSBarry Smith s = t + bs*i; 113787828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11384e2b4712SSatish Balay while (nz--) { 1139f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11404e2b4712SSatish Balay v += bs2; 11414e2b4712SSatish Balay } 11424e2b4712SSatish Balay } 11434e2b4712SSatish Balay /* backward solve the upper triangular */ 1144d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 11454e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11464e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11474e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11484e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 114987828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11504e2b4712SSatish Balay while (nz--) { 1151f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11524e2b4712SSatish Balay v += bs2; 11534e2b4712SSatish Balay } 1154f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 115587828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11564e2b4712SSatish Balay } 11574e2b4712SSatish Balay 11584e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11594e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11601ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11611ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1162d0f46423SBarry Smith ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 11634e2b4712SSatish Balay PetscFunctionReturn(0); 11644e2b4712SSatish Balay } 11654e2b4712SSatish Balay 11664a2ae208SSatish Balay #undef __FUNCT__ 11674a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1168dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11694e2b4712SSatish Balay { 11704e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11714e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11726849ba73SBarry Smith PetscErrorCode ierr; 1173690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1174690b6cddSBarry Smith PetscInt *diag = a->diag; 11753f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 117687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 117787828ca2SBarry Smith PetscScalar *x,*b,*t; 11784e2b4712SSatish Balay 11794e2b4712SSatish Balay PetscFunctionBegin; 11801ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11811ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1182f1af5d2fSBarry Smith t = a->solve_work; 11834e2b4712SSatish Balay 11844e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11854e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11864e2b4712SSatish Balay 11874e2b4712SSatish Balay /* forward solve the lower triangular */ 11884e2b4712SSatish Balay idx = 7*(*r++); 1189f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1190f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1191f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 11924e2b4712SSatish Balay 11934e2b4712SSatish Balay for (i=1; i<n; i++) { 11944e2b4712SSatish Balay v = aa + 49*ai[i]; 11954e2b4712SSatish Balay vi = aj + ai[i]; 11964e2b4712SSatish Balay nz = diag[i] - ai[i]; 11974e2b4712SSatish Balay idx = 7*(*r++); 1198f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1199f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 12004e2b4712SSatish Balay while (nz--) { 12014e2b4712SSatish Balay idx = 7*(*vi++); 1202f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1203f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1204f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1205f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1206f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1207f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1208f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1209f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1210f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1211f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12124e2b4712SSatish Balay v += 49; 12134e2b4712SSatish Balay } 12144e2b4712SSatish Balay idx = 7*i; 1215f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1216f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1217f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12184e2b4712SSatish Balay } 12194e2b4712SSatish Balay /* backward solve the upper triangular */ 12204e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12214e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12224e2b4712SSatish Balay vi = aj + diag[i] + 1; 12234e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12244e2b4712SSatish Balay idt = 7*i; 1225f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1226f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1227f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12284e2b4712SSatish Balay while (nz--) { 12294e2b4712SSatish Balay idx = 7*(*vi++); 1230f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1231f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1232f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1233f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1234f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1235f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1236f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1237f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1238f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1239f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12404e2b4712SSatish Balay v += 49; 12414e2b4712SSatish Balay } 12424e2b4712SSatish Balay idc = 7*(*c--); 12434e2b4712SSatish Balay v = aa + 49*diag[i]; 1244f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1245f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1246f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1247f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1248f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1249f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1250f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1251f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1252f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1253f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1254f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1255f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1256f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1257f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12584e2b4712SSatish Balay } 12594e2b4712SSatish Balay 12604e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12614e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12621ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12631ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1264d0f46423SBarry Smith ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 12654e2b4712SSatish Balay PetscFunctionReturn(0); 12664e2b4712SSatish Balay } 12674e2b4712SSatish Balay 12684a2ae208SSatish Balay #undef __FUNCT__ 12694a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1270dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 127115091d37SBarry Smith { 127215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1273690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1274dfbe8321SBarry Smith PetscErrorCode ierr; 1275690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1276d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1277d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1278d9fead3dSBarry Smith const PetscScalar *b; 127915091d37SBarry Smith 128015091d37SBarry Smith PetscFunctionBegin; 1281d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 12821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 128315091d37SBarry Smith /* forward solve the lower triangular */ 128415091d37SBarry Smith idx = 0; 128515091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 128615091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 128715091d37SBarry Smith x[6] = b[6+idx]; 128815091d37SBarry Smith for (i=1; i<n; i++) { 128915091d37SBarry Smith v = aa + 49*ai[i]; 129015091d37SBarry Smith vi = aj + ai[i]; 129115091d37SBarry Smith nz = diag[i] - ai[i]; 129215091d37SBarry Smith idx = 7*i; 1293f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1294f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1295f1af5d2fSBarry Smith s7 = b[6+idx]; 129615091d37SBarry Smith while (nz--) { 129715091d37SBarry Smith jdx = 7*(*vi++); 129815091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 129915091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 130015091d37SBarry Smith x7 = x[6+jdx]; 1301f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1302f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1303f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1304f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1305f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1306f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1307f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 130815091d37SBarry Smith v += 49; 130915091d37SBarry Smith } 1310f1af5d2fSBarry Smith x[idx] = s1; 1311f1af5d2fSBarry Smith x[1+idx] = s2; 1312f1af5d2fSBarry Smith x[2+idx] = s3; 1313f1af5d2fSBarry Smith x[3+idx] = s4; 1314f1af5d2fSBarry Smith x[4+idx] = s5; 1315f1af5d2fSBarry Smith x[5+idx] = s6; 1316f1af5d2fSBarry Smith x[6+idx] = s7; 131715091d37SBarry Smith } 131815091d37SBarry Smith /* backward solve the upper triangular */ 131915091d37SBarry Smith for (i=n-1; i>=0; i--){ 132015091d37SBarry Smith v = aa + 49*diag[i] + 49; 132115091d37SBarry Smith vi = aj + diag[i] + 1; 132215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 132315091d37SBarry Smith idt = 7*i; 1324f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1325f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1326f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1327f1af5d2fSBarry Smith s7 = x[6+idt]; 132815091d37SBarry Smith while (nz--) { 132915091d37SBarry Smith idx = 7*(*vi++); 133015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 133115091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 133215091d37SBarry Smith x7 = x[6+idx]; 1333f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1334f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1335f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1336f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1337f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1338f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1339f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 134015091d37SBarry Smith v += 49; 134115091d37SBarry Smith } 134215091d37SBarry Smith v = aa + 49*diag[i]; 1343f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1344f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1345f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1346f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1347f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1348f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1349f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1350f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1351f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1352f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1353f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1354f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1355f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1356f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 135715091d37SBarry Smith } 135815091d37SBarry Smith 1359d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13601ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1361d0f46423SBarry Smith ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 136215091d37SBarry Smith PetscFunctionReturn(0); 136315091d37SBarry Smith } 136415091d37SBarry Smith 13654a2ae208SSatish Balay #undef __FUNCT__ 13664a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1367dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 136815091d37SBarry Smith { 136915091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 137015091d37SBarry Smith IS iscol=a->col,isrow=a->row; 13716849ba73SBarry Smith PetscErrorCode ierr; 1372690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1373690b6cddSBarry Smith PetscInt *diag = a->diag; 1374d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1375d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1376d9fead3dSBarry Smith const PetscScalar *b; 137715091d37SBarry Smith PetscFunctionBegin; 1378d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1380f1af5d2fSBarry Smith t = a->solve_work; 138115091d37SBarry Smith 138215091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 138315091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 138415091d37SBarry Smith 138515091d37SBarry Smith /* forward solve the lower triangular */ 138615091d37SBarry Smith idx = 6*(*r++); 1387f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1388f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1389f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 139015091d37SBarry Smith for (i=1; i<n; i++) { 139115091d37SBarry Smith v = aa + 36*ai[i]; 139215091d37SBarry Smith vi = aj + ai[i]; 139315091d37SBarry Smith nz = diag[i] - ai[i]; 139415091d37SBarry Smith idx = 6*(*r++); 1395f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1396f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 139715091d37SBarry Smith while (nz--) { 139815091d37SBarry Smith idx = 6*(*vi++); 1399f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1400f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1401f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1402f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1403f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1404f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1405f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1406f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 140715091d37SBarry Smith v += 36; 140815091d37SBarry Smith } 140915091d37SBarry Smith idx = 6*i; 1410f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1411f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1412f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 141315091d37SBarry Smith } 141415091d37SBarry Smith /* backward solve the upper triangular */ 141515091d37SBarry Smith for (i=n-1; i>=0; i--){ 141615091d37SBarry Smith v = aa + 36*diag[i] + 36; 141715091d37SBarry Smith vi = aj + diag[i] + 1; 141815091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 141915091d37SBarry Smith idt = 6*i; 1420f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1421f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1422f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 142315091d37SBarry Smith while (nz--) { 142415091d37SBarry Smith idx = 6*(*vi++); 1425f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1426f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1427f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1428f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1429f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1430f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1431f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1432f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1433f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 143415091d37SBarry Smith v += 36; 143515091d37SBarry Smith } 143615091d37SBarry Smith idc = 6*(*c--); 143715091d37SBarry Smith v = aa + 36*diag[i]; 1438f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1439f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1440f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1441f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1442f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1443f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1444f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1445f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1446f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1447f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1448f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1449f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 145015091d37SBarry Smith } 145115091d37SBarry Smith 145215091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 145315091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1454d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14551ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1456d0f46423SBarry Smith ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 145715091d37SBarry Smith PetscFunctionReturn(0); 145815091d37SBarry Smith } 145915091d37SBarry Smith 14604a2ae208SSatish Balay #undef __FUNCT__ 14614a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1462dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 146315091d37SBarry Smith { 146415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1465690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1466dfbe8321SBarry Smith PetscErrorCode ierr; 1467690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1468d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1469d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1470d9fead3dSBarry Smith const PetscScalar *b; 147115091d37SBarry Smith 147215091d37SBarry Smith PetscFunctionBegin; 1473d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14741ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 147515091d37SBarry Smith /* forward solve the lower triangular */ 147615091d37SBarry Smith idx = 0; 147715091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 147815091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 147915091d37SBarry Smith for (i=1; i<n; i++) { 148015091d37SBarry Smith v = aa + 36*ai[i]; 148115091d37SBarry Smith vi = aj + ai[i]; 148215091d37SBarry Smith nz = diag[i] - ai[i]; 148315091d37SBarry Smith idx = 6*i; 1484f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1485f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 148615091d37SBarry Smith while (nz--) { 148715091d37SBarry Smith jdx = 6*(*vi++); 148815091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 148915091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1490f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1491f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1492f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1493f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1494f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1495f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 149615091d37SBarry Smith v += 36; 149715091d37SBarry Smith } 1498f1af5d2fSBarry Smith x[idx] = s1; 1499f1af5d2fSBarry Smith x[1+idx] = s2; 1500f1af5d2fSBarry Smith x[2+idx] = s3; 1501f1af5d2fSBarry Smith x[3+idx] = s4; 1502f1af5d2fSBarry Smith x[4+idx] = s5; 1503f1af5d2fSBarry Smith x[5+idx] = s6; 150415091d37SBarry Smith } 150515091d37SBarry Smith /* backward solve the upper triangular */ 150615091d37SBarry Smith for (i=n-1; i>=0; i--){ 150715091d37SBarry Smith v = aa + 36*diag[i] + 36; 150815091d37SBarry Smith vi = aj + diag[i] + 1; 150915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 151015091d37SBarry Smith idt = 6*i; 1511f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1512f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1513f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 151415091d37SBarry Smith while (nz--) { 151515091d37SBarry Smith idx = 6*(*vi++); 151615091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 151715091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1518f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1519f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1520f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1521f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1522f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1523f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 152415091d37SBarry Smith v += 36; 152515091d37SBarry Smith } 152615091d37SBarry Smith v = aa + 36*diag[i]; 1527f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1528f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1529f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1530f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1531f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1532f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 153315091d37SBarry Smith } 153415091d37SBarry Smith 1535d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15361ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1537d0f46423SBarry Smith ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 153815091d37SBarry Smith PetscFunctionReturn(0); 153915091d37SBarry Smith } 154015091d37SBarry Smith 15414a2ae208SSatish Balay #undef __FUNCT__ 15424a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1543dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 15444e2b4712SSatish Balay { 15454e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15464e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 15476849ba73SBarry Smith PetscErrorCode ierr; 1548690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1549690b6cddSBarry Smith PetscInt *diag = a->diag; 1550d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1551d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1552d9fead3dSBarry Smith const PetscScalar *b; 15534e2b4712SSatish Balay 15544e2b4712SSatish Balay PetscFunctionBegin; 1555d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15561ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1557f1af5d2fSBarry Smith t = a->solve_work; 15584e2b4712SSatish Balay 15594e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 15604e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 15614e2b4712SSatish Balay 15624e2b4712SSatish Balay /* forward solve the lower triangular */ 15634e2b4712SSatish Balay idx = 5*(*r++); 1564f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1565f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 15664e2b4712SSatish Balay for (i=1; i<n; i++) { 15674e2b4712SSatish Balay v = aa + 25*ai[i]; 15684e2b4712SSatish Balay vi = aj + ai[i]; 15694e2b4712SSatish Balay nz = diag[i] - ai[i]; 15704e2b4712SSatish Balay idx = 5*(*r++); 1571f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1572f1af5d2fSBarry Smith s5 = b[4+idx]; 15734e2b4712SSatish Balay while (nz--) { 15744e2b4712SSatish Balay idx = 5*(*vi++); 1575f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1576f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1577f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1578f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1579f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1580f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1581f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 15824e2b4712SSatish Balay v += 25; 15834e2b4712SSatish Balay } 15844e2b4712SSatish Balay idx = 5*i; 1585f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1586f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 15874e2b4712SSatish Balay } 15884e2b4712SSatish Balay /* backward solve the upper triangular */ 15894e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 15904e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 15914e2b4712SSatish Balay vi = aj + diag[i] + 1; 15924e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 15934e2b4712SSatish Balay idt = 5*i; 1594f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1595f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 15964e2b4712SSatish Balay while (nz--) { 15974e2b4712SSatish Balay idx = 5*(*vi++); 1598f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1599f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1600f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1601f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1602f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1603f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1604f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 16054e2b4712SSatish Balay v += 25; 16064e2b4712SSatish Balay } 16074e2b4712SSatish Balay idc = 5*(*c--); 16084e2b4712SSatish Balay v = aa + 25*diag[i]; 1609f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1610f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 1611f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1612f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 1613f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1614f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 1615f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1616f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 1617f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1618f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 16194e2b4712SSatish Balay } 16204e2b4712SSatish Balay 16214e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 16224e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1623d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16241ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1625d0f46423SBarry Smith ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 16264e2b4712SSatish Balay PetscFunctionReturn(0); 16274e2b4712SSatish Balay } 16284e2b4712SSatish Balay 16294a2ae208SSatish Balay #undef __FUNCT__ 16304a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1631dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 163215091d37SBarry Smith { 163315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1634690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1635dfbe8321SBarry Smith PetscErrorCode ierr; 1636690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1637d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1638d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1639d9fead3dSBarry Smith const PetscScalar *b; 164015091d37SBarry Smith 164115091d37SBarry Smith PetscFunctionBegin; 1642d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16431ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 164415091d37SBarry Smith /* forward solve the lower triangular */ 164515091d37SBarry Smith idx = 0; 164615091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 164715091d37SBarry Smith for (i=1; i<n; i++) { 164815091d37SBarry Smith v = aa + 25*ai[i]; 164915091d37SBarry Smith vi = aj + ai[i]; 165015091d37SBarry Smith nz = diag[i] - ai[i]; 165115091d37SBarry Smith idx = 5*i; 1652f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 165315091d37SBarry Smith while (nz--) { 165415091d37SBarry Smith jdx = 5*(*vi++); 165515091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1656f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1657f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1658f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1659f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1660f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 166115091d37SBarry Smith v += 25; 166215091d37SBarry Smith } 1663f1af5d2fSBarry Smith x[idx] = s1; 1664f1af5d2fSBarry Smith x[1+idx] = s2; 1665f1af5d2fSBarry Smith x[2+idx] = s3; 1666f1af5d2fSBarry Smith x[3+idx] = s4; 1667f1af5d2fSBarry Smith x[4+idx] = s5; 166815091d37SBarry Smith } 166915091d37SBarry Smith /* backward solve the upper triangular */ 167015091d37SBarry Smith for (i=n-1; i>=0; i--){ 167115091d37SBarry Smith v = aa + 25*diag[i] + 25; 167215091d37SBarry Smith vi = aj + diag[i] + 1; 167315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 167415091d37SBarry Smith idt = 5*i; 1675f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1676f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 167715091d37SBarry Smith while (nz--) { 167815091d37SBarry Smith idx = 5*(*vi++); 167915091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1680f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1681f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1682f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1683f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1684f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 168515091d37SBarry Smith v += 25; 168615091d37SBarry Smith } 168715091d37SBarry Smith v = aa + 25*diag[i]; 1688f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1689f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1690f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1691f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1692f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 169315091d37SBarry Smith } 169415091d37SBarry Smith 1695d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16961ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1697d0f46423SBarry Smith ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 169815091d37SBarry Smith PetscFunctionReturn(0); 169915091d37SBarry Smith } 170015091d37SBarry Smith 17014a2ae208SSatish Balay #undef __FUNCT__ 17024a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1703dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 17044e2b4712SSatish Balay { 17054e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 17064e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 17076849ba73SBarry Smith PetscErrorCode ierr; 1708690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1709690b6cddSBarry Smith PetscInt *diag = a->diag; 1710d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1711d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1712d9fead3dSBarry Smith const PetscScalar *b; 17134e2b4712SSatish Balay 17144e2b4712SSatish Balay PetscFunctionBegin; 1715d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17161ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1717f1af5d2fSBarry Smith t = a->solve_work; 17184e2b4712SSatish Balay 17194e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 17204e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 17214e2b4712SSatish Balay 17224e2b4712SSatish Balay /* forward solve the lower triangular */ 17234e2b4712SSatish Balay idx = 4*(*r++); 1724f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1725f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 17264e2b4712SSatish Balay for (i=1; i<n; i++) { 17274e2b4712SSatish Balay v = aa + 16*ai[i]; 17284e2b4712SSatish Balay vi = aj + ai[i]; 17294e2b4712SSatish Balay nz = diag[i] - ai[i]; 17304e2b4712SSatish Balay idx = 4*(*r++); 1731f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 17324e2b4712SSatish Balay while (nz--) { 17334e2b4712SSatish Balay idx = 4*(*vi++); 1734f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1735f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1736f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1737f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1738f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17394e2b4712SSatish Balay v += 16; 17404e2b4712SSatish Balay } 17414e2b4712SSatish Balay idx = 4*i; 1742f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1743f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 17444e2b4712SSatish Balay } 17454e2b4712SSatish Balay /* backward solve the upper triangular */ 17464e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 17474e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 17484e2b4712SSatish Balay vi = aj + diag[i] + 1; 17494e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 17504e2b4712SSatish Balay idt = 4*i; 1751f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1752f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 17534e2b4712SSatish Balay while (nz--) { 17544e2b4712SSatish Balay idx = 4*(*vi++); 1755f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1756f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1757f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1758f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1759f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1760f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17614e2b4712SSatish Balay v += 16; 17624e2b4712SSatish Balay } 17634e2b4712SSatish Balay idc = 4*(*c--); 17644e2b4712SSatish Balay v = aa + 16*diag[i]; 1765f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1766f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1767f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1768f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 17694e2b4712SSatish Balay } 17704e2b4712SSatish Balay 17714e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 17724e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1773d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1775d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 17764e2b4712SSatish Balay PetscFunctionReturn(0); 17774e2b4712SSatish Balay } 1778f26ec98cSKris Buschelman 1779f26ec98cSKris Buschelman #undef __FUNCT__ 1780f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1781dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1782f26ec98cSKris Buschelman { 1783f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1784f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 17856849ba73SBarry Smith PetscErrorCode ierr; 1786690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1787690b6cddSBarry Smith PetscInt *diag = a->diag; 1788d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1789d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 1790d9fead3dSBarry Smith PetscScalar *x; 1791d9fead3dSBarry Smith const PetscScalar *b; 1792f26ec98cSKris Buschelman 1793f26ec98cSKris Buschelman PetscFunctionBegin; 1794d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17951ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1796f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 1797f26ec98cSKris Buschelman 1798f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1799f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1800f26ec98cSKris Buschelman 1801f26ec98cSKris Buschelman /* forward solve the lower triangular */ 1802f26ec98cSKris Buschelman idx = 4*(*r++); 1803f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 1804f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 1805f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 1806f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 1807f26ec98cSKris Buschelman for (i=1; i<n; i++) { 1808f26ec98cSKris Buschelman v = aa + 16*ai[i]; 1809f26ec98cSKris Buschelman vi = aj + ai[i]; 1810f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 1811f26ec98cSKris Buschelman idx = 4*(*r++); 1812f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 1813f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 1814f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 1815f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 1816f26ec98cSKris Buschelman while (nz--) { 1817f26ec98cSKris Buschelman idx = 4*(*vi++); 1818f26ec98cSKris Buschelman x1 = t[idx]; 1819f26ec98cSKris Buschelman x2 = t[1+idx]; 1820f26ec98cSKris Buschelman x3 = t[2+idx]; 1821f26ec98cSKris Buschelman x4 = t[3+idx]; 1822f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1823f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1824f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1825f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1826f26ec98cSKris Buschelman v += 16; 1827f26ec98cSKris Buschelman } 1828f26ec98cSKris Buschelman idx = 4*i; 1829f26ec98cSKris Buschelman t[idx] = s1; 1830f26ec98cSKris Buschelman t[1+idx] = s2; 1831f26ec98cSKris Buschelman t[2+idx] = s3; 1832f26ec98cSKris Buschelman t[3+idx] = s4; 1833f26ec98cSKris Buschelman } 1834f26ec98cSKris Buschelman /* backward solve the upper triangular */ 1835f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 1836f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 1837f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 1838f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 1839f26ec98cSKris Buschelman idt = 4*i; 1840f26ec98cSKris Buschelman s1 = t[idt]; 1841f26ec98cSKris Buschelman s2 = t[1+idt]; 1842f26ec98cSKris Buschelman s3 = t[2+idt]; 1843f26ec98cSKris Buschelman s4 = t[3+idt]; 1844f26ec98cSKris Buschelman while (nz--) { 1845f26ec98cSKris Buschelman idx = 4*(*vi++); 1846f26ec98cSKris Buschelman x1 = t[idx]; 1847f26ec98cSKris Buschelman x2 = t[1+idx]; 1848f26ec98cSKris Buschelman x3 = t[2+idx]; 1849f26ec98cSKris Buschelman x4 = t[3+idx]; 1850f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1851f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1852f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1853f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1854f26ec98cSKris Buschelman v += 16; 1855f26ec98cSKris Buschelman } 1856f26ec98cSKris Buschelman idc = 4*(*c--); 1857f26ec98cSKris Buschelman v = aa + 16*diag[i]; 1858f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1859f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1860f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1861f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1862f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 1863f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 1864f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 1865f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 1866f26ec98cSKris Buschelman } 1867f26ec98cSKris Buschelman 1868f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1869f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1870d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1872d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 1873f26ec98cSKris Buschelman PetscFunctionReturn(0); 1874f26ec98cSKris Buschelman } 1875f26ec98cSKris Buschelman 187624c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 187724c233c2SKris Buschelman 187824c233c2SKris Buschelman #include PETSC_HAVE_SSE 187924c233c2SKris Buschelman 188024c233c2SKris Buschelman #undef __FUNCT__ 188124c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1882dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 188324c233c2SKris Buschelman { 188424c233c2SKris Buschelman /* 188524c233c2SKris Buschelman Note: This code uses demotion of double 188624c233c2SKris Buschelman to float when performing the mixed-mode computation. 188724c233c2SKris Buschelman This may not be numerically reasonable for all applications. 188824c233c2SKris Buschelman */ 188924c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 189024c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 18916849ba73SBarry Smith PetscErrorCode ierr; 1892690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1893690b6cddSBarry Smith PetscInt *diag = a->diag,ai16; 189424c233c2SKris Buschelman MatScalar *aa=a->a,*v; 189587828ca2SBarry Smith PetscScalar *x,*b,*t; 189624c233c2SKris Buschelman 189724c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 189824c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 189924c233c2SKris Buschelman unsigned long offset; 190024c233c2SKris Buschelman 190124c233c2SKris Buschelman PetscFunctionBegin; 190224c233c2SKris Buschelman SSE_SCOPE_BEGIN; 190324c233c2SKris Buschelman 190424c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 190524c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 190624c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 190724c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 190824c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 190924c233c2SKris Buschelman 19101ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 19111ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 191224c233c2SKris Buschelman t = a->solve_work; 191324c233c2SKris Buschelman 191424c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 191524c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 191624c233c2SKris Buschelman 191724c233c2SKris Buschelman /* forward solve the lower triangular */ 191824c233c2SKris Buschelman idx = 4*(*r++); 191924c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 192024c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 192124c233c2SKris Buschelman v = aa + 16*ai[1]; 192224c233c2SKris Buschelman 192324c233c2SKris Buschelman for (i=1; i<n;) { 192424c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 192524c233c2SKris Buschelman vi = aj + ai[i]; 192624c233c2SKris Buschelman nz = diag[i] - ai[i]; 192724c233c2SKris Buschelman idx = 4*(*r++); 192824c233c2SKris Buschelman 192924c233c2SKris Buschelman /* Demote sum from double to float */ 193024c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 193124c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 193224c233c2SKris Buschelman 193324c233c2SKris Buschelman while (nz--) { 193424c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 193524c233c2SKris Buschelman idx = 4*(*vi++); 193624c233c2SKris Buschelman 193724c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 193824c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 193924c233c2SKris Buschelman 194024c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 194124c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 194224c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 194324c233c2SKris Buschelman 194424c233c2SKris Buschelman /* First Column */ 194524c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 194624c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 194724c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 194824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 194924c233c2SKris Buschelman 195024c233c2SKris Buschelman /* Second Column */ 195124c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 195224c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 195324c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 195424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 195524c233c2SKris Buschelman 195624c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 195724c233c2SKris Buschelman 195824c233c2SKris Buschelman /* Third Column */ 195924c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 196024c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 196124c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 196224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 196324c233c2SKris Buschelman 196424c233c2SKris Buschelman /* Fourth Column */ 196524c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 196624c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 196724c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 196824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 196924c233c2SKris Buschelman SSE_INLINE_END_2 197024c233c2SKris Buschelman 197124c233c2SKris Buschelman v += 16; 197224c233c2SKris Buschelman } 197324c233c2SKris Buschelman idx = 4*i; 197424c233c2SKris Buschelman v = aa + 16*ai[++i]; 197524c233c2SKris Buschelman PREFETCH_NTA(v); 197624c233c2SKris Buschelman STORE_PS(tmps,XMM7); 197724c233c2SKris Buschelman 197824c233c2SKris Buschelman /* Promote result from float to double */ 197924c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 198024c233c2SKris Buschelman } 198124c233c2SKris Buschelman /* backward solve the upper triangular */ 198224c233c2SKris Buschelman idt = 4*(n-1); 198324c233c2SKris Buschelman ai16 = 16*diag[n-1]; 198424c233c2SKris Buschelman v = aa + ai16 + 16; 198524c233c2SKris Buschelman for (i=n-1; i>=0;){ 198624c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 198724c233c2SKris Buschelman vi = aj + diag[i] + 1; 198824c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 198924c233c2SKris Buschelman 199024c233c2SKris Buschelman /* Demote accumulator from double to float */ 199124c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 199224c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 199324c233c2SKris Buschelman 199424c233c2SKris Buschelman while (nz--) { 199524c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 199624c233c2SKris Buschelman idx = 4*(*vi++); 199724c233c2SKris Buschelman 199824c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 199924c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 200024c233c2SKris Buschelman 200124c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 200224c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 200324c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 200424c233c2SKris Buschelman 200524c233c2SKris Buschelman /* First Column */ 200624c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 200724c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 200824c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 200924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 201024c233c2SKris Buschelman 201124c233c2SKris Buschelman /* Second Column */ 201224c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 201324c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 201424c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 201524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 201624c233c2SKris Buschelman 201724c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 201824c233c2SKris Buschelman 201924c233c2SKris Buschelman /* Third Column */ 202024c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 202124c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 202224c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 202324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 202424c233c2SKris Buschelman 202524c233c2SKris Buschelman /* Fourth Column */ 202624c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 202724c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 202824c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 202924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 203024c233c2SKris Buschelman SSE_INLINE_END_2 203124c233c2SKris Buschelman v += 16; 203224c233c2SKris Buschelman } 203324c233c2SKris Buschelman v = aa + ai16; 203424c233c2SKris Buschelman ai16 = 16*diag[--i]; 203524c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 203624c233c2SKris Buschelman /* 203724c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 203824c233c2SKris Buschelman which was inverted as part of the factorization 203924c233c2SKris Buschelman */ 204024c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 204124c233c2SKris Buschelman /* First Column */ 204224c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 204324c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 204424c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 204524c233c2SKris Buschelman 204624c233c2SKris Buschelman /* Second Column */ 204724c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 204824c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 204924c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 205024c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 205124c233c2SKris Buschelman 205224c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 205324c233c2SKris Buschelman 205424c233c2SKris Buschelman /* Third Column */ 205524c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 205624c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 205724c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 205824c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 205924c233c2SKris Buschelman 206024c233c2SKris Buschelman /* Fourth Column */ 206124c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 206224c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 206324c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 206424c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 206524c233c2SKris Buschelman 206624c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 206724c233c2SKris Buschelman SSE_INLINE_END_3 206824c233c2SKris Buschelman 206924c233c2SKris Buschelman /* Promote solution from float to double */ 207024c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 207124c233c2SKris Buschelman 207224c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 207324c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 207424c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 207524c233c2SKris Buschelman idc = 4*(*c--); 207624c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 207724c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 207824c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 207924c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 208024c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 208124c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 208224c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 208324c233c2SKris Buschelman SSE_INLINE_END_2 208424c233c2SKris Buschelman v = aa + ai16 + 16; 208524c233c2SKris Buschelman idt -= 4; 208624c233c2SKris Buschelman } 208724c233c2SKris Buschelman 208824c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 208924c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 20901ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 20911ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2092d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 209324c233c2SKris Buschelman SSE_SCOPE_END; 209424c233c2SKris Buschelman PetscFunctionReturn(0); 209524c233c2SKris Buschelman } 209624c233c2SKris Buschelman 209724c233c2SKris Buschelman #endif 20980ef38995SBarry Smith 20990ef38995SBarry Smith 21004e2b4712SSatish Balay /* 21014e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 21024e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 21034e2b4712SSatish Balay */ 21044a2ae208SSatish Balay #undef __FUNCT__ 21054a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2106dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 21074e2b4712SSatish Balay { 21084e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2109356650c2SBarry Smith PetscInt n=a->mbs; 2110356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 2111dfbe8321SBarry Smith PetscErrorCode ierr; 2112356650c2SBarry Smith const PetscInt *diag = a->diag; 2113d9fead3dSBarry Smith const MatScalar *aa=a->a; 2114d9fead3dSBarry Smith PetscScalar *x; 2115d9fead3dSBarry Smith const PetscScalar *b; 21164e2b4712SSatish Balay 21174e2b4712SSatish Balay PetscFunctionBegin; 2118d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21191ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21204e2b4712SSatish Balay 2121aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 21222853dc0eSBarry Smith { 212387828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 21242853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 21252853dc0eSBarry Smith } 2126aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 21272853dc0eSBarry Smith { 212887828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 21292853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 21302853dc0eSBarry Smith } 2131aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 21322853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2133e1293385SBarry Smith #else 213430d4dcafSBarry Smith { 213587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2136d9fead3dSBarry Smith const MatScalar *v; 2137356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 2138356650c2SBarry Smith const PetscInt *vi; 2139e1293385SBarry Smith 21404e2b4712SSatish Balay /* forward solve the lower triangular */ 21414e2b4712SSatish Balay idx = 0; 2142e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 21434e2b4712SSatish Balay for (i=1; i<n; i++) { 21444e2b4712SSatish Balay v = aa + 16*ai[i]; 21454e2b4712SSatish Balay vi = aj + ai[i]; 21464e2b4712SSatish Balay nz = diag[i] - ai[i]; 2147e1293385SBarry Smith idx += 4; 2148f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 21494e2b4712SSatish Balay while (nz--) { 21504e2b4712SSatish Balay jdx = 4*(*vi++); 21514e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2152f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2153f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2154f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2155f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 21564e2b4712SSatish Balay v += 16; 21574e2b4712SSatish Balay } 2158f1af5d2fSBarry Smith x[idx] = s1; 2159f1af5d2fSBarry Smith x[1+idx] = s2; 2160f1af5d2fSBarry Smith x[2+idx] = s3; 2161f1af5d2fSBarry Smith x[3+idx] = s4; 21624e2b4712SSatish Balay } 21634e2b4712SSatish Balay /* backward solve the upper triangular */ 21644e555682SBarry Smith idt = 4*(n-1); 21654e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 21664e555682SBarry Smith ai16 = 16*diag[i]; 21674e555682SBarry Smith v = aa + ai16 + 16; 21684e2b4712SSatish Balay vi = aj + diag[i] + 1; 21694e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2170f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2171f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 21724e2b4712SSatish Balay while (nz--) { 21734e2b4712SSatish Balay idx = 4*(*vi++); 21744e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2175f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2176f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2177f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2178f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 21794e2b4712SSatish Balay v += 16; 21804e2b4712SSatish Balay } 21814e555682SBarry Smith v = aa + ai16; 2182f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2183f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2184f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2185f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2186329f5518SBarry Smith idt -= 4; 21874e2b4712SSatish Balay } 218830d4dcafSBarry Smith } 2189e1293385SBarry Smith #endif 21904e2b4712SSatish Balay 2191d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2193d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 21944e2b4712SSatish Balay PetscFunctionReturn(0); 21954e2b4712SSatish Balay } 21964e2b4712SSatish Balay 2197f26ec98cSKris Buschelman #undef __FUNCT__ 2198f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2199dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2200f26ec98cSKris Buschelman { 2201f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2202690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2203dfbe8321SBarry Smith PetscErrorCode ierr; 2204690b6cddSBarry Smith PetscInt *diag = a->diag; 2205f26ec98cSKris Buschelman MatScalar *aa=a->a; 2206f26ec98cSKris Buschelman PetscScalar *x,*b; 2207f26ec98cSKris Buschelman 2208f26ec98cSKris Buschelman PetscFunctionBegin; 22091ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 22101ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2211f26ec98cSKris Buschelman 2212f26ec98cSKris Buschelman { 2213f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2214f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 2215690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2216f26ec98cSKris Buschelman 2217f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2218f26ec98cSKris Buschelman idx = 0; 2219f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 2220f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 2221f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 2222f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 2223f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2224f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2225f26ec98cSKris Buschelman vi = aj + ai[i]; 2226f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2227f26ec98cSKris Buschelman idx += 4; 2228f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2229f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2230f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2231f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2232f26ec98cSKris Buschelman while (nz--) { 2233f26ec98cSKris Buschelman jdx = 4*(*vi++); 2234f26ec98cSKris Buschelman x1 = t[jdx]; 2235f26ec98cSKris Buschelman x2 = t[1+jdx]; 2236f26ec98cSKris Buschelman x3 = t[2+jdx]; 2237f26ec98cSKris Buschelman x4 = t[3+jdx]; 2238f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2239f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2240f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2241f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2242f26ec98cSKris Buschelman v += 16; 2243f26ec98cSKris Buschelman } 2244f26ec98cSKris Buschelman t[idx] = s1; 2245f26ec98cSKris Buschelman t[1+idx] = s2; 2246f26ec98cSKris Buschelman t[2+idx] = s3; 2247f26ec98cSKris Buschelman t[3+idx] = s4; 2248f26ec98cSKris Buschelman } 2249f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2250f26ec98cSKris Buschelman idt = 4*(n-1); 2251f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2252f26ec98cSKris Buschelman ai16 = 16*diag[i]; 2253f26ec98cSKris Buschelman v = aa + ai16 + 16; 2254f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2255f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2256f26ec98cSKris Buschelman s1 = t[idt]; 2257f26ec98cSKris Buschelman s2 = t[1+idt]; 2258f26ec98cSKris Buschelman s3 = t[2+idt]; 2259f26ec98cSKris Buschelman s4 = t[3+idt]; 2260f26ec98cSKris Buschelman while (nz--) { 2261f26ec98cSKris Buschelman idx = 4*(*vi++); 2262f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 2263f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 2264f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 2265f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 2266f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2267f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2268f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2269f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2270f26ec98cSKris Buschelman v += 16; 2271f26ec98cSKris Buschelman } 2272f26ec98cSKris Buschelman v = aa + ai16; 2273f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2274f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2275f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2276f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2277f26ec98cSKris Buschelman idt -= 4; 2278f26ec98cSKris Buschelman } 2279f26ec98cSKris Buschelman } 2280f26ec98cSKris Buschelman 22811ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 22821ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2283d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2284f26ec98cSKris Buschelman PetscFunctionReturn(0); 2285f26ec98cSKris Buschelman } 2286f26ec98cSKris Buschelman 22873660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 22883660e330SKris Buschelman 22893660e330SKris Buschelman #include PETSC_HAVE_SSE 22903660e330SKris Buschelman #undef __FUNCT__ 22917cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2292dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 22933660e330SKris Buschelman { 22943660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 22952aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 2296dfbe8321SBarry Smith PetscErrorCode ierr; 2297dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 22983660e330SKris Buschelman MatScalar *aa=a->a; 229987828ca2SBarry Smith PetscScalar *x,*b; 23003660e330SKris Buschelman 23013660e330SKris Buschelman PetscFunctionBegin; 23023660e330SKris Buschelman SSE_SCOPE_BEGIN; 23033660e330SKris Buschelman /* 23043660e330SKris Buschelman Note: This code currently uses demotion of double 23053660e330SKris Buschelman to float when performing the mixed-mode computation. 23063660e330SKris Buschelman This may not be numerically reasonable for all applications. 23073660e330SKris Buschelman */ 23083660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 23093660e330SKris Buschelman 23101ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 23111ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23123660e330SKris Buschelman { 2313eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 2314eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 23152aa5897fSKris Buschelman int nz,i,idt,ai16; 23162aa5897fSKris Buschelman unsigned int jdx,idx; 23172aa5897fSKris Buschelman unsigned short *vi; 2318eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 23193660e330SKris Buschelman 2320eb05f457SKris Buschelman /* First block is the identity. */ 23213660e330SKris Buschelman idx = 0; 2322eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 23232aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 23243660e330SKris Buschelman 23253660e330SKris Buschelman for (i=1; i<n;) { 23263660e330SKris Buschelman PREFETCH_NTA(&v[8]); 23273660e330SKris Buschelman vi = aj + ai[i]; 23283660e330SKris Buschelman nz = diag[i] - ai[i]; 23293660e330SKris Buschelman idx += 4; 23303660e330SKris Buschelman 2331eb05f457SKris Buschelman /* Demote RHS from double to float. */ 2332eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2333eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 23343660e330SKris Buschelman 23353660e330SKris Buschelman while (nz--) { 23363660e330SKris Buschelman PREFETCH_NTA(&v[16]); 23372aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 23383660e330SKris Buschelman 23393660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 2340eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 23413660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 23423660e330SKris Buschelman 23433660e330SKris Buschelman /* First Column */ 23443660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 23453660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 23463660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 23473660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 23483660e330SKris Buschelman 23493660e330SKris Buschelman /* Second Column */ 23503660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 23513660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 23523660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 23533660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 23543660e330SKris Buschelman 23553660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 23563660e330SKris Buschelman 23573660e330SKris Buschelman /* Third Column */ 23583660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 23593660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 23603660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 23613660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 23623660e330SKris Buschelman 23633660e330SKris Buschelman /* Fourth Column */ 23643660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 23653660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 23663660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 23673660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 23683660e330SKris Buschelman SSE_INLINE_END_2 23693660e330SKris Buschelman 23703660e330SKris Buschelman v += 16; 23713660e330SKris Buschelman } 23723660e330SKris Buschelman v = aa + 16*ai[++i]; 23733660e330SKris Buschelman PREFETCH_NTA(v); 2374eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 23753660e330SKris Buschelman } 2376eb05f457SKris Buschelman 2377eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 2378eb05f457SKris Buschelman 23793660e330SKris Buschelman idt = 4*(n-1); 23803660e330SKris Buschelman ai16 = 16*diag[n-1]; 23813660e330SKris Buschelman v = aa + ai16 + 16; 23823660e330SKris Buschelman for (i=n-1; i>=0;){ 23833660e330SKris Buschelman PREFETCH_NTA(&v[8]); 23843660e330SKris Buschelman vi = aj + diag[i] + 1; 23853660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 23863660e330SKris Buschelman 2387eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 23883660e330SKris Buschelman 23893660e330SKris Buschelman while (nz--) { 23903660e330SKris Buschelman PREFETCH_NTA(&v[16]); 23912aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 23923660e330SKris Buschelman 23933660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 2394eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 23953660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 23963660e330SKris Buschelman 23973660e330SKris Buschelman /* First Column */ 23983660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 23993660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 24003660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 24013660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 24023660e330SKris Buschelman 24033660e330SKris Buschelman /* Second Column */ 24043660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 24053660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 24063660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 24073660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 24083660e330SKris Buschelman 24093660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 24103660e330SKris Buschelman 24113660e330SKris Buschelman /* Third Column */ 24123660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 24133660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 24143660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 24153660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 24163660e330SKris Buschelman 24173660e330SKris Buschelman /* Fourth Column */ 24183660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 24193660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 24203660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 24213660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 24223660e330SKris Buschelman SSE_INLINE_END_2 24233660e330SKris Buschelman v += 16; 24243660e330SKris Buschelman } 24253660e330SKris Buschelman v = aa + ai16; 24263660e330SKris Buschelman ai16 = 16*diag[--i]; 24273660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 24283660e330SKris Buschelman /* 24293660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 24303660e330SKris Buschelman which was inverted as part of the factorization 24313660e330SKris Buschelman */ 2432eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 24333660e330SKris Buschelman /* First Column */ 24343660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 24353660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 24363660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 24373660e330SKris Buschelman 24383660e330SKris Buschelman /* Second Column */ 24393660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 24403660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 24413660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 24423660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 24433660e330SKris Buschelman 24443660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 24453660e330SKris Buschelman 24463660e330SKris Buschelman /* Third Column */ 24473660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 24483660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 24493660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 24503660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 24513660e330SKris Buschelman 24523660e330SKris Buschelman /* Fourth Column */ 24533660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 24543660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 24553660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 24563660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 24573660e330SKris Buschelman 24583660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 24593660e330SKris Buschelman SSE_INLINE_END_3 24603660e330SKris Buschelman 24613660e330SKris Buschelman v = aa + ai16 + 16; 24623660e330SKris Buschelman idt -= 4; 24633660e330SKris Buschelman } 2464eb05f457SKris Buschelman 2465eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 2466eb05f457SKris Buschelman idt = 4*(n-1); 2467eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 2468eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2469eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2470eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 2471eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 2472eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 2473eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 2474eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 2475eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 247654693613SKris Buschelman idt -= 4; 24773660e330SKris Buschelman } 2478eb05f457SKris Buschelman 2479eb05f457SKris Buschelman } /* End of artificial scope. */ 24801ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 24811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2482d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 24833660e330SKris Buschelman SSE_SCOPE_END; 24843660e330SKris Buschelman PetscFunctionReturn(0); 24853660e330SKris Buschelman } 24863660e330SKris Buschelman 24877cf1b8d3SKris Buschelman #undef __FUNCT__ 24887cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2489dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 24907cf1b8d3SKris Buschelman { 24917cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 24927cf1b8d3SKris Buschelman int *aj=a->j; 2493dfbe8321SBarry Smith PetscErrorCode ierr; 2494dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 24957cf1b8d3SKris Buschelman MatScalar *aa=a->a; 24967cf1b8d3SKris Buschelman PetscScalar *x,*b; 24977cf1b8d3SKris Buschelman 24987cf1b8d3SKris Buschelman PetscFunctionBegin; 24997cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 25007cf1b8d3SKris Buschelman /* 25017cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 25027cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 25037cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 25047cf1b8d3SKris Buschelman */ 25057cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 25067cf1b8d3SKris Buschelman 25071ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25081ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25097cf1b8d3SKris Buschelman { 25107cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 25117cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 25127cf1b8d3SKris Buschelman int nz,i,idt,ai16; 25137cf1b8d3SKris Buschelman int jdx,idx; 25147cf1b8d3SKris Buschelman int *vi; 25157cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 25167cf1b8d3SKris Buschelman 25177cf1b8d3SKris Buschelman /* First block is the identity. */ 25187cf1b8d3SKris Buschelman idx = 0; 25197cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 25207cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 25217cf1b8d3SKris Buschelman 25227cf1b8d3SKris Buschelman for (i=1; i<n;) { 25237cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 25247cf1b8d3SKris Buschelman vi = aj + ai[i]; 25257cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 25267cf1b8d3SKris Buschelman idx += 4; 25277cf1b8d3SKris Buschelman 25287cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 25297cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 25307cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 25317cf1b8d3SKris Buschelman 25327cf1b8d3SKris Buschelman while (nz--) { 25337cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 25347cf1b8d3SKris Buschelman jdx = 4*(*vi++); 25357cf1b8d3SKris Buschelman /* jdx = *vi++; */ 25367cf1b8d3SKris Buschelman 25377cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 25387cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 25397cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 25407cf1b8d3SKris Buschelman 25417cf1b8d3SKris Buschelman /* First Column */ 25427cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 25437cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25447cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 25457cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 25467cf1b8d3SKris Buschelman 25477cf1b8d3SKris Buschelman /* Second Column */ 25487cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 25497cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 25507cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 25517cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 25527cf1b8d3SKris Buschelman 25537cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 25547cf1b8d3SKris Buschelman 25557cf1b8d3SKris Buschelman /* Third Column */ 25567cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 25577cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 25587cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 25597cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 25607cf1b8d3SKris Buschelman 25617cf1b8d3SKris Buschelman /* Fourth Column */ 25627cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 25637cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 25647cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 25657cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 25667cf1b8d3SKris Buschelman SSE_INLINE_END_2 25677cf1b8d3SKris Buschelman 25687cf1b8d3SKris Buschelman v += 16; 25697cf1b8d3SKris Buschelman } 25707cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 25717cf1b8d3SKris Buschelman PREFETCH_NTA(v); 25727cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 25737cf1b8d3SKris Buschelman } 25747cf1b8d3SKris Buschelman 25757cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 25767cf1b8d3SKris Buschelman 25777cf1b8d3SKris Buschelman idt = 4*(n-1); 25787cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 25797cf1b8d3SKris Buschelman v = aa + ai16 + 16; 25807cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 25817cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 25827cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 25837cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 25847cf1b8d3SKris Buschelman 25857cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 25867cf1b8d3SKris Buschelman 25877cf1b8d3SKris Buschelman while (nz--) { 25887cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 25897cf1b8d3SKris Buschelman idx = 4*(*vi++); 25907cf1b8d3SKris Buschelman /* idx = *vi++; */ 25917cf1b8d3SKris Buschelman 25927cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 25937cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 25947cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 25957cf1b8d3SKris Buschelman 25967cf1b8d3SKris Buschelman /* First Column */ 25977cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 25987cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25997cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 26007cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 26017cf1b8d3SKris Buschelman 26027cf1b8d3SKris Buschelman /* Second Column */ 26037cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 26047cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 26057cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 26067cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 26077cf1b8d3SKris Buschelman 26087cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 26097cf1b8d3SKris Buschelman 26107cf1b8d3SKris Buschelman /* Third Column */ 26117cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 26127cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26137cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 26147cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 26157cf1b8d3SKris Buschelman 26167cf1b8d3SKris Buschelman /* Fourth Column */ 26177cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 26187cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 26197cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 26207cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 26217cf1b8d3SKris Buschelman SSE_INLINE_END_2 26227cf1b8d3SKris Buschelman v += 16; 26237cf1b8d3SKris Buschelman } 26247cf1b8d3SKris Buschelman v = aa + ai16; 26257cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 26267cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 26277cf1b8d3SKris Buschelman /* 26287cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 26297cf1b8d3SKris Buschelman which was inverted as part of the factorization 26307cf1b8d3SKris Buschelman */ 26317cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 26327cf1b8d3SKris Buschelman /* First Column */ 26337cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 26347cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 26357cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 26367cf1b8d3SKris Buschelman 26377cf1b8d3SKris Buschelman /* Second Column */ 26387cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 26397cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 26407cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 26417cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 26427cf1b8d3SKris Buschelman 26437cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 26447cf1b8d3SKris Buschelman 26457cf1b8d3SKris Buschelman /* Third Column */ 26467cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 26477cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26487cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 26497cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 26507cf1b8d3SKris Buschelman 26517cf1b8d3SKris Buschelman /* Fourth Column */ 26527cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 26537cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 26547cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 26557cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 26567cf1b8d3SKris Buschelman 26577cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 26587cf1b8d3SKris Buschelman SSE_INLINE_END_3 26597cf1b8d3SKris Buschelman 26607cf1b8d3SKris Buschelman v = aa + ai16 + 16; 26617cf1b8d3SKris Buschelman idt -= 4; 26627cf1b8d3SKris Buschelman } 26637cf1b8d3SKris Buschelman 26647cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 26657cf1b8d3SKris Buschelman idt = 4*(n-1); 26667cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 26677cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 26687cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 26697cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 26707cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 26717cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 26727cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 26737cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 26747cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 26757cf1b8d3SKris Buschelman idt -= 4; 26767cf1b8d3SKris Buschelman } 26777cf1b8d3SKris Buschelman 26787cf1b8d3SKris Buschelman } /* End of artificial scope. */ 26791ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 26801ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2681d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 26827cf1b8d3SKris Buschelman SSE_SCOPE_END; 26837cf1b8d3SKris Buschelman PetscFunctionReturn(0); 26847cf1b8d3SKris Buschelman } 26857cf1b8d3SKris Buschelman 26863660e330SKris Buschelman #endif 26874a2ae208SSatish Balay #undef __FUNCT__ 26884a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2689dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 26904e2b4712SSatish Balay { 26914e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 26924e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 26936849ba73SBarry Smith PetscErrorCode ierr; 2694690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2695690b6cddSBarry Smith PetscInt *diag = a->diag; 2696d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2697d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 2698d9fead3dSBarry Smith const PetscScalar *b; 26994e2b4712SSatish Balay 27004e2b4712SSatish Balay PetscFunctionBegin; 2701d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27021ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2703f1af5d2fSBarry Smith t = a->solve_work; 27044e2b4712SSatish Balay 27054e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 27064e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 27074e2b4712SSatish Balay 27084e2b4712SSatish Balay /* forward solve the lower triangular */ 27094e2b4712SSatish Balay idx = 3*(*r++); 2710f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 27114e2b4712SSatish Balay for (i=1; i<n; i++) { 27124e2b4712SSatish Balay v = aa + 9*ai[i]; 27134e2b4712SSatish Balay vi = aj + ai[i]; 27144e2b4712SSatish Balay nz = diag[i] - ai[i]; 27154e2b4712SSatish Balay idx = 3*(*r++); 2716f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 27174e2b4712SSatish Balay while (nz--) { 27184e2b4712SSatish Balay idx = 3*(*vi++); 2719f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2720f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2721f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2722f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 27234e2b4712SSatish Balay v += 9; 27244e2b4712SSatish Balay } 27254e2b4712SSatish Balay idx = 3*i; 2726f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 27274e2b4712SSatish Balay } 27284e2b4712SSatish Balay /* backward solve the upper triangular */ 27294e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 27304e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 27314e2b4712SSatish Balay vi = aj + diag[i] + 1; 27324e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 27334e2b4712SSatish Balay idt = 3*i; 2734f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 27354e2b4712SSatish Balay while (nz--) { 27364e2b4712SSatish Balay idx = 3*(*vi++); 2737f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2738f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2739f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2740f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 27414e2b4712SSatish Balay v += 9; 27424e2b4712SSatish Balay } 27434e2b4712SSatish Balay idc = 3*(*c--); 27444e2b4712SSatish Balay v = aa + 9*diag[i]; 2745f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2746f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2747f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 27484e2b4712SSatish Balay } 27494e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 27504e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2751d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27521ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2753d0f46423SBarry Smith ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 27544e2b4712SSatish Balay PetscFunctionReturn(0); 27554e2b4712SSatish Balay } 27564e2b4712SSatish Balay 275715091d37SBarry Smith /* 275815091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 275915091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 276015091d37SBarry Smith */ 27614a2ae208SSatish Balay #undef __FUNCT__ 27624a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2763dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 276415091d37SBarry Smith { 276515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2766690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2767dfbe8321SBarry Smith PetscErrorCode ierr; 2768690b6cddSBarry Smith PetscInt *diag = a->diag; 2769d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2770d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 2771d9fead3dSBarry Smith const PetscScalar *b; 2772690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 277315091d37SBarry Smith 277415091d37SBarry Smith PetscFunctionBegin; 2775d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27761ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 277715091d37SBarry Smith 277815091d37SBarry Smith /* forward solve the lower triangular */ 277915091d37SBarry Smith idx = 0; 278015091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 278115091d37SBarry Smith for (i=1; i<n; i++) { 278215091d37SBarry Smith v = aa + 9*ai[i]; 278315091d37SBarry Smith vi = aj + ai[i]; 278415091d37SBarry Smith nz = diag[i] - ai[i]; 278515091d37SBarry Smith idx += 3; 2786f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 278715091d37SBarry Smith while (nz--) { 278815091d37SBarry Smith jdx = 3*(*vi++); 278915091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2790f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2791f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2792f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 279315091d37SBarry Smith v += 9; 279415091d37SBarry Smith } 2795f1af5d2fSBarry Smith x[idx] = s1; 2796f1af5d2fSBarry Smith x[1+idx] = s2; 2797f1af5d2fSBarry Smith x[2+idx] = s3; 279815091d37SBarry Smith } 279915091d37SBarry Smith /* backward solve the upper triangular */ 280015091d37SBarry Smith for (i=n-1; i>=0; i--){ 280115091d37SBarry Smith v = aa + 9*diag[i] + 9; 280215091d37SBarry Smith vi = aj + diag[i] + 1; 280315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 280415091d37SBarry Smith idt = 3*i; 2805f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2806f1af5d2fSBarry Smith s3 = x[2+idt]; 280715091d37SBarry Smith while (nz--) { 280815091d37SBarry Smith idx = 3*(*vi++); 280915091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2810f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2811f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2812f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 281315091d37SBarry Smith v += 9; 281415091d37SBarry Smith } 281515091d37SBarry Smith v = aa + 9*diag[i]; 2816f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2817f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2818f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 281915091d37SBarry Smith } 282015091d37SBarry Smith 2821d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28221ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2823d0f46423SBarry Smith ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 282415091d37SBarry Smith PetscFunctionReturn(0); 282515091d37SBarry Smith } 282615091d37SBarry Smith 28274a2ae208SSatish Balay #undef __FUNCT__ 28284a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2829dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 28304e2b4712SSatish Balay { 28314e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 28324e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 28336849ba73SBarry Smith PetscErrorCode ierr; 2834690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2835690b6cddSBarry Smith PetscInt *diag = a->diag; 2836d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2837d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 2838d9fead3dSBarry Smith const PetscScalar *b; 28394e2b4712SSatish Balay 28404e2b4712SSatish Balay PetscFunctionBegin; 2841d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28421ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2843f1af5d2fSBarry Smith t = a->solve_work; 28444e2b4712SSatish Balay 28454e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 28464e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 28474e2b4712SSatish Balay 28484e2b4712SSatish Balay /* forward solve the lower triangular */ 28494e2b4712SSatish Balay idx = 2*(*r++); 2850f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 28514e2b4712SSatish Balay for (i=1; i<n; i++) { 28524e2b4712SSatish Balay v = aa + 4*ai[i]; 28534e2b4712SSatish Balay vi = aj + ai[i]; 28544e2b4712SSatish Balay nz = diag[i] - ai[i]; 28554e2b4712SSatish Balay idx = 2*(*r++); 2856f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 28574e2b4712SSatish Balay while (nz--) { 28584e2b4712SSatish Balay idx = 2*(*vi++); 2859f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2860f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2861f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 28624e2b4712SSatish Balay v += 4; 28634e2b4712SSatish Balay } 28644e2b4712SSatish Balay idx = 2*i; 2865f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 28664e2b4712SSatish Balay } 28674e2b4712SSatish Balay /* backward solve the upper triangular */ 28684e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 28694e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 28704e2b4712SSatish Balay vi = aj + diag[i] + 1; 28714e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 28724e2b4712SSatish Balay idt = 2*i; 2873f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 28744e2b4712SSatish Balay while (nz--) { 28754e2b4712SSatish Balay idx = 2*(*vi++); 2876f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2877f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2878f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 28794e2b4712SSatish Balay v += 4; 28804e2b4712SSatish Balay } 28814e2b4712SSatish Balay idc = 2*(*c--); 28824e2b4712SSatish Balay v = aa + 4*diag[i]; 2883f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2884f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 28854e2b4712SSatish Balay } 28864e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 28874e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2888d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28891ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2890d0f46423SBarry Smith ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 28914e2b4712SSatish Balay PetscFunctionReturn(0); 28924e2b4712SSatish Balay } 28934e2b4712SSatish Balay 289415091d37SBarry Smith /* 289515091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 289615091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 289715091d37SBarry Smith */ 28984a2ae208SSatish Balay #undef __FUNCT__ 28994a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2900dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 290115091d37SBarry Smith { 290215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2903690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2904dfbe8321SBarry Smith PetscErrorCode ierr; 2905690b6cddSBarry Smith PetscInt *diag = a->diag; 2906d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2907d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 2908d9fead3dSBarry Smith const PetscScalar *b; 2909690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 291015091d37SBarry Smith 291115091d37SBarry Smith PetscFunctionBegin; 2912d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 291415091d37SBarry Smith 291515091d37SBarry Smith /* forward solve the lower triangular */ 291615091d37SBarry Smith idx = 0; 291715091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 291815091d37SBarry Smith for (i=1; i<n; i++) { 291915091d37SBarry Smith v = aa + 4*ai[i]; 292015091d37SBarry Smith vi = aj + ai[i]; 292115091d37SBarry Smith nz = diag[i] - ai[i]; 292215091d37SBarry Smith idx += 2; 2923f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 292415091d37SBarry Smith while (nz--) { 292515091d37SBarry Smith jdx = 2*(*vi++); 292615091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 2927f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2928f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 292915091d37SBarry Smith v += 4; 293015091d37SBarry Smith } 2931f1af5d2fSBarry Smith x[idx] = s1; 2932f1af5d2fSBarry Smith x[1+idx] = s2; 293315091d37SBarry Smith } 293415091d37SBarry Smith /* backward solve the upper triangular */ 293515091d37SBarry Smith for (i=n-1; i>=0; i--){ 293615091d37SBarry Smith v = aa + 4*diag[i] + 4; 293715091d37SBarry Smith vi = aj + diag[i] + 1; 293815091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 293915091d37SBarry Smith idt = 2*i; 2940f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 294115091d37SBarry Smith while (nz--) { 294215091d37SBarry Smith idx = 2*(*vi++); 294315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 2944f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2945f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 294615091d37SBarry Smith v += 4; 294715091d37SBarry Smith } 294815091d37SBarry Smith v = aa + 4*diag[i]; 2949f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 2950f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 295115091d37SBarry Smith } 295215091d37SBarry Smith 2953d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29541ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2955d0f46423SBarry Smith ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 295615091d37SBarry Smith PetscFunctionReturn(0); 295715091d37SBarry Smith } 295815091d37SBarry Smith 29594a2ae208SSatish Balay #undef __FUNCT__ 29604a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2961dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 29624e2b4712SSatish Balay { 29634e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 29644e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 29656849ba73SBarry Smith PetscErrorCode ierr; 2966690b6cddSBarry Smith PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2967690b6cddSBarry Smith PetscInt *diag = a->diag; 29683f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 296987828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 29704e2b4712SSatish Balay 29714e2b4712SSatish Balay PetscFunctionBegin; 29724e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 29734e2b4712SSatish Balay 29741ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 29751ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2976f1af5d2fSBarry Smith t = a->solve_work; 29774e2b4712SSatish Balay 29784e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 29794e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 29804e2b4712SSatish Balay 29814e2b4712SSatish Balay /* forward solve the lower triangular */ 2982f1af5d2fSBarry Smith t[0] = b[*r++]; 29834e2b4712SSatish Balay for (i=1; i<n; i++) { 29844e2b4712SSatish Balay v = aa + ai[i]; 29854e2b4712SSatish Balay vi = aj + ai[i]; 29864e2b4712SSatish Balay nz = diag[i] - ai[i]; 2987f1af5d2fSBarry Smith s1 = b[*r++]; 29884e2b4712SSatish Balay while (nz--) { 2989f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 29904e2b4712SSatish Balay } 2991f1af5d2fSBarry Smith t[i] = s1; 29924e2b4712SSatish Balay } 29934e2b4712SSatish Balay /* backward solve the upper triangular */ 29944e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 29954e2b4712SSatish Balay v = aa + diag[i] + 1; 29964e2b4712SSatish Balay vi = aj + diag[i] + 1; 29974e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2998f1af5d2fSBarry Smith s1 = t[i]; 29994e2b4712SSatish Balay while (nz--) { 3000f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 30014e2b4712SSatish Balay } 3002f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 30034e2b4712SSatish Balay } 30044e2b4712SSatish Balay 30054e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 30064e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 30071ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 30081ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3009d0f46423SBarry Smith ierr = PetscLogFlops(2*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 30104e2b4712SSatish Balay PetscFunctionReturn(0); 30114e2b4712SSatish Balay } 301215091d37SBarry Smith /* 301315091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 301415091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 301515091d37SBarry Smith */ 30164a2ae208SSatish Balay #undef __FUNCT__ 30174a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3018dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 301915091d37SBarry Smith { 302015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3021690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3022dfbe8321SBarry Smith PetscErrorCode ierr; 3023690b6cddSBarry Smith PetscInt *diag = a->diag; 302415091d37SBarry Smith MatScalar *aa=a->a; 302587828ca2SBarry Smith PetscScalar *x,*b; 302687828ca2SBarry Smith PetscScalar s1,x1; 302715091d37SBarry Smith MatScalar *v; 3028690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 302915091d37SBarry Smith 303015091d37SBarry Smith PetscFunctionBegin; 30311ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 30321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 303315091d37SBarry Smith 303415091d37SBarry Smith /* forward solve the lower triangular */ 303515091d37SBarry Smith idx = 0; 303615091d37SBarry Smith x[0] = b[0]; 303715091d37SBarry Smith for (i=1; i<n; i++) { 303815091d37SBarry Smith v = aa + ai[i]; 303915091d37SBarry Smith vi = aj + ai[i]; 304015091d37SBarry Smith nz = diag[i] - ai[i]; 304115091d37SBarry Smith idx += 1; 3042f1af5d2fSBarry Smith s1 = b[idx]; 304315091d37SBarry Smith while (nz--) { 304415091d37SBarry Smith jdx = *vi++; 304515091d37SBarry Smith x1 = x[jdx]; 3046f1af5d2fSBarry Smith s1 -= v[0]*x1; 304715091d37SBarry Smith v += 1; 304815091d37SBarry Smith } 3049f1af5d2fSBarry Smith x[idx] = s1; 305015091d37SBarry Smith } 305115091d37SBarry Smith /* backward solve the upper triangular */ 305215091d37SBarry Smith for (i=n-1; i>=0; i--){ 305315091d37SBarry Smith v = aa + diag[i] + 1; 305415091d37SBarry Smith vi = aj + diag[i] + 1; 305515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 305615091d37SBarry Smith idt = i; 3057f1af5d2fSBarry Smith s1 = x[idt]; 305815091d37SBarry Smith while (nz--) { 305915091d37SBarry Smith idx = *vi++; 306015091d37SBarry Smith x1 = x[idx]; 3061f1af5d2fSBarry Smith s1 -= v[0]*x1; 306215091d37SBarry Smith v += 1; 306315091d37SBarry Smith } 306415091d37SBarry Smith v = aa + diag[i]; 3065f1af5d2fSBarry Smith x[idt] = v[0]*s1; 306615091d37SBarry Smith } 30671ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 30681ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3069d0f46423SBarry Smith ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 307015091d37SBarry Smith PetscFunctionReturn(0); 307115091d37SBarry Smith } 30724e2b4712SSatish Balay 30734e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 30744e2b4712SSatish Balay /* 30754e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 30764e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 30774e2b4712SSatish Balay Not a good example of code reuse. 30784e2b4712SSatish Balay */ 3079*719d5645SBarry Smith EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption); 3080db4efbfdSBarry Smith EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 3081435faa5fSBarry Smith 30824a2ae208SSatish Balay #undef __FUNCT__ 30834a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3084*719d5645SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,MatFactorInfo *info) 30854e2b4712SSatish Balay { 30864e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 30874e2b4712SSatish Balay IS isicol; 30886849ba73SBarry Smith PetscErrorCode ierr; 3089690b6cddSBarry Smith PetscInt *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j; 3090690b6cddSBarry Smith PetscInt *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3091a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3092d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 30932af78befSBarry Smith PetscTruth col_identity,row_identity,flg; 3094329f5518SBarry Smith PetscReal f; 30954e2b4712SSatish Balay 30964e2b4712SSatish Balay PetscFunctionBegin; 3097435faa5fSBarry Smith f = info->fill; 3098690b6cddSBarry Smith levels = (PetscInt)info->levels; 3099690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 31004c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3101667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3102667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3103309c388cSBarry Smith 3104309c388cSBarry Smith if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3105*719d5645SBarry Smith ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 3106*719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 3107*719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 3108*719d5645SBarry Smith ierr = MatMissingDiagonal_SeqBAIJ(fact,&flg,&dd);CHKERRQ(ierr); 31092af78befSBarry Smith if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd); 3110bb3d539aSBarry Smith b->row = isrow; 3111bb3d539aSBarry Smith b->col = iscol; 3112bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3113bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3114bb3d539aSBarry Smith b->icol = isicol; 3115bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3116*719d5645SBarry Smith ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3117309c388cSBarry Smith } else { /* general case perform the symbolic factorization */ 31184e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 31194e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 31204e2b4712SSatish Balay 31214e2b4712SSatish Balay /* get new row pointers */ 3122690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 31234e2b4712SSatish Balay ainew[0] = 0; 31244e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 3125690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 3126690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 31274e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 3128690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 31294e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 3130690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 31314e2b4712SSatish Balay /* im is level for each filled value */ 3132690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 31334e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 3134690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 31354e2b4712SSatish Balay dloc[0] = 0; 31364e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 3137435faa5fSBarry Smith 3138435faa5fSBarry Smith /* copy prow into linked list */ 31394e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 314029bbc08cSBarry Smith if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 31414e2b4712SSatish Balay xi = aj + ai[r[prow]]; 31424e2b4712SSatish Balay fill[n] = n; 3143435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 31444e2b4712SSatish Balay while (nz--) { 31454e2b4712SSatish Balay fm = n; 31464e2b4712SSatish Balay idx = ic[*xi++]; 31474e2b4712SSatish Balay do { 31484e2b4712SSatish Balay m = fm; 31494e2b4712SSatish Balay fm = fill[m]; 31504e2b4712SSatish Balay } while (fm < idx); 31514e2b4712SSatish Balay fill[m] = idx; 31524e2b4712SSatish Balay fill[idx] = fm; 31534e2b4712SSatish Balay im[idx] = 0; 31544e2b4712SSatish Balay } 3155435faa5fSBarry Smith 3156435faa5fSBarry Smith /* make sure diagonal entry is included */ 3157435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 3158435faa5fSBarry Smith fm = n; 3159435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 3160435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3161435faa5fSBarry Smith fill[fm] = prow; 3162435faa5fSBarry Smith im[prow] = 0; 3163435faa5fSBarry Smith nzf++; 3164335d9088SBarry Smith dcount++; 3165435faa5fSBarry Smith } 3166435faa5fSBarry Smith 31674e2b4712SSatish Balay nzi = 0; 31684e2b4712SSatish Balay row = fill[n]; 31694e2b4712SSatish Balay while (row < prow) { 31704e2b4712SSatish Balay incrlev = im[row] + 1; 31714e2b4712SSatish Balay nz = dloc[row]; 3172435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 31734e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 31744e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 31754e2b4712SSatish Balay fm = row; 31764e2b4712SSatish Balay while (nnz-- > 0) { 31774e2b4712SSatish Balay idx = *xi++; 31784e2b4712SSatish Balay if (*flev + incrlev > levels) { 31794e2b4712SSatish Balay flev++; 31804e2b4712SSatish Balay continue; 31814e2b4712SSatish Balay } 31824e2b4712SSatish Balay do { 31834e2b4712SSatish Balay m = fm; 31844e2b4712SSatish Balay fm = fill[m]; 31854e2b4712SSatish Balay } while (fm < idx); 31864e2b4712SSatish Balay if (fm != idx) { 31874e2b4712SSatish Balay im[idx] = *flev + incrlev; 31884e2b4712SSatish Balay fill[m] = idx; 31894e2b4712SSatish Balay fill[idx] = fm; 31904e2b4712SSatish Balay fm = idx; 31914e2b4712SSatish Balay nzf++; 3192ecf371e4SBarry Smith } else { 31934e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 31944e2b4712SSatish Balay } 31954e2b4712SSatish Balay flev++; 31964e2b4712SSatish Balay } 31974e2b4712SSatish Balay row = fill[row]; 31984e2b4712SSatish Balay nzi++; 31994e2b4712SSatish Balay } 32004e2b4712SSatish Balay /* copy new filled row into permanent storage */ 32014e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 32024e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 3203ecf371e4SBarry Smith 3204ecf371e4SBarry Smith /* estimate how much additional space we will need */ 3205ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3206ecf371e4SBarry Smith /* just double the memory each time */ 3207690b6cddSBarry Smith PetscInt maxadd = jmax; 3208ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 32094e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 32104e2b4712SSatish Balay jmax += maxadd; 3211ecf371e4SBarry Smith 3212ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 3213690b6cddSBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr); 3214690b6cddSBarry Smith ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3215606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 32164e2b4712SSatish Balay ajnew = xi; 3217690b6cddSBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr); 3218690b6cddSBarry Smith ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3219606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 32204e2b4712SSatish Balay ajfill = xi; 3221eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 32224e2b4712SSatish Balay } 32234e2b4712SSatish Balay xi = ajnew + ainew[prow]; 32244e2b4712SSatish Balay flev = ajfill + ainew[prow]; 32254e2b4712SSatish Balay dloc[prow] = nzi; 32264e2b4712SSatish Balay fm = fill[n]; 32274e2b4712SSatish Balay while (nzf--) { 32284e2b4712SSatish Balay *xi++ = fm; 32294e2b4712SSatish Balay *flev++ = im[fm]; 32304e2b4712SSatish Balay fm = fill[fm]; 32314e2b4712SSatish Balay } 3232435faa5fSBarry Smith /* make sure row has diagonal entry */ 3233435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 323477431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 32352401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3236435faa5fSBarry Smith } 32374e2b4712SSatish Balay } 3238606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 32394e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 32404e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3241606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 3242606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 32434e2b4712SSatish Balay 32446cf91177SBarry Smith #if defined(PETSC_USE_INFO) 32454e2b4712SSatish Balay { 3246329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3247ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3248ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3249ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3250ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3251335d9088SBarry Smith if (diagonal_fill) { 3252ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3253335d9088SBarry Smith } 32544e2b4712SSatish Balay } 325563ba0a88SBarry Smith #endif 32564e2b4712SSatish Balay 32574e2b4712SSatish Balay /* put together the new matrix */ 3258*719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3259*719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 3260*719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 3261e6b907acSBarry Smith b->free_a = PETSC_TRUE; 3262e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 32637c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 3264a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 32654e2b4712SSatish Balay b->j = ajnew; 32664e2b4712SSatish Balay b->i = ainew; 32674e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 32684e2b4712SSatish Balay b->diag = dloc; 32694e2b4712SSatish Balay b->ilen = 0; 32704e2b4712SSatish Balay b->imax = 0; 32714e2b4712SSatish Balay b->row = isrow; 32724e2b4712SSatish Balay b->col = iscol; 3273bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3274c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3275c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3276e51c0b9cSSatish Balay b->icol = isicol; 327787828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 32784e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 32794e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 3280*719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 32814e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 32824e2b4712SSatish Balay 3283*719d5645SBarry Smith (fact)->info.factor_mallocs = reallocate; 3284*719d5645SBarry Smith (fact)->info.fill_ratio_given = f; 3285*719d5645SBarry Smith (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3286309c388cSBarry Smith } 3287*719d5645SBarry Smith ierr = MatSeqBAIJSetNumericFactorization(fact,row_identity && col_identity);CHKERRQ(ierr); 32888661488fSKris Buschelman PetscFunctionReturn(0); 32898661488fSKris Buschelman } 32908661488fSKris Buschelman 3291732ee342SKris Buschelman #undef __FUNCT__ 32927e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3293dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 32947e7071cdSKris Buschelman { 329512272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 329612272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 32975a9542e3SKris Buschelman PetscFunctionBegin; 32987cf1b8d3SKris Buschelman /* Undo Column scaling */ 32997cf1b8d3SKris Buschelman /* while (nz--) { */ 33007cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 33017cf1b8d3SKris Buschelman /* } */ 3302c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 3303c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 33047cf1b8d3SKris Buschelman PetscFunctionReturn(0); 33057cf1b8d3SKris Buschelman } 33067cf1b8d3SKris Buschelman 33077cf1b8d3SKris Buschelman #undef __FUNCT__ 33087cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3309dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 33107cf1b8d3SKris Buschelman { 33117cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3312b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 33132aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 33145a9542e3SKris Buschelman PetscFunctionBegin; 33150b9da03eSKris Buschelman /* Is this really necessary? */ 331620235379SKris Buschelman while (nz--) { 33170b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 33187e7071cdSKris Buschelman } 3319c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 33207e7071cdSKris Buschelman PetscFunctionReturn(0); 33217e7071cdSKris Buschelman } 33227e7071cdSKris Buschelman 3323732ee342SKris Buschelman 3324