1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 34e2b4712SSatish Balay /* 44e2b4712SSatish Balay Factorization code for BAIJ format. 54e2b4712SSatish Balay */ 64e2b4712SSatish Balay 74e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h" 84e2b4712SSatish Balay #include "src/inline/ilu.h" 974c49faeSBarry Smith #include "src/inline/dot.h" 104e2b4712SSatish Balay 114a2ae208SSatish Balay #undef __FUNCT__ 124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14f1af5d2fSBarry Smith { 15f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16dfbe8321SBarry Smith PetscErrorCode ierr; 17690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18690b6cddSBarry Smith PetscInt *diag = a->diag; 19f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2087828ca2SBarry Smith PetscScalar s1,*x,*b; 21f1af5d2fSBarry Smith 22f1af5d2fSBarry Smith PetscFunctionBegin; 23ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 241ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 251ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26f1af5d2fSBarry Smith 27f1af5d2fSBarry Smith /* forward solve the U^T */ 28f1af5d2fSBarry Smith for (i=0; i<n; i++) { 29f1af5d2fSBarry Smith 30f1af5d2fSBarry Smith v = aa + diag[i]; 31f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 32ef66eb69SBarry Smith s1 = (*v++)*x[i]; 33f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 34f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 35f1af5d2fSBarry Smith while (nz--) { 36f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 37f1af5d2fSBarry Smith } 38f1af5d2fSBarry Smith x[i] = s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith /* backward solve the L^T */ 41f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 42f1af5d2fSBarry Smith v = aa + diag[i] - 1; 43f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 44f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 45f1af5d2fSBarry Smith s1 = x[i]; 46f1af5d2fSBarry Smith while (nz--) { 47f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 48f1af5d2fSBarry Smith } 49f1af5d2fSBarry Smith } 501ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 511ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52d0f46423SBarry Smith ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 53f1af5d2fSBarry Smith PetscFunctionReturn(0); 54f1af5d2fSBarry Smith } 55f1af5d2fSBarry Smith 564a2ae208SSatish Balay #undef __FUNCT__ 574a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 58dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 59f1af5d2fSBarry Smith { 60f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 61dfbe8321SBarry Smith PetscErrorCode ierr; 62690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 63690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 64f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6587828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6687828ca2SBarry Smith PetscScalar *x,*b; 67f1af5d2fSBarry Smith 68f1af5d2fSBarry Smith PetscFunctionBegin; 69ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 701ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 711ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 72f1af5d2fSBarry Smith 73f1af5d2fSBarry Smith /* forward solve the U^T */ 74f1af5d2fSBarry Smith idx = 0; 75f1af5d2fSBarry Smith for (i=0; i<n; i++) { 76f1af5d2fSBarry Smith 77f1af5d2fSBarry Smith v = aa + 4*diag[i]; 78f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 79ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 80f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 81f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 82f1af5d2fSBarry Smith v += 4; 83f1af5d2fSBarry Smith 84f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 85f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 86f1af5d2fSBarry Smith while (nz--) { 87f1af5d2fSBarry Smith oidx = 2*(*vi++); 88f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 89f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 90f1af5d2fSBarry Smith v += 4; 91f1af5d2fSBarry Smith } 92f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 93f1af5d2fSBarry Smith idx += 2; 94f1af5d2fSBarry Smith } 95f1af5d2fSBarry Smith /* backward solve the L^T */ 96f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 97f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 98f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 99f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 100f1af5d2fSBarry Smith idt = 2*i; 101f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 102f1af5d2fSBarry Smith while (nz--) { 103f1af5d2fSBarry Smith idx = 2*(*vi--); 104f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 105f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 106f1af5d2fSBarry Smith v -= 4; 107f1af5d2fSBarry Smith } 108f1af5d2fSBarry Smith } 1091ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1101ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 111d0f46423SBarry Smith ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 112f1af5d2fSBarry Smith PetscFunctionReturn(0); 113f1af5d2fSBarry Smith } 114f1af5d2fSBarry Smith 1154a2ae208SSatish Balay #undef __FUNCT__ 1164a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 117dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 118f1af5d2fSBarry Smith { 119f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 120dfbe8321SBarry Smith PetscErrorCode ierr; 121690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 122690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 123f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12487828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12587828ca2SBarry Smith PetscScalar *x,*b; 126f1af5d2fSBarry Smith 127f1af5d2fSBarry Smith PetscFunctionBegin; 128ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1291ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1301ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 131f1af5d2fSBarry Smith 132f1af5d2fSBarry Smith /* forward solve the U^T */ 133f1af5d2fSBarry Smith idx = 0; 134f1af5d2fSBarry Smith for (i=0; i<n; i++) { 135f1af5d2fSBarry Smith 136f1af5d2fSBarry Smith v = aa + 9*diag[i]; 137f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 138ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 139f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 140f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 141f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 142f1af5d2fSBarry Smith v += 9; 143f1af5d2fSBarry Smith 144f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 145f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 146f1af5d2fSBarry Smith while (nz--) { 147f1af5d2fSBarry Smith oidx = 3*(*vi++); 148f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 149f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 150f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 151f1af5d2fSBarry Smith v += 9; 152f1af5d2fSBarry Smith } 153f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 154f1af5d2fSBarry Smith idx += 3; 155f1af5d2fSBarry Smith } 156f1af5d2fSBarry Smith /* backward solve the L^T */ 157f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 158f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 159f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 160f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 161f1af5d2fSBarry Smith idt = 3*i; 162f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 163f1af5d2fSBarry Smith while (nz--) { 164f1af5d2fSBarry Smith idx = 3*(*vi--); 165f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 166f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 167f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 168f1af5d2fSBarry Smith v -= 9; 169f1af5d2fSBarry Smith } 170f1af5d2fSBarry Smith } 1711ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1721ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 173d0f46423SBarry Smith ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 174f1af5d2fSBarry Smith PetscFunctionReturn(0); 175f1af5d2fSBarry Smith } 176f1af5d2fSBarry Smith 1774a2ae208SSatish Balay #undef __FUNCT__ 1784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 179dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 180f1af5d2fSBarry Smith { 181f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182dfbe8321SBarry Smith PetscErrorCode ierr; 183690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 184690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 185f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18787828ca2SBarry Smith PetscScalar *x,*b; 188f1af5d2fSBarry Smith 189f1af5d2fSBarry Smith PetscFunctionBegin; 190ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1911ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1921ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 193f1af5d2fSBarry Smith 194f1af5d2fSBarry Smith /* forward solve the U^T */ 195f1af5d2fSBarry Smith idx = 0; 196f1af5d2fSBarry Smith for (i=0; i<n; i++) { 197f1af5d2fSBarry Smith 198f1af5d2fSBarry Smith v = aa + 16*diag[i]; 199f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 200ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 201f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 202f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 203f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 204f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 205f1af5d2fSBarry Smith v += 16; 206f1af5d2fSBarry Smith 207f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 208f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 209f1af5d2fSBarry Smith while (nz--) { 210f1af5d2fSBarry Smith oidx = 4*(*vi++); 211f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 212f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 213f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 214f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 215f1af5d2fSBarry Smith v += 16; 216f1af5d2fSBarry Smith } 217f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 218f1af5d2fSBarry Smith idx += 4; 219f1af5d2fSBarry Smith } 220f1af5d2fSBarry Smith /* backward solve the L^T */ 221f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 222f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 223f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 224f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 225f1af5d2fSBarry Smith idt = 4*i; 226f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 227f1af5d2fSBarry Smith while (nz--) { 228f1af5d2fSBarry Smith idx = 4*(*vi--); 229f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 230f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 231f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 232f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 233f1af5d2fSBarry Smith v -= 16; 234f1af5d2fSBarry Smith } 235f1af5d2fSBarry Smith } 2361ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2371ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 238d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 239f1af5d2fSBarry Smith PetscFunctionReturn(0); 240f1af5d2fSBarry Smith } 241f1af5d2fSBarry Smith 2424a2ae208SSatish Balay #undef __FUNCT__ 2434a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 244dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 245f1af5d2fSBarry Smith { 246f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 247dfbe8321SBarry Smith PetscErrorCode ierr; 248690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 249690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 250f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 25187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 25287828ca2SBarry Smith PetscScalar *x,*b; 253f1af5d2fSBarry Smith 254f1af5d2fSBarry Smith PetscFunctionBegin; 255ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2561ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2571ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 258f1af5d2fSBarry Smith 259f1af5d2fSBarry Smith /* forward solve the U^T */ 260f1af5d2fSBarry Smith idx = 0; 261f1af5d2fSBarry Smith for (i=0; i<n; i++) { 262f1af5d2fSBarry Smith 263f1af5d2fSBarry Smith v = aa + 25*diag[i]; 264f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 265ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 266f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 267f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 268f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 269f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 270f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 271f1af5d2fSBarry Smith v += 25; 272f1af5d2fSBarry Smith 273f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 274f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 275f1af5d2fSBarry Smith while (nz--) { 276f1af5d2fSBarry Smith oidx = 5*(*vi++); 277f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 278f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 279f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 280f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 281f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 282f1af5d2fSBarry Smith v += 25; 283f1af5d2fSBarry Smith } 284f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 285f1af5d2fSBarry Smith idx += 5; 286f1af5d2fSBarry Smith } 287f1af5d2fSBarry Smith /* backward solve the L^T */ 288f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 289f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 290f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 291f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 292f1af5d2fSBarry Smith idt = 5*i; 293f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 294f1af5d2fSBarry Smith while (nz--) { 295f1af5d2fSBarry Smith idx = 5*(*vi--); 296f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 297f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 298f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 299f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 300f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 301f1af5d2fSBarry Smith v -= 25; 302f1af5d2fSBarry Smith } 303f1af5d2fSBarry Smith } 3041ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3051ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 306d0f46423SBarry Smith ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 307f1af5d2fSBarry Smith PetscFunctionReturn(0); 308f1af5d2fSBarry Smith } 309f1af5d2fSBarry Smith 3104a2ae208SSatish Balay #undef __FUNCT__ 3114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 312dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 313f1af5d2fSBarry Smith { 314f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 315dfbe8321SBarry Smith PetscErrorCode ierr; 316690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 317690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 318f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 31987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 32087828ca2SBarry Smith PetscScalar *x,*b; 321f1af5d2fSBarry Smith 322f1af5d2fSBarry Smith PetscFunctionBegin; 323ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3241ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3251ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 326f1af5d2fSBarry Smith 327f1af5d2fSBarry Smith /* forward solve the U^T */ 328f1af5d2fSBarry Smith idx = 0; 329f1af5d2fSBarry Smith for (i=0; i<n; i++) { 330f1af5d2fSBarry Smith 331f1af5d2fSBarry Smith v = aa + 36*diag[i]; 332f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 333ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 334ef66eb69SBarry Smith x6 = x[5+idx]; 335f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 336f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 337f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 338f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 339f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 340f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 341f1af5d2fSBarry Smith v += 36; 342f1af5d2fSBarry Smith 343f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 344f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 345f1af5d2fSBarry Smith while (nz--) { 346f1af5d2fSBarry Smith oidx = 6*(*vi++); 347f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 348f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 349f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 350f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 351f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 352f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 353f1af5d2fSBarry Smith v += 36; 354f1af5d2fSBarry Smith } 355f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 356f1af5d2fSBarry Smith x[5+idx] = s6; 357f1af5d2fSBarry Smith idx += 6; 358f1af5d2fSBarry Smith } 359f1af5d2fSBarry Smith /* backward solve the L^T */ 360f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 361f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 362f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 363f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 364f1af5d2fSBarry Smith idt = 6*i; 365f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 366f1af5d2fSBarry Smith s6 = x[5+idt]; 367f1af5d2fSBarry Smith while (nz--) { 368f1af5d2fSBarry Smith idx = 6*(*vi--); 369f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 370f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 371f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 372f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 373f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 374f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 375f1af5d2fSBarry Smith v -= 36; 376f1af5d2fSBarry Smith } 377f1af5d2fSBarry Smith } 3781ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3791ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 380d0f46423SBarry Smith ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 381f1af5d2fSBarry Smith PetscFunctionReturn(0); 382f1af5d2fSBarry Smith } 383f1af5d2fSBarry Smith 3844a2ae208SSatish Balay #undef __FUNCT__ 3854a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 386dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 387f1af5d2fSBarry Smith { 388f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 389dfbe8321SBarry Smith PetscErrorCode ierr; 390690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 391690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 392f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 39387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 39487828ca2SBarry Smith PetscScalar *x,*b; 395f1af5d2fSBarry Smith 396f1af5d2fSBarry Smith PetscFunctionBegin; 397ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3981ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3991ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 400f1af5d2fSBarry Smith 401f1af5d2fSBarry Smith /* forward solve the U^T */ 402f1af5d2fSBarry Smith idx = 0; 403f1af5d2fSBarry Smith for (i=0; i<n; i++) { 404f1af5d2fSBarry Smith 405f1af5d2fSBarry Smith v = aa + 49*diag[i]; 406f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 407ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 408ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 409f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 410f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 411f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 412f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 413f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 414f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 415f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 416f1af5d2fSBarry Smith v += 49; 417f1af5d2fSBarry Smith 418f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 419f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 420f1af5d2fSBarry Smith while (nz--) { 421f1af5d2fSBarry Smith oidx = 7*(*vi++); 422f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 423f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 424f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 425f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 426f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 427f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 428f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 429f1af5d2fSBarry Smith v += 49; 430f1af5d2fSBarry Smith } 431f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 432f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 433f1af5d2fSBarry Smith idx += 7; 434f1af5d2fSBarry Smith } 435f1af5d2fSBarry Smith /* backward solve the L^T */ 436f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 437f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 438f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 439f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 440f1af5d2fSBarry Smith idt = 7*i; 441f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 442f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 443f1af5d2fSBarry Smith while (nz--) { 444f1af5d2fSBarry Smith idx = 7*(*vi--); 445f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 446f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 447f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 448f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 449f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 450f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 451f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 452f1af5d2fSBarry Smith v -= 49; 453f1af5d2fSBarry Smith } 454f1af5d2fSBarry Smith } 4551ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4561ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457d0f46423SBarry Smith ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 458f1af5d2fSBarry Smith PetscFunctionReturn(0); 459f1af5d2fSBarry Smith } 460f1af5d2fSBarry Smith 461f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4624a2ae208SSatish Balay #undef __FUNCT__ 4634a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 464dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 465f1af5d2fSBarry Smith { 466f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 467f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 4686849ba73SBarry Smith PetscErrorCode ierr; 469*5d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 470*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 471690b6cddSBarry Smith PetscInt *diag = a->diag; 472f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 47387828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 474f1af5d2fSBarry Smith 475f1af5d2fSBarry Smith PetscFunctionBegin; 4761ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4771ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 478f1af5d2fSBarry Smith t = a->solve_work; 479f1af5d2fSBarry Smith 480f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 481f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 482f1af5d2fSBarry Smith 483f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 484f1af5d2fSBarry Smith for (i=0; i<n; i++) { 485f1af5d2fSBarry Smith t[i] = b[c[i]]; 486f1af5d2fSBarry Smith } 487f1af5d2fSBarry Smith 488f1af5d2fSBarry Smith /* forward solve the U^T */ 489f1af5d2fSBarry Smith for (i=0; i<n; i++) { 490f1af5d2fSBarry Smith 491f1af5d2fSBarry Smith v = aa + diag[i]; 492f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 493f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 494f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 495f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 496f1af5d2fSBarry Smith while (nz--) { 497f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 498f1af5d2fSBarry Smith } 499f1af5d2fSBarry Smith t[i] = s1; 500f1af5d2fSBarry Smith } 501f1af5d2fSBarry Smith /* backward solve the L^T */ 502f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 503f1af5d2fSBarry Smith v = aa + diag[i] - 1; 504f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 505f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 506f1af5d2fSBarry Smith s1 = t[i]; 507f1af5d2fSBarry Smith while (nz--) { 508f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 509f1af5d2fSBarry Smith } 510f1af5d2fSBarry Smith } 511f1af5d2fSBarry Smith 512f1af5d2fSBarry Smith /* copy t into x according to permutation */ 513f1af5d2fSBarry Smith for (i=0; i<n; i++) { 514f1af5d2fSBarry Smith x[r[i]] = t[i]; 515f1af5d2fSBarry Smith } 516f1af5d2fSBarry Smith 517f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 518f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5191ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5201ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 521d0f46423SBarry Smith ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 522f1af5d2fSBarry Smith PetscFunctionReturn(0); 523f1af5d2fSBarry Smith } 524f1af5d2fSBarry Smith 5254a2ae208SSatish Balay #undef __FUNCT__ 5264a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 527dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 528f1af5d2fSBarry Smith { 529f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 530f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5316849ba73SBarry Smith PetscErrorCode ierr; 532*5d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 533*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 534690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 535f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53687828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 53787828ca2SBarry Smith PetscScalar *x,*b,*t; 538f1af5d2fSBarry Smith 539f1af5d2fSBarry Smith PetscFunctionBegin; 5401ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5411ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 542f1af5d2fSBarry Smith t = a->solve_work; 543f1af5d2fSBarry Smith 544f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 545f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 546f1af5d2fSBarry Smith 547f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 548f1af5d2fSBarry Smith ii = 0; 549f1af5d2fSBarry Smith for (i=0; i<n; i++) { 550f1af5d2fSBarry Smith ic = 2*c[i]; 551f1af5d2fSBarry Smith t[ii] = b[ic]; 552f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 553f1af5d2fSBarry Smith ii += 2; 554f1af5d2fSBarry Smith } 555f1af5d2fSBarry Smith 556f1af5d2fSBarry Smith /* forward solve the U^T */ 557f1af5d2fSBarry Smith idx = 0; 558f1af5d2fSBarry Smith for (i=0; i<n; i++) { 559f1af5d2fSBarry Smith 560f1af5d2fSBarry Smith v = aa + 4*diag[i]; 561f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 562f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 563f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 564f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 565f1af5d2fSBarry Smith v += 4; 566f1af5d2fSBarry Smith 567f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 568f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 569f1af5d2fSBarry Smith while (nz--) { 570f1af5d2fSBarry Smith oidx = 2*(*vi++); 571f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 572f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 573f1af5d2fSBarry Smith v += 4; 574f1af5d2fSBarry Smith } 575f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 576f1af5d2fSBarry Smith idx += 2; 577f1af5d2fSBarry Smith } 578f1af5d2fSBarry Smith /* backward solve the L^T */ 579f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 580f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 581f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 582f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 583f1af5d2fSBarry Smith idt = 2*i; 584f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 585f1af5d2fSBarry Smith while (nz--) { 586f1af5d2fSBarry Smith idx = 2*(*vi--); 587f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 588f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 589f1af5d2fSBarry Smith v -= 4; 590f1af5d2fSBarry Smith } 591f1af5d2fSBarry Smith } 592f1af5d2fSBarry Smith 593f1af5d2fSBarry Smith /* copy t into x according to permutation */ 594f1af5d2fSBarry Smith ii = 0; 595f1af5d2fSBarry Smith for (i=0; i<n; i++) { 596f1af5d2fSBarry Smith ir = 2*r[i]; 597f1af5d2fSBarry Smith x[ir] = t[ii]; 598f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 599f1af5d2fSBarry Smith ii += 2; 600f1af5d2fSBarry Smith } 601f1af5d2fSBarry Smith 602f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 603f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6041ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6051ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 606d0f46423SBarry Smith ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 607f1af5d2fSBarry Smith PetscFunctionReturn(0); 608f1af5d2fSBarry Smith } 609f1af5d2fSBarry Smith 6104a2ae208SSatish Balay #undef __FUNCT__ 6114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 612dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 613f1af5d2fSBarry Smith { 614f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 615f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6166849ba73SBarry Smith PetscErrorCode ierr; 617*5d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 618*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 619690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 620f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 62187828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 62287828ca2SBarry Smith PetscScalar *x,*b,*t; 623f1af5d2fSBarry Smith 624f1af5d2fSBarry Smith PetscFunctionBegin; 6251ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6261ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 627f1af5d2fSBarry Smith t = a->solve_work; 628f1af5d2fSBarry Smith 629f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 630f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 631f1af5d2fSBarry Smith 632f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 633f1af5d2fSBarry Smith ii = 0; 634f1af5d2fSBarry Smith for (i=0; i<n; i++) { 635f1af5d2fSBarry Smith ic = 3*c[i]; 636f1af5d2fSBarry Smith t[ii] = b[ic]; 637f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 638f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 639f1af5d2fSBarry Smith ii += 3; 640f1af5d2fSBarry Smith } 641f1af5d2fSBarry Smith 642f1af5d2fSBarry Smith /* forward solve the U^T */ 643f1af5d2fSBarry Smith idx = 0; 644f1af5d2fSBarry Smith for (i=0; i<n; i++) { 645f1af5d2fSBarry Smith 646f1af5d2fSBarry Smith v = aa + 9*diag[i]; 647f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 648f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 649f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 650f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 651f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 652f1af5d2fSBarry Smith v += 9; 653f1af5d2fSBarry Smith 654f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 655f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 656f1af5d2fSBarry Smith while (nz--) { 657f1af5d2fSBarry Smith oidx = 3*(*vi++); 658f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 659f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 660f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 661f1af5d2fSBarry Smith v += 9; 662f1af5d2fSBarry Smith } 663f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 664f1af5d2fSBarry Smith idx += 3; 665f1af5d2fSBarry Smith } 666f1af5d2fSBarry Smith /* backward solve the L^T */ 667f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 668f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 669f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 670f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 671f1af5d2fSBarry Smith idt = 3*i; 672f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 673f1af5d2fSBarry Smith while (nz--) { 674f1af5d2fSBarry Smith idx = 3*(*vi--); 675f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 676f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 677f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 678f1af5d2fSBarry Smith v -= 9; 679f1af5d2fSBarry Smith } 680f1af5d2fSBarry Smith } 681f1af5d2fSBarry Smith 682f1af5d2fSBarry Smith /* copy t into x according to permutation */ 683f1af5d2fSBarry Smith ii = 0; 684f1af5d2fSBarry Smith for (i=0; i<n; i++) { 685f1af5d2fSBarry Smith ir = 3*r[i]; 686f1af5d2fSBarry Smith x[ir] = t[ii]; 687f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 688f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 689f1af5d2fSBarry Smith ii += 3; 690f1af5d2fSBarry Smith } 691f1af5d2fSBarry Smith 692f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 693f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6941ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6951ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 696d0f46423SBarry Smith ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 697f1af5d2fSBarry Smith PetscFunctionReturn(0); 698f1af5d2fSBarry Smith } 699f1af5d2fSBarry Smith 7004a2ae208SSatish Balay #undef __FUNCT__ 7014a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 702dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 703f1af5d2fSBarry Smith { 704f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 705f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7066849ba73SBarry Smith PetscErrorCode ierr; 707*5d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 708*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 709690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 710f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 71187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 71287828ca2SBarry Smith PetscScalar *x,*b,*t; 713f1af5d2fSBarry Smith 714f1af5d2fSBarry Smith PetscFunctionBegin; 7151ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7161ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 717f1af5d2fSBarry Smith t = a->solve_work; 718f1af5d2fSBarry Smith 719f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 720f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 721f1af5d2fSBarry Smith 722f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 723f1af5d2fSBarry Smith ii = 0; 724f1af5d2fSBarry Smith for (i=0; i<n; i++) { 725f1af5d2fSBarry Smith ic = 4*c[i]; 726f1af5d2fSBarry Smith t[ii] = b[ic]; 727f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 728f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 729f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 730f1af5d2fSBarry Smith ii += 4; 731f1af5d2fSBarry Smith } 732f1af5d2fSBarry Smith 733f1af5d2fSBarry Smith /* forward solve the U^T */ 734f1af5d2fSBarry Smith idx = 0; 735f1af5d2fSBarry Smith for (i=0; i<n; i++) { 736f1af5d2fSBarry Smith 737f1af5d2fSBarry Smith v = aa + 16*diag[i]; 738f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 739f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 740f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 741f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 742f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 743f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 744f1af5d2fSBarry Smith v += 16; 745f1af5d2fSBarry Smith 746f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 747f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 748f1af5d2fSBarry Smith while (nz--) { 749f1af5d2fSBarry Smith oidx = 4*(*vi++); 750f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 751f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 752f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 753f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 754f1af5d2fSBarry Smith v += 16; 755f1af5d2fSBarry Smith } 756f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 757f1af5d2fSBarry Smith idx += 4; 758f1af5d2fSBarry Smith } 759f1af5d2fSBarry Smith /* backward solve the L^T */ 760f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 761f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 762f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 763f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 764f1af5d2fSBarry Smith idt = 4*i; 765f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 766f1af5d2fSBarry Smith while (nz--) { 767f1af5d2fSBarry Smith idx = 4*(*vi--); 768f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 769f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 770f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 771f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 772f1af5d2fSBarry Smith v -= 16; 773f1af5d2fSBarry Smith } 774f1af5d2fSBarry Smith } 775f1af5d2fSBarry Smith 776f1af5d2fSBarry Smith /* copy t into x according to permutation */ 777f1af5d2fSBarry Smith ii = 0; 778f1af5d2fSBarry Smith for (i=0; i<n; i++) { 779f1af5d2fSBarry Smith ir = 4*r[i]; 780f1af5d2fSBarry Smith x[ir] = t[ii]; 781f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 782f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 783f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 784f1af5d2fSBarry Smith ii += 4; 785f1af5d2fSBarry Smith } 786f1af5d2fSBarry Smith 787f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 788f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7891ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7901ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 791d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 792f1af5d2fSBarry Smith PetscFunctionReturn(0); 793f1af5d2fSBarry Smith } 794f1af5d2fSBarry Smith 7954a2ae208SSatish Balay #undef __FUNCT__ 7964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 797dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 798f1af5d2fSBarry Smith { 799f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 800f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8016849ba73SBarry Smith PetscErrorCode ierr; 802*5d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 803*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 804690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 805f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 80687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 80787828ca2SBarry Smith PetscScalar *x,*b,*t; 808f1af5d2fSBarry Smith 809f1af5d2fSBarry Smith PetscFunctionBegin; 8101ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8111ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 812f1af5d2fSBarry Smith t = a->solve_work; 813f1af5d2fSBarry Smith 814f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 815f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 816f1af5d2fSBarry Smith 817f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 818f1af5d2fSBarry Smith ii = 0; 819f1af5d2fSBarry Smith for (i=0; i<n; i++) { 820f1af5d2fSBarry Smith ic = 5*c[i]; 821f1af5d2fSBarry Smith t[ii] = b[ic]; 822f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 823f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 824f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 825f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 826f1af5d2fSBarry Smith ii += 5; 827f1af5d2fSBarry Smith } 828f1af5d2fSBarry Smith 829f1af5d2fSBarry Smith /* forward solve the U^T */ 830f1af5d2fSBarry Smith idx = 0; 831f1af5d2fSBarry Smith for (i=0; i<n; i++) { 832f1af5d2fSBarry Smith 833f1af5d2fSBarry Smith v = aa + 25*diag[i]; 834f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 835f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 836f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 837f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 838f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 839f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 840f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 841f1af5d2fSBarry Smith v += 25; 842f1af5d2fSBarry Smith 843f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 844f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 845f1af5d2fSBarry Smith while (nz--) { 846f1af5d2fSBarry Smith oidx = 5*(*vi++); 847f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 848f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 849f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 850f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 851f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 852f1af5d2fSBarry Smith v += 25; 853f1af5d2fSBarry Smith } 854f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 855f1af5d2fSBarry Smith idx += 5; 856f1af5d2fSBarry Smith } 857f1af5d2fSBarry Smith /* backward solve the L^T */ 858f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 859f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 860f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 861f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 862f1af5d2fSBarry Smith idt = 5*i; 863f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 864f1af5d2fSBarry Smith while (nz--) { 865f1af5d2fSBarry Smith idx = 5*(*vi--); 866f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 867f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 868f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 869f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 870f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 871f1af5d2fSBarry Smith v -= 25; 872f1af5d2fSBarry Smith } 873f1af5d2fSBarry Smith } 874f1af5d2fSBarry Smith 875f1af5d2fSBarry Smith /* copy t into x according to permutation */ 876f1af5d2fSBarry Smith ii = 0; 877f1af5d2fSBarry Smith for (i=0; i<n; i++) { 878f1af5d2fSBarry Smith ir = 5*r[i]; 879f1af5d2fSBarry Smith x[ir] = t[ii]; 880f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 881f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 882f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 883f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 884f1af5d2fSBarry Smith ii += 5; 885f1af5d2fSBarry Smith } 886f1af5d2fSBarry Smith 887f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 888f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8891ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8901ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 891d0f46423SBarry Smith ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 892f1af5d2fSBarry Smith PetscFunctionReturn(0); 893f1af5d2fSBarry Smith } 894f1af5d2fSBarry Smith 8954a2ae208SSatish Balay #undef __FUNCT__ 8964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 897dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 898f1af5d2fSBarry Smith { 899f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 900f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9016849ba73SBarry Smith PetscErrorCode ierr; 902*5d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 903*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 904690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 905f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 90687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 90787828ca2SBarry Smith PetscScalar *x,*b,*t; 908f1af5d2fSBarry Smith 909f1af5d2fSBarry Smith PetscFunctionBegin; 9101ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9111ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 912f1af5d2fSBarry Smith t = a->solve_work; 913f1af5d2fSBarry Smith 914f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 915f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 916f1af5d2fSBarry Smith 917f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 918f1af5d2fSBarry Smith ii = 0; 919f1af5d2fSBarry Smith for (i=0; i<n; i++) { 920f1af5d2fSBarry Smith ic = 6*c[i]; 921f1af5d2fSBarry Smith t[ii] = b[ic]; 922f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 923f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 924f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 925f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 926f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 927f1af5d2fSBarry Smith ii += 6; 928f1af5d2fSBarry Smith } 929f1af5d2fSBarry Smith 930f1af5d2fSBarry Smith /* forward solve the U^T */ 931f1af5d2fSBarry Smith idx = 0; 932f1af5d2fSBarry Smith for (i=0; i<n; i++) { 933f1af5d2fSBarry Smith 934f1af5d2fSBarry Smith v = aa + 36*diag[i]; 935f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 936f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 937f1af5d2fSBarry Smith x6 = t[5+idx]; 938f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 939f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 940f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 941f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 942f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 943f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 944f1af5d2fSBarry Smith v += 36; 945f1af5d2fSBarry Smith 946f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 947f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 948f1af5d2fSBarry Smith while (nz--) { 949f1af5d2fSBarry Smith oidx = 6*(*vi++); 950f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 951f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 952f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 953f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 954f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 955f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 956f1af5d2fSBarry Smith v += 36; 957f1af5d2fSBarry Smith } 958f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 959f1af5d2fSBarry Smith t[5+idx] = s6; 960f1af5d2fSBarry Smith idx += 6; 961f1af5d2fSBarry Smith } 962f1af5d2fSBarry Smith /* backward solve the L^T */ 963f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 964f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 965f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 966f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 967f1af5d2fSBarry Smith idt = 6*i; 968f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 969f1af5d2fSBarry Smith s6 = t[5+idt]; 970f1af5d2fSBarry Smith while (nz--) { 971f1af5d2fSBarry Smith idx = 6*(*vi--); 972f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 973f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 974f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 975f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 976f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 977f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 978f1af5d2fSBarry Smith v -= 36; 979f1af5d2fSBarry Smith } 980f1af5d2fSBarry Smith } 981f1af5d2fSBarry Smith 982f1af5d2fSBarry Smith /* copy t into x according to permutation */ 983f1af5d2fSBarry Smith ii = 0; 984f1af5d2fSBarry Smith for (i=0; i<n; i++) { 985f1af5d2fSBarry Smith ir = 6*r[i]; 986f1af5d2fSBarry Smith x[ir] = t[ii]; 987f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 988f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 989f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 990f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 991f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 992f1af5d2fSBarry Smith ii += 6; 993f1af5d2fSBarry Smith } 994f1af5d2fSBarry Smith 995f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 996f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9971ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 9981ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 999d0f46423SBarry Smith ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 1000f1af5d2fSBarry Smith PetscFunctionReturn(0); 1001f1af5d2fSBarry Smith } 1002f1af5d2fSBarry Smith 10034a2ae208SSatish Balay #undef __FUNCT__ 10044a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1005dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1006f1af5d2fSBarry Smith { 1007f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1008f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10096849ba73SBarry Smith PetscErrorCode ierr; 1010*5d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1011*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1012690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1013f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 101487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 101587828ca2SBarry Smith PetscScalar *x,*b,*t; 1016f1af5d2fSBarry Smith 1017f1af5d2fSBarry Smith PetscFunctionBegin; 10181ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10191ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1020f1af5d2fSBarry Smith t = a->solve_work; 1021f1af5d2fSBarry Smith 1022f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1023f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1024f1af5d2fSBarry Smith 1025f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1026f1af5d2fSBarry Smith ii = 0; 1027f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1028f1af5d2fSBarry Smith ic = 7*c[i]; 1029f1af5d2fSBarry Smith t[ii] = b[ic]; 1030f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1031f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1032f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1033f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1034f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1035f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1036f1af5d2fSBarry Smith ii += 7; 1037f1af5d2fSBarry Smith } 1038f1af5d2fSBarry Smith 1039f1af5d2fSBarry Smith /* forward solve the U^T */ 1040f1af5d2fSBarry Smith idx = 0; 1041f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1042f1af5d2fSBarry Smith 1043f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1044f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1045f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1046f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1047f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1048f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1049f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1050f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1051f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1052f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1053f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1054f1af5d2fSBarry Smith v += 49; 1055f1af5d2fSBarry Smith 1056f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1057f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1058f1af5d2fSBarry Smith while (nz--) { 1059f1af5d2fSBarry Smith oidx = 7*(*vi++); 1060f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1061f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1062f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1063f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1064f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1065f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1066f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1067f1af5d2fSBarry Smith v += 49; 1068f1af5d2fSBarry Smith } 1069f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1070f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1071f1af5d2fSBarry Smith idx += 7; 1072f1af5d2fSBarry Smith } 1073f1af5d2fSBarry Smith /* backward solve the L^T */ 1074f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1075f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1076f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1077f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1078f1af5d2fSBarry Smith idt = 7*i; 1079f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1080f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1081f1af5d2fSBarry Smith while (nz--) { 1082f1af5d2fSBarry Smith idx = 7*(*vi--); 1083f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1084f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1085f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1086f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1087f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1088f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1089f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1090f1af5d2fSBarry Smith v -= 49; 1091f1af5d2fSBarry Smith } 1092f1af5d2fSBarry Smith } 1093f1af5d2fSBarry Smith 1094f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1095f1af5d2fSBarry Smith ii = 0; 1096f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1097f1af5d2fSBarry Smith ir = 7*r[i]; 1098f1af5d2fSBarry Smith x[ir] = t[ii]; 1099f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1100f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1101f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1102f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1103f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1104f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1105f1af5d2fSBarry Smith ii += 7; 1106f1af5d2fSBarry Smith } 1107f1af5d2fSBarry Smith 1108f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1109f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11101ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11111ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1112d0f46423SBarry Smith ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 1113f1af5d2fSBarry Smith PetscFunctionReturn(0); 1114f1af5d2fSBarry Smith } 1115f1af5d2fSBarry Smith 11164e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11174a2ae208SSatish Balay #undef __FUNCT__ 11184a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1119dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11204e2b4712SSatish Balay { 11214e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11224e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11236849ba73SBarry Smith PetscErrorCode ierr; 1124*5d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1125*5d0c19d7SBarry Smith PetscInt i,n=a->mbs; 1126*5d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11273f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 112887828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11294e2b4712SSatish Balay 11304e2b4712SSatish Balay PetscFunctionBegin; 11311ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1133f1af5d2fSBarry Smith t = a->solve_work; 11344e2b4712SSatish Balay 11354e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11364e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11374e2b4712SSatish Balay 11384e2b4712SSatish Balay /* forward solve the lower triangular */ 113987828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11404e2b4712SSatish Balay for (i=1; i<n; i++) { 11414e2b4712SSatish Balay v = aa + bs2*ai[i]; 11424e2b4712SSatish Balay vi = aj + ai[i]; 11434e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1144f1af5d2fSBarry Smith s = t + bs*i; 114587828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11464e2b4712SSatish Balay while (nz--) { 1147f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11484e2b4712SSatish Balay v += bs2; 11494e2b4712SSatish Balay } 11504e2b4712SSatish Balay } 11514e2b4712SSatish Balay /* backward solve the upper triangular */ 1152d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 11534e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11544e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11554e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11564e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 115787828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11584e2b4712SSatish Balay while (nz--) { 1159f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11604e2b4712SSatish Balay v += bs2; 11614e2b4712SSatish Balay } 1162f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 116387828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11644e2b4712SSatish Balay } 11654e2b4712SSatish Balay 11664e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11674e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11681ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11691ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1170d0f46423SBarry Smith ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 11714e2b4712SSatish Balay PetscFunctionReturn(0); 11724e2b4712SSatish Balay } 11734e2b4712SSatish Balay 11744a2ae208SSatish Balay #undef __FUNCT__ 11754a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1176dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11774e2b4712SSatish Balay { 11784e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11794e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11806849ba73SBarry Smith PetscErrorCode ierr; 1181*5d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1182*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 11833f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 118487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 118587828ca2SBarry Smith PetscScalar *x,*b,*t; 11864e2b4712SSatish Balay 11874e2b4712SSatish Balay PetscFunctionBegin; 11881ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11891ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1190f1af5d2fSBarry Smith t = a->solve_work; 11914e2b4712SSatish Balay 11924e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11934e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11944e2b4712SSatish Balay 11954e2b4712SSatish Balay /* forward solve the lower triangular */ 11964e2b4712SSatish Balay idx = 7*(*r++); 1197f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1198f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1199f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 12004e2b4712SSatish Balay 12014e2b4712SSatish Balay for (i=1; i<n; i++) { 12024e2b4712SSatish Balay v = aa + 49*ai[i]; 12034e2b4712SSatish Balay vi = aj + ai[i]; 12044e2b4712SSatish Balay nz = diag[i] - ai[i]; 12054e2b4712SSatish Balay idx = 7*(*r++); 1206f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1207f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 12084e2b4712SSatish Balay while (nz--) { 12094e2b4712SSatish Balay idx = 7*(*vi++); 1210f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1211f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1212f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1213f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1214f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1215f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1216f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1217f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1218f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1219f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12204e2b4712SSatish Balay v += 49; 12214e2b4712SSatish Balay } 12224e2b4712SSatish Balay idx = 7*i; 1223f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1224f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1225f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12264e2b4712SSatish Balay } 12274e2b4712SSatish Balay /* backward solve the upper triangular */ 12284e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12294e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12304e2b4712SSatish Balay vi = aj + diag[i] + 1; 12314e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12324e2b4712SSatish Balay idt = 7*i; 1233f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1234f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1235f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12364e2b4712SSatish Balay while (nz--) { 12374e2b4712SSatish Balay idx = 7*(*vi++); 1238f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1239f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1240f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1241f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1242f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1243f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1244f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1245f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1246f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1247f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12484e2b4712SSatish Balay v += 49; 12494e2b4712SSatish Balay } 12504e2b4712SSatish Balay idc = 7*(*c--); 12514e2b4712SSatish Balay v = aa + 49*diag[i]; 1252f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1253f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1254f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1255f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1256f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1257f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1258f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1259f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1260f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1261f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1262f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1263f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1264f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1265f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12664e2b4712SSatish Balay } 12674e2b4712SSatish Balay 12684e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12694e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12701ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1272d0f46423SBarry Smith ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 12734e2b4712SSatish Balay PetscFunctionReturn(0); 12744e2b4712SSatish Balay } 12754e2b4712SSatish Balay 12764a2ae208SSatish Balay #undef __FUNCT__ 12774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1278dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 127915091d37SBarry Smith { 128015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1281690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1282dfbe8321SBarry Smith PetscErrorCode ierr; 1283690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1284d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1285d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1286d9fead3dSBarry Smith const PetscScalar *b; 128715091d37SBarry Smith 128815091d37SBarry Smith PetscFunctionBegin; 1289d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 12901ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 129115091d37SBarry Smith /* forward solve the lower triangular */ 129215091d37SBarry Smith idx = 0; 129315091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 129415091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 129515091d37SBarry Smith x[6] = b[6+idx]; 129615091d37SBarry Smith for (i=1; i<n; i++) { 129715091d37SBarry Smith v = aa + 49*ai[i]; 129815091d37SBarry Smith vi = aj + ai[i]; 129915091d37SBarry Smith nz = diag[i] - ai[i]; 130015091d37SBarry Smith idx = 7*i; 1301f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1302f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1303f1af5d2fSBarry Smith s7 = b[6+idx]; 130415091d37SBarry Smith while (nz--) { 130515091d37SBarry Smith jdx = 7*(*vi++); 130615091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 130715091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 130815091d37SBarry Smith x7 = x[6+jdx]; 1309f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1310f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1311f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1312f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1313f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1314f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1315f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 131615091d37SBarry Smith v += 49; 131715091d37SBarry Smith } 1318f1af5d2fSBarry Smith x[idx] = s1; 1319f1af5d2fSBarry Smith x[1+idx] = s2; 1320f1af5d2fSBarry Smith x[2+idx] = s3; 1321f1af5d2fSBarry Smith x[3+idx] = s4; 1322f1af5d2fSBarry Smith x[4+idx] = s5; 1323f1af5d2fSBarry Smith x[5+idx] = s6; 1324f1af5d2fSBarry Smith x[6+idx] = s7; 132515091d37SBarry Smith } 132615091d37SBarry Smith /* backward solve the upper triangular */ 132715091d37SBarry Smith for (i=n-1; i>=0; i--){ 132815091d37SBarry Smith v = aa + 49*diag[i] + 49; 132915091d37SBarry Smith vi = aj + diag[i] + 1; 133015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 133115091d37SBarry Smith idt = 7*i; 1332f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1333f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1334f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1335f1af5d2fSBarry Smith s7 = x[6+idt]; 133615091d37SBarry Smith while (nz--) { 133715091d37SBarry Smith idx = 7*(*vi++); 133815091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 133915091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 134015091d37SBarry Smith x7 = x[6+idx]; 1341f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1342f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1343f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1344f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1345f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1346f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1347f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 134815091d37SBarry Smith v += 49; 134915091d37SBarry Smith } 135015091d37SBarry Smith v = aa + 49*diag[i]; 1351f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1352f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1353f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1354f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1355f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1356f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1357f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1358f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1359f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1360f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1361f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1362f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1363f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1364f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 136515091d37SBarry Smith } 136615091d37SBarry Smith 1367d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13681ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1369d0f46423SBarry Smith ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 137015091d37SBarry Smith PetscFunctionReturn(0); 137115091d37SBarry Smith } 137215091d37SBarry Smith 13734a2ae208SSatish Balay #undef __FUNCT__ 13744a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1375dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 137615091d37SBarry Smith { 137715091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 137815091d37SBarry Smith IS iscol=a->col,isrow=a->row; 13796849ba73SBarry Smith PetscErrorCode ierr; 1380*5d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1381*5d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1382d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1383d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1384d9fead3dSBarry Smith const PetscScalar *b; 138515091d37SBarry Smith PetscFunctionBegin; 1386d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13871ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1388f1af5d2fSBarry Smith t = a->solve_work; 138915091d37SBarry Smith 139015091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 139115091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 139215091d37SBarry Smith 139315091d37SBarry Smith /* forward solve the lower triangular */ 139415091d37SBarry Smith idx = 6*(*r++); 1395f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1396f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1397f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 139815091d37SBarry Smith for (i=1; i<n; i++) { 139915091d37SBarry Smith v = aa + 36*ai[i]; 140015091d37SBarry Smith vi = aj + ai[i]; 140115091d37SBarry Smith nz = diag[i] - ai[i]; 140215091d37SBarry Smith idx = 6*(*r++); 1403f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1404f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 140515091d37SBarry Smith while (nz--) { 140615091d37SBarry Smith idx = 6*(*vi++); 1407f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1408f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1409f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1410f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1411f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1412f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1413f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1414f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 141515091d37SBarry Smith v += 36; 141615091d37SBarry Smith } 141715091d37SBarry Smith idx = 6*i; 1418f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1419f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1420f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 142115091d37SBarry Smith } 142215091d37SBarry Smith /* backward solve the upper triangular */ 142315091d37SBarry Smith for (i=n-1; i>=0; i--){ 142415091d37SBarry Smith v = aa + 36*diag[i] + 36; 142515091d37SBarry Smith vi = aj + diag[i] + 1; 142615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 142715091d37SBarry Smith idt = 6*i; 1428f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1429f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1430f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 143115091d37SBarry Smith while (nz--) { 143215091d37SBarry Smith idx = 6*(*vi++); 1433f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1434f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1435f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1436f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1437f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1438f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1439f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1440f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1441f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 144215091d37SBarry Smith v += 36; 144315091d37SBarry Smith } 144415091d37SBarry Smith idc = 6*(*c--); 144515091d37SBarry Smith v = aa + 36*diag[i]; 1446f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1447f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1448f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1449f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1450f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1451f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1452f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1453f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1454f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1455f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1456f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1457f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 145815091d37SBarry Smith } 145915091d37SBarry Smith 146015091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 146115091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1462d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14631ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1464d0f46423SBarry Smith ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 146515091d37SBarry Smith PetscFunctionReturn(0); 146615091d37SBarry Smith } 146715091d37SBarry Smith 14684a2ae208SSatish Balay #undef __FUNCT__ 14694a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1470dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 147115091d37SBarry Smith { 147215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1473690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1474dfbe8321SBarry Smith PetscErrorCode ierr; 1475690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1476d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1477d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1478d9fead3dSBarry Smith const PetscScalar *b; 147915091d37SBarry Smith 148015091d37SBarry Smith PetscFunctionBegin; 1481d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 148315091d37SBarry Smith /* forward solve the lower triangular */ 148415091d37SBarry Smith idx = 0; 148515091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 148615091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 148715091d37SBarry Smith for (i=1; i<n; i++) { 148815091d37SBarry Smith v = aa + 36*ai[i]; 148915091d37SBarry Smith vi = aj + ai[i]; 149015091d37SBarry Smith nz = diag[i] - ai[i]; 149115091d37SBarry Smith idx = 6*i; 1492f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1493f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 149415091d37SBarry Smith while (nz--) { 149515091d37SBarry Smith jdx = 6*(*vi++); 149615091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 149715091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1498f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1499f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1500f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1501f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1502f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1503f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 150415091d37SBarry Smith v += 36; 150515091d37SBarry Smith } 1506f1af5d2fSBarry Smith x[idx] = s1; 1507f1af5d2fSBarry Smith x[1+idx] = s2; 1508f1af5d2fSBarry Smith x[2+idx] = s3; 1509f1af5d2fSBarry Smith x[3+idx] = s4; 1510f1af5d2fSBarry Smith x[4+idx] = s5; 1511f1af5d2fSBarry Smith x[5+idx] = s6; 151215091d37SBarry Smith } 151315091d37SBarry Smith /* backward solve the upper triangular */ 151415091d37SBarry Smith for (i=n-1; i>=0; i--){ 151515091d37SBarry Smith v = aa + 36*diag[i] + 36; 151615091d37SBarry Smith vi = aj + diag[i] + 1; 151715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 151815091d37SBarry Smith idt = 6*i; 1519f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1520f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1521f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 152215091d37SBarry Smith while (nz--) { 152315091d37SBarry Smith idx = 6*(*vi++); 152415091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 152515091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1526f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1527f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1528f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1529f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1530f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1531f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 153215091d37SBarry Smith v += 36; 153315091d37SBarry Smith } 153415091d37SBarry Smith v = aa + 36*diag[i]; 1535f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1536f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1537f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1538f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1539f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1540f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 154115091d37SBarry Smith } 154215091d37SBarry Smith 1543d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15441ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1545d0f46423SBarry Smith ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 154615091d37SBarry Smith PetscFunctionReturn(0); 154715091d37SBarry Smith } 154815091d37SBarry Smith 15494a2ae208SSatish Balay #undef __FUNCT__ 15504a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1551dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 15524e2b4712SSatish Balay { 15534e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15544e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 15556849ba73SBarry Smith PetscErrorCode ierr; 1556*5d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 1557*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1558d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1559d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1560d9fead3dSBarry Smith const PetscScalar *b; 15614e2b4712SSatish Balay 15624e2b4712SSatish Balay PetscFunctionBegin; 1563d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15641ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1565f1af5d2fSBarry Smith t = a->solve_work; 15664e2b4712SSatish Balay 15674e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 15684e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 15694e2b4712SSatish Balay 15704e2b4712SSatish Balay /* forward solve the lower triangular */ 15714e2b4712SSatish Balay idx = 5*(*r++); 1572f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1573f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 15744e2b4712SSatish Balay for (i=1; i<n; i++) { 15754e2b4712SSatish Balay v = aa + 25*ai[i]; 15764e2b4712SSatish Balay vi = aj + ai[i]; 15774e2b4712SSatish Balay nz = diag[i] - ai[i]; 15784e2b4712SSatish Balay idx = 5*(*r++); 1579f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1580f1af5d2fSBarry Smith s5 = b[4+idx]; 15814e2b4712SSatish Balay while (nz--) { 15824e2b4712SSatish Balay idx = 5*(*vi++); 1583f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1584f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1585f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1586f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1587f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1588f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1589f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 15904e2b4712SSatish Balay v += 25; 15914e2b4712SSatish Balay } 15924e2b4712SSatish Balay idx = 5*i; 1593f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1594f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 15954e2b4712SSatish Balay } 15964e2b4712SSatish Balay /* backward solve the upper triangular */ 15974e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 15984e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 15994e2b4712SSatish Balay vi = aj + diag[i] + 1; 16004e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 16014e2b4712SSatish Balay idt = 5*i; 1602f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1603f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 16044e2b4712SSatish Balay while (nz--) { 16054e2b4712SSatish Balay idx = 5*(*vi++); 1606f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1607f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1608f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1609f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1610f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1611f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1612f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 16134e2b4712SSatish Balay v += 25; 16144e2b4712SSatish Balay } 16154e2b4712SSatish Balay idc = 5*(*c--); 16164e2b4712SSatish Balay v = aa + 25*diag[i]; 1617f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1618f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 1619f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1620f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 1621f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1622f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 1623f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1624f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 1625f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1626f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 16274e2b4712SSatish Balay } 16284e2b4712SSatish Balay 16294e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 16304e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1631d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16321ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1633d0f46423SBarry Smith ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 16344e2b4712SSatish Balay PetscFunctionReturn(0); 16354e2b4712SSatish Balay } 16364e2b4712SSatish Balay 16374a2ae208SSatish Balay #undef __FUNCT__ 16384a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1639dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 164015091d37SBarry Smith { 164115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1642690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1643dfbe8321SBarry Smith PetscErrorCode ierr; 1644690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1645d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1646d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1647d9fead3dSBarry Smith const PetscScalar *b; 164815091d37SBarry Smith 164915091d37SBarry Smith PetscFunctionBegin; 1650d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16511ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 165215091d37SBarry Smith /* forward solve the lower triangular */ 165315091d37SBarry Smith idx = 0; 165415091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 165515091d37SBarry Smith for (i=1; i<n; i++) { 165615091d37SBarry Smith v = aa + 25*ai[i]; 165715091d37SBarry Smith vi = aj + ai[i]; 165815091d37SBarry Smith nz = diag[i] - ai[i]; 165915091d37SBarry Smith idx = 5*i; 1660f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 166115091d37SBarry Smith while (nz--) { 166215091d37SBarry Smith jdx = 5*(*vi++); 166315091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1664f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1665f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1666f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1667f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1668f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 166915091d37SBarry Smith v += 25; 167015091d37SBarry Smith } 1671f1af5d2fSBarry Smith x[idx] = s1; 1672f1af5d2fSBarry Smith x[1+idx] = s2; 1673f1af5d2fSBarry Smith x[2+idx] = s3; 1674f1af5d2fSBarry Smith x[3+idx] = s4; 1675f1af5d2fSBarry Smith x[4+idx] = s5; 167615091d37SBarry Smith } 167715091d37SBarry Smith /* backward solve the upper triangular */ 167815091d37SBarry Smith for (i=n-1; i>=0; i--){ 167915091d37SBarry Smith v = aa + 25*diag[i] + 25; 168015091d37SBarry Smith vi = aj + diag[i] + 1; 168115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 168215091d37SBarry Smith idt = 5*i; 1683f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1684f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 168515091d37SBarry Smith while (nz--) { 168615091d37SBarry Smith idx = 5*(*vi++); 168715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1688f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1689f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1690f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1691f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1692f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 169315091d37SBarry Smith v += 25; 169415091d37SBarry Smith } 169515091d37SBarry Smith v = aa + 25*diag[i]; 1696f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1697f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1698f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1699f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1700f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 170115091d37SBarry Smith } 170215091d37SBarry Smith 1703d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17041ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1705d0f46423SBarry Smith ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 170615091d37SBarry Smith PetscFunctionReturn(0); 170715091d37SBarry Smith } 170815091d37SBarry Smith 17094a2ae208SSatish Balay #undef __FUNCT__ 17104a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1711dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 17124e2b4712SSatish Balay { 17134e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 17144e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 17156849ba73SBarry Smith PetscErrorCode ierr; 1716*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1717*5d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1718d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1719d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1720d9fead3dSBarry Smith const PetscScalar *b; 17214e2b4712SSatish Balay 17224e2b4712SSatish Balay PetscFunctionBegin; 1723d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17241ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1725f1af5d2fSBarry Smith t = a->solve_work; 17264e2b4712SSatish Balay 17274e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 17284e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 17294e2b4712SSatish Balay 17304e2b4712SSatish Balay /* forward solve the lower triangular */ 17314e2b4712SSatish Balay idx = 4*(*r++); 1732f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1733f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 17344e2b4712SSatish Balay for (i=1; i<n; i++) { 17354e2b4712SSatish Balay v = aa + 16*ai[i]; 17364e2b4712SSatish Balay vi = aj + ai[i]; 17374e2b4712SSatish Balay nz = diag[i] - ai[i]; 17384e2b4712SSatish Balay idx = 4*(*r++); 1739f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 17404e2b4712SSatish Balay while (nz--) { 17414e2b4712SSatish Balay idx = 4*(*vi++); 1742f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1743f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1744f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1745f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1746f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17474e2b4712SSatish Balay v += 16; 17484e2b4712SSatish Balay } 17494e2b4712SSatish Balay idx = 4*i; 1750f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1751f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 17524e2b4712SSatish Balay } 17534e2b4712SSatish Balay /* backward solve the upper triangular */ 17544e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 17554e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 17564e2b4712SSatish Balay vi = aj + diag[i] + 1; 17574e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 17584e2b4712SSatish Balay idt = 4*i; 1759f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1760f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 17614e2b4712SSatish Balay while (nz--) { 17624e2b4712SSatish Balay idx = 4*(*vi++); 1763f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1764f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1765f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1766f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1767f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1768f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17694e2b4712SSatish Balay v += 16; 17704e2b4712SSatish Balay } 17714e2b4712SSatish Balay idc = 4*(*c--); 17724e2b4712SSatish Balay v = aa + 16*diag[i]; 1773f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1774f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1775f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1776f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 17774e2b4712SSatish Balay } 17784e2b4712SSatish Balay 17794e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 17804e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1781d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17821ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1783d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 17844e2b4712SSatish Balay PetscFunctionReturn(0); 17854e2b4712SSatish Balay } 1786f26ec98cSKris Buschelman 1787f26ec98cSKris Buschelman #undef __FUNCT__ 1788f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1789dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1790f26ec98cSKris Buschelman { 1791f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1792f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 17936849ba73SBarry Smith PetscErrorCode ierr; 1794*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1795*5d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1796d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1797d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 1798d9fead3dSBarry Smith PetscScalar *x; 1799d9fead3dSBarry Smith const PetscScalar *b; 1800f26ec98cSKris Buschelman 1801f26ec98cSKris Buschelman PetscFunctionBegin; 1802d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18031ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1804f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 1805f26ec98cSKris Buschelman 1806f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1807f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1808f26ec98cSKris Buschelman 1809f26ec98cSKris Buschelman /* forward solve the lower triangular */ 1810f26ec98cSKris Buschelman idx = 4*(*r++); 1811f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 1812f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 1813f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 1814f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 1815f26ec98cSKris Buschelman for (i=1; i<n; i++) { 1816f26ec98cSKris Buschelman v = aa + 16*ai[i]; 1817f26ec98cSKris Buschelman vi = aj + ai[i]; 1818f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 1819f26ec98cSKris Buschelman idx = 4*(*r++); 1820f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 1821f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 1822f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 1823f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 1824f26ec98cSKris Buschelman while (nz--) { 1825f26ec98cSKris Buschelman idx = 4*(*vi++); 1826f26ec98cSKris Buschelman x1 = t[idx]; 1827f26ec98cSKris Buschelman x2 = t[1+idx]; 1828f26ec98cSKris Buschelman x3 = t[2+idx]; 1829f26ec98cSKris Buschelman x4 = t[3+idx]; 1830f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1831f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1832f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1833f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1834f26ec98cSKris Buschelman v += 16; 1835f26ec98cSKris Buschelman } 1836f26ec98cSKris Buschelman idx = 4*i; 1837f26ec98cSKris Buschelman t[idx] = s1; 1838f26ec98cSKris Buschelman t[1+idx] = s2; 1839f26ec98cSKris Buschelman t[2+idx] = s3; 1840f26ec98cSKris Buschelman t[3+idx] = s4; 1841f26ec98cSKris Buschelman } 1842f26ec98cSKris Buschelman /* backward solve the upper triangular */ 1843f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 1844f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 1845f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 1846f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 1847f26ec98cSKris Buschelman idt = 4*i; 1848f26ec98cSKris Buschelman s1 = t[idt]; 1849f26ec98cSKris Buschelman s2 = t[1+idt]; 1850f26ec98cSKris Buschelman s3 = t[2+idt]; 1851f26ec98cSKris Buschelman s4 = t[3+idt]; 1852f26ec98cSKris Buschelman while (nz--) { 1853f26ec98cSKris Buschelman idx = 4*(*vi++); 1854f26ec98cSKris Buschelman x1 = t[idx]; 1855f26ec98cSKris Buschelman x2 = t[1+idx]; 1856f26ec98cSKris Buschelman x3 = t[2+idx]; 1857f26ec98cSKris Buschelman x4 = t[3+idx]; 1858f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1859f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1860f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1861f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1862f26ec98cSKris Buschelman v += 16; 1863f26ec98cSKris Buschelman } 1864f26ec98cSKris Buschelman idc = 4*(*c--); 1865f26ec98cSKris Buschelman v = aa + 16*diag[i]; 1866f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1867f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1868f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1869f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1870f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 1871f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 1872f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 1873f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 1874f26ec98cSKris Buschelman } 1875f26ec98cSKris Buschelman 1876f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1877f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1878d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18791ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1880d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 1881f26ec98cSKris Buschelman PetscFunctionReturn(0); 1882f26ec98cSKris Buschelman } 1883f26ec98cSKris Buschelman 188424c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 188524c233c2SKris Buschelman 188624c233c2SKris Buschelman #include PETSC_HAVE_SSE 188724c233c2SKris Buschelman 188824c233c2SKris Buschelman #undef __FUNCT__ 188924c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1890dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 189124c233c2SKris Buschelman { 189224c233c2SKris Buschelman /* 189324c233c2SKris Buschelman Note: This code uses demotion of double 189424c233c2SKris Buschelman to float when performing the mixed-mode computation. 189524c233c2SKris Buschelman This may not be numerically reasonable for all applications. 189624c233c2SKris Buschelman */ 189724c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 189824c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 18996849ba73SBarry Smith PetscErrorCode ierr; 1900*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 1901*5d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 190224c233c2SKris Buschelman MatScalar *aa=a->a,*v; 190387828ca2SBarry Smith PetscScalar *x,*b,*t; 190424c233c2SKris Buschelman 190524c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 190624c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 190724c233c2SKris Buschelman unsigned long offset; 190824c233c2SKris Buschelman 190924c233c2SKris Buschelman PetscFunctionBegin; 191024c233c2SKris Buschelman SSE_SCOPE_BEGIN; 191124c233c2SKris Buschelman 191224c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 191324c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 191424c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 191524c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 191624c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 191724c233c2SKris Buschelman 19181ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 19191ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192024c233c2SKris Buschelman t = a->solve_work; 192124c233c2SKris Buschelman 192224c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 192324c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 192424c233c2SKris Buschelman 192524c233c2SKris Buschelman /* forward solve the lower triangular */ 192624c233c2SKris Buschelman idx = 4*(*r++); 192724c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 192824c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 192924c233c2SKris Buschelman v = aa + 16*ai[1]; 193024c233c2SKris Buschelman 193124c233c2SKris Buschelman for (i=1; i<n;) { 193224c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 193324c233c2SKris Buschelman vi = aj + ai[i]; 193424c233c2SKris Buschelman nz = diag[i] - ai[i]; 193524c233c2SKris Buschelman idx = 4*(*r++); 193624c233c2SKris Buschelman 193724c233c2SKris Buschelman /* Demote sum from double to float */ 193824c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 193924c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 194024c233c2SKris Buschelman 194124c233c2SKris Buschelman while (nz--) { 194224c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 194324c233c2SKris Buschelman idx = 4*(*vi++); 194424c233c2SKris Buschelman 194524c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 194624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 194724c233c2SKris Buschelman 194824c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 194924c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 195024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 195124c233c2SKris Buschelman 195224c233c2SKris Buschelman /* First Column */ 195324c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 195424c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 195524c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 195624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 195724c233c2SKris Buschelman 195824c233c2SKris Buschelman /* Second Column */ 195924c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 196024c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 196124c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 196224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 196324c233c2SKris Buschelman 196424c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 196524c233c2SKris Buschelman 196624c233c2SKris Buschelman /* Third Column */ 196724c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 196824c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 196924c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 197024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 197124c233c2SKris Buschelman 197224c233c2SKris Buschelman /* Fourth Column */ 197324c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 197424c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 197524c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 197624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 197724c233c2SKris Buschelman SSE_INLINE_END_2 197824c233c2SKris Buschelman 197924c233c2SKris Buschelman v += 16; 198024c233c2SKris Buschelman } 198124c233c2SKris Buschelman idx = 4*i; 198224c233c2SKris Buschelman v = aa + 16*ai[++i]; 198324c233c2SKris Buschelman PREFETCH_NTA(v); 198424c233c2SKris Buschelman STORE_PS(tmps,XMM7); 198524c233c2SKris Buschelman 198624c233c2SKris Buschelman /* Promote result from float to double */ 198724c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 198824c233c2SKris Buschelman } 198924c233c2SKris Buschelman /* backward solve the upper triangular */ 199024c233c2SKris Buschelman idt = 4*(n-1); 199124c233c2SKris Buschelman ai16 = 16*diag[n-1]; 199224c233c2SKris Buschelman v = aa + ai16 + 16; 199324c233c2SKris Buschelman for (i=n-1; i>=0;){ 199424c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 199524c233c2SKris Buschelman vi = aj + diag[i] + 1; 199624c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 199724c233c2SKris Buschelman 199824c233c2SKris Buschelman /* Demote accumulator from double to float */ 199924c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 200024c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 200124c233c2SKris Buschelman 200224c233c2SKris Buschelman while (nz--) { 200324c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 200424c233c2SKris Buschelman idx = 4*(*vi++); 200524c233c2SKris Buschelman 200624c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 200724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 200824c233c2SKris Buschelman 200924c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 201024c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 201124c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 201224c233c2SKris Buschelman 201324c233c2SKris Buschelman /* First Column */ 201424c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 201524c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 201624c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 201724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 201824c233c2SKris Buschelman 201924c233c2SKris Buschelman /* Second Column */ 202024c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 202124c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 202224c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 202324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 202424c233c2SKris Buschelman 202524c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 202624c233c2SKris Buschelman 202724c233c2SKris Buschelman /* Third Column */ 202824c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 202924c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 203024c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 203124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 203224c233c2SKris Buschelman 203324c233c2SKris Buschelman /* Fourth Column */ 203424c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 203524c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 203624c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 203724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 203824c233c2SKris Buschelman SSE_INLINE_END_2 203924c233c2SKris Buschelman v += 16; 204024c233c2SKris Buschelman } 204124c233c2SKris Buschelman v = aa + ai16; 204224c233c2SKris Buschelman ai16 = 16*diag[--i]; 204324c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 204424c233c2SKris Buschelman /* 204524c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 204624c233c2SKris Buschelman which was inverted as part of the factorization 204724c233c2SKris Buschelman */ 204824c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 204924c233c2SKris Buschelman /* First Column */ 205024c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 205124c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 205224c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 205324c233c2SKris Buschelman 205424c233c2SKris Buschelman /* Second Column */ 205524c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 205624c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 205724c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 205824c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 205924c233c2SKris Buschelman 206024c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 206124c233c2SKris Buschelman 206224c233c2SKris Buschelman /* Third Column */ 206324c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 206424c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 206524c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 206624c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 206724c233c2SKris Buschelman 206824c233c2SKris Buschelman /* Fourth Column */ 206924c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 207024c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 207124c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 207224c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 207324c233c2SKris Buschelman 207424c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 207524c233c2SKris Buschelman SSE_INLINE_END_3 207624c233c2SKris Buschelman 207724c233c2SKris Buschelman /* Promote solution from float to double */ 207824c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 207924c233c2SKris Buschelman 208024c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 208124c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 208224c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 208324c233c2SKris Buschelman idc = 4*(*c--); 208424c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 208524c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 208624c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 208724c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 208824c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 208924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 209024c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 209124c233c2SKris Buschelman SSE_INLINE_END_2 209224c233c2SKris Buschelman v = aa + ai16 + 16; 209324c233c2SKris Buschelman idt -= 4; 209424c233c2SKris Buschelman } 209524c233c2SKris Buschelman 209624c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 209724c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 20981ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 20991ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2100d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 210124c233c2SKris Buschelman SSE_SCOPE_END; 210224c233c2SKris Buschelman PetscFunctionReturn(0); 210324c233c2SKris Buschelman } 210424c233c2SKris Buschelman 210524c233c2SKris Buschelman #endif 21060ef38995SBarry Smith 21070ef38995SBarry Smith 21084e2b4712SSatish Balay /* 21094e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 21104e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 21114e2b4712SSatish Balay */ 21124a2ae208SSatish Balay #undef __FUNCT__ 21134a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2114dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 21154e2b4712SSatish Balay { 21164e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2117356650c2SBarry Smith PetscInt n=a->mbs; 2118356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 2119dfbe8321SBarry Smith PetscErrorCode ierr; 2120356650c2SBarry Smith const PetscInt *diag = a->diag; 2121d9fead3dSBarry Smith const MatScalar *aa=a->a; 2122d9fead3dSBarry Smith PetscScalar *x; 2123d9fead3dSBarry Smith const PetscScalar *b; 21244e2b4712SSatish Balay 21254e2b4712SSatish Balay PetscFunctionBegin; 2126d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21284e2b4712SSatish Balay 2129aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 21302853dc0eSBarry Smith { 213187828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 21322853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 21332853dc0eSBarry Smith } 2134aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 21352853dc0eSBarry Smith { 213687828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 21372853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 21382853dc0eSBarry Smith } 2139aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 21402853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2141e1293385SBarry Smith #else 214230d4dcafSBarry Smith { 214387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2144d9fead3dSBarry Smith const MatScalar *v; 2145356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 2146356650c2SBarry Smith const PetscInt *vi; 2147e1293385SBarry Smith 21484e2b4712SSatish Balay /* forward solve the lower triangular */ 21494e2b4712SSatish Balay idx = 0; 2150e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 21514e2b4712SSatish Balay for (i=1; i<n; i++) { 21524e2b4712SSatish Balay v = aa + 16*ai[i]; 21534e2b4712SSatish Balay vi = aj + ai[i]; 21544e2b4712SSatish Balay nz = diag[i] - ai[i]; 2155e1293385SBarry Smith idx += 4; 2156f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 21574e2b4712SSatish Balay while (nz--) { 21584e2b4712SSatish Balay jdx = 4*(*vi++); 21594e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2160f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2161f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2162f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2163f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 21644e2b4712SSatish Balay v += 16; 21654e2b4712SSatish Balay } 2166f1af5d2fSBarry Smith x[idx] = s1; 2167f1af5d2fSBarry Smith x[1+idx] = s2; 2168f1af5d2fSBarry Smith x[2+idx] = s3; 2169f1af5d2fSBarry Smith x[3+idx] = s4; 21704e2b4712SSatish Balay } 21714e2b4712SSatish Balay /* backward solve the upper triangular */ 21724e555682SBarry Smith idt = 4*(n-1); 21734e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 21744e555682SBarry Smith ai16 = 16*diag[i]; 21754e555682SBarry Smith v = aa + ai16 + 16; 21764e2b4712SSatish Balay vi = aj + diag[i] + 1; 21774e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2178f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2179f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 21804e2b4712SSatish Balay while (nz--) { 21814e2b4712SSatish Balay idx = 4*(*vi++); 21824e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2183f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2184f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2185f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2186f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 21874e2b4712SSatish Balay v += 16; 21884e2b4712SSatish Balay } 21894e555682SBarry Smith v = aa + ai16; 2190f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2191f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2192f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2193f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2194329f5518SBarry Smith idt -= 4; 21954e2b4712SSatish Balay } 219630d4dcafSBarry Smith } 2197e1293385SBarry Smith #endif 21984e2b4712SSatish Balay 2199d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2201d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 22024e2b4712SSatish Balay PetscFunctionReturn(0); 22034e2b4712SSatish Balay } 22044e2b4712SSatish Balay 2205f26ec98cSKris Buschelman #undef __FUNCT__ 2206f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2207dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2208f26ec98cSKris Buschelman { 2209f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2210690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2211dfbe8321SBarry Smith PetscErrorCode ierr; 2212690b6cddSBarry Smith PetscInt *diag = a->diag; 2213f26ec98cSKris Buschelman MatScalar *aa=a->a; 2214f26ec98cSKris Buschelman PetscScalar *x,*b; 2215f26ec98cSKris Buschelman 2216f26ec98cSKris Buschelman PetscFunctionBegin; 22171ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 22181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2219f26ec98cSKris Buschelman 2220f26ec98cSKris Buschelman { 2221f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2222f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 2223690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2224f26ec98cSKris Buschelman 2225f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2226f26ec98cSKris Buschelman idx = 0; 2227f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 2228f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 2229f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 2230f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 2231f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2232f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2233f26ec98cSKris Buschelman vi = aj + ai[i]; 2234f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2235f26ec98cSKris Buschelman idx += 4; 2236f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2237f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2238f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2239f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2240f26ec98cSKris Buschelman while (nz--) { 2241f26ec98cSKris Buschelman jdx = 4*(*vi++); 2242f26ec98cSKris Buschelman x1 = t[jdx]; 2243f26ec98cSKris Buschelman x2 = t[1+jdx]; 2244f26ec98cSKris Buschelman x3 = t[2+jdx]; 2245f26ec98cSKris Buschelman x4 = t[3+jdx]; 2246f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2247f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2248f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2249f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2250f26ec98cSKris Buschelman v += 16; 2251f26ec98cSKris Buschelman } 2252f26ec98cSKris Buschelman t[idx] = s1; 2253f26ec98cSKris Buschelman t[1+idx] = s2; 2254f26ec98cSKris Buschelman t[2+idx] = s3; 2255f26ec98cSKris Buschelman t[3+idx] = s4; 2256f26ec98cSKris Buschelman } 2257f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2258f26ec98cSKris Buschelman idt = 4*(n-1); 2259f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2260f26ec98cSKris Buschelman ai16 = 16*diag[i]; 2261f26ec98cSKris Buschelman v = aa + ai16 + 16; 2262f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2263f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2264f26ec98cSKris Buschelman s1 = t[idt]; 2265f26ec98cSKris Buschelman s2 = t[1+idt]; 2266f26ec98cSKris Buschelman s3 = t[2+idt]; 2267f26ec98cSKris Buschelman s4 = t[3+idt]; 2268f26ec98cSKris Buschelman while (nz--) { 2269f26ec98cSKris Buschelman idx = 4*(*vi++); 2270f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 2271f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 2272f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 2273f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 2274f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2275f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2276f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2277f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2278f26ec98cSKris Buschelman v += 16; 2279f26ec98cSKris Buschelman } 2280f26ec98cSKris Buschelman v = aa + ai16; 2281f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2282f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2283f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2284f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2285f26ec98cSKris Buschelman idt -= 4; 2286f26ec98cSKris Buschelman } 2287f26ec98cSKris Buschelman } 2288f26ec98cSKris Buschelman 22891ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 22901ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2291d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2292f26ec98cSKris Buschelman PetscFunctionReturn(0); 2293f26ec98cSKris Buschelman } 2294f26ec98cSKris Buschelman 22953660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 22963660e330SKris Buschelman 22973660e330SKris Buschelman #include PETSC_HAVE_SSE 22983660e330SKris Buschelman #undef __FUNCT__ 22997cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2300dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 23013660e330SKris Buschelman { 23023660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 23032aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 2304dfbe8321SBarry Smith PetscErrorCode ierr; 2305dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 23063660e330SKris Buschelman MatScalar *aa=a->a; 230787828ca2SBarry Smith PetscScalar *x,*b; 23083660e330SKris Buschelman 23093660e330SKris Buschelman PetscFunctionBegin; 23103660e330SKris Buschelman SSE_SCOPE_BEGIN; 23113660e330SKris Buschelman /* 23123660e330SKris Buschelman Note: This code currently uses demotion of double 23133660e330SKris Buschelman to float when performing the mixed-mode computation. 23143660e330SKris Buschelman This may not be numerically reasonable for all applications. 23153660e330SKris Buschelman */ 23163660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 23173660e330SKris Buschelman 23181ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 23191ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23203660e330SKris Buschelman { 2321eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 2322eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 23232aa5897fSKris Buschelman int nz,i,idt,ai16; 23242aa5897fSKris Buschelman unsigned int jdx,idx; 23252aa5897fSKris Buschelman unsigned short *vi; 2326eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 23273660e330SKris Buschelman 2328eb05f457SKris Buschelman /* First block is the identity. */ 23293660e330SKris Buschelman idx = 0; 2330eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 23312aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 23323660e330SKris Buschelman 23333660e330SKris Buschelman for (i=1; i<n;) { 23343660e330SKris Buschelman PREFETCH_NTA(&v[8]); 23353660e330SKris Buschelman vi = aj + ai[i]; 23363660e330SKris Buschelman nz = diag[i] - ai[i]; 23373660e330SKris Buschelman idx += 4; 23383660e330SKris Buschelman 2339eb05f457SKris Buschelman /* Demote RHS from double to float. */ 2340eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2341eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 23423660e330SKris Buschelman 23433660e330SKris Buschelman while (nz--) { 23443660e330SKris Buschelman PREFETCH_NTA(&v[16]); 23452aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 23463660e330SKris Buschelman 23473660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 2348eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 23493660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 23503660e330SKris Buschelman 23513660e330SKris Buschelman /* First Column */ 23523660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 23533660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 23543660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 23553660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 23563660e330SKris Buschelman 23573660e330SKris Buschelman /* Second Column */ 23583660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 23593660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 23603660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 23613660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 23623660e330SKris Buschelman 23633660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 23643660e330SKris Buschelman 23653660e330SKris Buschelman /* Third Column */ 23663660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 23673660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 23683660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 23693660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 23703660e330SKris Buschelman 23713660e330SKris Buschelman /* Fourth Column */ 23723660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 23733660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 23743660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 23753660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 23763660e330SKris Buschelman SSE_INLINE_END_2 23773660e330SKris Buschelman 23783660e330SKris Buschelman v += 16; 23793660e330SKris Buschelman } 23803660e330SKris Buschelman v = aa + 16*ai[++i]; 23813660e330SKris Buschelman PREFETCH_NTA(v); 2382eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 23833660e330SKris Buschelman } 2384eb05f457SKris Buschelman 2385eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 2386eb05f457SKris Buschelman 23873660e330SKris Buschelman idt = 4*(n-1); 23883660e330SKris Buschelman ai16 = 16*diag[n-1]; 23893660e330SKris Buschelman v = aa + ai16 + 16; 23903660e330SKris Buschelman for (i=n-1; i>=0;){ 23913660e330SKris Buschelman PREFETCH_NTA(&v[8]); 23923660e330SKris Buschelman vi = aj + diag[i] + 1; 23933660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 23943660e330SKris Buschelman 2395eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 23963660e330SKris Buschelman 23973660e330SKris Buschelman while (nz--) { 23983660e330SKris Buschelman PREFETCH_NTA(&v[16]); 23992aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 24003660e330SKris Buschelman 24013660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 2402eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 24033660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 24043660e330SKris Buschelman 24053660e330SKris Buschelman /* First Column */ 24063660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 24073660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 24083660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 24093660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 24103660e330SKris Buschelman 24113660e330SKris Buschelman /* Second Column */ 24123660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 24133660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 24143660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 24153660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 24163660e330SKris Buschelman 24173660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 24183660e330SKris Buschelman 24193660e330SKris Buschelman /* Third Column */ 24203660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 24213660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 24223660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 24233660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 24243660e330SKris Buschelman 24253660e330SKris Buschelman /* Fourth Column */ 24263660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 24273660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 24283660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 24293660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 24303660e330SKris Buschelman SSE_INLINE_END_2 24313660e330SKris Buschelman v += 16; 24323660e330SKris Buschelman } 24333660e330SKris Buschelman v = aa + ai16; 24343660e330SKris Buschelman ai16 = 16*diag[--i]; 24353660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 24363660e330SKris Buschelman /* 24373660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 24383660e330SKris Buschelman which was inverted as part of the factorization 24393660e330SKris Buschelman */ 2440eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 24413660e330SKris Buschelman /* First Column */ 24423660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 24433660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 24443660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 24453660e330SKris Buschelman 24463660e330SKris Buschelman /* Second Column */ 24473660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 24483660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 24493660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 24503660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 24513660e330SKris Buschelman 24523660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 24533660e330SKris Buschelman 24543660e330SKris Buschelman /* Third Column */ 24553660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 24563660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 24573660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 24583660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 24593660e330SKris Buschelman 24603660e330SKris Buschelman /* Fourth Column */ 24613660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 24623660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 24633660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 24643660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 24653660e330SKris Buschelman 24663660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 24673660e330SKris Buschelman SSE_INLINE_END_3 24683660e330SKris Buschelman 24693660e330SKris Buschelman v = aa + ai16 + 16; 24703660e330SKris Buschelman idt -= 4; 24713660e330SKris Buschelman } 2472eb05f457SKris Buschelman 2473eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 2474eb05f457SKris Buschelman idt = 4*(n-1); 2475eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 2476eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2477eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2478eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 2479eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 2480eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 2481eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 2482eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 2483eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 248454693613SKris Buschelman idt -= 4; 24853660e330SKris Buschelman } 2486eb05f457SKris Buschelman 2487eb05f457SKris Buschelman } /* End of artificial scope. */ 24881ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 24891ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2490d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 24913660e330SKris Buschelman SSE_SCOPE_END; 24923660e330SKris Buschelman PetscFunctionReturn(0); 24933660e330SKris Buschelman } 24943660e330SKris Buschelman 24957cf1b8d3SKris Buschelman #undef __FUNCT__ 24967cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2497dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 24987cf1b8d3SKris Buschelman { 24997cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 25007cf1b8d3SKris Buschelman int *aj=a->j; 2501dfbe8321SBarry Smith PetscErrorCode ierr; 2502dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 25037cf1b8d3SKris Buschelman MatScalar *aa=a->a; 25047cf1b8d3SKris Buschelman PetscScalar *x,*b; 25057cf1b8d3SKris Buschelman 25067cf1b8d3SKris Buschelman PetscFunctionBegin; 25077cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 25087cf1b8d3SKris Buschelman /* 25097cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 25107cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 25117cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 25127cf1b8d3SKris Buschelman */ 25137cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 25147cf1b8d3SKris Buschelman 25151ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25161ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25177cf1b8d3SKris Buschelman { 25187cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 25197cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 25207cf1b8d3SKris Buschelman int nz,i,idt,ai16; 25217cf1b8d3SKris Buschelman int jdx,idx; 25227cf1b8d3SKris Buschelman int *vi; 25237cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 25247cf1b8d3SKris Buschelman 25257cf1b8d3SKris Buschelman /* First block is the identity. */ 25267cf1b8d3SKris Buschelman idx = 0; 25277cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 25287cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 25297cf1b8d3SKris Buschelman 25307cf1b8d3SKris Buschelman for (i=1; i<n;) { 25317cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 25327cf1b8d3SKris Buschelman vi = aj + ai[i]; 25337cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 25347cf1b8d3SKris Buschelman idx += 4; 25357cf1b8d3SKris Buschelman 25367cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 25377cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 25387cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 25397cf1b8d3SKris Buschelman 25407cf1b8d3SKris Buschelman while (nz--) { 25417cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 25427cf1b8d3SKris Buschelman jdx = 4*(*vi++); 25437cf1b8d3SKris Buschelman /* jdx = *vi++; */ 25447cf1b8d3SKris Buschelman 25457cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 25467cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 25477cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 25487cf1b8d3SKris Buschelman 25497cf1b8d3SKris Buschelman /* First Column */ 25507cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 25517cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25527cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 25537cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 25547cf1b8d3SKris Buschelman 25557cf1b8d3SKris Buschelman /* Second Column */ 25567cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 25577cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 25587cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 25597cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 25607cf1b8d3SKris Buschelman 25617cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 25627cf1b8d3SKris Buschelman 25637cf1b8d3SKris Buschelman /* Third Column */ 25647cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 25657cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 25667cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 25677cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 25687cf1b8d3SKris Buschelman 25697cf1b8d3SKris Buschelman /* Fourth Column */ 25707cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 25717cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 25727cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 25737cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 25747cf1b8d3SKris Buschelman SSE_INLINE_END_2 25757cf1b8d3SKris Buschelman 25767cf1b8d3SKris Buschelman v += 16; 25777cf1b8d3SKris Buschelman } 25787cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 25797cf1b8d3SKris Buschelman PREFETCH_NTA(v); 25807cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 25817cf1b8d3SKris Buschelman } 25827cf1b8d3SKris Buschelman 25837cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 25847cf1b8d3SKris Buschelman 25857cf1b8d3SKris Buschelman idt = 4*(n-1); 25867cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 25877cf1b8d3SKris Buschelman v = aa + ai16 + 16; 25887cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 25897cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 25907cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 25917cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 25927cf1b8d3SKris Buschelman 25937cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 25947cf1b8d3SKris Buschelman 25957cf1b8d3SKris Buschelman while (nz--) { 25967cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 25977cf1b8d3SKris Buschelman idx = 4*(*vi++); 25987cf1b8d3SKris Buschelman /* idx = *vi++; */ 25997cf1b8d3SKris Buschelman 26007cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 26017cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 26027cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 26037cf1b8d3SKris Buschelman 26047cf1b8d3SKris Buschelman /* First Column */ 26057cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 26067cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 26077cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 26087cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 26097cf1b8d3SKris Buschelman 26107cf1b8d3SKris Buschelman /* Second Column */ 26117cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 26127cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 26137cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 26147cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 26157cf1b8d3SKris Buschelman 26167cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 26177cf1b8d3SKris Buschelman 26187cf1b8d3SKris Buschelman /* Third Column */ 26197cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 26207cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26217cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 26227cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 26237cf1b8d3SKris Buschelman 26247cf1b8d3SKris Buschelman /* Fourth Column */ 26257cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 26267cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 26277cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 26287cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 26297cf1b8d3SKris Buschelman SSE_INLINE_END_2 26307cf1b8d3SKris Buschelman v += 16; 26317cf1b8d3SKris Buschelman } 26327cf1b8d3SKris Buschelman v = aa + ai16; 26337cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 26347cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 26357cf1b8d3SKris Buschelman /* 26367cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 26377cf1b8d3SKris Buschelman which was inverted as part of the factorization 26387cf1b8d3SKris Buschelman */ 26397cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 26407cf1b8d3SKris Buschelman /* First Column */ 26417cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 26427cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 26437cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 26447cf1b8d3SKris Buschelman 26457cf1b8d3SKris Buschelman /* Second Column */ 26467cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 26477cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 26487cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 26497cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 26507cf1b8d3SKris Buschelman 26517cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 26527cf1b8d3SKris Buschelman 26537cf1b8d3SKris Buschelman /* Third Column */ 26547cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 26557cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26567cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 26577cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 26587cf1b8d3SKris Buschelman 26597cf1b8d3SKris Buschelman /* Fourth Column */ 26607cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 26617cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 26627cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 26637cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 26647cf1b8d3SKris Buschelman 26657cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 26667cf1b8d3SKris Buschelman SSE_INLINE_END_3 26677cf1b8d3SKris Buschelman 26687cf1b8d3SKris Buschelman v = aa + ai16 + 16; 26697cf1b8d3SKris Buschelman idt -= 4; 26707cf1b8d3SKris Buschelman } 26717cf1b8d3SKris Buschelman 26727cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 26737cf1b8d3SKris Buschelman idt = 4*(n-1); 26747cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 26757cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 26767cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 26777cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 26787cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 26797cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 26807cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 26817cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 26827cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 26837cf1b8d3SKris Buschelman idt -= 4; 26847cf1b8d3SKris Buschelman } 26857cf1b8d3SKris Buschelman 26867cf1b8d3SKris Buschelman } /* End of artificial scope. */ 26871ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 26881ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2689d0f46423SBarry Smith ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 26907cf1b8d3SKris Buschelman SSE_SCOPE_END; 26917cf1b8d3SKris Buschelman PetscFunctionReturn(0); 26927cf1b8d3SKris Buschelman } 26937cf1b8d3SKris Buschelman 26943660e330SKris Buschelman #endif 26954a2ae208SSatish Balay #undef __FUNCT__ 26964a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2697dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 26984e2b4712SSatish Balay { 26994e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 27004e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 27016849ba73SBarry Smith PetscErrorCode ierr; 2702*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2703*5d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2704d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2705d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 2706d9fead3dSBarry Smith const PetscScalar *b; 27074e2b4712SSatish Balay 27084e2b4712SSatish Balay PetscFunctionBegin; 2709d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27101ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2711f1af5d2fSBarry Smith t = a->solve_work; 27124e2b4712SSatish Balay 27134e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 27144e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 27154e2b4712SSatish Balay 27164e2b4712SSatish Balay /* forward solve the lower triangular */ 27174e2b4712SSatish Balay idx = 3*(*r++); 2718f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 27194e2b4712SSatish Balay for (i=1; i<n; i++) { 27204e2b4712SSatish Balay v = aa + 9*ai[i]; 27214e2b4712SSatish Balay vi = aj + ai[i]; 27224e2b4712SSatish Balay nz = diag[i] - ai[i]; 27234e2b4712SSatish Balay idx = 3*(*r++); 2724f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 27254e2b4712SSatish Balay while (nz--) { 27264e2b4712SSatish Balay idx = 3*(*vi++); 2727f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2728f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2729f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2730f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 27314e2b4712SSatish Balay v += 9; 27324e2b4712SSatish Balay } 27334e2b4712SSatish Balay idx = 3*i; 2734f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 27354e2b4712SSatish Balay } 27364e2b4712SSatish Balay /* backward solve the upper triangular */ 27374e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 27384e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 27394e2b4712SSatish Balay vi = aj + diag[i] + 1; 27404e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 27414e2b4712SSatish Balay idt = 3*i; 2742f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 27434e2b4712SSatish Balay while (nz--) { 27444e2b4712SSatish Balay idx = 3*(*vi++); 2745f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2746f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2747f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2748f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 27494e2b4712SSatish Balay v += 9; 27504e2b4712SSatish Balay } 27514e2b4712SSatish Balay idc = 3*(*c--); 27524e2b4712SSatish Balay v = aa + 9*diag[i]; 2753f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2754f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2755f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 27564e2b4712SSatish Balay } 27574e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 27584e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2759d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27601ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2761d0f46423SBarry Smith ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 27624e2b4712SSatish Balay PetscFunctionReturn(0); 27634e2b4712SSatish Balay } 27644e2b4712SSatish Balay 276515091d37SBarry Smith /* 276615091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 276715091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 276815091d37SBarry Smith */ 27694a2ae208SSatish Balay #undef __FUNCT__ 27704a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2771dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 277215091d37SBarry Smith { 277315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2774690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2775dfbe8321SBarry Smith PetscErrorCode ierr; 2776690b6cddSBarry Smith PetscInt *diag = a->diag; 2777d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2778d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 2779d9fead3dSBarry Smith const PetscScalar *b; 2780690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 278115091d37SBarry Smith 278215091d37SBarry Smith PetscFunctionBegin; 2783d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27841ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 278515091d37SBarry Smith 278615091d37SBarry Smith /* forward solve the lower triangular */ 278715091d37SBarry Smith idx = 0; 278815091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 278915091d37SBarry Smith for (i=1; i<n; i++) { 279015091d37SBarry Smith v = aa + 9*ai[i]; 279115091d37SBarry Smith vi = aj + ai[i]; 279215091d37SBarry Smith nz = diag[i] - ai[i]; 279315091d37SBarry Smith idx += 3; 2794f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 279515091d37SBarry Smith while (nz--) { 279615091d37SBarry Smith jdx = 3*(*vi++); 279715091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2798f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2799f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2800f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 280115091d37SBarry Smith v += 9; 280215091d37SBarry Smith } 2803f1af5d2fSBarry Smith x[idx] = s1; 2804f1af5d2fSBarry Smith x[1+idx] = s2; 2805f1af5d2fSBarry Smith x[2+idx] = s3; 280615091d37SBarry Smith } 280715091d37SBarry Smith /* backward solve the upper triangular */ 280815091d37SBarry Smith for (i=n-1; i>=0; i--){ 280915091d37SBarry Smith v = aa + 9*diag[i] + 9; 281015091d37SBarry Smith vi = aj + diag[i] + 1; 281115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 281215091d37SBarry Smith idt = 3*i; 2813f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2814f1af5d2fSBarry Smith s3 = x[2+idt]; 281515091d37SBarry Smith while (nz--) { 281615091d37SBarry Smith idx = 3*(*vi++); 281715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2818f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2819f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2820f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 282115091d37SBarry Smith v += 9; 282215091d37SBarry Smith } 282315091d37SBarry Smith v = aa + 9*diag[i]; 2824f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2825f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2826f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 282715091d37SBarry Smith } 282815091d37SBarry Smith 2829d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28301ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2831d0f46423SBarry Smith ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 283215091d37SBarry Smith PetscFunctionReturn(0); 283315091d37SBarry Smith } 283415091d37SBarry Smith 28354a2ae208SSatish Balay #undef __FUNCT__ 28364a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2837dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 28384e2b4712SSatish Balay { 28394e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 28404e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 28416849ba73SBarry Smith PetscErrorCode ierr; 2842*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2843*5d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2844d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2845d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 2846d9fead3dSBarry Smith const PetscScalar *b; 28474e2b4712SSatish Balay 28484e2b4712SSatish Balay PetscFunctionBegin; 2849d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28501ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2851f1af5d2fSBarry Smith t = a->solve_work; 28524e2b4712SSatish Balay 28534e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 28544e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 28554e2b4712SSatish Balay 28564e2b4712SSatish Balay /* forward solve the lower triangular */ 28574e2b4712SSatish Balay idx = 2*(*r++); 2858f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 28594e2b4712SSatish Balay for (i=1; i<n; i++) { 28604e2b4712SSatish Balay v = aa + 4*ai[i]; 28614e2b4712SSatish Balay vi = aj + ai[i]; 28624e2b4712SSatish Balay nz = diag[i] - ai[i]; 28634e2b4712SSatish Balay idx = 2*(*r++); 2864f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 28654e2b4712SSatish Balay while (nz--) { 28664e2b4712SSatish Balay idx = 2*(*vi++); 2867f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2868f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2869f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 28704e2b4712SSatish Balay v += 4; 28714e2b4712SSatish Balay } 28724e2b4712SSatish Balay idx = 2*i; 2873f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 28744e2b4712SSatish Balay } 28754e2b4712SSatish Balay /* backward solve the upper triangular */ 28764e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 28774e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 28784e2b4712SSatish Balay vi = aj + diag[i] + 1; 28794e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 28804e2b4712SSatish Balay idt = 2*i; 2881f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 28824e2b4712SSatish Balay while (nz--) { 28834e2b4712SSatish Balay idx = 2*(*vi++); 2884f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2885f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2886f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 28874e2b4712SSatish Balay v += 4; 28884e2b4712SSatish Balay } 28894e2b4712SSatish Balay idc = 2*(*c--); 28904e2b4712SSatish Balay v = aa + 4*diag[i]; 2891f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2892f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 28934e2b4712SSatish Balay } 28944e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 28954e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2896d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2898d0f46423SBarry Smith ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 28994e2b4712SSatish Balay PetscFunctionReturn(0); 29004e2b4712SSatish Balay } 29014e2b4712SSatish Balay 290215091d37SBarry Smith /* 290315091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 290415091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 290515091d37SBarry Smith */ 29064a2ae208SSatish Balay #undef __FUNCT__ 29074a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2908dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 290915091d37SBarry Smith { 291015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2911690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2912dfbe8321SBarry Smith PetscErrorCode ierr; 2913690b6cddSBarry Smith PetscInt *diag = a->diag; 2914d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2915d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 2916d9fead3dSBarry Smith const PetscScalar *b; 2917690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 291815091d37SBarry Smith 291915091d37SBarry Smith PetscFunctionBegin; 2920d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 292215091d37SBarry Smith 292315091d37SBarry Smith /* forward solve the lower triangular */ 292415091d37SBarry Smith idx = 0; 292515091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 292615091d37SBarry Smith for (i=1; i<n; i++) { 292715091d37SBarry Smith v = aa + 4*ai[i]; 292815091d37SBarry Smith vi = aj + ai[i]; 292915091d37SBarry Smith nz = diag[i] - ai[i]; 293015091d37SBarry Smith idx += 2; 2931f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 293215091d37SBarry Smith while (nz--) { 293315091d37SBarry Smith jdx = 2*(*vi++); 293415091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 2935f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2936f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 293715091d37SBarry Smith v += 4; 293815091d37SBarry Smith } 2939f1af5d2fSBarry Smith x[idx] = s1; 2940f1af5d2fSBarry Smith x[1+idx] = s2; 294115091d37SBarry Smith } 294215091d37SBarry Smith /* backward solve the upper triangular */ 294315091d37SBarry Smith for (i=n-1; i>=0; i--){ 294415091d37SBarry Smith v = aa + 4*diag[i] + 4; 294515091d37SBarry Smith vi = aj + diag[i] + 1; 294615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 294715091d37SBarry Smith idt = 2*i; 2948f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 294915091d37SBarry Smith while (nz--) { 295015091d37SBarry Smith idx = 2*(*vi++); 295115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 2952f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2953f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 295415091d37SBarry Smith v += 4; 295515091d37SBarry Smith } 295615091d37SBarry Smith v = aa + 4*diag[i]; 2957f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 2958f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 295915091d37SBarry Smith } 296015091d37SBarry Smith 2961d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29621ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2963d0f46423SBarry Smith ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 296415091d37SBarry Smith PetscFunctionReturn(0); 296515091d37SBarry Smith } 296615091d37SBarry Smith 29674a2ae208SSatish Balay #undef __FUNCT__ 29684a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2969dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 29704e2b4712SSatish Balay { 29714e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 29724e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 29736849ba73SBarry Smith PetscErrorCode ierr; 2974*5d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2975*5d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 29763f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 297787828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 29784e2b4712SSatish Balay 29794e2b4712SSatish Balay PetscFunctionBegin; 29804e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 29814e2b4712SSatish Balay 29821ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 29831ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2984f1af5d2fSBarry Smith t = a->solve_work; 29854e2b4712SSatish Balay 29864e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 29874e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 29884e2b4712SSatish Balay 29894e2b4712SSatish Balay /* forward solve the lower triangular */ 2990f1af5d2fSBarry Smith t[0] = b[*r++]; 29914e2b4712SSatish Balay for (i=1; i<n; i++) { 29924e2b4712SSatish Balay v = aa + ai[i]; 29934e2b4712SSatish Balay vi = aj + ai[i]; 29944e2b4712SSatish Balay nz = diag[i] - ai[i]; 2995f1af5d2fSBarry Smith s1 = b[*r++]; 29964e2b4712SSatish Balay while (nz--) { 2997f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 29984e2b4712SSatish Balay } 2999f1af5d2fSBarry Smith t[i] = s1; 30004e2b4712SSatish Balay } 30014e2b4712SSatish Balay /* backward solve the upper triangular */ 30024e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 30034e2b4712SSatish Balay v = aa + diag[i] + 1; 30044e2b4712SSatish Balay vi = aj + diag[i] + 1; 30054e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3006f1af5d2fSBarry Smith s1 = t[i]; 30074e2b4712SSatish Balay while (nz--) { 3008f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 30094e2b4712SSatish Balay } 3010f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 30114e2b4712SSatish Balay } 30124e2b4712SSatish Balay 30134e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 30144e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 30151ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 30161ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3017d0f46423SBarry Smith ierr = PetscLogFlops(2*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 30184e2b4712SSatish Balay PetscFunctionReturn(0); 30194e2b4712SSatish Balay } 302015091d37SBarry Smith /* 302115091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 302215091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 302315091d37SBarry Smith */ 30244a2ae208SSatish Balay #undef __FUNCT__ 30254a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3026dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 302715091d37SBarry Smith { 302815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3029690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3030dfbe8321SBarry Smith PetscErrorCode ierr; 3031690b6cddSBarry Smith PetscInt *diag = a->diag; 303215091d37SBarry Smith MatScalar *aa=a->a; 303387828ca2SBarry Smith PetscScalar *x,*b; 303487828ca2SBarry Smith PetscScalar s1,x1; 303515091d37SBarry Smith MatScalar *v; 3036690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 303715091d37SBarry Smith 303815091d37SBarry Smith PetscFunctionBegin; 30391ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 30401ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 304115091d37SBarry Smith 304215091d37SBarry Smith /* forward solve the lower triangular */ 304315091d37SBarry Smith idx = 0; 304415091d37SBarry Smith x[0] = b[0]; 304515091d37SBarry Smith for (i=1; i<n; i++) { 304615091d37SBarry Smith v = aa + ai[i]; 304715091d37SBarry Smith vi = aj + ai[i]; 304815091d37SBarry Smith nz = diag[i] - ai[i]; 304915091d37SBarry Smith idx += 1; 3050f1af5d2fSBarry Smith s1 = b[idx]; 305115091d37SBarry Smith while (nz--) { 305215091d37SBarry Smith jdx = *vi++; 305315091d37SBarry Smith x1 = x[jdx]; 3054f1af5d2fSBarry Smith s1 -= v[0]*x1; 305515091d37SBarry Smith v += 1; 305615091d37SBarry Smith } 3057f1af5d2fSBarry Smith x[idx] = s1; 305815091d37SBarry Smith } 305915091d37SBarry Smith /* backward solve the upper triangular */ 306015091d37SBarry Smith for (i=n-1; i>=0; i--){ 306115091d37SBarry Smith v = aa + diag[i] + 1; 306215091d37SBarry Smith vi = aj + diag[i] + 1; 306315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 306415091d37SBarry Smith idt = i; 3065f1af5d2fSBarry Smith s1 = x[idt]; 306615091d37SBarry Smith while (nz--) { 306715091d37SBarry Smith idx = *vi++; 306815091d37SBarry Smith x1 = x[idx]; 3069f1af5d2fSBarry Smith s1 -= v[0]*x1; 307015091d37SBarry Smith v += 1; 307115091d37SBarry Smith } 307215091d37SBarry Smith v = aa + diag[i]; 3073f1af5d2fSBarry Smith x[idt] = v[0]*s1; 307415091d37SBarry Smith } 30751ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 30761ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3077d0f46423SBarry Smith ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 307815091d37SBarry Smith PetscFunctionReturn(0); 307915091d37SBarry Smith } 30804e2b4712SSatish Balay 30814e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 30824e2b4712SSatish Balay /* 30834e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 30844e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 30854e2b4712SSatish Balay Not a good example of code reuse. 30864e2b4712SSatish Balay */ 3087719d5645SBarry Smith EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption); 3088db4efbfdSBarry Smith EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 3089435faa5fSBarry Smith 30904a2ae208SSatish Balay #undef __FUNCT__ 30914a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3092719d5645SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,MatFactorInfo *info) 30934e2b4712SSatish Balay { 30944e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 30954e2b4712SSatish Balay IS isicol; 30966849ba73SBarry Smith PetscErrorCode ierr; 3097*5d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 3098*5d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 3099a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3100d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 310141df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 3102329f5518SBarry Smith PetscReal f; 31034e2b4712SSatish Balay 31044e2b4712SSatish Balay PetscFunctionBegin; 3105435faa5fSBarry Smith f = info->fill; 3106690b6cddSBarry Smith levels = (PetscInt)info->levels; 3107690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 31084c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3109667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3110667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 311141df41f0SMatthew Knepley both_identity = row_identity && col_identity; 3112309c388cSBarry Smith 311341df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 3114719d5645SBarry Smith ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 3115719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 3116719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 3117719d5645SBarry Smith ierr = MatMissingDiagonal_SeqBAIJ(fact,&flg,&dd);CHKERRQ(ierr); 31182af78befSBarry Smith if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd); 3119bb3d539aSBarry Smith b->row = isrow; 3120bb3d539aSBarry Smith b->col = iscol; 3121bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3122bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3123bb3d539aSBarry Smith b->icol = isicol; 3124bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3125719d5645SBarry Smith ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3126309c388cSBarry Smith } else { /* general case perform the symbolic factorization */ 31274e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 31284e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 31294e2b4712SSatish Balay 31304e2b4712SSatish Balay /* get new row pointers */ 3131690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 31324e2b4712SSatish Balay ainew[0] = 0; 31334e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 3134690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 3135690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 31364e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 3137690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 31384e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 3139690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 31404e2b4712SSatish Balay /* im is level for each filled value */ 3141690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 31424e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 3143690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 31444e2b4712SSatish Balay dloc[0] = 0; 31454e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 3146435faa5fSBarry Smith 3147435faa5fSBarry Smith /* copy prow into linked list */ 31484e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 314929bbc08cSBarry Smith if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 31504e2b4712SSatish Balay xi = aj + ai[r[prow]]; 31514e2b4712SSatish Balay fill[n] = n; 3152435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 31534e2b4712SSatish Balay while (nz--) { 31544e2b4712SSatish Balay fm = n; 31554e2b4712SSatish Balay idx = ic[*xi++]; 31564e2b4712SSatish Balay do { 31574e2b4712SSatish Balay m = fm; 31584e2b4712SSatish Balay fm = fill[m]; 31594e2b4712SSatish Balay } while (fm < idx); 31604e2b4712SSatish Balay fill[m] = idx; 31614e2b4712SSatish Balay fill[idx] = fm; 31624e2b4712SSatish Balay im[idx] = 0; 31634e2b4712SSatish Balay } 3164435faa5fSBarry Smith 3165435faa5fSBarry Smith /* make sure diagonal entry is included */ 3166435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 3167435faa5fSBarry Smith fm = n; 3168435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 3169435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3170435faa5fSBarry Smith fill[fm] = prow; 3171435faa5fSBarry Smith im[prow] = 0; 3172435faa5fSBarry Smith nzf++; 3173335d9088SBarry Smith dcount++; 3174435faa5fSBarry Smith } 3175435faa5fSBarry Smith 31764e2b4712SSatish Balay nzi = 0; 31774e2b4712SSatish Balay row = fill[n]; 31784e2b4712SSatish Balay while (row < prow) { 31794e2b4712SSatish Balay incrlev = im[row] + 1; 31804e2b4712SSatish Balay nz = dloc[row]; 3181435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 31824e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 31834e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 31844e2b4712SSatish Balay fm = row; 31854e2b4712SSatish Balay while (nnz-- > 0) { 31864e2b4712SSatish Balay idx = *xi++; 31874e2b4712SSatish Balay if (*flev + incrlev > levels) { 31884e2b4712SSatish Balay flev++; 31894e2b4712SSatish Balay continue; 31904e2b4712SSatish Balay } 31914e2b4712SSatish Balay do { 31924e2b4712SSatish Balay m = fm; 31934e2b4712SSatish Balay fm = fill[m]; 31944e2b4712SSatish Balay } while (fm < idx); 31954e2b4712SSatish Balay if (fm != idx) { 31964e2b4712SSatish Balay im[idx] = *flev + incrlev; 31974e2b4712SSatish Balay fill[m] = idx; 31984e2b4712SSatish Balay fill[idx] = fm; 31994e2b4712SSatish Balay fm = idx; 32004e2b4712SSatish Balay nzf++; 3201ecf371e4SBarry Smith } else { 32024e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 32034e2b4712SSatish Balay } 32044e2b4712SSatish Balay flev++; 32054e2b4712SSatish Balay } 32064e2b4712SSatish Balay row = fill[row]; 32074e2b4712SSatish Balay nzi++; 32084e2b4712SSatish Balay } 32094e2b4712SSatish Balay /* copy new filled row into permanent storage */ 32104e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 32114e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 3212ecf371e4SBarry Smith 3213ecf371e4SBarry Smith /* estimate how much additional space we will need */ 3214ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3215ecf371e4SBarry Smith /* just double the memory each time */ 3216690b6cddSBarry Smith PetscInt maxadd = jmax; 3217ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 32184e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 32194e2b4712SSatish Balay jmax += maxadd; 3220ecf371e4SBarry Smith 3221ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 3222*5d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3223*5d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3224606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 3225*5d0c19d7SBarry Smith ajnew = xitmp; 3226*5d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3227*5d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3228606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 3229*5d0c19d7SBarry Smith ajfill = xitmp; 3230eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 32314e2b4712SSatish Balay } 3232*5d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 32334e2b4712SSatish Balay flev = ajfill + ainew[prow]; 32344e2b4712SSatish Balay dloc[prow] = nzi; 32354e2b4712SSatish Balay fm = fill[n]; 32364e2b4712SSatish Balay while (nzf--) { 3237*5d0c19d7SBarry Smith *xitmp++ = fm; 32384e2b4712SSatish Balay *flev++ = im[fm]; 32394e2b4712SSatish Balay fm = fill[fm]; 32404e2b4712SSatish Balay } 3241435faa5fSBarry Smith /* make sure row has diagonal entry */ 3242435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 324377431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 32442401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3245435faa5fSBarry Smith } 32464e2b4712SSatish Balay } 3247606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 32484e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 32494e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3250606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 3251606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 32524e2b4712SSatish Balay 32536cf91177SBarry Smith #if defined(PETSC_USE_INFO) 32544e2b4712SSatish Balay { 3255329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3256ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3257ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3258ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3259ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3260335d9088SBarry Smith if (diagonal_fill) { 3261ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3262335d9088SBarry Smith } 32634e2b4712SSatish Balay } 326463ba0a88SBarry Smith #endif 32654e2b4712SSatish Balay 32664e2b4712SSatish Balay /* put together the new matrix */ 3267719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3268719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 3269719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 3270e6b907acSBarry Smith b->free_a = PETSC_TRUE; 3271e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 32727c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 3273a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 32744e2b4712SSatish Balay b->j = ajnew; 32754e2b4712SSatish Balay b->i = ainew; 32764e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 32774e2b4712SSatish Balay b->diag = dloc; 32784e2b4712SSatish Balay b->ilen = 0; 32794e2b4712SSatish Balay b->imax = 0; 32804e2b4712SSatish Balay b->row = isrow; 32814e2b4712SSatish Balay b->col = iscol; 3282bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3283c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3284c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3285e51c0b9cSSatish Balay b->icol = isicol; 328687828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 32874e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 32884e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 3289719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 32904e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 32914e2b4712SSatish Balay 3292719d5645SBarry Smith (fact)->info.factor_mallocs = reallocate; 3293719d5645SBarry Smith (fact)->info.fill_ratio_given = f; 3294719d5645SBarry Smith (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3295309c388cSBarry Smith } 329641df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 32978661488fSKris Buschelman PetscFunctionReturn(0); 32988661488fSKris Buschelman } 32998661488fSKris Buschelman 3300732ee342SKris Buschelman #undef __FUNCT__ 33017e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3302dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 33037e7071cdSKris Buschelman { 330412272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 330512272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 33065a9542e3SKris Buschelman PetscFunctionBegin; 33077cf1b8d3SKris Buschelman /* Undo Column scaling */ 33087cf1b8d3SKris Buschelman /* while (nz--) { */ 33097cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 33107cf1b8d3SKris Buschelman /* } */ 3311c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 3312c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 33137cf1b8d3SKris Buschelman PetscFunctionReturn(0); 33147cf1b8d3SKris Buschelman } 33157cf1b8d3SKris Buschelman 33167cf1b8d3SKris Buschelman #undef __FUNCT__ 33177cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3318dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 33197cf1b8d3SKris Buschelman { 33207cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3321b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 33222aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 33235a9542e3SKris Buschelman PetscFunctionBegin; 33240b9da03eSKris Buschelman /* Is this really necessary? */ 332520235379SKris Buschelman while (nz--) { 33260b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 33277e7071cdSKris Buschelman } 3328c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 33297e7071cdSKris Buschelman PetscFunctionReturn(0); 33307e7071cdSKris Buschelman } 33317e7071cdSKris Buschelman 3332732ee342SKris Buschelman 3333