173f4d377SMatthew Knepley /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/ 24e2b4712SSatish Balay /* 34e2b4712SSatish Balay Factorization code for BAIJ format. 44e2b4712SSatish Balay */ 54e2b4712SSatish Balay 64e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h" 74e2b4712SSatish Balay #include "src/vec/vecimpl.h" 84e2b4712SSatish Balay #include "src/inline/ilu.h" 974c49faeSBarry Smith #include "src/inline/dot.h" 104e2b4712SSatish Balay 114a2ae208SSatish Balay #undef __FUNCT__ 124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 137c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14f1af5d2fSBarry Smith { 15f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 17f1af5d2fSBarry Smith int *diag = a->diag; 18f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 1987828ca2SBarry Smith PetscScalar s1,*x,*b; 20f1af5d2fSBarry Smith 21f1af5d2fSBarry Smith PetscFunctionBegin; 22ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 23f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 24f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25f1af5d2fSBarry Smith 26f1af5d2fSBarry Smith /* forward solve the U^T */ 27f1af5d2fSBarry Smith for (i=0; i<n; i++) { 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith v = aa + diag[i]; 30f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 31ef66eb69SBarry Smith s1 = (*v++)*x[i]; 32f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 33f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 34f1af5d2fSBarry Smith while (nz--) { 35f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 36f1af5d2fSBarry Smith } 37f1af5d2fSBarry Smith x[i] = s1; 38f1af5d2fSBarry Smith } 39f1af5d2fSBarry Smith /* backward solve the L^T */ 40f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 41f1af5d2fSBarry Smith v = aa + diag[i] - 1; 42f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 43f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 44f1af5d2fSBarry Smith s1 = x[i]; 45f1af5d2fSBarry Smith while (nz--) { 46f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 47f1af5d2fSBarry Smith } 48f1af5d2fSBarry Smith } 49f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 52f1af5d2fSBarry Smith PetscFunctionReturn(0); 53f1af5d2fSBarry Smith } 54f1af5d2fSBarry Smith 554a2ae208SSatish Balay #undef __FUNCT__ 564a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 577c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 58f1af5d2fSBarry Smith { 59f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 60f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 61f1af5d2fSBarry Smith int *diag = a->diag,oidx; 62f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6387828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6487828ca2SBarry Smith PetscScalar *x,*b; 65f1af5d2fSBarry Smith 66f1af5d2fSBarry Smith PetscFunctionBegin; 67ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 68f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 69f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 70f1af5d2fSBarry Smith 71f1af5d2fSBarry Smith /* forward solve the U^T */ 72f1af5d2fSBarry Smith idx = 0; 73f1af5d2fSBarry Smith for (i=0; i<n; i++) { 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith v = aa + 4*diag[i]; 76f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 77ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 78f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 79f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 80f1af5d2fSBarry Smith v += 4; 81f1af5d2fSBarry Smith 82f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 83f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 84f1af5d2fSBarry Smith while (nz--) { 85f1af5d2fSBarry Smith oidx = 2*(*vi++); 86f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 87f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 88f1af5d2fSBarry Smith v += 4; 89f1af5d2fSBarry Smith } 90f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 91f1af5d2fSBarry Smith idx += 2; 92f1af5d2fSBarry Smith } 93f1af5d2fSBarry Smith /* backward solve the L^T */ 94f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 95f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 96f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 97f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 98f1af5d2fSBarry Smith idt = 2*i; 99f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 100f1af5d2fSBarry Smith while (nz--) { 101f1af5d2fSBarry Smith idx = 2*(*vi--); 102f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 103f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 104f1af5d2fSBarry Smith v -= 4; 105f1af5d2fSBarry Smith } 106f1af5d2fSBarry Smith } 107f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 108f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 109b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 110f1af5d2fSBarry Smith PetscFunctionReturn(0); 111f1af5d2fSBarry Smith } 112f1af5d2fSBarry Smith 1134a2ae208SSatish Balay #undef __FUNCT__ 1144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 1157c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 116f1af5d2fSBarry Smith { 117f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 118f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 119f1af5d2fSBarry Smith int *diag = a->diag,oidx; 120f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12187828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12287828ca2SBarry Smith PetscScalar *x,*b; 123f1af5d2fSBarry Smith 124f1af5d2fSBarry Smith PetscFunctionBegin; 125ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 126f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 127f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 128f1af5d2fSBarry Smith 129f1af5d2fSBarry Smith /* forward solve the U^T */ 130f1af5d2fSBarry Smith idx = 0; 131f1af5d2fSBarry Smith for (i=0; i<n; i++) { 132f1af5d2fSBarry Smith 133f1af5d2fSBarry Smith v = aa + 9*diag[i]; 134f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 135ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 136f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 137f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 138f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 139f1af5d2fSBarry Smith v += 9; 140f1af5d2fSBarry Smith 141f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 142f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 143f1af5d2fSBarry Smith while (nz--) { 144f1af5d2fSBarry Smith oidx = 3*(*vi++); 145f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 146f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 147f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 148f1af5d2fSBarry Smith v += 9; 149f1af5d2fSBarry Smith } 150f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 151f1af5d2fSBarry Smith idx += 3; 152f1af5d2fSBarry Smith } 153f1af5d2fSBarry Smith /* backward solve the L^T */ 154f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 155f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 156f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 157f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 158f1af5d2fSBarry Smith idt = 3*i; 159f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 160f1af5d2fSBarry Smith while (nz--) { 161f1af5d2fSBarry Smith idx = 3*(*vi--); 162f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 163f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 164f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 165f1af5d2fSBarry Smith v -= 9; 166f1af5d2fSBarry Smith } 167f1af5d2fSBarry Smith } 168f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 169f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 170b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 171f1af5d2fSBarry Smith PetscFunctionReturn(0); 172f1af5d2fSBarry Smith } 173f1af5d2fSBarry Smith 1744a2ae208SSatish Balay #undef __FUNCT__ 1754a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 1767c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 177f1af5d2fSBarry Smith { 178f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 179f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 180f1af5d2fSBarry Smith int *diag = a->diag,oidx; 181f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18387828ca2SBarry Smith PetscScalar *x,*b; 184f1af5d2fSBarry Smith 185f1af5d2fSBarry Smith PetscFunctionBegin; 186ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 187f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 188f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 189f1af5d2fSBarry Smith 190f1af5d2fSBarry Smith /* forward solve the U^T */ 191f1af5d2fSBarry Smith idx = 0; 192f1af5d2fSBarry Smith for (i=0; i<n; i++) { 193f1af5d2fSBarry Smith 194f1af5d2fSBarry Smith v = aa + 16*diag[i]; 195f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 196ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 197f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 198f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 199f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 200f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 201f1af5d2fSBarry Smith v += 16; 202f1af5d2fSBarry Smith 203f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 204f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 205f1af5d2fSBarry Smith while (nz--) { 206f1af5d2fSBarry Smith oidx = 4*(*vi++); 207f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 208f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 209f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 210f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 211f1af5d2fSBarry Smith v += 16; 212f1af5d2fSBarry Smith } 213f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 214f1af5d2fSBarry Smith idx += 4; 215f1af5d2fSBarry Smith } 216f1af5d2fSBarry Smith /* backward solve the L^T */ 217f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 218f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 219f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 220f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 221f1af5d2fSBarry Smith idt = 4*i; 222f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 223f1af5d2fSBarry Smith while (nz--) { 224f1af5d2fSBarry Smith idx = 4*(*vi--); 225f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 226f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 227f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 228f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 229f1af5d2fSBarry Smith v -= 16; 230f1af5d2fSBarry Smith } 231f1af5d2fSBarry Smith } 232f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 233f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 235f1af5d2fSBarry Smith PetscFunctionReturn(0); 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith 2384a2ae208SSatish Balay #undef __FUNCT__ 2394a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 2407c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 241f1af5d2fSBarry Smith { 242f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 244f1af5d2fSBarry Smith int *diag = a->diag,oidx; 245f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 24687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 24787828ca2SBarry Smith PetscScalar *x,*b; 248f1af5d2fSBarry Smith 249f1af5d2fSBarry Smith PetscFunctionBegin; 250ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 251f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 252f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 253f1af5d2fSBarry Smith 254f1af5d2fSBarry Smith /* forward solve the U^T */ 255f1af5d2fSBarry Smith idx = 0; 256f1af5d2fSBarry Smith for (i=0; i<n; i++) { 257f1af5d2fSBarry Smith 258f1af5d2fSBarry Smith v = aa + 25*diag[i]; 259f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 260ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 261f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 262f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 263f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 264f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 265f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 266f1af5d2fSBarry Smith v += 25; 267f1af5d2fSBarry Smith 268f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 269f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 270f1af5d2fSBarry Smith while (nz--) { 271f1af5d2fSBarry Smith oidx = 5*(*vi++); 272f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 273f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 274f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 275f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 276f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 277f1af5d2fSBarry Smith v += 25; 278f1af5d2fSBarry Smith } 279f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 280f1af5d2fSBarry Smith idx += 5; 281f1af5d2fSBarry Smith } 282f1af5d2fSBarry Smith /* backward solve the L^T */ 283f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 284f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 285f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 286f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 287f1af5d2fSBarry Smith idt = 5*i; 288f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 289f1af5d2fSBarry Smith while (nz--) { 290f1af5d2fSBarry Smith idx = 5*(*vi--); 291f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 292f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 293f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 294f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 295f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 296f1af5d2fSBarry Smith v -= 25; 297f1af5d2fSBarry Smith } 298f1af5d2fSBarry Smith } 299f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 300f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 301b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 302f1af5d2fSBarry Smith PetscFunctionReturn(0); 303f1af5d2fSBarry Smith } 304f1af5d2fSBarry Smith 3054a2ae208SSatish Balay #undef __FUNCT__ 3064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 3077c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 308f1af5d2fSBarry Smith { 309f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 310f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 311f1af5d2fSBarry Smith int *diag = a->diag,oidx; 312f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 31387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 31487828ca2SBarry Smith PetscScalar *x,*b; 315f1af5d2fSBarry Smith 316f1af5d2fSBarry Smith PetscFunctionBegin; 317ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 318f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 319f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 320f1af5d2fSBarry Smith 321f1af5d2fSBarry Smith /* forward solve the U^T */ 322f1af5d2fSBarry Smith idx = 0; 323f1af5d2fSBarry Smith for (i=0; i<n; i++) { 324f1af5d2fSBarry Smith 325f1af5d2fSBarry Smith v = aa + 36*diag[i]; 326f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 327ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 328ef66eb69SBarry Smith x6 = x[5+idx]; 329f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 330f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 331f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 332f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 333f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 334f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 335f1af5d2fSBarry Smith v += 36; 336f1af5d2fSBarry Smith 337f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 338f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 339f1af5d2fSBarry Smith while (nz--) { 340f1af5d2fSBarry Smith oidx = 6*(*vi++); 341f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 342f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 343f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 344f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 345f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 346f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 347f1af5d2fSBarry Smith v += 36; 348f1af5d2fSBarry Smith } 349f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 350f1af5d2fSBarry Smith x[5+idx] = s6; 351f1af5d2fSBarry Smith idx += 6; 352f1af5d2fSBarry Smith } 353f1af5d2fSBarry Smith /* backward solve the L^T */ 354f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 355f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 356f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 357f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 358f1af5d2fSBarry Smith idt = 6*i; 359f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 360f1af5d2fSBarry Smith s6 = x[5+idt]; 361f1af5d2fSBarry Smith while (nz--) { 362f1af5d2fSBarry Smith idx = 6*(*vi--); 363f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 364f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 365f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 366f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 367f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 368f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 369f1af5d2fSBarry Smith v -= 36; 370f1af5d2fSBarry Smith } 371f1af5d2fSBarry Smith } 372f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 373f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 374b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 375f1af5d2fSBarry Smith PetscFunctionReturn(0); 376f1af5d2fSBarry Smith } 377f1af5d2fSBarry Smith 3784a2ae208SSatish Balay #undef __FUNCT__ 3794a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 3807c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 381f1af5d2fSBarry Smith { 382f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 383f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 384f1af5d2fSBarry Smith int *diag = a->diag,oidx; 385f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 38687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 38787828ca2SBarry Smith PetscScalar *x,*b; 388f1af5d2fSBarry Smith 389f1af5d2fSBarry Smith PetscFunctionBegin; 390ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 391f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 392f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 393f1af5d2fSBarry Smith 394f1af5d2fSBarry Smith /* forward solve the U^T */ 395f1af5d2fSBarry Smith idx = 0; 396f1af5d2fSBarry Smith for (i=0; i<n; i++) { 397f1af5d2fSBarry Smith 398f1af5d2fSBarry Smith v = aa + 49*diag[i]; 399f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 400ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 401ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 402f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 403f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 404f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 405f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 406f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 407f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 408f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 409f1af5d2fSBarry Smith v += 49; 410f1af5d2fSBarry Smith 411f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 412f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 413f1af5d2fSBarry Smith while (nz--) { 414f1af5d2fSBarry Smith oidx = 7*(*vi++); 415f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 416f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 417f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 418f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 419f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 420f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 421f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 422f1af5d2fSBarry Smith v += 49; 423f1af5d2fSBarry Smith } 424f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 425f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 426f1af5d2fSBarry Smith idx += 7; 427f1af5d2fSBarry Smith } 428f1af5d2fSBarry Smith /* backward solve the L^T */ 429f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 430f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 431f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 432f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 433f1af5d2fSBarry Smith idt = 7*i; 434f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 435f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 436f1af5d2fSBarry Smith while (nz--) { 437f1af5d2fSBarry Smith idx = 7*(*vi--); 438f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 439f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 440f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 441f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 442f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 443f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 444f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 445f1af5d2fSBarry Smith v -= 49; 446f1af5d2fSBarry Smith } 447f1af5d2fSBarry Smith } 448f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 449f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 450b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 451f1af5d2fSBarry Smith PetscFunctionReturn(0); 452f1af5d2fSBarry Smith } 453f1af5d2fSBarry Smith 454f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4554a2ae208SSatish Balay #undef __FUNCT__ 4564a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 4577c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 458f1af5d2fSBarry Smith { 459f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 460f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 461f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 462f1af5d2fSBarry Smith int *diag = a->diag; 463f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 46487828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 465f1af5d2fSBarry Smith 466f1af5d2fSBarry Smith PetscFunctionBegin; 467f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 468f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 469f1af5d2fSBarry Smith t = a->solve_work; 470f1af5d2fSBarry Smith 471f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 472f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 473f1af5d2fSBarry Smith 474f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 475f1af5d2fSBarry Smith for (i=0; i<n; i++) { 476f1af5d2fSBarry Smith t[i] = b[c[i]]; 477f1af5d2fSBarry Smith } 478f1af5d2fSBarry Smith 479f1af5d2fSBarry Smith /* forward solve the U^T */ 480f1af5d2fSBarry Smith for (i=0; i<n; i++) { 481f1af5d2fSBarry Smith 482f1af5d2fSBarry Smith v = aa + diag[i]; 483f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 484f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 485f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 486f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 487f1af5d2fSBarry Smith while (nz--) { 488f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 489f1af5d2fSBarry Smith } 490f1af5d2fSBarry Smith t[i] = s1; 491f1af5d2fSBarry Smith } 492f1af5d2fSBarry Smith /* backward solve the L^T */ 493f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 494f1af5d2fSBarry Smith v = aa + diag[i] - 1; 495f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 496f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 497f1af5d2fSBarry Smith s1 = t[i]; 498f1af5d2fSBarry Smith while (nz--) { 499f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 500f1af5d2fSBarry Smith } 501f1af5d2fSBarry Smith } 502f1af5d2fSBarry Smith 503f1af5d2fSBarry Smith /* copy t into x according to permutation */ 504f1af5d2fSBarry Smith for (i=0; i<n; i++) { 505f1af5d2fSBarry Smith x[r[i]] = t[i]; 506f1af5d2fSBarry Smith } 507f1af5d2fSBarry Smith 508f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 509f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 510f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 511f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 512b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 513f1af5d2fSBarry Smith PetscFunctionReturn(0); 514f1af5d2fSBarry Smith } 515f1af5d2fSBarry Smith 5164a2ae208SSatish Balay #undef __FUNCT__ 5174a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 5187c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 519f1af5d2fSBarry Smith { 520f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 521f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 522f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 523f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 524f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 52587828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 52687828ca2SBarry Smith PetscScalar *x,*b,*t; 527f1af5d2fSBarry Smith 528f1af5d2fSBarry Smith PetscFunctionBegin; 529f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 530f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 531f1af5d2fSBarry Smith t = a->solve_work; 532f1af5d2fSBarry Smith 533f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 534f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 535f1af5d2fSBarry Smith 536f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 537f1af5d2fSBarry Smith ii = 0; 538f1af5d2fSBarry Smith for (i=0; i<n; i++) { 539f1af5d2fSBarry Smith ic = 2*c[i]; 540f1af5d2fSBarry Smith t[ii] = b[ic]; 541f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 542f1af5d2fSBarry Smith ii += 2; 543f1af5d2fSBarry Smith } 544f1af5d2fSBarry Smith 545f1af5d2fSBarry Smith /* forward solve the U^T */ 546f1af5d2fSBarry Smith idx = 0; 547f1af5d2fSBarry Smith for (i=0; i<n; i++) { 548f1af5d2fSBarry Smith 549f1af5d2fSBarry Smith v = aa + 4*diag[i]; 550f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 551f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 552f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 553f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 554f1af5d2fSBarry Smith v += 4; 555f1af5d2fSBarry Smith 556f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 557f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 558f1af5d2fSBarry Smith while (nz--) { 559f1af5d2fSBarry Smith oidx = 2*(*vi++); 560f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 561f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 562f1af5d2fSBarry Smith v += 4; 563f1af5d2fSBarry Smith } 564f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 565f1af5d2fSBarry Smith idx += 2; 566f1af5d2fSBarry Smith } 567f1af5d2fSBarry Smith /* backward solve the L^T */ 568f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 569f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 570f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 571f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 572f1af5d2fSBarry Smith idt = 2*i; 573f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 574f1af5d2fSBarry Smith while (nz--) { 575f1af5d2fSBarry Smith idx = 2*(*vi--); 576f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 577f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 578f1af5d2fSBarry Smith v -= 4; 579f1af5d2fSBarry Smith } 580f1af5d2fSBarry Smith } 581f1af5d2fSBarry Smith 582f1af5d2fSBarry Smith /* copy t into x according to permutation */ 583f1af5d2fSBarry Smith ii = 0; 584f1af5d2fSBarry Smith for (i=0; i<n; i++) { 585f1af5d2fSBarry Smith ir = 2*r[i]; 586f1af5d2fSBarry Smith x[ir] = t[ii]; 587f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 588f1af5d2fSBarry Smith ii += 2; 589f1af5d2fSBarry Smith } 590f1af5d2fSBarry Smith 591f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 592f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 593f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 594f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 595b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 596f1af5d2fSBarry Smith PetscFunctionReturn(0); 597f1af5d2fSBarry Smith } 598f1af5d2fSBarry Smith 5994a2ae208SSatish Balay #undef __FUNCT__ 6004a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 6017c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 602f1af5d2fSBarry Smith { 603f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 604f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 605f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 606f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 607f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 60887828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 60987828ca2SBarry Smith PetscScalar *x,*b,*t; 610f1af5d2fSBarry Smith 611f1af5d2fSBarry Smith PetscFunctionBegin; 612f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 613f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 614f1af5d2fSBarry Smith t = a->solve_work; 615f1af5d2fSBarry Smith 616f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 617f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 618f1af5d2fSBarry Smith 619f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 620f1af5d2fSBarry Smith ii = 0; 621f1af5d2fSBarry Smith for (i=0; i<n; i++) { 622f1af5d2fSBarry Smith ic = 3*c[i]; 623f1af5d2fSBarry Smith t[ii] = b[ic]; 624f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 625f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 626f1af5d2fSBarry Smith ii += 3; 627f1af5d2fSBarry Smith } 628f1af5d2fSBarry Smith 629f1af5d2fSBarry Smith /* forward solve the U^T */ 630f1af5d2fSBarry Smith idx = 0; 631f1af5d2fSBarry Smith for (i=0; i<n; i++) { 632f1af5d2fSBarry Smith 633f1af5d2fSBarry Smith v = aa + 9*diag[i]; 634f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 635f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 636f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 637f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 638f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 639f1af5d2fSBarry Smith v += 9; 640f1af5d2fSBarry Smith 641f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 642f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 643f1af5d2fSBarry Smith while (nz--) { 644f1af5d2fSBarry Smith oidx = 3*(*vi++); 645f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 646f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 647f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 648f1af5d2fSBarry Smith v += 9; 649f1af5d2fSBarry Smith } 650f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 651f1af5d2fSBarry Smith idx += 3; 652f1af5d2fSBarry Smith } 653f1af5d2fSBarry Smith /* backward solve the L^T */ 654f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 655f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 656f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 657f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 658f1af5d2fSBarry Smith idt = 3*i; 659f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 660f1af5d2fSBarry Smith while (nz--) { 661f1af5d2fSBarry Smith idx = 3*(*vi--); 662f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 663f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 664f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 665f1af5d2fSBarry Smith v -= 9; 666f1af5d2fSBarry Smith } 667f1af5d2fSBarry Smith } 668f1af5d2fSBarry Smith 669f1af5d2fSBarry Smith /* copy t into x according to permutation */ 670f1af5d2fSBarry Smith ii = 0; 671f1af5d2fSBarry Smith for (i=0; i<n; i++) { 672f1af5d2fSBarry Smith ir = 3*r[i]; 673f1af5d2fSBarry Smith x[ir] = t[ii]; 674f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 675f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 676f1af5d2fSBarry Smith ii += 3; 677f1af5d2fSBarry Smith } 678f1af5d2fSBarry Smith 679f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 680f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 681f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 682f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 683b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 684f1af5d2fSBarry Smith PetscFunctionReturn(0); 685f1af5d2fSBarry Smith } 686f1af5d2fSBarry Smith 6874a2ae208SSatish Balay #undef __FUNCT__ 6884a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 6897c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 690f1af5d2fSBarry Smith { 691f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 692f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 693f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 694f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 695f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 69687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 69787828ca2SBarry Smith PetscScalar *x,*b,*t; 698f1af5d2fSBarry Smith 699f1af5d2fSBarry Smith PetscFunctionBegin; 700f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 701f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 702f1af5d2fSBarry Smith t = a->solve_work; 703f1af5d2fSBarry Smith 704f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 705f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 706f1af5d2fSBarry Smith 707f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 708f1af5d2fSBarry Smith ii = 0; 709f1af5d2fSBarry Smith for (i=0; i<n; i++) { 710f1af5d2fSBarry Smith ic = 4*c[i]; 711f1af5d2fSBarry Smith t[ii] = b[ic]; 712f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 713f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 714f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 715f1af5d2fSBarry Smith ii += 4; 716f1af5d2fSBarry Smith } 717f1af5d2fSBarry Smith 718f1af5d2fSBarry Smith /* forward solve the U^T */ 719f1af5d2fSBarry Smith idx = 0; 720f1af5d2fSBarry Smith for (i=0; i<n; i++) { 721f1af5d2fSBarry Smith 722f1af5d2fSBarry Smith v = aa + 16*diag[i]; 723f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 724f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 725f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 726f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 727f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 728f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 729f1af5d2fSBarry Smith v += 16; 730f1af5d2fSBarry Smith 731f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 732f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 733f1af5d2fSBarry Smith while (nz--) { 734f1af5d2fSBarry Smith oidx = 4*(*vi++); 735f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 736f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 737f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 738f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 739f1af5d2fSBarry Smith v += 16; 740f1af5d2fSBarry Smith } 741f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 742f1af5d2fSBarry Smith idx += 4; 743f1af5d2fSBarry Smith } 744f1af5d2fSBarry Smith /* backward solve the L^T */ 745f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 746f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 747f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 748f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 749f1af5d2fSBarry Smith idt = 4*i; 750f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 751f1af5d2fSBarry Smith while (nz--) { 752f1af5d2fSBarry Smith idx = 4*(*vi--); 753f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 754f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 755f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 756f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 757f1af5d2fSBarry Smith v -= 16; 758f1af5d2fSBarry Smith } 759f1af5d2fSBarry Smith } 760f1af5d2fSBarry Smith 761f1af5d2fSBarry Smith /* copy t into x according to permutation */ 762f1af5d2fSBarry Smith ii = 0; 763f1af5d2fSBarry Smith for (i=0; i<n; i++) { 764f1af5d2fSBarry Smith ir = 4*r[i]; 765f1af5d2fSBarry Smith x[ir] = t[ii]; 766f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 767f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 768f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 769f1af5d2fSBarry Smith ii += 4; 770f1af5d2fSBarry Smith } 771f1af5d2fSBarry Smith 772f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 773f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 774f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 775f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 776b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 777f1af5d2fSBarry Smith PetscFunctionReturn(0); 778f1af5d2fSBarry Smith } 779f1af5d2fSBarry Smith 7804a2ae208SSatish Balay #undef __FUNCT__ 7814a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 7827c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 783f1af5d2fSBarry Smith { 784f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 785f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 786f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 787f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 788f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 78987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 79087828ca2SBarry Smith PetscScalar *x,*b,*t; 791f1af5d2fSBarry Smith 792f1af5d2fSBarry Smith PetscFunctionBegin; 793f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 794f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 795f1af5d2fSBarry Smith t = a->solve_work; 796f1af5d2fSBarry Smith 797f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 798f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 799f1af5d2fSBarry Smith 800f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 801f1af5d2fSBarry Smith ii = 0; 802f1af5d2fSBarry Smith for (i=0; i<n; i++) { 803f1af5d2fSBarry Smith ic = 5*c[i]; 804f1af5d2fSBarry Smith t[ii] = b[ic]; 805f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 806f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 807f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 808f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 809f1af5d2fSBarry Smith ii += 5; 810f1af5d2fSBarry Smith } 811f1af5d2fSBarry Smith 812f1af5d2fSBarry Smith /* forward solve the U^T */ 813f1af5d2fSBarry Smith idx = 0; 814f1af5d2fSBarry Smith for (i=0; i<n; i++) { 815f1af5d2fSBarry Smith 816f1af5d2fSBarry Smith v = aa + 25*diag[i]; 817f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 818f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 819f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 820f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 821f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 822f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 823f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 824f1af5d2fSBarry Smith v += 25; 825f1af5d2fSBarry Smith 826f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 827f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 828f1af5d2fSBarry Smith while (nz--) { 829f1af5d2fSBarry Smith oidx = 5*(*vi++); 830f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 831f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 832f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 833f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 834f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 835f1af5d2fSBarry Smith v += 25; 836f1af5d2fSBarry Smith } 837f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 838f1af5d2fSBarry Smith idx += 5; 839f1af5d2fSBarry Smith } 840f1af5d2fSBarry Smith /* backward solve the L^T */ 841f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 842f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 843f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 844f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 845f1af5d2fSBarry Smith idt = 5*i; 846f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 847f1af5d2fSBarry Smith while (nz--) { 848f1af5d2fSBarry Smith idx = 5*(*vi--); 849f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854f1af5d2fSBarry Smith v -= 25; 855f1af5d2fSBarry Smith } 856f1af5d2fSBarry Smith } 857f1af5d2fSBarry Smith 858f1af5d2fSBarry Smith /* copy t into x according to permutation */ 859f1af5d2fSBarry Smith ii = 0; 860f1af5d2fSBarry Smith for (i=0; i<n; i++) { 861f1af5d2fSBarry Smith ir = 5*r[i]; 862f1af5d2fSBarry Smith x[ir] = t[ii]; 863f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 864f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 865f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 866f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 867f1af5d2fSBarry Smith ii += 5; 868f1af5d2fSBarry Smith } 869f1af5d2fSBarry Smith 870f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 871f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 872f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 873f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 874b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 875f1af5d2fSBarry Smith PetscFunctionReturn(0); 876f1af5d2fSBarry Smith } 877f1af5d2fSBarry Smith 8784a2ae208SSatish Balay #undef __FUNCT__ 8794a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 8807c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 881f1af5d2fSBarry Smith { 882f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 883f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 884f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 885f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 886f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 88787828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 88887828ca2SBarry Smith PetscScalar *x,*b,*t; 889f1af5d2fSBarry Smith 890f1af5d2fSBarry Smith PetscFunctionBegin; 891f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 892f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 893f1af5d2fSBarry Smith t = a->solve_work; 894f1af5d2fSBarry Smith 895f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 896f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 897f1af5d2fSBarry Smith 898f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 899f1af5d2fSBarry Smith ii = 0; 900f1af5d2fSBarry Smith for (i=0; i<n; i++) { 901f1af5d2fSBarry Smith ic = 6*c[i]; 902f1af5d2fSBarry Smith t[ii] = b[ic]; 903f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 904f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 905f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 906f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 907f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 908f1af5d2fSBarry Smith ii += 6; 909f1af5d2fSBarry Smith } 910f1af5d2fSBarry Smith 911f1af5d2fSBarry Smith /* forward solve the U^T */ 912f1af5d2fSBarry Smith idx = 0; 913f1af5d2fSBarry Smith for (i=0; i<n; i++) { 914f1af5d2fSBarry Smith 915f1af5d2fSBarry Smith v = aa + 36*diag[i]; 916f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 917f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 918f1af5d2fSBarry Smith x6 = t[5+idx]; 919f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 920f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 921f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 922f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 923f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 924f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 925f1af5d2fSBarry Smith v += 36; 926f1af5d2fSBarry Smith 927f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 928f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 929f1af5d2fSBarry Smith while (nz--) { 930f1af5d2fSBarry Smith oidx = 6*(*vi++); 931f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 932f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 933f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 934f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 935f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 936f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 937f1af5d2fSBarry Smith v += 36; 938f1af5d2fSBarry Smith } 939f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 940f1af5d2fSBarry Smith t[5+idx] = s6; 941f1af5d2fSBarry Smith idx += 6; 942f1af5d2fSBarry Smith } 943f1af5d2fSBarry Smith /* backward solve the L^T */ 944f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 945f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 946f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 947f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 948f1af5d2fSBarry Smith idt = 6*i; 949f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 950f1af5d2fSBarry Smith s6 = t[5+idt]; 951f1af5d2fSBarry Smith while (nz--) { 952f1af5d2fSBarry Smith idx = 6*(*vi--); 953f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 954f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 955f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 956f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 957f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 958f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 959f1af5d2fSBarry Smith v -= 36; 960f1af5d2fSBarry Smith } 961f1af5d2fSBarry Smith } 962f1af5d2fSBarry Smith 963f1af5d2fSBarry Smith /* copy t into x according to permutation */ 964f1af5d2fSBarry Smith ii = 0; 965f1af5d2fSBarry Smith for (i=0; i<n; i++) { 966f1af5d2fSBarry Smith ir = 6*r[i]; 967f1af5d2fSBarry Smith x[ir] = t[ii]; 968f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 969f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 970f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 971f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 972f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 973f1af5d2fSBarry Smith ii += 6; 974f1af5d2fSBarry Smith } 975f1af5d2fSBarry Smith 976f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 977f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 978f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 979f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 980b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 981f1af5d2fSBarry Smith PetscFunctionReturn(0); 982f1af5d2fSBarry Smith } 983f1af5d2fSBarry Smith 9844a2ae208SSatish Balay #undef __FUNCT__ 9854a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 9867c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 987f1af5d2fSBarry Smith { 988f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 989f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 990f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 991f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 992f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 99387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 99487828ca2SBarry Smith PetscScalar *x,*b,*t; 995f1af5d2fSBarry Smith 996f1af5d2fSBarry Smith PetscFunctionBegin; 997f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 998f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 999f1af5d2fSBarry Smith t = a->solve_work; 1000f1af5d2fSBarry Smith 1001f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1002f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1003f1af5d2fSBarry Smith 1004f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1005f1af5d2fSBarry Smith ii = 0; 1006f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1007f1af5d2fSBarry Smith ic = 7*c[i]; 1008f1af5d2fSBarry Smith t[ii] = b[ic]; 1009f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1010f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1011f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1012f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1013f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1014f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1015f1af5d2fSBarry Smith ii += 7; 1016f1af5d2fSBarry Smith } 1017f1af5d2fSBarry Smith 1018f1af5d2fSBarry Smith /* forward solve the U^T */ 1019f1af5d2fSBarry Smith idx = 0; 1020f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1021f1af5d2fSBarry Smith 1022f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1023f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1024f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1025f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1026f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1027f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1028f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1029f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1030f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1031f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1032f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1033f1af5d2fSBarry Smith v += 49; 1034f1af5d2fSBarry Smith 1035f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1036f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1037f1af5d2fSBarry Smith while (nz--) { 1038f1af5d2fSBarry Smith oidx = 7*(*vi++); 1039f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1040f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1041f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1042f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1043f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1044f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1045f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1046f1af5d2fSBarry Smith v += 49; 1047f1af5d2fSBarry Smith } 1048f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1049f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1050f1af5d2fSBarry Smith idx += 7; 1051f1af5d2fSBarry Smith } 1052f1af5d2fSBarry Smith /* backward solve the L^T */ 1053f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1054f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1055f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1056f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1057f1af5d2fSBarry Smith idt = 7*i; 1058f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1059f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1060f1af5d2fSBarry Smith while (nz--) { 1061f1af5d2fSBarry Smith idx = 7*(*vi--); 1062f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069f1af5d2fSBarry Smith v -= 49; 1070f1af5d2fSBarry Smith } 1071f1af5d2fSBarry Smith } 1072f1af5d2fSBarry Smith 1073f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1074f1af5d2fSBarry Smith ii = 0; 1075f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1076f1af5d2fSBarry Smith ir = 7*r[i]; 1077f1af5d2fSBarry Smith x[ir] = t[ii]; 1078f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1079f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1080f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1081f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1082f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1083f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1084f1af5d2fSBarry Smith ii += 7; 1085f1af5d2fSBarry Smith } 1086f1af5d2fSBarry Smith 1087f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1088f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1089f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1090f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1091b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 1092f1af5d2fSBarry Smith PetscFunctionReturn(0); 1093f1af5d2fSBarry Smith } 1094f1af5d2fSBarry Smith 10954e2b4712SSatish Balay /* ----------------------------------------------------------- */ 10964a2ae208SSatish Balay #undef __FUNCT__ 10974a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 10984e2b4712SSatish Balay int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 10994e2b4712SSatish Balay { 11004e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11014e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11024e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 11034e2b4712SSatish Balay int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 11043f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 110587828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11064e2b4712SSatish Balay 11074e2b4712SSatish Balay PetscFunctionBegin; 1108e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1109e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1110f1af5d2fSBarry Smith t = a->solve_work; 11114e2b4712SSatish Balay 11124e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11134e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11144e2b4712SSatish Balay 11154e2b4712SSatish Balay /* forward solve the lower triangular */ 111687828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11174e2b4712SSatish Balay for (i=1; i<n; i++) { 11184e2b4712SSatish Balay v = aa + bs2*ai[i]; 11194e2b4712SSatish Balay vi = aj + ai[i]; 11204e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1121f1af5d2fSBarry Smith s = t + bs*i; 112287828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11234e2b4712SSatish Balay while (nz--) { 1124f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11254e2b4712SSatish Balay v += bs2; 11264e2b4712SSatish Balay } 11274e2b4712SSatish Balay } 11284e2b4712SSatish Balay /* backward solve the upper triangular */ 1129273d9f13SBarry Smith ls = a->solve_work + A->n; 11304e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11314e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11324e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11334e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 113487828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11354e2b4712SSatish Balay while (nz--) { 1136f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11374e2b4712SSatish Balay v += bs2; 11384e2b4712SSatish Balay } 1139f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 114087828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11414e2b4712SSatish Balay } 11424e2b4712SSatish Balay 11434e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11444e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1145e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1146e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1147b0a32e0cSBarry Smith PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 11484e2b4712SSatish Balay PetscFunctionReturn(0); 11494e2b4712SSatish Balay } 11504e2b4712SSatish Balay 11514a2ae208SSatish Balay #undef __FUNCT__ 11524a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 11534e2b4712SSatish Balay int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11544e2b4712SSatish Balay { 11554e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11564e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11574e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 11584e2b4712SSatish Balay int *diag = a->diag; 11593f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 116087828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 116187828ca2SBarry Smith PetscScalar *x,*b,*t; 11624e2b4712SSatish Balay 11634e2b4712SSatish Balay PetscFunctionBegin; 1164e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1165e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1166f1af5d2fSBarry Smith t = a->solve_work; 11674e2b4712SSatish Balay 11684e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11694e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11704e2b4712SSatish Balay 11714e2b4712SSatish Balay /* forward solve the lower triangular */ 11724e2b4712SSatish Balay idx = 7*(*r++); 1173f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1174f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1175f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 11764e2b4712SSatish Balay 11774e2b4712SSatish Balay for (i=1; i<n; i++) { 11784e2b4712SSatish Balay v = aa + 49*ai[i]; 11794e2b4712SSatish Balay vi = aj + ai[i]; 11804e2b4712SSatish Balay nz = diag[i] - ai[i]; 11814e2b4712SSatish Balay idx = 7*(*r++); 1182f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1183f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 11844e2b4712SSatish Balay while (nz--) { 11854e2b4712SSatish Balay idx = 7*(*vi++); 1186f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1187f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1188f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1189f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1190f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1191f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1192f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1193f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1194f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1195f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 11964e2b4712SSatish Balay v += 49; 11974e2b4712SSatish Balay } 11984e2b4712SSatish Balay idx = 7*i; 1199f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1200f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1201f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12024e2b4712SSatish Balay } 12034e2b4712SSatish Balay /* backward solve the upper triangular */ 12044e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12054e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12064e2b4712SSatish Balay vi = aj + diag[i] + 1; 12074e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12084e2b4712SSatish Balay idt = 7*i; 1209f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1210f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1211f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12124e2b4712SSatish Balay while (nz--) { 12134e2b4712SSatish Balay idx = 7*(*vi++); 1214f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1215f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1216f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1217f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1218f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1219f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1220f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1221f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1222f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1223f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12244e2b4712SSatish Balay v += 49; 12254e2b4712SSatish Balay } 12264e2b4712SSatish Balay idc = 7*(*c--); 12274e2b4712SSatish Balay v = aa + 49*diag[i]; 1228f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1229f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1230f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1231f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1232f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1233f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1234f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1235f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1236f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1237f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1238f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1239f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1240f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1241f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12424e2b4712SSatish Balay } 12434e2b4712SSatish Balay 12444e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12454e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1246e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1247e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1248b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 12494e2b4712SSatish Balay PetscFunctionReturn(0); 12504e2b4712SSatish Balay } 12514e2b4712SSatish Balay 12524a2ae208SSatish Balay #undef __FUNCT__ 12534a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 125415091d37SBarry Smith int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 125515091d37SBarry Smith { 125615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 125715091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 125815091d37SBarry Smith int ierr,*diag = a->diag,jdx; 125915091d37SBarry Smith MatScalar *aa=a->a,*v; 126087828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 126115091d37SBarry Smith 126215091d37SBarry Smith PetscFunctionBegin; 126315091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 126415091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 126515091d37SBarry Smith /* forward solve the lower triangular */ 126615091d37SBarry Smith idx = 0; 126715091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 126815091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 126915091d37SBarry Smith x[6] = b[6+idx]; 127015091d37SBarry Smith for (i=1; i<n; i++) { 127115091d37SBarry Smith v = aa + 49*ai[i]; 127215091d37SBarry Smith vi = aj + ai[i]; 127315091d37SBarry Smith nz = diag[i] - ai[i]; 127415091d37SBarry Smith idx = 7*i; 1275f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1276f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1277f1af5d2fSBarry Smith s7 = b[6+idx]; 127815091d37SBarry Smith while (nz--) { 127915091d37SBarry Smith jdx = 7*(*vi++); 128015091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 128115091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 128215091d37SBarry Smith x7 = x[6+jdx]; 1283f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1284f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1285f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1286f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1287f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1288f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1289f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 129015091d37SBarry Smith v += 49; 129115091d37SBarry Smith } 1292f1af5d2fSBarry Smith x[idx] = s1; 1293f1af5d2fSBarry Smith x[1+idx] = s2; 1294f1af5d2fSBarry Smith x[2+idx] = s3; 1295f1af5d2fSBarry Smith x[3+idx] = s4; 1296f1af5d2fSBarry Smith x[4+idx] = s5; 1297f1af5d2fSBarry Smith x[5+idx] = s6; 1298f1af5d2fSBarry Smith x[6+idx] = s7; 129915091d37SBarry Smith } 130015091d37SBarry Smith /* backward solve the upper triangular */ 130115091d37SBarry Smith for (i=n-1; i>=0; i--){ 130215091d37SBarry Smith v = aa + 49*diag[i] + 49; 130315091d37SBarry Smith vi = aj + diag[i] + 1; 130415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 130515091d37SBarry Smith idt = 7*i; 1306f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1307f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1308f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1309f1af5d2fSBarry Smith s7 = x[6+idt]; 131015091d37SBarry Smith while (nz--) { 131115091d37SBarry Smith idx = 7*(*vi++); 131215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 131315091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 131415091d37SBarry Smith x7 = x[6+idx]; 1315f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1316f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1317f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1318f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1319f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1320f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1321f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 132215091d37SBarry Smith v += 49; 132315091d37SBarry Smith } 132415091d37SBarry Smith v = aa + 49*diag[i]; 1325f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1326f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1327f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1328f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1329f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1330f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1331f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1332f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1333f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1334f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1335f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1336f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1337f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1338f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 133915091d37SBarry Smith } 134015091d37SBarry Smith 134115091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 134215091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1343b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 134415091d37SBarry Smith PetscFunctionReturn(0); 134515091d37SBarry Smith } 134615091d37SBarry Smith 13474a2ae208SSatish Balay #undef __FUNCT__ 13484a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 134915091d37SBarry Smith int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 135015091d37SBarry Smith { 135115091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 135215091d37SBarry Smith IS iscol=a->col,isrow=a->row; 135315091d37SBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 135415091d37SBarry Smith int *diag = a->diag; 135515091d37SBarry Smith MatScalar *aa=a->a,*v; 135687828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 135715091d37SBarry Smith 135815091d37SBarry Smith PetscFunctionBegin; 135915091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 136015091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1361f1af5d2fSBarry Smith t = a->solve_work; 136215091d37SBarry Smith 136315091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 136415091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 136515091d37SBarry Smith 136615091d37SBarry Smith /* forward solve the lower triangular */ 136715091d37SBarry Smith idx = 6*(*r++); 1368f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1369f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1370f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 137115091d37SBarry Smith for (i=1; i<n; i++) { 137215091d37SBarry Smith v = aa + 36*ai[i]; 137315091d37SBarry Smith vi = aj + ai[i]; 137415091d37SBarry Smith nz = diag[i] - ai[i]; 137515091d37SBarry Smith idx = 6*(*r++); 1376f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1377f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 137815091d37SBarry Smith while (nz--) { 137915091d37SBarry Smith idx = 6*(*vi++); 1380f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1381f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1382f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1383f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1384f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1385f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1386f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1387f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 138815091d37SBarry Smith v += 36; 138915091d37SBarry Smith } 139015091d37SBarry Smith idx = 6*i; 1391f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1392f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1393f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 139415091d37SBarry Smith } 139515091d37SBarry Smith /* backward solve the upper triangular */ 139615091d37SBarry Smith for (i=n-1; i>=0; i--){ 139715091d37SBarry Smith v = aa + 36*diag[i] + 36; 139815091d37SBarry Smith vi = aj + diag[i] + 1; 139915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 140015091d37SBarry Smith idt = 6*i; 1401f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1402f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1403f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 140415091d37SBarry Smith while (nz--) { 140515091d37SBarry Smith idx = 6*(*vi++); 1406f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1407f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1408f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1409f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1410f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1411f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1412f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1413f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1414f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 141515091d37SBarry Smith v += 36; 141615091d37SBarry Smith } 141715091d37SBarry Smith idc = 6*(*c--); 141815091d37SBarry Smith v = aa + 36*diag[i]; 1419f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1420f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1421f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1422f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1423f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1424f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1425f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1426f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1427f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1428f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1429f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1430f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 143115091d37SBarry Smith } 143215091d37SBarry Smith 143315091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 143415091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 143515091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 143615091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1437b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 143815091d37SBarry Smith PetscFunctionReturn(0); 143915091d37SBarry Smith } 144015091d37SBarry Smith 14414a2ae208SSatish Balay #undef __FUNCT__ 14424a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 144315091d37SBarry Smith int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 144415091d37SBarry Smith { 144515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 144615091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 144715091d37SBarry Smith int ierr,*diag = a->diag,jdx; 144815091d37SBarry Smith MatScalar *aa=a->a,*v; 144987828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 145015091d37SBarry Smith 145115091d37SBarry Smith PetscFunctionBegin; 145215091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 145315091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 145415091d37SBarry Smith /* forward solve the lower triangular */ 145515091d37SBarry Smith idx = 0; 145615091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 145715091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 145815091d37SBarry Smith for (i=1; i<n; i++) { 145915091d37SBarry Smith v = aa + 36*ai[i]; 146015091d37SBarry Smith vi = aj + ai[i]; 146115091d37SBarry Smith nz = diag[i] - ai[i]; 146215091d37SBarry Smith idx = 6*i; 1463f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1464f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 146515091d37SBarry Smith while (nz--) { 146615091d37SBarry Smith jdx = 6*(*vi++); 146715091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 146815091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1469f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1470f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1471f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1472f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1473f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1474f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 147515091d37SBarry Smith v += 36; 147615091d37SBarry Smith } 1477f1af5d2fSBarry Smith x[idx] = s1; 1478f1af5d2fSBarry Smith x[1+idx] = s2; 1479f1af5d2fSBarry Smith x[2+idx] = s3; 1480f1af5d2fSBarry Smith x[3+idx] = s4; 1481f1af5d2fSBarry Smith x[4+idx] = s5; 1482f1af5d2fSBarry Smith x[5+idx] = s6; 148315091d37SBarry Smith } 148415091d37SBarry Smith /* backward solve the upper triangular */ 148515091d37SBarry Smith for (i=n-1; i>=0; i--){ 148615091d37SBarry Smith v = aa + 36*diag[i] + 36; 148715091d37SBarry Smith vi = aj + diag[i] + 1; 148815091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 148915091d37SBarry Smith idt = 6*i; 1490f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1491f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1492f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 149315091d37SBarry Smith while (nz--) { 149415091d37SBarry Smith idx = 6*(*vi++); 149515091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 149615091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1497f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1498f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1499f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1500f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1501f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1502f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 150315091d37SBarry Smith v += 36; 150415091d37SBarry Smith } 150515091d37SBarry Smith v = aa + 36*diag[i]; 1506f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1507f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1508f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1509f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1510f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1511f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 151215091d37SBarry Smith } 151315091d37SBarry Smith 151415091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 151515091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1516b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 151715091d37SBarry Smith PetscFunctionReturn(0); 151815091d37SBarry Smith } 151915091d37SBarry Smith 15204a2ae208SSatish Balay #undef __FUNCT__ 15214a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 15224e2b4712SSatish Balay int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 15234e2b4712SSatish Balay { 15244e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15254e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 15264e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 15274e2b4712SSatish Balay int *diag = a->diag; 15283f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 152987828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 15304e2b4712SSatish Balay 15314e2b4712SSatish Balay PetscFunctionBegin; 1532e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1533e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1534f1af5d2fSBarry Smith t = a->solve_work; 15354e2b4712SSatish Balay 15364e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 15374e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 15384e2b4712SSatish Balay 15394e2b4712SSatish Balay /* forward solve the lower triangular */ 15404e2b4712SSatish Balay idx = 5*(*r++); 1541f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1542f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 15434e2b4712SSatish Balay for (i=1; i<n; i++) { 15444e2b4712SSatish Balay v = aa + 25*ai[i]; 15454e2b4712SSatish Balay vi = aj + ai[i]; 15464e2b4712SSatish Balay nz = diag[i] - ai[i]; 15474e2b4712SSatish Balay idx = 5*(*r++); 1548f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1549f1af5d2fSBarry Smith s5 = b[4+idx]; 15504e2b4712SSatish Balay while (nz--) { 15514e2b4712SSatish Balay idx = 5*(*vi++); 1552f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1553f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1554f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1555f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1556f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1557f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1558f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 15594e2b4712SSatish Balay v += 25; 15604e2b4712SSatish Balay } 15614e2b4712SSatish Balay idx = 5*i; 1562f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1563f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 15644e2b4712SSatish Balay } 15654e2b4712SSatish Balay /* backward solve the upper triangular */ 15664e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 15674e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 15684e2b4712SSatish Balay vi = aj + diag[i] + 1; 15694e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 15704e2b4712SSatish Balay idt = 5*i; 1571f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1572f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 15734e2b4712SSatish Balay while (nz--) { 15744e2b4712SSatish Balay idx = 5*(*vi++); 1575f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1576f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1577f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1578f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1579f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1580f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1581f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 15824e2b4712SSatish Balay v += 25; 15834e2b4712SSatish Balay } 15844e2b4712SSatish Balay idc = 5*(*c--); 15854e2b4712SSatish Balay v = aa + 25*diag[i]; 1586f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1587f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 1588f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1589f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 1590f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1591f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 1592f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1593f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 1594f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1595f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 15964e2b4712SSatish Balay } 15974e2b4712SSatish Balay 15984e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 15994e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1600e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1601e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1602b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 16034e2b4712SSatish Balay PetscFunctionReturn(0); 16044e2b4712SSatish Balay } 16054e2b4712SSatish Balay 16064a2ae208SSatish Balay #undef __FUNCT__ 16074a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 160815091d37SBarry Smith int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 160915091d37SBarry Smith { 161015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 161115091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 161215091d37SBarry Smith int ierr,*diag = a->diag,jdx; 161315091d37SBarry Smith MatScalar *aa=a->a,*v; 161487828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 161515091d37SBarry Smith 161615091d37SBarry Smith PetscFunctionBegin; 161715091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 161815091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 161915091d37SBarry Smith /* forward solve the lower triangular */ 162015091d37SBarry Smith idx = 0; 162115091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 162215091d37SBarry Smith for (i=1; i<n; i++) { 162315091d37SBarry Smith v = aa + 25*ai[i]; 162415091d37SBarry Smith vi = aj + ai[i]; 162515091d37SBarry Smith nz = diag[i] - ai[i]; 162615091d37SBarry Smith idx = 5*i; 1627f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 162815091d37SBarry Smith while (nz--) { 162915091d37SBarry Smith jdx = 5*(*vi++); 163015091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1631f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1632f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1633f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1634f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1635f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 163615091d37SBarry Smith v += 25; 163715091d37SBarry Smith } 1638f1af5d2fSBarry Smith x[idx] = s1; 1639f1af5d2fSBarry Smith x[1+idx] = s2; 1640f1af5d2fSBarry Smith x[2+idx] = s3; 1641f1af5d2fSBarry Smith x[3+idx] = s4; 1642f1af5d2fSBarry Smith x[4+idx] = s5; 164315091d37SBarry Smith } 164415091d37SBarry Smith /* backward solve the upper triangular */ 164515091d37SBarry Smith for (i=n-1; i>=0; i--){ 164615091d37SBarry Smith v = aa + 25*diag[i] + 25; 164715091d37SBarry Smith vi = aj + diag[i] + 1; 164815091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 164915091d37SBarry Smith idt = 5*i; 1650f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1651f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 165215091d37SBarry Smith while (nz--) { 165315091d37SBarry Smith idx = 5*(*vi++); 165415091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1655f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1656f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1657f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1658f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1659f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 166015091d37SBarry Smith v += 25; 166115091d37SBarry Smith } 166215091d37SBarry Smith v = aa + 25*diag[i]; 1663f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1664f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1665f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1666f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1667f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 166815091d37SBarry Smith } 166915091d37SBarry Smith 167015091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 167115091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1672b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 167315091d37SBarry Smith PetscFunctionReturn(0); 167415091d37SBarry Smith } 167515091d37SBarry Smith 16764a2ae208SSatish Balay #undef __FUNCT__ 16774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 16784e2b4712SSatish Balay int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 16794e2b4712SSatish Balay { 16804e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 16814e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 16824e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 16834e2b4712SSatish Balay int *diag = a->diag; 16843f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 168587828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 16864e2b4712SSatish Balay 16874e2b4712SSatish Balay PetscFunctionBegin; 1688e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1689e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1690f1af5d2fSBarry Smith t = a->solve_work; 16914e2b4712SSatish Balay 16924e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 16934e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 16944e2b4712SSatish Balay 16954e2b4712SSatish Balay /* forward solve the lower triangular */ 16964e2b4712SSatish Balay idx = 4*(*r++); 1697f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1698f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 16994e2b4712SSatish Balay for (i=1; i<n; i++) { 17004e2b4712SSatish Balay v = aa + 16*ai[i]; 17014e2b4712SSatish Balay vi = aj + ai[i]; 17024e2b4712SSatish Balay nz = diag[i] - ai[i]; 17034e2b4712SSatish Balay idx = 4*(*r++); 1704f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 17054e2b4712SSatish Balay while (nz--) { 17064e2b4712SSatish Balay idx = 4*(*vi++); 1707f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1708f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1709f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1710f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1711f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17124e2b4712SSatish Balay v += 16; 17134e2b4712SSatish Balay } 17144e2b4712SSatish Balay idx = 4*i; 1715f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1716f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 17174e2b4712SSatish Balay } 17184e2b4712SSatish Balay /* backward solve the upper triangular */ 17194e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 17204e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 17214e2b4712SSatish Balay vi = aj + diag[i] + 1; 17224e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 17234e2b4712SSatish Balay idt = 4*i; 1724f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1725f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 17264e2b4712SSatish Balay while (nz--) { 17274e2b4712SSatish Balay idx = 4*(*vi++); 1728f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1729f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1730f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1731f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1732f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1733f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17344e2b4712SSatish Balay v += 16; 17354e2b4712SSatish Balay } 17364e2b4712SSatish Balay idc = 4*(*c--); 17374e2b4712SSatish Balay v = aa + 16*diag[i]; 1738f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1739f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1740f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1741f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 17424e2b4712SSatish Balay } 17434e2b4712SSatish Balay 17444e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 17454e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1746e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1747e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1748b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 17494e2b4712SSatish Balay PetscFunctionReturn(0); 17504e2b4712SSatish Balay } 1751f26ec98cSKris Buschelman 1752f26ec98cSKris Buschelman #undef __FUNCT__ 1753f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1754f26ec98cSKris Buschelman int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1755f26ec98cSKris Buschelman { 1756f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1757f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 1758f26ec98cSKris Buschelman int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1759f26ec98cSKris Buschelman int *diag = a->diag; 1760f26ec98cSKris Buschelman MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1761f26ec98cSKris Buschelman PetscScalar *x,*b; 1762f26ec98cSKris Buschelman 1763f26ec98cSKris Buschelman PetscFunctionBegin; 1764f26ec98cSKris Buschelman ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1765f26ec98cSKris Buschelman ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1766f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 1767f26ec98cSKris Buschelman 1768f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1769f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1770f26ec98cSKris Buschelman 1771f26ec98cSKris Buschelman /* forward solve the lower triangular */ 1772f26ec98cSKris Buschelman idx = 4*(*r++); 1773f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 1774f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 1775f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 1776f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 1777f26ec98cSKris Buschelman for (i=1; i<n; i++) { 1778f26ec98cSKris Buschelman v = aa + 16*ai[i]; 1779f26ec98cSKris Buschelman vi = aj + ai[i]; 1780f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 1781f26ec98cSKris Buschelman idx = 4*(*r++); 1782f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 1783f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 1784f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 1785f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 1786f26ec98cSKris Buschelman while (nz--) { 1787f26ec98cSKris Buschelman idx = 4*(*vi++); 1788f26ec98cSKris Buschelman x1 = t[idx]; 1789f26ec98cSKris Buschelman x2 = t[1+idx]; 1790f26ec98cSKris Buschelman x3 = t[2+idx]; 1791f26ec98cSKris Buschelman x4 = t[3+idx]; 1792f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1793f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1794f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1795f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1796f26ec98cSKris Buschelman v += 16; 1797f26ec98cSKris Buschelman } 1798f26ec98cSKris Buschelman idx = 4*i; 1799f26ec98cSKris Buschelman t[idx] = s1; 1800f26ec98cSKris Buschelman t[1+idx] = s2; 1801f26ec98cSKris Buschelman t[2+idx] = s3; 1802f26ec98cSKris Buschelman t[3+idx] = s4; 1803f26ec98cSKris Buschelman } 1804f26ec98cSKris Buschelman /* backward solve the upper triangular */ 1805f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 1806f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 1807f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 1808f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 1809f26ec98cSKris Buschelman idt = 4*i; 1810f26ec98cSKris Buschelman s1 = t[idt]; 1811f26ec98cSKris Buschelman s2 = t[1+idt]; 1812f26ec98cSKris Buschelman s3 = t[2+idt]; 1813f26ec98cSKris Buschelman s4 = t[3+idt]; 1814f26ec98cSKris Buschelman while (nz--) { 1815f26ec98cSKris Buschelman idx = 4*(*vi++); 1816f26ec98cSKris Buschelman x1 = t[idx]; 1817f26ec98cSKris Buschelman x2 = t[1+idx]; 1818f26ec98cSKris Buschelman x3 = t[2+idx]; 1819f26ec98cSKris Buschelman x4 = t[3+idx]; 1820f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1821f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1822f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1823f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1824f26ec98cSKris Buschelman v += 16; 1825f26ec98cSKris Buschelman } 1826f26ec98cSKris Buschelman idc = 4*(*c--); 1827f26ec98cSKris Buschelman v = aa + 16*diag[i]; 1828f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1829f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1830f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1831f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1832f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 1833f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 1834f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 1835f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 1836f26ec98cSKris Buschelman } 1837f26ec98cSKris Buschelman 1838f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1839f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1840f26ec98cSKris Buschelman ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1841f26ec98cSKris Buschelman ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1842f26ec98cSKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 1843f26ec98cSKris Buschelman PetscFunctionReturn(0); 1844f26ec98cSKris Buschelman } 1845f26ec98cSKris Buschelman 184624c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 184724c233c2SKris Buschelman 184824c233c2SKris Buschelman #include PETSC_HAVE_SSE 184924c233c2SKris Buschelman 185024c233c2SKris Buschelman #undef __FUNCT__ 185124c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 185224c233c2SKris Buschelman int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 185324c233c2SKris Buschelman { 185424c233c2SKris Buschelman /* 185524c233c2SKris Buschelman Note: This code uses demotion of double 185624c233c2SKris Buschelman to float when performing the mixed-mode computation. 185724c233c2SKris Buschelman This may not be numerically reasonable for all applications. 185824c233c2SKris Buschelman */ 185924c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 186024c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 186124c233c2SKris Buschelman int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 186224c233c2SKris Buschelman int *diag = a->diag,ai16; 186324c233c2SKris Buschelman MatScalar *aa=a->a,*v; 186487828ca2SBarry Smith PetscScalar *x,*b,*t; 186524c233c2SKris Buschelman 186624c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 186724c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 186824c233c2SKris Buschelman unsigned long offset; 186924c233c2SKris Buschelman 187024c233c2SKris Buschelman PetscFunctionBegin; 187124c233c2SKris Buschelman SSE_SCOPE_BEGIN; 187224c233c2SKris Buschelman 187324c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 187424c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 187524c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 187624c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 187724c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 187824c233c2SKris Buschelman 187924c233c2SKris Buschelman ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 188024c233c2SKris Buschelman ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 188124c233c2SKris Buschelman t = a->solve_work; 188224c233c2SKris Buschelman 188324c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 188424c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 188524c233c2SKris Buschelman 188624c233c2SKris Buschelman /* forward solve the lower triangular */ 188724c233c2SKris Buschelman idx = 4*(*r++); 188824c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 188924c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 189024c233c2SKris Buschelman v = aa + 16*ai[1]; 189124c233c2SKris Buschelman 189224c233c2SKris Buschelman for (i=1; i<n;) { 189324c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 189424c233c2SKris Buschelman vi = aj + ai[i]; 189524c233c2SKris Buschelman nz = diag[i] - ai[i]; 189624c233c2SKris Buschelman idx = 4*(*r++); 189724c233c2SKris Buschelman 189824c233c2SKris Buschelman /* Demote sum from double to float */ 189924c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 190024c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 190124c233c2SKris Buschelman 190224c233c2SKris Buschelman while (nz--) { 190324c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 190424c233c2SKris Buschelman idx = 4*(*vi++); 190524c233c2SKris Buschelman 190624c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 190724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 190824c233c2SKris Buschelman 190924c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 191024c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 191124c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 191224c233c2SKris Buschelman 191324c233c2SKris Buschelman /* First Column */ 191424c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 191524c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 191624c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 191724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 191824c233c2SKris Buschelman 191924c233c2SKris Buschelman /* Second Column */ 192024c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 192124c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 192224c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 192324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 192424c233c2SKris Buschelman 192524c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 192624c233c2SKris Buschelman 192724c233c2SKris Buschelman /* Third Column */ 192824c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 192924c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 193024c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 193124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 193224c233c2SKris Buschelman 193324c233c2SKris Buschelman /* Fourth Column */ 193424c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 193524c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 193624c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 193724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 193824c233c2SKris Buschelman SSE_INLINE_END_2 193924c233c2SKris Buschelman 194024c233c2SKris Buschelman v += 16; 194124c233c2SKris Buschelman } 194224c233c2SKris Buschelman idx = 4*i; 194324c233c2SKris Buschelman v = aa + 16*ai[++i]; 194424c233c2SKris Buschelman PREFETCH_NTA(v); 194524c233c2SKris Buschelman STORE_PS(tmps,XMM7); 194624c233c2SKris Buschelman 194724c233c2SKris Buschelman /* Promote result from float to double */ 194824c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 194924c233c2SKris Buschelman } 195024c233c2SKris Buschelman /* backward solve the upper triangular */ 195124c233c2SKris Buschelman idt = 4*(n-1); 195224c233c2SKris Buschelman ai16 = 16*diag[n-1]; 195324c233c2SKris Buschelman v = aa + ai16 + 16; 195424c233c2SKris Buschelman for (i=n-1; i>=0;){ 195524c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 195624c233c2SKris Buschelman vi = aj + diag[i] + 1; 195724c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 195824c233c2SKris Buschelman 195924c233c2SKris Buschelman /* Demote accumulator from double to float */ 196024c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 196124c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 196224c233c2SKris Buschelman 196324c233c2SKris Buschelman while (nz--) { 196424c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 196524c233c2SKris Buschelman idx = 4*(*vi++); 196624c233c2SKris Buschelman 196724c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 196824c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 196924c233c2SKris Buschelman 197024c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 197124c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 197224c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 197324c233c2SKris Buschelman 197424c233c2SKris Buschelman /* First Column */ 197524c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 197624c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 197724c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 197824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 197924c233c2SKris Buschelman 198024c233c2SKris Buschelman /* Second Column */ 198124c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 198224c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 198324c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 198424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 198524c233c2SKris Buschelman 198624c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 198724c233c2SKris Buschelman 198824c233c2SKris Buschelman /* Third Column */ 198924c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 199024c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 199124c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 199224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 199324c233c2SKris Buschelman 199424c233c2SKris Buschelman /* Fourth Column */ 199524c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 199624c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 199724c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 199824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 199924c233c2SKris Buschelman SSE_INLINE_END_2 200024c233c2SKris Buschelman v += 16; 200124c233c2SKris Buschelman } 200224c233c2SKris Buschelman v = aa + ai16; 200324c233c2SKris Buschelman ai16 = 16*diag[--i]; 200424c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 200524c233c2SKris Buschelman /* 200624c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 200724c233c2SKris Buschelman which was inverted as part of the factorization 200824c233c2SKris Buschelman */ 200924c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 201024c233c2SKris Buschelman /* First Column */ 201124c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 201224c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 201324c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 201424c233c2SKris Buschelman 201524c233c2SKris Buschelman /* Second Column */ 201624c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 201724c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 201824c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 201924c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 202024c233c2SKris Buschelman 202124c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 202224c233c2SKris Buschelman 202324c233c2SKris Buschelman /* Third Column */ 202424c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 202524c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 202624c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 202724c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 202824c233c2SKris Buschelman 202924c233c2SKris Buschelman /* Fourth Column */ 203024c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 203124c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 203224c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 203324c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 203424c233c2SKris Buschelman 203524c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 203624c233c2SKris Buschelman SSE_INLINE_END_3 203724c233c2SKris Buschelman 203824c233c2SKris Buschelman /* Promote solution from float to double */ 203924c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 204024c233c2SKris Buschelman 204124c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 204224c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 204324c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 204424c233c2SKris Buschelman idc = 4*(*c--); 204524c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 204624c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 204724c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 204824c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 204924c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 205024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 205124c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 205224c233c2SKris Buschelman SSE_INLINE_END_2 205324c233c2SKris Buschelman v = aa + ai16 + 16; 205424c233c2SKris Buschelman idt -= 4; 205524c233c2SKris Buschelman } 205624c233c2SKris Buschelman 205724c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 205824c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 205924c233c2SKris Buschelman ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 206024c233c2SKris Buschelman ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 206124c233c2SKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 206224c233c2SKris Buschelman SSE_SCOPE_END; 206324c233c2SKris Buschelman PetscFunctionReturn(0); 206424c233c2SKris Buschelman } 206524c233c2SKris Buschelman 206624c233c2SKris Buschelman #endif 20670ef38995SBarry Smith 20680ef38995SBarry Smith 20694e2b4712SSatish Balay /* 20704e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 20714e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 20724e2b4712SSatish Balay */ 20734a2ae208SSatish Balay #undef __FUNCT__ 20744a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 20754e2b4712SSatish Balay int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 20764e2b4712SSatish Balay { 20774e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 207830d4dcafSBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 207930d4dcafSBarry Smith int ierr,*diag = a->diag; 20803f1db9ecSBarry Smith MatScalar *aa=a->a; 208187828ca2SBarry Smith PetscScalar *x,*b; 20824e2b4712SSatish Balay 20834e2b4712SSatish Balay PetscFunctionBegin; 2084e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2085e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 20864e2b4712SSatish Balay 2087aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 20882853dc0eSBarry Smith { 208987828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 20902853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 20912853dc0eSBarry Smith } 2092aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 20932853dc0eSBarry Smith { 209487828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 20952853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 20962853dc0eSBarry Smith } 2097aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 20982853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2099e1293385SBarry Smith #else 210030d4dcafSBarry Smith { 210187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 21023f1db9ecSBarry Smith MatScalar *v; 21034e555682SBarry Smith int jdx,idt,idx,nz,*vi,i,ai16; 2104e1293385SBarry Smith 21054e2b4712SSatish Balay /* forward solve the lower triangular */ 21064e2b4712SSatish Balay idx = 0; 2107e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 21084e2b4712SSatish Balay for (i=1; i<n; i++) { 21094e2b4712SSatish Balay v = aa + 16*ai[i]; 21104e2b4712SSatish Balay vi = aj + ai[i]; 21114e2b4712SSatish Balay nz = diag[i] - ai[i]; 2112e1293385SBarry Smith idx += 4; 2113f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 21144e2b4712SSatish Balay while (nz--) { 21154e2b4712SSatish Balay jdx = 4*(*vi++); 21164e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2117f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2118f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2119f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2120f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 21214e2b4712SSatish Balay v += 16; 21224e2b4712SSatish Balay } 2123f1af5d2fSBarry Smith x[idx] = s1; 2124f1af5d2fSBarry Smith x[1+idx] = s2; 2125f1af5d2fSBarry Smith x[2+idx] = s3; 2126f1af5d2fSBarry Smith x[3+idx] = s4; 21274e2b4712SSatish Balay } 21284e2b4712SSatish Balay /* backward solve the upper triangular */ 21294e555682SBarry Smith idt = 4*(n-1); 21304e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 21314e555682SBarry Smith ai16 = 16*diag[i]; 21324e555682SBarry Smith v = aa + ai16 + 16; 21334e2b4712SSatish Balay vi = aj + diag[i] + 1; 21344e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2135f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2136f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 21374e2b4712SSatish Balay while (nz--) { 21384e2b4712SSatish Balay idx = 4*(*vi++); 21394e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2140f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2141f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2142f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2143f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 21444e2b4712SSatish Balay v += 16; 21454e2b4712SSatish Balay } 21464e555682SBarry Smith v = aa + ai16; 2147f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2148f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2149f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2150f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2151329f5518SBarry Smith idt -= 4; 21524e2b4712SSatish Balay } 215330d4dcafSBarry Smith } 2154e1293385SBarry Smith #endif 21554e2b4712SSatish Balay 2156e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2157e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2158b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 21594e2b4712SSatish Balay PetscFunctionReturn(0); 21604e2b4712SSatish Balay } 21614e2b4712SSatish Balay 2162f26ec98cSKris Buschelman #undef __FUNCT__ 2163f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2164f26ec98cSKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2165f26ec98cSKris Buschelman { 2166f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2167f26ec98cSKris Buschelman int n=a->mbs,*ai=a->i,*aj=a->j; 2168f26ec98cSKris Buschelman int ierr,*diag = a->diag; 2169f26ec98cSKris Buschelman MatScalar *aa=a->a; 2170f26ec98cSKris Buschelman PetscScalar *x,*b; 2171f26ec98cSKris Buschelman 2172f26ec98cSKris Buschelman PetscFunctionBegin; 2173f26ec98cSKris Buschelman ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2174f26ec98cSKris Buschelman ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2175f26ec98cSKris Buschelman 2176f26ec98cSKris Buschelman { 2177f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2178f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 2179f26ec98cSKris Buschelman int jdx,idt,idx,nz,*vi,i,ai16; 2180f26ec98cSKris Buschelman 2181f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2182f26ec98cSKris Buschelman idx = 0; 2183f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 2184f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 2185f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 2186f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 2187f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2188f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2189f26ec98cSKris Buschelman vi = aj + ai[i]; 2190f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2191f26ec98cSKris Buschelman idx += 4; 2192f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2193f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2194f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2195f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2196f26ec98cSKris Buschelman while (nz--) { 2197f26ec98cSKris Buschelman jdx = 4*(*vi++); 2198f26ec98cSKris Buschelman x1 = t[jdx]; 2199f26ec98cSKris Buschelman x2 = t[1+jdx]; 2200f26ec98cSKris Buschelman x3 = t[2+jdx]; 2201f26ec98cSKris Buschelman x4 = t[3+jdx]; 2202f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2203f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2204f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2205f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2206f26ec98cSKris Buschelman v += 16; 2207f26ec98cSKris Buschelman } 2208f26ec98cSKris Buschelman t[idx] = s1; 2209f26ec98cSKris Buschelman t[1+idx] = s2; 2210f26ec98cSKris Buschelman t[2+idx] = s3; 2211f26ec98cSKris Buschelman t[3+idx] = s4; 2212f26ec98cSKris Buschelman } 2213f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2214f26ec98cSKris Buschelman idt = 4*(n-1); 2215f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2216f26ec98cSKris Buschelman ai16 = 16*diag[i]; 2217f26ec98cSKris Buschelman v = aa + ai16 + 16; 2218f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2219f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2220f26ec98cSKris Buschelman s1 = t[idt]; 2221f26ec98cSKris Buschelman s2 = t[1+idt]; 2222f26ec98cSKris Buschelman s3 = t[2+idt]; 2223f26ec98cSKris Buschelman s4 = t[3+idt]; 2224f26ec98cSKris Buschelman while (nz--) { 2225f26ec98cSKris Buschelman idx = 4*(*vi++); 2226f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 2227f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 2228f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 2229f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 2230f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2231f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2232f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2233f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2234f26ec98cSKris Buschelman v += 16; 2235f26ec98cSKris Buschelman } 2236f26ec98cSKris Buschelman v = aa + ai16; 2237f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2238f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2239f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2240f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2241f26ec98cSKris Buschelman idt -= 4; 2242f26ec98cSKris Buschelman } 2243f26ec98cSKris Buschelman } 2244f26ec98cSKris Buschelman 2245f26ec98cSKris Buschelman ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2246f26ec98cSKris Buschelman ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2247f26ec98cSKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 2248f26ec98cSKris Buschelman PetscFunctionReturn(0); 2249f26ec98cSKris Buschelman } 2250f26ec98cSKris Buschelman 22513660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 22523660e330SKris Buschelman 22533660e330SKris Buschelman #include PETSC_HAVE_SSE 22546f6a888dSBarry Smith #include "src/vec/vecimpl.h" /* to allow VecGetArrayFast() */ 22553660e330SKris Buschelman #undef __FUNCT__ 22567cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 22577cf1b8d3SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 22583660e330SKris Buschelman { 22593660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 22602aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 22612aa5897fSKris Buschelman int ierr,*ai=a->i,n=a->mbs,*diag = a->diag; 22623660e330SKris Buschelman MatScalar *aa=a->a; 226387828ca2SBarry Smith PetscScalar *x,*b; 22643660e330SKris Buschelman 22653660e330SKris Buschelman PetscFunctionBegin; 22663660e330SKris Buschelman SSE_SCOPE_BEGIN; 22673660e330SKris Buschelman /* 22683660e330SKris Buschelman Note: This code currently uses demotion of double 22693660e330SKris Buschelman to float when performing the mixed-mode computation. 22703660e330SKris Buschelman This may not be numerically reasonable for all applications. 22713660e330SKris Buschelman */ 22723660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 22733660e330SKris Buschelman 22746f6a888dSBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 22756f6a888dSBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 22763660e330SKris Buschelman { 2277eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 2278eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 22792aa5897fSKris Buschelman int nz,i,idt,ai16; 22802aa5897fSKris Buschelman unsigned int jdx,idx; 22812aa5897fSKris Buschelman unsigned short *vi; 2282eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 22833660e330SKris Buschelman 2284eb05f457SKris Buschelman /* First block is the identity. */ 22853660e330SKris Buschelman idx = 0; 2286eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 22872aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 22883660e330SKris Buschelman 22893660e330SKris Buschelman for (i=1; i<n;) { 22903660e330SKris Buschelman PREFETCH_NTA(&v[8]); 22913660e330SKris Buschelman vi = aj + ai[i]; 22923660e330SKris Buschelman nz = diag[i] - ai[i]; 22933660e330SKris Buschelman idx += 4; 22943660e330SKris Buschelman 2295eb05f457SKris Buschelman /* Demote RHS from double to float. */ 2296eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2297eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 22983660e330SKris Buschelman 22993660e330SKris Buschelman while (nz--) { 23003660e330SKris Buschelman PREFETCH_NTA(&v[16]); 23012aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 23023660e330SKris Buschelman 23033660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 2304eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 23053660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 23063660e330SKris Buschelman 23073660e330SKris Buschelman /* First Column */ 23083660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 23093660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 23103660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 23113660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 23123660e330SKris Buschelman 23133660e330SKris Buschelman /* Second Column */ 23143660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 23153660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 23163660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 23173660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 23183660e330SKris Buschelman 23193660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 23203660e330SKris Buschelman 23213660e330SKris Buschelman /* Third Column */ 23223660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 23233660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 23243660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 23253660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 23263660e330SKris Buschelman 23273660e330SKris Buschelman /* Fourth Column */ 23283660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 23293660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 23303660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 23313660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 23323660e330SKris Buschelman SSE_INLINE_END_2 23333660e330SKris Buschelman 23343660e330SKris Buschelman v += 16; 23353660e330SKris Buschelman } 23363660e330SKris Buschelman v = aa + 16*ai[++i]; 23373660e330SKris Buschelman PREFETCH_NTA(v); 2338eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 23393660e330SKris Buschelman } 2340eb05f457SKris Buschelman 2341eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 2342eb05f457SKris Buschelman 23433660e330SKris Buschelman idt = 4*(n-1); 23443660e330SKris Buschelman ai16 = 16*diag[n-1]; 23453660e330SKris Buschelman v = aa + ai16 + 16; 23463660e330SKris Buschelman for (i=n-1; i>=0;){ 23473660e330SKris Buschelman PREFETCH_NTA(&v[8]); 23483660e330SKris Buschelman vi = aj + diag[i] + 1; 23493660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 23503660e330SKris Buschelman 2351eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 23523660e330SKris Buschelman 23533660e330SKris Buschelman while (nz--) { 23543660e330SKris Buschelman PREFETCH_NTA(&v[16]); 23552aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 23563660e330SKris Buschelman 23573660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 2358eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 23593660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 23603660e330SKris Buschelman 23613660e330SKris Buschelman /* First Column */ 23623660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 23633660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 23643660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 23653660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 23663660e330SKris Buschelman 23673660e330SKris Buschelman /* Second Column */ 23683660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 23693660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 23703660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 23713660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 23723660e330SKris Buschelman 23733660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 23743660e330SKris Buschelman 23753660e330SKris Buschelman /* Third Column */ 23763660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 23773660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 23783660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 23793660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 23803660e330SKris Buschelman 23813660e330SKris Buschelman /* Fourth Column */ 23823660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 23833660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 23843660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 23853660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 23863660e330SKris Buschelman SSE_INLINE_END_2 23873660e330SKris Buschelman v += 16; 23883660e330SKris Buschelman } 23893660e330SKris Buschelman v = aa + ai16; 23903660e330SKris Buschelman ai16 = 16*diag[--i]; 23913660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 23923660e330SKris Buschelman /* 23933660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 23943660e330SKris Buschelman which was inverted as part of the factorization 23953660e330SKris Buschelman */ 2396eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 23973660e330SKris Buschelman /* First Column */ 23983660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 23993660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 24003660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 24013660e330SKris Buschelman 24023660e330SKris Buschelman /* Second Column */ 24033660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 24043660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 24053660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 24063660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 24073660e330SKris Buschelman 24083660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 24093660e330SKris Buschelman 24103660e330SKris Buschelman /* Third Column */ 24113660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 24123660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 24133660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 24143660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 24153660e330SKris Buschelman 24163660e330SKris Buschelman /* Fourth Column */ 24173660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 24183660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 24193660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 24203660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 24213660e330SKris Buschelman 24223660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 24233660e330SKris Buschelman SSE_INLINE_END_3 24243660e330SKris Buschelman 24253660e330SKris Buschelman v = aa + ai16 + 16; 24263660e330SKris Buschelman idt -= 4; 24273660e330SKris Buschelman } 2428eb05f457SKris Buschelman 2429eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 2430eb05f457SKris Buschelman idt = 4*(n-1); 2431eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 2432eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2433eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2434eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 2435eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 2436eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 2437eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 2438eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 2439eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 244054693613SKris Buschelman idt -= 4; 24413660e330SKris Buschelman } 2442eb05f457SKris Buschelman 2443eb05f457SKris Buschelman } /* End of artificial scope. */ 24446f6a888dSBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 24456f6a888dSBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 24463660e330SKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 24473660e330SKris Buschelman SSE_SCOPE_END; 24483660e330SKris Buschelman PetscFunctionReturn(0); 24493660e330SKris Buschelman } 24503660e330SKris Buschelman 24517cf1b8d3SKris Buschelman #undef __FUNCT__ 24527cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 24537cf1b8d3SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 24547cf1b8d3SKris Buschelman { 24557cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 24567cf1b8d3SKris Buschelman int *aj=a->j; 24577cf1b8d3SKris Buschelman int ierr,*ai=a->i,n=a->mbs,*diag = a->diag; 24587cf1b8d3SKris Buschelman MatScalar *aa=a->a; 24597cf1b8d3SKris Buschelman PetscScalar *x,*b; 24607cf1b8d3SKris Buschelman 24617cf1b8d3SKris Buschelman PetscFunctionBegin; 24627cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 24637cf1b8d3SKris Buschelman /* 24647cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 24657cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 24667cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 24677cf1b8d3SKris Buschelman */ 24687cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 24697cf1b8d3SKris Buschelman 24707cf1b8d3SKris Buschelman ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 24717cf1b8d3SKris Buschelman ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 24727cf1b8d3SKris Buschelman { 24737cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 24747cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 24757cf1b8d3SKris Buschelman int nz,i,idt,ai16; 24767cf1b8d3SKris Buschelman int jdx,idx; 24777cf1b8d3SKris Buschelman int *vi; 24787cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 24797cf1b8d3SKris Buschelman 24807cf1b8d3SKris Buschelman /* First block is the identity. */ 24817cf1b8d3SKris Buschelman idx = 0; 24827cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 24837cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 24847cf1b8d3SKris Buschelman 24857cf1b8d3SKris Buschelman for (i=1; i<n;) { 24867cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 24877cf1b8d3SKris Buschelman vi = aj + ai[i]; 24887cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 24897cf1b8d3SKris Buschelman idx += 4; 24907cf1b8d3SKris Buschelman 24917cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 24927cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 24937cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 24947cf1b8d3SKris Buschelman 24957cf1b8d3SKris Buschelman while (nz--) { 24967cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 24977cf1b8d3SKris Buschelman jdx = 4*(*vi++); 24987cf1b8d3SKris Buschelman /* jdx = *vi++; */ 24997cf1b8d3SKris Buschelman 25007cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 25017cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 25027cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 25037cf1b8d3SKris Buschelman 25047cf1b8d3SKris Buschelman /* First Column */ 25057cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 25067cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25077cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 25087cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 25097cf1b8d3SKris Buschelman 25107cf1b8d3SKris Buschelman /* Second Column */ 25117cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 25127cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 25137cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 25147cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 25157cf1b8d3SKris Buschelman 25167cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 25177cf1b8d3SKris Buschelman 25187cf1b8d3SKris Buschelman /* Third Column */ 25197cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 25207cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 25217cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 25227cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 25237cf1b8d3SKris Buschelman 25247cf1b8d3SKris Buschelman /* Fourth Column */ 25257cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 25267cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 25277cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 25287cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 25297cf1b8d3SKris Buschelman SSE_INLINE_END_2 25307cf1b8d3SKris Buschelman 25317cf1b8d3SKris Buschelman v += 16; 25327cf1b8d3SKris Buschelman } 25337cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 25347cf1b8d3SKris Buschelman PREFETCH_NTA(v); 25357cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 25367cf1b8d3SKris Buschelman } 25377cf1b8d3SKris Buschelman 25387cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 25397cf1b8d3SKris Buschelman 25407cf1b8d3SKris Buschelman idt = 4*(n-1); 25417cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 25427cf1b8d3SKris Buschelman v = aa + ai16 + 16; 25437cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 25447cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 25457cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 25467cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 25477cf1b8d3SKris Buschelman 25487cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 25497cf1b8d3SKris Buschelman 25507cf1b8d3SKris Buschelman while (nz--) { 25517cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 25527cf1b8d3SKris Buschelman idx = 4*(*vi++); 25537cf1b8d3SKris Buschelman /* idx = *vi++; */ 25547cf1b8d3SKris Buschelman 25557cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 25567cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 25577cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 25587cf1b8d3SKris Buschelman 25597cf1b8d3SKris Buschelman /* First Column */ 25607cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 25617cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25627cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 25637cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 25647cf1b8d3SKris Buschelman 25657cf1b8d3SKris Buschelman /* Second Column */ 25667cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 25677cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 25687cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 25697cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 25707cf1b8d3SKris Buschelman 25717cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 25727cf1b8d3SKris Buschelman 25737cf1b8d3SKris Buschelman /* Third Column */ 25747cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 25757cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 25767cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 25777cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 25787cf1b8d3SKris Buschelman 25797cf1b8d3SKris Buschelman /* Fourth Column */ 25807cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 25817cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 25827cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 25837cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 25847cf1b8d3SKris Buschelman SSE_INLINE_END_2 25857cf1b8d3SKris Buschelman v += 16; 25867cf1b8d3SKris Buschelman } 25877cf1b8d3SKris Buschelman v = aa + ai16; 25887cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 25897cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 25907cf1b8d3SKris Buschelman /* 25917cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 25927cf1b8d3SKris Buschelman which was inverted as part of the factorization 25937cf1b8d3SKris Buschelman */ 25947cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 25957cf1b8d3SKris Buschelman /* First Column */ 25967cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 25977cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25987cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 25997cf1b8d3SKris Buschelman 26007cf1b8d3SKris Buschelman /* Second Column */ 26017cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 26027cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 26037cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 26047cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 26057cf1b8d3SKris Buschelman 26067cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 26077cf1b8d3SKris Buschelman 26087cf1b8d3SKris Buschelman /* Third Column */ 26097cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 26107cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26117cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 26127cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 26137cf1b8d3SKris Buschelman 26147cf1b8d3SKris Buschelman /* Fourth Column */ 26157cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 26167cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 26177cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 26187cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 26197cf1b8d3SKris Buschelman 26207cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 26217cf1b8d3SKris Buschelman SSE_INLINE_END_3 26227cf1b8d3SKris Buschelman 26237cf1b8d3SKris Buschelman v = aa + ai16 + 16; 26247cf1b8d3SKris Buschelman idt -= 4; 26257cf1b8d3SKris Buschelman } 26267cf1b8d3SKris Buschelman 26277cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 26287cf1b8d3SKris Buschelman idt = 4*(n-1); 26297cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 26307cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 26317cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 26327cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 26337cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 26347cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 26357cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 26367cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 26377cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 26387cf1b8d3SKris Buschelman idt -= 4; 26397cf1b8d3SKris Buschelman } 26407cf1b8d3SKris Buschelman 26417cf1b8d3SKris Buschelman } /* End of artificial scope. */ 26427cf1b8d3SKris Buschelman ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 26437cf1b8d3SKris Buschelman ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 26447cf1b8d3SKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 26457cf1b8d3SKris Buschelman SSE_SCOPE_END; 26467cf1b8d3SKris Buschelman PetscFunctionReturn(0); 26477cf1b8d3SKris Buschelman } 26487cf1b8d3SKris Buschelman 26493660e330SKris Buschelman #endif 26504a2ae208SSatish Balay #undef __FUNCT__ 26514a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 26524e2b4712SSatish Balay int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 26534e2b4712SSatish Balay { 26544e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 26554e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 26564e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 26574e2b4712SSatish Balay int *diag = a->diag; 26583f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 265987828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 26604e2b4712SSatish Balay 26614e2b4712SSatish Balay PetscFunctionBegin; 2662e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2663e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2664f1af5d2fSBarry Smith t = a->solve_work; 26654e2b4712SSatish Balay 26664e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 26674e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 26684e2b4712SSatish Balay 26694e2b4712SSatish Balay /* forward solve the lower triangular */ 26704e2b4712SSatish Balay idx = 3*(*r++); 2671f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 26724e2b4712SSatish Balay for (i=1; i<n; i++) { 26734e2b4712SSatish Balay v = aa + 9*ai[i]; 26744e2b4712SSatish Balay vi = aj + ai[i]; 26754e2b4712SSatish Balay nz = diag[i] - ai[i]; 26764e2b4712SSatish Balay idx = 3*(*r++); 2677f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 26784e2b4712SSatish Balay while (nz--) { 26794e2b4712SSatish Balay idx = 3*(*vi++); 2680f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2681f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2682f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2683f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 26844e2b4712SSatish Balay v += 9; 26854e2b4712SSatish Balay } 26864e2b4712SSatish Balay idx = 3*i; 2687f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 26884e2b4712SSatish Balay } 26894e2b4712SSatish Balay /* backward solve the upper triangular */ 26904e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 26914e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 26924e2b4712SSatish Balay vi = aj + diag[i] + 1; 26934e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 26944e2b4712SSatish Balay idt = 3*i; 2695f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 26964e2b4712SSatish Balay while (nz--) { 26974e2b4712SSatish Balay idx = 3*(*vi++); 2698f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2699f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2700f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2701f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 27024e2b4712SSatish Balay v += 9; 27034e2b4712SSatish Balay } 27044e2b4712SSatish Balay idc = 3*(*c--); 27054e2b4712SSatish Balay v = aa + 9*diag[i]; 2706f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2707f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2708f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 27094e2b4712SSatish Balay } 27104e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 27114e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2712e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2713e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2714b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 27154e2b4712SSatish Balay PetscFunctionReturn(0); 27164e2b4712SSatish Balay } 27174e2b4712SSatish Balay 271815091d37SBarry Smith /* 271915091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 272015091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 272115091d37SBarry Smith */ 27224a2ae208SSatish Balay #undef __FUNCT__ 27234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 272415091d37SBarry Smith int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 272515091d37SBarry Smith { 272615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 272715091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 272815091d37SBarry Smith int ierr,*diag = a->diag; 272915091d37SBarry Smith MatScalar *aa=a->a,*v; 273087828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 273115091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 273215091d37SBarry Smith 273315091d37SBarry Smith PetscFunctionBegin; 273415091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 273515091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 273615091d37SBarry Smith 273715091d37SBarry Smith 273815091d37SBarry Smith /* forward solve the lower triangular */ 273915091d37SBarry Smith idx = 0; 274015091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 274115091d37SBarry Smith for (i=1; i<n; i++) { 274215091d37SBarry Smith v = aa + 9*ai[i]; 274315091d37SBarry Smith vi = aj + ai[i]; 274415091d37SBarry Smith nz = diag[i] - ai[i]; 274515091d37SBarry Smith idx += 3; 2746f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 274715091d37SBarry Smith while (nz--) { 274815091d37SBarry Smith jdx = 3*(*vi++); 274915091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2750f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2751f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2752f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 275315091d37SBarry Smith v += 9; 275415091d37SBarry Smith } 2755f1af5d2fSBarry Smith x[idx] = s1; 2756f1af5d2fSBarry Smith x[1+idx] = s2; 2757f1af5d2fSBarry Smith x[2+idx] = s3; 275815091d37SBarry Smith } 275915091d37SBarry Smith /* backward solve the upper triangular */ 276015091d37SBarry Smith for (i=n-1; i>=0; i--){ 276115091d37SBarry Smith v = aa + 9*diag[i] + 9; 276215091d37SBarry Smith vi = aj + diag[i] + 1; 276315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 276415091d37SBarry Smith idt = 3*i; 2765f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2766f1af5d2fSBarry Smith s3 = x[2+idt]; 276715091d37SBarry Smith while (nz--) { 276815091d37SBarry Smith idx = 3*(*vi++); 276915091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2770f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2771f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2772f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 277315091d37SBarry Smith v += 9; 277415091d37SBarry Smith } 277515091d37SBarry Smith v = aa + 9*diag[i]; 2776f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2777f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2778f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 277915091d37SBarry Smith } 278015091d37SBarry Smith 278115091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 278215091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2783b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 278415091d37SBarry Smith PetscFunctionReturn(0); 278515091d37SBarry Smith } 278615091d37SBarry Smith 27874a2ae208SSatish Balay #undef __FUNCT__ 27884a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 27894e2b4712SSatish Balay int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 27904e2b4712SSatish Balay { 27914e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 27924e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 27934e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 27944e2b4712SSatish Balay int *diag = a->diag; 27953f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 279687828ca2SBarry Smith PetscScalar *x,*b,s1,s2,x1,x2,*t; 27974e2b4712SSatish Balay 27984e2b4712SSatish Balay PetscFunctionBegin; 2799e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2800e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2801f1af5d2fSBarry Smith t = a->solve_work; 28024e2b4712SSatish Balay 28034e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 28044e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 28054e2b4712SSatish Balay 28064e2b4712SSatish Balay /* forward solve the lower triangular */ 28074e2b4712SSatish Balay idx = 2*(*r++); 2808f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 28094e2b4712SSatish Balay for (i=1; i<n; i++) { 28104e2b4712SSatish Balay v = aa + 4*ai[i]; 28114e2b4712SSatish Balay vi = aj + ai[i]; 28124e2b4712SSatish Balay nz = diag[i] - ai[i]; 28134e2b4712SSatish Balay idx = 2*(*r++); 2814f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 28154e2b4712SSatish Balay while (nz--) { 28164e2b4712SSatish Balay idx = 2*(*vi++); 2817f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2818f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2819f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 28204e2b4712SSatish Balay v += 4; 28214e2b4712SSatish Balay } 28224e2b4712SSatish Balay idx = 2*i; 2823f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 28244e2b4712SSatish Balay } 28254e2b4712SSatish Balay /* backward solve the upper triangular */ 28264e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 28274e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 28284e2b4712SSatish Balay vi = aj + diag[i] + 1; 28294e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 28304e2b4712SSatish Balay idt = 2*i; 2831f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 28324e2b4712SSatish Balay while (nz--) { 28334e2b4712SSatish Balay idx = 2*(*vi++); 2834f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2835f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2836f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 28374e2b4712SSatish Balay v += 4; 28384e2b4712SSatish Balay } 28394e2b4712SSatish Balay idc = 2*(*c--); 28404e2b4712SSatish Balay v = aa + 4*diag[i]; 2841f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2842f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 28434e2b4712SSatish Balay } 28444e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 28454e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2846e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2847e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2848b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 28494e2b4712SSatish Balay PetscFunctionReturn(0); 28504e2b4712SSatish Balay } 28514e2b4712SSatish Balay 285215091d37SBarry Smith /* 285315091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 285415091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 285515091d37SBarry Smith */ 28564a2ae208SSatish Balay #undef __FUNCT__ 28574a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 285815091d37SBarry Smith int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 285915091d37SBarry Smith { 286015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 286115091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 286215091d37SBarry Smith int ierr,*diag = a->diag; 286315091d37SBarry Smith MatScalar *aa=a->a,*v; 286487828ca2SBarry Smith PetscScalar *x,*b,s1,s2,x1,x2; 286515091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 286615091d37SBarry Smith 286715091d37SBarry Smith PetscFunctionBegin; 286815091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 286915091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 287015091d37SBarry Smith 287115091d37SBarry Smith /* forward solve the lower triangular */ 287215091d37SBarry Smith idx = 0; 287315091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 287415091d37SBarry Smith for (i=1; i<n; i++) { 287515091d37SBarry Smith v = aa + 4*ai[i]; 287615091d37SBarry Smith vi = aj + ai[i]; 287715091d37SBarry Smith nz = diag[i] - ai[i]; 287815091d37SBarry Smith idx += 2; 2879f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 288015091d37SBarry Smith while (nz--) { 288115091d37SBarry Smith jdx = 2*(*vi++); 288215091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 2883f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2884f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 288515091d37SBarry Smith v += 4; 288615091d37SBarry Smith } 2887f1af5d2fSBarry Smith x[idx] = s1; 2888f1af5d2fSBarry Smith x[1+idx] = s2; 288915091d37SBarry Smith } 289015091d37SBarry Smith /* backward solve the upper triangular */ 289115091d37SBarry Smith for (i=n-1; i>=0; i--){ 289215091d37SBarry Smith v = aa + 4*diag[i] + 4; 289315091d37SBarry Smith vi = aj + diag[i] + 1; 289415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 289515091d37SBarry Smith idt = 2*i; 2896f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 289715091d37SBarry Smith while (nz--) { 289815091d37SBarry Smith idx = 2*(*vi++); 289915091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 2900f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2901f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 290215091d37SBarry Smith v += 4; 290315091d37SBarry Smith } 290415091d37SBarry Smith v = aa + 4*diag[i]; 2905f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 2906f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 290715091d37SBarry Smith } 290815091d37SBarry Smith 290915091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 291015091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2911b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 291215091d37SBarry Smith PetscFunctionReturn(0); 291315091d37SBarry Smith } 291415091d37SBarry Smith 29154a2ae208SSatish Balay #undef __FUNCT__ 29164a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 29174e2b4712SSatish Balay int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 29184e2b4712SSatish Balay { 29194e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 29204e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 29214e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 29224e2b4712SSatish Balay int *diag = a->diag; 29233f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 292487828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 29254e2b4712SSatish Balay 29264e2b4712SSatish Balay PetscFunctionBegin; 29274e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 29284e2b4712SSatish Balay 2929e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2930e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2931f1af5d2fSBarry Smith t = a->solve_work; 29324e2b4712SSatish Balay 29334e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 29344e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 29354e2b4712SSatish Balay 29364e2b4712SSatish Balay /* forward solve the lower triangular */ 2937f1af5d2fSBarry Smith t[0] = b[*r++]; 29384e2b4712SSatish Balay for (i=1; i<n; i++) { 29394e2b4712SSatish Balay v = aa + ai[i]; 29404e2b4712SSatish Balay vi = aj + ai[i]; 29414e2b4712SSatish Balay nz = diag[i] - ai[i]; 2942f1af5d2fSBarry Smith s1 = b[*r++]; 29434e2b4712SSatish Balay while (nz--) { 2944f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 29454e2b4712SSatish Balay } 2946f1af5d2fSBarry Smith t[i] = s1; 29474e2b4712SSatish Balay } 29484e2b4712SSatish Balay /* backward solve the upper triangular */ 29494e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 29504e2b4712SSatish Balay v = aa + diag[i] + 1; 29514e2b4712SSatish Balay vi = aj + diag[i] + 1; 29524e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2953f1af5d2fSBarry Smith s1 = t[i]; 29544e2b4712SSatish Balay while (nz--) { 2955f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 29564e2b4712SSatish Balay } 2957f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 29584e2b4712SSatish Balay } 29594e2b4712SSatish Balay 29604e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 29614e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2962e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2963e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2964b0a32e0cSBarry Smith PetscLogFlops(2*1*(a->nz) - A->n); 29654e2b4712SSatish Balay PetscFunctionReturn(0); 29664e2b4712SSatish Balay } 296715091d37SBarry Smith /* 296815091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 296915091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 297015091d37SBarry Smith */ 29714a2ae208SSatish Balay #undef __FUNCT__ 29724a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 297315091d37SBarry Smith int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 297415091d37SBarry Smith { 297515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 297615091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 297715091d37SBarry Smith int ierr,*diag = a->diag; 297815091d37SBarry Smith MatScalar *aa=a->a; 297987828ca2SBarry Smith PetscScalar *x,*b; 298087828ca2SBarry Smith PetscScalar s1,x1; 298115091d37SBarry Smith MatScalar *v; 298215091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 298315091d37SBarry Smith 298415091d37SBarry Smith PetscFunctionBegin; 298515091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 298615091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 298715091d37SBarry Smith 298815091d37SBarry Smith /* forward solve the lower triangular */ 298915091d37SBarry Smith idx = 0; 299015091d37SBarry Smith x[0] = b[0]; 299115091d37SBarry Smith for (i=1; i<n; i++) { 299215091d37SBarry Smith v = aa + ai[i]; 299315091d37SBarry Smith vi = aj + ai[i]; 299415091d37SBarry Smith nz = diag[i] - ai[i]; 299515091d37SBarry Smith idx += 1; 2996f1af5d2fSBarry Smith s1 = b[idx]; 299715091d37SBarry Smith while (nz--) { 299815091d37SBarry Smith jdx = *vi++; 299915091d37SBarry Smith x1 = x[jdx]; 3000f1af5d2fSBarry Smith s1 -= v[0]*x1; 300115091d37SBarry Smith v += 1; 300215091d37SBarry Smith } 3003f1af5d2fSBarry Smith x[idx] = s1; 300415091d37SBarry Smith } 300515091d37SBarry Smith /* backward solve the upper triangular */ 300615091d37SBarry Smith for (i=n-1; i>=0; i--){ 300715091d37SBarry Smith v = aa + diag[i] + 1; 300815091d37SBarry Smith vi = aj + diag[i] + 1; 300915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 301015091d37SBarry Smith idt = i; 3011f1af5d2fSBarry Smith s1 = x[idt]; 301215091d37SBarry Smith while (nz--) { 301315091d37SBarry Smith idx = *vi++; 301415091d37SBarry Smith x1 = x[idx]; 3015f1af5d2fSBarry Smith s1 -= v[0]*x1; 301615091d37SBarry Smith v += 1; 301715091d37SBarry Smith } 301815091d37SBarry Smith v = aa + diag[i]; 3019f1af5d2fSBarry Smith x[idt] = v[0]*s1; 302015091d37SBarry Smith } 302115091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 302215091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3023b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 302415091d37SBarry Smith PetscFunctionReturn(0); 302515091d37SBarry Smith } 30264e2b4712SSatish Balay 30274e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 30284e2b4712SSatish Balay /* 30294e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 30304e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 30314e2b4712SSatish Balay Not a good example of code reuse. 30324e2b4712SSatish Balay */ 3033ca44d042SBarry Smith EXTERN int MatMissingDiagonal_SeqBAIJ(Mat); 3034435faa5fSBarry Smith 30354a2ae208SSatish Balay #undef __FUNCT__ 30364a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3037435faa5fSBarry Smith int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact) 30384e2b4712SSatish Balay { 30394e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 30404e2b4712SSatish Balay IS isicol; 30414e2b4712SSatish Balay int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j; 30424e2b4712SSatish Balay int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3043eb150c5cSKris Buschelman int *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0; 3044435faa5fSBarry Smith int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 30454533b203SBarry Smith PetscTruth col_identity,row_identity; 3046329f5518SBarry Smith PetscReal f; 30474e2b4712SSatish Balay 30484e2b4712SSatish Balay PetscFunctionBegin; 3049435faa5fSBarry Smith f = info->fill; 3050335d9088SBarry Smith levels = (int)info->levels; 3051335d9088SBarry Smith diagonal_fill = (int)info->diagonal_fill; 30524c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3053667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3054667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3055309c388cSBarry Smith 3056309c388cSBarry Smith if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3057bb3d539aSBarry Smith ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3058bb3d539aSBarry Smith (*fact)->factor = FACTOR_LU; 3059bb3d539aSBarry Smith b = (Mat_SeqBAIJ*)(*fact)->data; 3060bb3d539aSBarry Smith if (!b->diag) { 3061bb3d539aSBarry Smith ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3062bb3d539aSBarry Smith } 3063bb3d539aSBarry Smith ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3064bb3d539aSBarry Smith b->row = isrow; 3065bb3d539aSBarry Smith b->col = iscol; 3066bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3067bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3068bb3d539aSBarry Smith b->icol = isicol; 3069bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 307087828ca2SBarry Smith ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3071309c388cSBarry Smith } else { /* general case perform the symbolic factorization */ 30724e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 30734e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 30744e2b4712SSatish Balay 30754e2b4712SSatish Balay /* get new row pointers */ 3076b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 30774e2b4712SSatish Balay ainew[0] = 0; 30784e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 30794e2b4712SSatish Balay jmax = (int)(f*ai[n] + 1); 308082502324SSatish Balay ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 30814e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 308282502324SSatish Balay ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 30834e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 3084b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 30854e2b4712SSatish Balay /* im is level for each filled value */ 3086b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 30874e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 3088b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 30894e2b4712SSatish Balay dloc[0] = 0; 30904e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 3091435faa5fSBarry Smith 3092435faa5fSBarry Smith /* copy prow into linked list */ 30934e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 309429bbc08cSBarry Smith if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 30954e2b4712SSatish Balay xi = aj + ai[r[prow]]; 30964e2b4712SSatish Balay fill[n] = n; 3097435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 30984e2b4712SSatish Balay while (nz--) { 30994e2b4712SSatish Balay fm = n; 31004e2b4712SSatish Balay idx = ic[*xi++]; 31014e2b4712SSatish Balay do { 31024e2b4712SSatish Balay m = fm; 31034e2b4712SSatish Balay fm = fill[m]; 31044e2b4712SSatish Balay } while (fm < idx); 31054e2b4712SSatish Balay fill[m] = idx; 31064e2b4712SSatish Balay fill[idx] = fm; 31074e2b4712SSatish Balay im[idx] = 0; 31084e2b4712SSatish Balay } 3109435faa5fSBarry Smith 3110435faa5fSBarry Smith /* make sure diagonal entry is included */ 3111435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 3112435faa5fSBarry Smith fm = n; 3113435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 3114435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3115435faa5fSBarry Smith fill[fm] = prow; 3116435faa5fSBarry Smith im[prow] = 0; 3117435faa5fSBarry Smith nzf++; 3118335d9088SBarry Smith dcount++; 3119435faa5fSBarry Smith } 3120435faa5fSBarry Smith 31214e2b4712SSatish Balay nzi = 0; 31224e2b4712SSatish Balay row = fill[n]; 31234e2b4712SSatish Balay while (row < prow) { 31244e2b4712SSatish Balay incrlev = im[row] + 1; 31254e2b4712SSatish Balay nz = dloc[row]; 3126435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 31274e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 31284e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 31294e2b4712SSatish Balay fm = row; 31304e2b4712SSatish Balay while (nnz-- > 0) { 31314e2b4712SSatish Balay idx = *xi++; 31324e2b4712SSatish Balay if (*flev + incrlev > levels) { 31334e2b4712SSatish Balay flev++; 31344e2b4712SSatish Balay continue; 31354e2b4712SSatish Balay } 31364e2b4712SSatish Balay do { 31374e2b4712SSatish Balay m = fm; 31384e2b4712SSatish Balay fm = fill[m]; 31394e2b4712SSatish Balay } while (fm < idx); 31404e2b4712SSatish Balay if (fm != idx) { 31414e2b4712SSatish Balay im[idx] = *flev + incrlev; 31424e2b4712SSatish Balay fill[m] = idx; 31434e2b4712SSatish Balay fill[idx] = fm; 31444e2b4712SSatish Balay fm = idx; 31454e2b4712SSatish Balay nzf++; 3146ecf371e4SBarry Smith } else { 31474e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 31484e2b4712SSatish Balay } 31494e2b4712SSatish Balay flev++; 31504e2b4712SSatish Balay } 31514e2b4712SSatish Balay row = fill[row]; 31524e2b4712SSatish Balay nzi++; 31534e2b4712SSatish Balay } 31544e2b4712SSatish Balay /* copy new filled row into permanent storage */ 31554e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 31564e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 3157ecf371e4SBarry Smith 3158ecf371e4SBarry Smith /* estimate how much additional space we will need */ 3159ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3160ecf371e4SBarry Smith /* just double the memory each time */ 3161ecf371e4SBarry Smith int maxadd = jmax; 3162ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 31634e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 31644e2b4712SSatish Balay jmax += maxadd; 3165ecf371e4SBarry Smith 3166ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 316782502324SSatish Balay ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3168549d3d68SSatish Balay ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3169606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 31704e2b4712SSatish Balay ajnew = xi; 317182502324SSatish Balay ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3172549d3d68SSatish Balay ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3173606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 31744e2b4712SSatish Balay ajfill = xi; 3175eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 31764e2b4712SSatish Balay } 31774e2b4712SSatish Balay xi = ajnew + ainew[prow]; 31784e2b4712SSatish Balay flev = ajfill + ainew[prow]; 31794e2b4712SSatish Balay dloc[prow] = nzi; 31804e2b4712SSatish Balay fm = fill[n]; 31814e2b4712SSatish Balay while (nzf--) { 31824e2b4712SSatish Balay *xi++ = fm; 31834e2b4712SSatish Balay *flev++ = im[fm]; 31844e2b4712SSatish Balay fm = fill[fm]; 31854e2b4712SSatish Balay } 3186435faa5fSBarry Smith /* make sure row has diagonal entry */ 3187435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 318829bbc08cSBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 3189435faa5fSBarry Smith try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 3190435faa5fSBarry Smith } 31914e2b4712SSatish Balay } 3192606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 31934e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 31944e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3195606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 3196606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 31974e2b4712SSatish Balay 31984e2b4712SSatish Balay { 3199329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3200eb150c5cSKris Buschelman PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af); 3201b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 3202b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 3203b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 3204335d9088SBarry Smith if (diagonal_fill) { 3205b1bcba4aSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount); 3206335d9088SBarry Smith } 32074e2b4712SSatish Balay } 32084e2b4712SSatish Balay 32094e2b4712SSatish Balay /* put together the new matrix */ 32104e2b4712SSatish Balay ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 3211b0a32e0cSBarry Smith PetscLogObjectParent(*fact,isicol); 32124e2b4712SSatish Balay b = (Mat_SeqBAIJ*)(*fact)->data; 3213606d414cSSatish Balay ierr = PetscFree(b->imax);CHKERRQ(ierr); 32147c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 32153f1db9ecSBarry Smith len = bs2*ainew[n]*sizeof(MatScalar); 32164e2b4712SSatish Balay /* the next line frees the default space generated by the Create() */ 3217606d414cSSatish Balay ierr = PetscFree(b->a);CHKERRQ(ierr); 3218606d414cSSatish Balay ierr = PetscFree(b->ilen);CHKERRQ(ierr); 321982502324SSatish Balay ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 32204e2b4712SSatish Balay b->j = ajnew; 32214e2b4712SSatish Balay b->i = ainew; 32224e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 32234e2b4712SSatish Balay b->diag = dloc; 32244e2b4712SSatish Balay b->ilen = 0; 32254e2b4712SSatish Balay b->imax = 0; 32264e2b4712SSatish Balay b->row = isrow; 32274e2b4712SSatish Balay b->col = iscol; 3228bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3229c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3230c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3231e51c0b9cSSatish Balay b->icol = isicol; 323287828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 32334e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 32344e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 323587828ca2SBarry Smith PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 32364e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 32374e2b4712SSatish Balay (*fact)->factor = FACTOR_LU; 32384e2b4712SSatish Balay 3239eb150c5cSKris Buschelman (*fact)->info.factor_mallocs = reallocate; 32404e2b4712SSatish Balay (*fact)->info.fill_ratio_given = f; 3241329f5518SBarry Smith (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3242309c388cSBarry Smith } 32434e2b4712SSatish Balay 3244309c388cSBarry Smith if (row_identity && col_identity) { 3245732ee342SKris Buschelman ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 32468661488fSKris Buschelman } 32478661488fSKris Buschelman PetscFunctionReturn(0); 32488661488fSKris Buschelman } 32498661488fSKris Buschelman 3250732ee342SKris Buschelman #undef __FUNCT__ 32517e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 32527e7071cdSKris Buschelman int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 32537e7071cdSKris Buschelman { 32547e7071cdSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 32552aa5897fSKris Buschelman int i,*AJ=a->j,nz=a->nz; 3256*5a9542e3SKris Buschelman PetscFunctionBegin; 32577cf1b8d3SKris Buschelman /* Undo Column scaling */ 32587cf1b8d3SKris Buschelman /* while (nz--) { */ 32597cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 32607cf1b8d3SKris Buschelman /* } */ 32617cf1b8d3SKris Buschelman PetscFunctionReturn(0); 32627cf1b8d3SKris Buschelman } 32637cf1b8d3SKris Buschelman 32647cf1b8d3SKris Buschelman #undef __FUNCT__ 32657cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 32667cf1b8d3SKris Buschelman int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 32677cf1b8d3SKris Buschelman { 32687cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 32697cf1b8d3SKris Buschelman int i,*AJ=a->j,nz=a->nz; 32702aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 3271*5a9542e3SKris Buschelman PetscFunctionBegin; 327220235379SKris Buschelman while (nz--) { 32732aa5897fSKris Buschelman AJ[i] = (int)((unsigned int)aj[i]); /* First extend, then convert to signed. */ 32747e7071cdSKris Buschelman } 32757e7071cdSKris Buschelman PetscFunctionReturn(0); 32767e7071cdSKris Buschelman } 32777e7071cdSKris Buschelman 32787e7071cdSKris Buschelman #undef __FUNCT__ 3279732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3280732ee342SKris Buschelman int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 32818661488fSKris Buschelman { 32828661488fSKris Buschelman /* 32838661488fSKris Buschelman Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 32848661488fSKris Buschelman with natural ordering 32858661488fSKris Buschelman */ 32868661488fSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 32878661488fSKris Buschelman 32888661488fSKris Buschelman PetscFunctionBegin; 3289a7ba9c3cSKris Buschelman inA->ops->solve = MatSolve_SeqBAIJ_Update; 3290a7ba9c3cSKris Buschelman inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 32918661488fSKris Buschelman switch (a->bs) { 32928661488fSKris Buschelman case 1: 32938661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3294732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 3295732ee342SKris Buschelman break; 3296309c388cSBarry Smith case 2: 32978661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3298732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 3299309c388cSBarry Smith break; 3300309c388cSBarry Smith case 3: 33018661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3302732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 3303309c388cSBarry Smith break; 3304309c388cSBarry Smith case 4: 3305a7d8d0baSKris Buschelman #if defined(PETSC_USE_MAT_SINGLE) 3306a7d8d0baSKris Buschelman { 3307a7d8d0baSKris Buschelman PetscTruth sse_enabled_local; 330843b9cc93SKris Buschelman int ierr; 3309ccaa8a1bSKris Buschelman ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 33106b7cc795SKris Buschelman if (sse_enabled_local) { 3311b988c221SKris Buschelman # if defined(PETSC_HAVE_SSE) 33127cf1b8d3SKris Buschelman int i,*AJ=a->j,nz=a->nz,n=a->mbs; 33137cf1b8d3SKris Buschelman if (n==(unsigned short)n) { 33142aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 331513c7ffeeSKris Buschelman for (i=0;i<nz;i++) { 33162aa5897fSKris Buschelman aj[i] = (unsigned short)AJ[i]; 331713c7ffeeSKris Buschelman } 33187cf1b8d3SKris Buschelman inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj; 33197cf1b8d3SKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj; 332086b4ebfeSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n"); 33217cf1b8d3SKris Buschelman } else { 33227cf1b8d3SKris Buschelman /* Scale the column indices for easier indexing in MatSolve. */ 33237cf1b8d3SKris Buschelman /* for (i=0;i<nz;i++) { */ 33247cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]*4; */ 33257cf1b8d3SKris Buschelman /* } */ 33267e7071cdSKris Buschelman inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE; 33278661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 332886b4ebfeSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n"); 33297cf1b8d3SKris Buschelman } 3330b988c221SKris Buschelman # else 3331b988c221SKris Buschelman /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3332b988c221SKris Buschelman SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3333b988c221SKris Buschelman # endif 33343ba47ebaSKris Buschelman } else { 33358661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3336732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 33373ba47ebaSKris Buschelman } 3338a7d8d0baSKris Buschelman } 3339a7d8d0baSKris Buschelman #else 3340a7d8d0baSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3341a7d8d0baSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3342a7d8d0baSKris Buschelman #endif 3343309c388cSBarry Smith break; 3344309c388cSBarry Smith case 5: 33458661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3346732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 3347309c388cSBarry Smith break; 3348309c388cSBarry Smith case 6: 33498661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3350732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 3351309c388cSBarry Smith break; 3352309c388cSBarry Smith case 7: 33538661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3354732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 3355309c388cSBarry Smith break; 3356309c388cSBarry Smith } 33574e2b4712SSatish Balay PetscFunctionReturn(0); 33584e2b4712SSatish Balay } 3359732ee342SKris Buschelman 3360732ee342SKris Buschelman #undef __FUNCT__ 3361732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3362732ee342SKris Buschelman int MatSeqBAIJ_UpdateSolvers(Mat A) 3363732ee342SKris Buschelman { 3364732ee342SKris Buschelman /* 3365732ee342SKris Buschelman Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3366732ee342SKris Buschelman with natural ordering 3367732ee342SKris Buschelman */ 3368732ee342SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3369732ee342SKris Buschelman IS row = a->row, col = a->col; 3370732ee342SKris Buschelman PetscTruth row_identity, col_identity; 337123c42b7cSKris Buschelman PetscTruth use_natural; 3372732ee342SKris Buschelman int ierr; 3373732ee342SKris Buschelman 3374732ee342SKris Buschelman PetscFunctionBegin; 3375cf242676SKris Buschelman 337694ee7fc8SKris Buschelman use_natural = PETSC_FALSE; 3377cf242676SKris Buschelman 3378732ee342SKris Buschelman ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3379732ee342SKris Buschelman ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3380732ee342SKris Buschelman 3381732ee342SKris Buschelman if (row_identity && col_identity) { 3382732ee342SKris Buschelman use_natural = PETSC_TRUE; 3383732ee342SKris Buschelman } else { 3384732ee342SKris Buschelman use_natural = PETSC_FALSE; 3385732ee342SKris Buschelman } 3386732ee342SKris Buschelman switch (a->bs) { 3387732ee342SKris Buschelman case 1: 3388732ee342SKris Buschelman if (use_natural) { 3389732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3390732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3391732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 3392732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3393732ee342SKris Buschelman } else { 3394732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_1; 3395732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3396732ee342SKris Buschelman } 3397732ee342SKris Buschelman break; 3398732ee342SKris Buschelman case 2: 3399732ee342SKris Buschelman if (use_natural) { 3400732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3401732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3402732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 3403732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3404732ee342SKris Buschelman } else { 3405732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_2; 3406732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3407732ee342SKris Buschelman } 3408732ee342SKris Buschelman break; 3409732ee342SKris Buschelman case 3: 3410732ee342SKris Buschelman if (use_natural) { 3411732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3412732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3413732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 3414732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3415732ee342SKris Buschelman } else { 3416732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_3; 3417732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3418732ee342SKris Buschelman } 3419732ee342SKris Buschelman break; 3420732ee342SKris Buschelman case 4: 3421f26ec98cSKris Buschelman { 3422123145dfSKris Buschelman PetscTruth sse_enabled_local; 3423ccaa8a1bSKris Buschelman ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3424732ee342SKris Buschelman if (use_natural) { 34252859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE) 3426f26ec98cSKris Buschelman if (sse_enabled_local) { /* Natural + Single + SSE */ 3427eb150c5cSKris Buschelman # if defined(PETSC_HAVE_SSE) 3428995eb297SKris Buschelman int n=a->mbs; 3429995eb297SKris Buschelman if (n==(unsigned short)n) { 3430995eb297SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj; 3431995eb297SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n"); 3432995eb297SKris Buschelman } else { 3433732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 343486b4ebfeSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n"); 3435995eb297SKris Buschelman } 3436eb150c5cSKris Buschelman # else 3437eb150c5cSKris Buschelman /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3438eb150c5cSKris Buschelman SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3439eb150c5cSKris Buschelman # endif 3440f26ec98cSKris Buschelman } else { /* Natural + Single */ 3441f26ec98cSKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3442123145dfSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n"); 3443f26ec98cSKris Buschelman } 34442859b196SKris Buschelman #else 34452859b196SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3446123145dfSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 34472859b196SKris Buschelman #endif 3448732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3449123145dfSKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3450f26ec98cSKris Buschelman } else { /* Arbitrary ordering */ 34512859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE) 3452f26ec98cSKris Buschelman if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3453eb150c5cSKris Buschelman # if defined(PETSC_HAVE_SSE) 3454732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3455732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3456eb150c5cSKris Buschelman # else 3457eb150c5cSKris Buschelman /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3458eb150c5cSKris Buschelman SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3459eb150c5cSKris Buschelman # endif 3460f26ec98cSKris Buschelman } else { /* Arbitrary + Single */ 3461f26ec98cSKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3462f26ec98cSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"); 3463732ee342SKris Buschelman } 34642859b196SKris Buschelman #else 34652859b196SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4; 34662859b196SKris Buschelman #endif 3467732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3468732ee342SKris Buschelman } 3469f26ec98cSKris Buschelman } 3470732ee342SKris Buschelman break; 3471732ee342SKris Buschelman case 5: 3472732ee342SKris Buschelman if (use_natural) { 3473732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3474732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3475732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3476732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3477732ee342SKris Buschelman } else { 3478732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_5; 3479732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3480732ee342SKris Buschelman } 3481732ee342SKris Buschelman break; 3482732ee342SKris Buschelman case 6: 3483732ee342SKris Buschelman if (use_natural) { 3484732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3485732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3486732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3487732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3488732ee342SKris Buschelman } else { 3489732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_6; 3490732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3491732ee342SKris Buschelman } 3492732ee342SKris Buschelman break; 3493732ee342SKris Buschelman case 7: 3494732ee342SKris Buschelman if (use_natural) { 3495732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3496732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3497732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3498732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3499732ee342SKris Buschelman } else { 3500732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_7; 3501732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3502732ee342SKris Buschelman } 3503732ee342SKris Buschelman break; 350431801e53SKris Buschelman default: 350531801e53SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_N; 350631801e53SKris Buschelman break; 3507732ee342SKris Buschelman } 3508732ee342SKris Buschelman PetscFunctionReturn(0); 3509732ee342SKris Buschelman } 3510732ee342SKris Buschelman 3511732ee342SKris Buschelman #undef __FUNCT__ 3512732ee342SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3513732ee342SKris Buschelman int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3514732ee342SKris Buschelman int ierr; 3515732ee342SKris Buschelman 3516732ee342SKris Buschelman PetscFunctionBegin; 3517732ee342SKris Buschelman ierr = MatSeqBAIJ_UpdateSolvers(A); 3518cf242676SKris Buschelman if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3519732ee342SKris Buschelman ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3520cf242676SKris Buschelman } else { 3521cf242676SKris Buschelman SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3522cf242676SKris Buschelman } 3523732ee342SKris Buschelman PetscFunctionReturn(0); 3524732ee342SKris Buschelman } 3525732ee342SKris Buschelman 3526732ee342SKris Buschelman #undef __FUNCT__ 3527732ee342SKris Buschelman #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3528732ee342SKris Buschelman int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3529732ee342SKris Buschelman int ierr; 3530732ee342SKris Buschelman 3531732ee342SKris Buschelman PetscFunctionBegin; 3532732ee342SKris Buschelman ierr = MatSeqBAIJ_UpdateSolvers(A); 3533732ee342SKris Buschelman ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3534732ee342SKris Buschelman PetscFunctionReturn(0); 3535732ee342SKris Buschelman } 3536732ee342SKris Buschelman 3537732ee342SKris Buschelman 3538