173f4d377SMatthew Knepley /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/ 24e2b4712SSatish Balay /* 34e2b4712SSatish Balay Factorization code for BAIJ format. 44e2b4712SSatish Balay */ 54e2b4712SSatish Balay 64e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h" 74e2b4712SSatish Balay #include "src/inline/ilu.h" 874c49faeSBarry Smith #include "src/inline/dot.h" 94e2b4712SSatish Balay 104a2ae208SSatish Balay #undef __FUNCT__ 114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 127c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 13f1af5d2fSBarry Smith { 14f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 16f1af5d2fSBarry Smith int *diag = a->diag; 17f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 1887828ca2SBarry Smith PetscScalar s1,*x,*b; 19f1af5d2fSBarry Smith 20f1af5d2fSBarry Smith PetscFunctionBegin; 21ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 22b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 23b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 24f1af5d2fSBarry Smith 25f1af5d2fSBarry Smith /* forward solve the U^T */ 26f1af5d2fSBarry Smith for (i=0; i<n; i++) { 27f1af5d2fSBarry Smith 28f1af5d2fSBarry Smith v = aa + diag[i]; 29f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 30ef66eb69SBarry Smith s1 = (*v++)*x[i]; 31f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 32f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 33f1af5d2fSBarry Smith while (nz--) { 34f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 35f1af5d2fSBarry Smith } 36f1af5d2fSBarry Smith x[i] = s1; 37f1af5d2fSBarry Smith } 38f1af5d2fSBarry Smith /* backward solve the L^T */ 39f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 40f1af5d2fSBarry Smith v = aa + diag[i] - 1; 41f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 42f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 43f1af5d2fSBarry Smith s1 = x[i]; 44f1af5d2fSBarry Smith while (nz--) { 45f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 46f1af5d2fSBarry Smith } 47f1af5d2fSBarry Smith } 48b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 49b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 50b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 51f1af5d2fSBarry Smith PetscFunctionReturn(0); 52f1af5d2fSBarry Smith } 53f1af5d2fSBarry Smith 544a2ae208SSatish Balay #undef __FUNCT__ 554a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 567c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 57f1af5d2fSBarry Smith { 58f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 59f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 60f1af5d2fSBarry Smith int *diag = a->diag,oidx; 61f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6287828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6387828ca2SBarry Smith PetscScalar *x,*b; 64f1af5d2fSBarry Smith 65f1af5d2fSBarry Smith PetscFunctionBegin; 66ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 67b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 68b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith /* forward solve the U^T */ 71f1af5d2fSBarry Smith idx = 0; 72f1af5d2fSBarry Smith for (i=0; i<n; i++) { 73f1af5d2fSBarry Smith 74f1af5d2fSBarry Smith v = aa + 4*diag[i]; 75f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 76ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 77f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 78f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 79f1af5d2fSBarry Smith v += 4; 80f1af5d2fSBarry Smith 81f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 82f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 83f1af5d2fSBarry Smith while (nz--) { 84f1af5d2fSBarry Smith oidx = 2*(*vi++); 85f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 86f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 87f1af5d2fSBarry Smith v += 4; 88f1af5d2fSBarry Smith } 89f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 90f1af5d2fSBarry Smith idx += 2; 91f1af5d2fSBarry Smith } 92f1af5d2fSBarry Smith /* backward solve the L^T */ 93f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 94f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 95f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 96f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 97f1af5d2fSBarry Smith idt = 2*i; 98f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 99f1af5d2fSBarry Smith while (nz--) { 100f1af5d2fSBarry Smith idx = 2*(*vi--); 101f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 102f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 103f1af5d2fSBarry Smith v -= 4; 104f1af5d2fSBarry Smith } 105f1af5d2fSBarry Smith } 106b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 107b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 108b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 109f1af5d2fSBarry Smith PetscFunctionReturn(0); 110f1af5d2fSBarry Smith } 111f1af5d2fSBarry Smith 1124a2ae208SSatish Balay #undef __FUNCT__ 1134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 1147c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 115f1af5d2fSBarry Smith { 116f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 117f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 118f1af5d2fSBarry Smith int *diag = a->diag,oidx; 119f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12087828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12187828ca2SBarry Smith PetscScalar *x,*b; 122f1af5d2fSBarry Smith 123f1af5d2fSBarry Smith PetscFunctionBegin; 124ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 125b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 126b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 127f1af5d2fSBarry Smith 128f1af5d2fSBarry Smith /* forward solve the U^T */ 129f1af5d2fSBarry Smith idx = 0; 130f1af5d2fSBarry Smith for (i=0; i<n; i++) { 131f1af5d2fSBarry Smith 132f1af5d2fSBarry Smith v = aa + 9*diag[i]; 133f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 134ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 135f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 136f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 137f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 138f1af5d2fSBarry Smith v += 9; 139f1af5d2fSBarry Smith 140f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 141f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 142f1af5d2fSBarry Smith while (nz--) { 143f1af5d2fSBarry Smith oidx = 3*(*vi++); 144f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 145f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 146f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 147f1af5d2fSBarry Smith v += 9; 148f1af5d2fSBarry Smith } 149f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 150f1af5d2fSBarry Smith idx += 3; 151f1af5d2fSBarry Smith } 152f1af5d2fSBarry Smith /* backward solve the L^T */ 153f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 154f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 155f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 156f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 157f1af5d2fSBarry Smith idt = 3*i; 158f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 159f1af5d2fSBarry Smith while (nz--) { 160f1af5d2fSBarry Smith idx = 3*(*vi--); 161f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 162f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 163f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 164f1af5d2fSBarry Smith v -= 9; 165f1af5d2fSBarry Smith } 166f1af5d2fSBarry Smith } 167b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 168b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 169b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 170f1af5d2fSBarry Smith PetscFunctionReturn(0); 171f1af5d2fSBarry Smith } 172f1af5d2fSBarry Smith 1734a2ae208SSatish Balay #undef __FUNCT__ 1744a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 1757c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 176f1af5d2fSBarry Smith { 177f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 178f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 179f1af5d2fSBarry Smith int *diag = a->diag,oidx; 180f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18287828ca2SBarry Smith PetscScalar *x,*b; 183f1af5d2fSBarry Smith 184f1af5d2fSBarry Smith PetscFunctionBegin; 185ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 186b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 187b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 188f1af5d2fSBarry Smith 189f1af5d2fSBarry Smith /* forward solve the U^T */ 190f1af5d2fSBarry Smith idx = 0; 191f1af5d2fSBarry Smith for (i=0; i<n; i++) { 192f1af5d2fSBarry Smith 193f1af5d2fSBarry Smith v = aa + 16*diag[i]; 194f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 195ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 196f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 197f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 198f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 199f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 200f1af5d2fSBarry Smith v += 16; 201f1af5d2fSBarry Smith 202f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 203f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 204f1af5d2fSBarry Smith while (nz--) { 205f1af5d2fSBarry Smith oidx = 4*(*vi++); 206f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 207f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 208f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 209f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 210f1af5d2fSBarry Smith v += 16; 211f1af5d2fSBarry Smith } 212f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 213f1af5d2fSBarry Smith idx += 4; 214f1af5d2fSBarry Smith } 215f1af5d2fSBarry Smith /* backward solve the L^T */ 216f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 217f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 218f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 219f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 220f1af5d2fSBarry Smith idt = 4*i; 221f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 222f1af5d2fSBarry Smith while (nz--) { 223f1af5d2fSBarry Smith idx = 4*(*vi--); 224f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 225f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 226f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 227f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 228f1af5d2fSBarry Smith v -= 16; 229f1af5d2fSBarry Smith } 230f1af5d2fSBarry Smith } 231b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 232b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 233b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 234f1af5d2fSBarry Smith PetscFunctionReturn(0); 235f1af5d2fSBarry Smith } 236f1af5d2fSBarry Smith 2374a2ae208SSatish Balay #undef __FUNCT__ 2384a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 2397c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 240f1af5d2fSBarry Smith { 241f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 242f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 243f1af5d2fSBarry Smith int *diag = a->diag,oidx; 244f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 24587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 24687828ca2SBarry Smith PetscScalar *x,*b; 247f1af5d2fSBarry Smith 248f1af5d2fSBarry Smith PetscFunctionBegin; 249ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 250b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 251b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 252f1af5d2fSBarry Smith 253f1af5d2fSBarry Smith /* forward solve the U^T */ 254f1af5d2fSBarry Smith idx = 0; 255f1af5d2fSBarry Smith for (i=0; i<n; i++) { 256f1af5d2fSBarry Smith 257f1af5d2fSBarry Smith v = aa + 25*diag[i]; 258f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 259ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 260f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 261f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 262f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 263f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 264f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 265f1af5d2fSBarry Smith v += 25; 266f1af5d2fSBarry Smith 267f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 268f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 269f1af5d2fSBarry Smith while (nz--) { 270f1af5d2fSBarry Smith oidx = 5*(*vi++); 271f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 272f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 273f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 274f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 275f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 276f1af5d2fSBarry Smith v += 25; 277f1af5d2fSBarry Smith } 278f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 279f1af5d2fSBarry Smith idx += 5; 280f1af5d2fSBarry Smith } 281f1af5d2fSBarry Smith /* backward solve the L^T */ 282f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 283f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 284f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 285f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 286f1af5d2fSBarry Smith idt = 5*i; 287f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 288f1af5d2fSBarry Smith while (nz--) { 289f1af5d2fSBarry Smith idx = 5*(*vi--); 290f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 291f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 292f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 293f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 294f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 295f1af5d2fSBarry Smith v -= 25; 296f1af5d2fSBarry Smith } 297f1af5d2fSBarry Smith } 298b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 299b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 300b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 301f1af5d2fSBarry Smith PetscFunctionReturn(0); 302f1af5d2fSBarry Smith } 303f1af5d2fSBarry Smith 3044a2ae208SSatish Balay #undef __FUNCT__ 3054a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 3067c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 307f1af5d2fSBarry Smith { 308f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 309f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 310f1af5d2fSBarry Smith int *diag = a->diag,oidx; 311f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 31287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 31387828ca2SBarry Smith PetscScalar *x,*b; 314f1af5d2fSBarry Smith 315f1af5d2fSBarry Smith PetscFunctionBegin; 316ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 317b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 318b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 319f1af5d2fSBarry Smith 320f1af5d2fSBarry Smith /* forward solve the U^T */ 321f1af5d2fSBarry Smith idx = 0; 322f1af5d2fSBarry Smith for (i=0; i<n; i++) { 323f1af5d2fSBarry Smith 324f1af5d2fSBarry Smith v = aa + 36*diag[i]; 325f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 326ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 327ef66eb69SBarry Smith x6 = x[5+idx]; 328f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 329f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 330f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 331f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 332f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 333f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 334f1af5d2fSBarry Smith v += 36; 335f1af5d2fSBarry Smith 336f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 337f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 338f1af5d2fSBarry Smith while (nz--) { 339f1af5d2fSBarry Smith oidx = 6*(*vi++); 340f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 341f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 342f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 343f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 344f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 345f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 346f1af5d2fSBarry Smith v += 36; 347f1af5d2fSBarry Smith } 348f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 349f1af5d2fSBarry Smith x[5+idx] = s6; 350f1af5d2fSBarry Smith idx += 6; 351f1af5d2fSBarry Smith } 352f1af5d2fSBarry Smith /* backward solve the L^T */ 353f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 354f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 355f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 356f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 357f1af5d2fSBarry Smith idt = 6*i; 358f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 359f1af5d2fSBarry Smith s6 = x[5+idt]; 360f1af5d2fSBarry Smith while (nz--) { 361f1af5d2fSBarry Smith idx = 6*(*vi--); 362f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 363f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 364f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 365f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 366f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 367f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 368f1af5d2fSBarry Smith v -= 36; 369f1af5d2fSBarry Smith } 370f1af5d2fSBarry Smith } 371b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 372b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 373b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 374f1af5d2fSBarry Smith PetscFunctionReturn(0); 375f1af5d2fSBarry Smith } 376f1af5d2fSBarry Smith 3774a2ae208SSatish Balay #undef __FUNCT__ 3784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 3797c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 380f1af5d2fSBarry Smith { 381f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 382f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 383f1af5d2fSBarry Smith int *diag = a->diag,oidx; 384f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 38587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 38687828ca2SBarry Smith PetscScalar *x,*b; 387f1af5d2fSBarry Smith 388f1af5d2fSBarry Smith PetscFunctionBegin; 389ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 390b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 391b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 392f1af5d2fSBarry Smith 393f1af5d2fSBarry Smith /* forward solve the U^T */ 394f1af5d2fSBarry Smith idx = 0; 395f1af5d2fSBarry Smith for (i=0; i<n; i++) { 396f1af5d2fSBarry Smith 397f1af5d2fSBarry Smith v = aa + 49*diag[i]; 398f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 399ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 400ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 401f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 402f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 403f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 404f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 405f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 406f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 407f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 408f1af5d2fSBarry Smith v += 49; 409f1af5d2fSBarry Smith 410f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 411f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 412f1af5d2fSBarry Smith while (nz--) { 413f1af5d2fSBarry Smith oidx = 7*(*vi++); 414f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 415f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 416f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 417f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 418f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 419f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 420f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 421f1af5d2fSBarry Smith v += 49; 422f1af5d2fSBarry Smith } 423f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 424f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 425f1af5d2fSBarry Smith idx += 7; 426f1af5d2fSBarry Smith } 427f1af5d2fSBarry Smith /* backward solve the L^T */ 428f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 429f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 430f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 431f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 432f1af5d2fSBarry Smith idt = 7*i; 433f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 434f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 435f1af5d2fSBarry Smith while (nz--) { 436f1af5d2fSBarry Smith idx = 7*(*vi--); 437f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 438f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 439f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 440f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 441f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 442f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 443f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 444f1af5d2fSBarry Smith v -= 49; 445f1af5d2fSBarry Smith } 446f1af5d2fSBarry Smith } 447b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 448b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 449b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 450f1af5d2fSBarry Smith PetscFunctionReturn(0); 451f1af5d2fSBarry Smith } 452f1af5d2fSBarry Smith 453f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4544a2ae208SSatish Balay #undef __FUNCT__ 4554a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 4567c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 457f1af5d2fSBarry Smith { 458f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 459f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 460f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 461f1af5d2fSBarry Smith int *diag = a->diag; 462f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 46387828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 464f1af5d2fSBarry Smith 465f1af5d2fSBarry Smith PetscFunctionBegin; 466b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 467b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 468f1af5d2fSBarry Smith t = a->solve_work; 469f1af5d2fSBarry Smith 470f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 471f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 472f1af5d2fSBarry Smith 473f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 474f1af5d2fSBarry Smith for (i=0; i<n; i++) { 475f1af5d2fSBarry Smith t[i] = b[c[i]]; 476f1af5d2fSBarry Smith } 477f1af5d2fSBarry Smith 478f1af5d2fSBarry Smith /* forward solve the U^T */ 479f1af5d2fSBarry Smith for (i=0; i<n; i++) { 480f1af5d2fSBarry Smith 481f1af5d2fSBarry Smith v = aa + diag[i]; 482f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 483f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 484f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 485f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 486f1af5d2fSBarry Smith while (nz--) { 487f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 488f1af5d2fSBarry Smith } 489f1af5d2fSBarry Smith t[i] = s1; 490f1af5d2fSBarry Smith } 491f1af5d2fSBarry Smith /* backward solve the L^T */ 492f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 493f1af5d2fSBarry Smith v = aa + diag[i] - 1; 494f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 495f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 496f1af5d2fSBarry Smith s1 = t[i]; 497f1af5d2fSBarry Smith while (nz--) { 498f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 499f1af5d2fSBarry Smith } 500f1af5d2fSBarry Smith } 501f1af5d2fSBarry Smith 502f1af5d2fSBarry Smith /* copy t into x according to permutation */ 503f1af5d2fSBarry Smith for (i=0; i<n; i++) { 504f1af5d2fSBarry Smith x[r[i]] = t[i]; 505f1af5d2fSBarry Smith } 506f1af5d2fSBarry Smith 507f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 508f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 509b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 510b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 511b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 512f1af5d2fSBarry Smith PetscFunctionReturn(0); 513f1af5d2fSBarry Smith } 514f1af5d2fSBarry Smith 5154a2ae208SSatish Balay #undef __FUNCT__ 5164a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 5177c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 518f1af5d2fSBarry Smith { 519f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 520f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 521f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 522f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 523f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 52487828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 52587828ca2SBarry Smith PetscScalar *x,*b,*t; 526f1af5d2fSBarry Smith 527f1af5d2fSBarry Smith PetscFunctionBegin; 528b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 529b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 530f1af5d2fSBarry Smith t = a->solve_work; 531f1af5d2fSBarry Smith 532f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 533f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 534f1af5d2fSBarry Smith 535f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 536f1af5d2fSBarry Smith ii = 0; 537f1af5d2fSBarry Smith for (i=0; i<n; i++) { 538f1af5d2fSBarry Smith ic = 2*c[i]; 539f1af5d2fSBarry Smith t[ii] = b[ic]; 540f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 541f1af5d2fSBarry Smith ii += 2; 542f1af5d2fSBarry Smith } 543f1af5d2fSBarry Smith 544f1af5d2fSBarry Smith /* forward solve the U^T */ 545f1af5d2fSBarry Smith idx = 0; 546f1af5d2fSBarry Smith for (i=0; i<n; i++) { 547f1af5d2fSBarry Smith 548f1af5d2fSBarry Smith v = aa + 4*diag[i]; 549f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 550f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 551f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 552f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 553f1af5d2fSBarry Smith v += 4; 554f1af5d2fSBarry Smith 555f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 556f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 557f1af5d2fSBarry Smith while (nz--) { 558f1af5d2fSBarry Smith oidx = 2*(*vi++); 559f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 560f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 561f1af5d2fSBarry Smith v += 4; 562f1af5d2fSBarry Smith } 563f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 564f1af5d2fSBarry Smith idx += 2; 565f1af5d2fSBarry Smith } 566f1af5d2fSBarry Smith /* backward solve the L^T */ 567f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 568f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 569f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 570f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 571f1af5d2fSBarry Smith idt = 2*i; 572f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 573f1af5d2fSBarry Smith while (nz--) { 574f1af5d2fSBarry Smith idx = 2*(*vi--); 575f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 576f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 577f1af5d2fSBarry Smith v -= 4; 578f1af5d2fSBarry Smith } 579f1af5d2fSBarry Smith } 580f1af5d2fSBarry Smith 581f1af5d2fSBarry Smith /* copy t into x according to permutation */ 582f1af5d2fSBarry Smith ii = 0; 583f1af5d2fSBarry Smith for (i=0; i<n; i++) { 584f1af5d2fSBarry Smith ir = 2*r[i]; 585f1af5d2fSBarry Smith x[ir] = t[ii]; 586f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 587f1af5d2fSBarry Smith ii += 2; 588f1af5d2fSBarry Smith } 589f1af5d2fSBarry Smith 590f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 591f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 592b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 593b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 594b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 595f1af5d2fSBarry Smith PetscFunctionReturn(0); 596f1af5d2fSBarry Smith } 597f1af5d2fSBarry Smith 5984a2ae208SSatish Balay #undef __FUNCT__ 5994a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 6007c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 601f1af5d2fSBarry Smith { 602f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 603f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 604f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 605f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 606f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 60787828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 60887828ca2SBarry Smith PetscScalar *x,*b,*t; 609f1af5d2fSBarry Smith 610f1af5d2fSBarry Smith PetscFunctionBegin; 611b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 612b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 613f1af5d2fSBarry Smith t = a->solve_work; 614f1af5d2fSBarry Smith 615f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 616f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 617f1af5d2fSBarry Smith 618f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 619f1af5d2fSBarry Smith ii = 0; 620f1af5d2fSBarry Smith for (i=0; i<n; i++) { 621f1af5d2fSBarry Smith ic = 3*c[i]; 622f1af5d2fSBarry Smith t[ii] = b[ic]; 623f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 624f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 625f1af5d2fSBarry Smith ii += 3; 626f1af5d2fSBarry Smith } 627f1af5d2fSBarry Smith 628f1af5d2fSBarry Smith /* forward solve the U^T */ 629f1af5d2fSBarry Smith idx = 0; 630f1af5d2fSBarry Smith for (i=0; i<n; i++) { 631f1af5d2fSBarry Smith 632f1af5d2fSBarry Smith v = aa + 9*diag[i]; 633f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 634f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 635f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 636f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 637f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 638f1af5d2fSBarry Smith v += 9; 639f1af5d2fSBarry Smith 640f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 641f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 642f1af5d2fSBarry Smith while (nz--) { 643f1af5d2fSBarry Smith oidx = 3*(*vi++); 644f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 645f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 646f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 647f1af5d2fSBarry Smith v += 9; 648f1af5d2fSBarry Smith } 649f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 650f1af5d2fSBarry Smith idx += 3; 651f1af5d2fSBarry Smith } 652f1af5d2fSBarry Smith /* backward solve the L^T */ 653f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 654f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 655f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 656f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 657f1af5d2fSBarry Smith idt = 3*i; 658f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 659f1af5d2fSBarry Smith while (nz--) { 660f1af5d2fSBarry Smith idx = 3*(*vi--); 661f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 662f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 663f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 664f1af5d2fSBarry Smith v -= 9; 665f1af5d2fSBarry Smith } 666f1af5d2fSBarry Smith } 667f1af5d2fSBarry Smith 668f1af5d2fSBarry Smith /* copy t into x according to permutation */ 669f1af5d2fSBarry Smith ii = 0; 670f1af5d2fSBarry Smith for (i=0; i<n; i++) { 671f1af5d2fSBarry Smith ir = 3*r[i]; 672f1af5d2fSBarry Smith x[ir] = t[ii]; 673f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 674f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 675f1af5d2fSBarry Smith ii += 3; 676f1af5d2fSBarry Smith } 677f1af5d2fSBarry Smith 678f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 679f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 680b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 681b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 682b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 683f1af5d2fSBarry Smith PetscFunctionReturn(0); 684f1af5d2fSBarry Smith } 685f1af5d2fSBarry Smith 6864a2ae208SSatish Balay #undef __FUNCT__ 6874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 6887c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 689f1af5d2fSBarry Smith { 690f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 691f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 692f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 693f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 694f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 69587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 69687828ca2SBarry Smith PetscScalar *x,*b,*t; 697f1af5d2fSBarry Smith 698f1af5d2fSBarry Smith PetscFunctionBegin; 699b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 700b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 701f1af5d2fSBarry Smith t = a->solve_work; 702f1af5d2fSBarry Smith 703f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 704f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 705f1af5d2fSBarry Smith 706f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 707f1af5d2fSBarry Smith ii = 0; 708f1af5d2fSBarry Smith for (i=0; i<n; i++) { 709f1af5d2fSBarry Smith ic = 4*c[i]; 710f1af5d2fSBarry Smith t[ii] = b[ic]; 711f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 712f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 713f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 714f1af5d2fSBarry Smith ii += 4; 715f1af5d2fSBarry Smith } 716f1af5d2fSBarry Smith 717f1af5d2fSBarry Smith /* forward solve the U^T */ 718f1af5d2fSBarry Smith idx = 0; 719f1af5d2fSBarry Smith for (i=0; i<n; i++) { 720f1af5d2fSBarry Smith 721f1af5d2fSBarry Smith v = aa + 16*diag[i]; 722f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 723f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 724f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 725f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 726f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 727f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 728f1af5d2fSBarry Smith v += 16; 729f1af5d2fSBarry Smith 730f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 731f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 732f1af5d2fSBarry Smith while (nz--) { 733f1af5d2fSBarry Smith oidx = 4*(*vi++); 734f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 735f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 736f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 737f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 738f1af5d2fSBarry Smith v += 16; 739f1af5d2fSBarry Smith } 740f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 741f1af5d2fSBarry Smith idx += 4; 742f1af5d2fSBarry Smith } 743f1af5d2fSBarry Smith /* backward solve the L^T */ 744f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 745f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 746f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 747f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 748f1af5d2fSBarry Smith idt = 4*i; 749f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 750f1af5d2fSBarry Smith while (nz--) { 751f1af5d2fSBarry Smith idx = 4*(*vi--); 752f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756f1af5d2fSBarry Smith v -= 16; 757f1af5d2fSBarry Smith } 758f1af5d2fSBarry Smith } 759f1af5d2fSBarry Smith 760f1af5d2fSBarry Smith /* copy t into x according to permutation */ 761f1af5d2fSBarry Smith ii = 0; 762f1af5d2fSBarry Smith for (i=0; i<n; i++) { 763f1af5d2fSBarry Smith ir = 4*r[i]; 764f1af5d2fSBarry Smith x[ir] = t[ii]; 765f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 766f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 767f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 768f1af5d2fSBarry Smith ii += 4; 769f1af5d2fSBarry Smith } 770f1af5d2fSBarry Smith 771f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 772f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 773b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 774b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 775b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 776f1af5d2fSBarry Smith PetscFunctionReturn(0); 777f1af5d2fSBarry Smith } 778f1af5d2fSBarry Smith 7794a2ae208SSatish Balay #undef __FUNCT__ 7804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 7817c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 782f1af5d2fSBarry Smith { 783f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 784f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 785f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 786f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 787f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 78887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 78987828ca2SBarry Smith PetscScalar *x,*b,*t; 790f1af5d2fSBarry Smith 791f1af5d2fSBarry Smith PetscFunctionBegin; 792b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 793b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 794f1af5d2fSBarry Smith t = a->solve_work; 795f1af5d2fSBarry Smith 796f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 797f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 798f1af5d2fSBarry Smith 799f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 800f1af5d2fSBarry Smith ii = 0; 801f1af5d2fSBarry Smith for (i=0; i<n; i++) { 802f1af5d2fSBarry Smith ic = 5*c[i]; 803f1af5d2fSBarry Smith t[ii] = b[ic]; 804f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 805f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 806f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 807f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 808f1af5d2fSBarry Smith ii += 5; 809f1af5d2fSBarry Smith } 810f1af5d2fSBarry Smith 811f1af5d2fSBarry Smith /* forward solve the U^T */ 812f1af5d2fSBarry Smith idx = 0; 813f1af5d2fSBarry Smith for (i=0; i<n; i++) { 814f1af5d2fSBarry Smith 815f1af5d2fSBarry Smith v = aa + 25*diag[i]; 816f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 817f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 818f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 819f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 820f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 821f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 822f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 823f1af5d2fSBarry Smith v += 25; 824f1af5d2fSBarry Smith 825f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 826f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 827f1af5d2fSBarry Smith while (nz--) { 828f1af5d2fSBarry Smith oidx = 5*(*vi++); 829f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 830f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 831f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 832f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 833f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 834f1af5d2fSBarry Smith v += 25; 835f1af5d2fSBarry Smith } 836f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 837f1af5d2fSBarry Smith idx += 5; 838f1af5d2fSBarry Smith } 839f1af5d2fSBarry Smith /* backward solve the L^T */ 840f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 841f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 842f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 843f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 844f1af5d2fSBarry Smith idt = 5*i; 845f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 846f1af5d2fSBarry Smith while (nz--) { 847f1af5d2fSBarry Smith idx = 5*(*vi--); 848f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 849f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 850f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 851f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 852f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 853f1af5d2fSBarry Smith v -= 25; 854f1af5d2fSBarry Smith } 855f1af5d2fSBarry Smith } 856f1af5d2fSBarry Smith 857f1af5d2fSBarry Smith /* copy t into x according to permutation */ 858f1af5d2fSBarry Smith ii = 0; 859f1af5d2fSBarry Smith for (i=0; i<n; i++) { 860f1af5d2fSBarry Smith ir = 5*r[i]; 861f1af5d2fSBarry Smith x[ir] = t[ii]; 862f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 863f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 864f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 865f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 866f1af5d2fSBarry Smith ii += 5; 867f1af5d2fSBarry Smith } 868f1af5d2fSBarry Smith 869f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 870f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 871b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 872b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 873b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 874f1af5d2fSBarry Smith PetscFunctionReturn(0); 875f1af5d2fSBarry Smith } 876f1af5d2fSBarry Smith 8774a2ae208SSatish Balay #undef __FUNCT__ 8784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 8797c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 880f1af5d2fSBarry Smith { 881f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 882f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 883f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 884f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 885f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 88687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 88787828ca2SBarry Smith PetscScalar *x,*b,*t; 888f1af5d2fSBarry Smith 889f1af5d2fSBarry Smith PetscFunctionBegin; 890b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 891b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 892f1af5d2fSBarry Smith t = a->solve_work; 893f1af5d2fSBarry Smith 894f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 895f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 896f1af5d2fSBarry Smith 897f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 898f1af5d2fSBarry Smith ii = 0; 899f1af5d2fSBarry Smith for (i=0; i<n; i++) { 900f1af5d2fSBarry Smith ic = 6*c[i]; 901f1af5d2fSBarry Smith t[ii] = b[ic]; 902f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 903f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 904f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 905f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 906f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 907f1af5d2fSBarry Smith ii += 6; 908f1af5d2fSBarry Smith } 909f1af5d2fSBarry Smith 910f1af5d2fSBarry Smith /* forward solve the U^T */ 911f1af5d2fSBarry Smith idx = 0; 912f1af5d2fSBarry Smith for (i=0; i<n; i++) { 913f1af5d2fSBarry Smith 914f1af5d2fSBarry Smith v = aa + 36*diag[i]; 915f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 916f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 917f1af5d2fSBarry Smith x6 = t[5+idx]; 918f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 919f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 920f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 921f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 922f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 923f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 924f1af5d2fSBarry Smith v += 36; 925f1af5d2fSBarry Smith 926f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 927f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 928f1af5d2fSBarry Smith while (nz--) { 929f1af5d2fSBarry Smith oidx = 6*(*vi++); 930f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 931f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 932f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 933f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 934f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 935f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 936f1af5d2fSBarry Smith v += 36; 937f1af5d2fSBarry Smith } 938f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 939f1af5d2fSBarry Smith t[5+idx] = s6; 940f1af5d2fSBarry Smith idx += 6; 941f1af5d2fSBarry Smith } 942f1af5d2fSBarry Smith /* backward solve the L^T */ 943f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 944f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 945f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 946f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 947f1af5d2fSBarry Smith idt = 6*i; 948f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 949f1af5d2fSBarry Smith s6 = t[5+idt]; 950f1af5d2fSBarry Smith while (nz--) { 951f1af5d2fSBarry Smith idx = 6*(*vi--); 952f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958f1af5d2fSBarry Smith v -= 36; 959f1af5d2fSBarry Smith } 960f1af5d2fSBarry Smith } 961f1af5d2fSBarry Smith 962f1af5d2fSBarry Smith /* copy t into x according to permutation */ 963f1af5d2fSBarry Smith ii = 0; 964f1af5d2fSBarry Smith for (i=0; i<n; i++) { 965f1af5d2fSBarry Smith ir = 6*r[i]; 966f1af5d2fSBarry Smith x[ir] = t[ii]; 967f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 968f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 969f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 970f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 971f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 972f1af5d2fSBarry Smith ii += 6; 973f1af5d2fSBarry Smith } 974f1af5d2fSBarry Smith 975f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 976f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 977b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 978b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 979b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 980f1af5d2fSBarry Smith PetscFunctionReturn(0); 981f1af5d2fSBarry Smith } 982f1af5d2fSBarry Smith 9834a2ae208SSatish Balay #undef __FUNCT__ 9844a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 9857c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 986f1af5d2fSBarry Smith { 987f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 988f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 989f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 990f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 991f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 99287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 99387828ca2SBarry Smith PetscScalar *x,*b,*t; 994f1af5d2fSBarry Smith 995f1af5d2fSBarry Smith PetscFunctionBegin; 996b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 997b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 998f1af5d2fSBarry Smith t = a->solve_work; 999f1af5d2fSBarry Smith 1000f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1001f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1002f1af5d2fSBarry Smith 1003f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1004f1af5d2fSBarry Smith ii = 0; 1005f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1006f1af5d2fSBarry Smith ic = 7*c[i]; 1007f1af5d2fSBarry Smith t[ii] = b[ic]; 1008f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1009f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1010f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1011f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1012f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1013f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1014f1af5d2fSBarry Smith ii += 7; 1015f1af5d2fSBarry Smith } 1016f1af5d2fSBarry Smith 1017f1af5d2fSBarry Smith /* forward solve the U^T */ 1018f1af5d2fSBarry Smith idx = 0; 1019f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1020f1af5d2fSBarry Smith 1021f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1022f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1023f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1024f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1025f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1026f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1027f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1028f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1029f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1030f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1031f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1032f1af5d2fSBarry Smith v += 49; 1033f1af5d2fSBarry Smith 1034f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1035f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1036f1af5d2fSBarry Smith while (nz--) { 1037f1af5d2fSBarry Smith oidx = 7*(*vi++); 1038f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1039f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1040f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1041f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1042f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1043f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1044f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1045f1af5d2fSBarry Smith v += 49; 1046f1af5d2fSBarry Smith } 1047f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1048f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1049f1af5d2fSBarry Smith idx += 7; 1050f1af5d2fSBarry Smith } 1051f1af5d2fSBarry Smith /* backward solve the L^T */ 1052f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1053f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1054f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1055f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1056f1af5d2fSBarry Smith idt = 7*i; 1057f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1058f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1059f1af5d2fSBarry Smith while (nz--) { 1060f1af5d2fSBarry Smith idx = 7*(*vi--); 1061f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1062f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1063f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1064f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1065f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1066f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1067f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1068f1af5d2fSBarry Smith v -= 49; 1069f1af5d2fSBarry Smith } 1070f1af5d2fSBarry Smith } 1071f1af5d2fSBarry Smith 1072f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1073f1af5d2fSBarry Smith ii = 0; 1074f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1075f1af5d2fSBarry Smith ir = 7*r[i]; 1076f1af5d2fSBarry Smith x[ir] = t[ii]; 1077f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1078f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1079f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1080f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1081f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1082f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1083f1af5d2fSBarry Smith ii += 7; 1084f1af5d2fSBarry Smith } 1085f1af5d2fSBarry Smith 1086f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1087f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1088b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 1089b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1090b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 1091f1af5d2fSBarry Smith PetscFunctionReturn(0); 1092f1af5d2fSBarry Smith } 1093f1af5d2fSBarry Smith 10944e2b4712SSatish Balay /* ----------------------------------------------------------- */ 10954a2ae208SSatish Balay #undef __FUNCT__ 10964a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 10974e2b4712SSatish Balay int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 10984e2b4712SSatish Balay { 10994e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11004e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11014e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 11024e2b4712SSatish Balay int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 11033f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 110487828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11054e2b4712SSatish Balay 11064e2b4712SSatish Balay PetscFunctionBegin; 1107b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 1108b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 1109f1af5d2fSBarry Smith t = a->solve_work; 11104e2b4712SSatish Balay 11114e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11124e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11134e2b4712SSatish Balay 11144e2b4712SSatish Balay /* forward solve the lower triangular */ 111587828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11164e2b4712SSatish Balay for (i=1; i<n; i++) { 11174e2b4712SSatish Balay v = aa + bs2*ai[i]; 11184e2b4712SSatish Balay vi = aj + ai[i]; 11194e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1120f1af5d2fSBarry Smith s = t + bs*i; 112187828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11224e2b4712SSatish Balay while (nz--) { 1123f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11244e2b4712SSatish Balay v += bs2; 11254e2b4712SSatish Balay } 11264e2b4712SSatish Balay } 11274e2b4712SSatish Balay /* backward solve the upper triangular */ 1128273d9f13SBarry Smith ls = a->solve_work + A->n; 11294e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11304e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11314e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11324e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 113387828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11344e2b4712SSatish Balay while (nz--) { 1135f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11364e2b4712SSatish Balay v += bs2; 11374e2b4712SSatish Balay } 1138f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 113987828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11404e2b4712SSatish Balay } 11414e2b4712SSatish Balay 11424e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11434e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1144b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 1145b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1146b0a32e0cSBarry Smith PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 11474e2b4712SSatish Balay PetscFunctionReturn(0); 11484e2b4712SSatish Balay } 11494e2b4712SSatish Balay 11504a2ae208SSatish Balay #undef __FUNCT__ 11514a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 11524e2b4712SSatish Balay int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11534e2b4712SSatish Balay { 11544e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11554e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11564e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 11574e2b4712SSatish Balay int *diag = a->diag; 11583f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 115987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 116087828ca2SBarry Smith PetscScalar *x,*b,*t; 11614e2b4712SSatish Balay 11624e2b4712SSatish Balay PetscFunctionBegin; 1163b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 1164b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 1165f1af5d2fSBarry Smith t = a->solve_work; 11664e2b4712SSatish Balay 11674e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11684e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11694e2b4712SSatish Balay 11704e2b4712SSatish Balay /* forward solve the lower triangular */ 11714e2b4712SSatish Balay idx = 7*(*r++); 1172f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1173f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1174f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 11754e2b4712SSatish Balay 11764e2b4712SSatish Balay for (i=1; i<n; i++) { 11774e2b4712SSatish Balay v = aa + 49*ai[i]; 11784e2b4712SSatish Balay vi = aj + ai[i]; 11794e2b4712SSatish Balay nz = diag[i] - ai[i]; 11804e2b4712SSatish Balay idx = 7*(*r++); 1181f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1182f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 11834e2b4712SSatish Balay while (nz--) { 11844e2b4712SSatish Balay idx = 7*(*vi++); 1185f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1186f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1187f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1188f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1189f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1190f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1191f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1192f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1193f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1194f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 11954e2b4712SSatish Balay v += 49; 11964e2b4712SSatish Balay } 11974e2b4712SSatish Balay idx = 7*i; 1198f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1199f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1200f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12014e2b4712SSatish Balay } 12024e2b4712SSatish Balay /* backward solve the upper triangular */ 12034e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12044e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12054e2b4712SSatish Balay vi = aj + diag[i] + 1; 12064e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12074e2b4712SSatish Balay idt = 7*i; 1208f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1209f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1210f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12114e2b4712SSatish Balay while (nz--) { 12124e2b4712SSatish Balay idx = 7*(*vi++); 1213f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1214f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1215f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1216f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1217f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1218f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1219f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1220f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1221f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1222f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12234e2b4712SSatish Balay v += 49; 12244e2b4712SSatish Balay } 12254e2b4712SSatish Balay idc = 7*(*c--); 12264e2b4712SSatish Balay v = aa + 49*diag[i]; 1227f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1228f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1229f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1230f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1231f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1232f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1233f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1234f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1235f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1236f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1237f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1238f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1239f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1240f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12414e2b4712SSatish Balay } 12424e2b4712SSatish Balay 12434e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12444e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1245b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 1246b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1247b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 12484e2b4712SSatish Balay PetscFunctionReturn(0); 12494e2b4712SSatish Balay } 12504e2b4712SSatish Balay 12514a2ae208SSatish Balay #undef __FUNCT__ 12524a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 125315091d37SBarry Smith int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 125415091d37SBarry Smith { 125515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 125615091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 125715091d37SBarry Smith int ierr,*diag = a->diag,jdx; 125815091d37SBarry Smith MatScalar *aa=a->a,*v; 125987828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 126015091d37SBarry Smith 126115091d37SBarry Smith PetscFunctionBegin; 1262b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 1263b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 126415091d37SBarry Smith /* forward solve the lower triangular */ 126515091d37SBarry Smith idx = 0; 126615091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 126715091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 126815091d37SBarry Smith x[6] = b[6+idx]; 126915091d37SBarry Smith for (i=1; i<n; i++) { 127015091d37SBarry Smith v = aa + 49*ai[i]; 127115091d37SBarry Smith vi = aj + ai[i]; 127215091d37SBarry Smith nz = diag[i] - ai[i]; 127315091d37SBarry Smith idx = 7*i; 1274f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1275f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1276f1af5d2fSBarry Smith s7 = b[6+idx]; 127715091d37SBarry Smith while (nz--) { 127815091d37SBarry Smith jdx = 7*(*vi++); 127915091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 128015091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 128115091d37SBarry Smith x7 = x[6+jdx]; 1282f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1283f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1284f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1285f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1286f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1287f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1288f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 128915091d37SBarry Smith v += 49; 129015091d37SBarry Smith } 1291f1af5d2fSBarry Smith x[idx] = s1; 1292f1af5d2fSBarry Smith x[1+idx] = s2; 1293f1af5d2fSBarry Smith x[2+idx] = s3; 1294f1af5d2fSBarry Smith x[3+idx] = s4; 1295f1af5d2fSBarry Smith x[4+idx] = s5; 1296f1af5d2fSBarry Smith x[5+idx] = s6; 1297f1af5d2fSBarry Smith x[6+idx] = s7; 129815091d37SBarry Smith } 129915091d37SBarry Smith /* backward solve the upper triangular */ 130015091d37SBarry Smith for (i=n-1; i>=0; i--){ 130115091d37SBarry Smith v = aa + 49*diag[i] + 49; 130215091d37SBarry Smith vi = aj + diag[i] + 1; 130315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 130415091d37SBarry Smith idt = 7*i; 1305f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1306f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1307f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1308f1af5d2fSBarry Smith s7 = x[6+idt]; 130915091d37SBarry Smith while (nz--) { 131015091d37SBarry Smith idx = 7*(*vi++); 131115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 131215091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 131315091d37SBarry Smith x7 = x[6+idx]; 1314f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1315f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1316f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1317f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1318f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1319f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1320f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 132115091d37SBarry Smith v += 49; 132215091d37SBarry Smith } 132315091d37SBarry Smith v = aa + 49*diag[i]; 1324f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1325f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1326f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1327f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1328f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1329f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1330f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1331f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1332f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1333f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1334f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1335f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1336f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1337f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 133815091d37SBarry Smith } 133915091d37SBarry Smith 1340b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 1341b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1342b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 134315091d37SBarry Smith PetscFunctionReturn(0); 134415091d37SBarry Smith } 134515091d37SBarry Smith 13464a2ae208SSatish Balay #undef __FUNCT__ 13474a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 134815091d37SBarry Smith int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 134915091d37SBarry Smith { 135015091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 135115091d37SBarry Smith IS iscol=a->col,isrow=a->row; 135215091d37SBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 135315091d37SBarry Smith int *diag = a->diag; 135415091d37SBarry Smith MatScalar *aa=a->a,*v; 135587828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 135615091d37SBarry Smith 135715091d37SBarry Smith PetscFunctionBegin; 1358b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 1359b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 1360f1af5d2fSBarry Smith t = a->solve_work; 136115091d37SBarry Smith 136215091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 136315091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 136415091d37SBarry Smith 136515091d37SBarry Smith /* forward solve the lower triangular */ 136615091d37SBarry Smith idx = 6*(*r++); 1367f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1368f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1369f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 137015091d37SBarry Smith for (i=1; i<n; i++) { 137115091d37SBarry Smith v = aa + 36*ai[i]; 137215091d37SBarry Smith vi = aj + ai[i]; 137315091d37SBarry Smith nz = diag[i] - ai[i]; 137415091d37SBarry Smith idx = 6*(*r++); 1375f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1376f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 137715091d37SBarry Smith while (nz--) { 137815091d37SBarry Smith idx = 6*(*vi++); 1379f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1380f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1381f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1382f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1383f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1384f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1385f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1386f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 138715091d37SBarry Smith v += 36; 138815091d37SBarry Smith } 138915091d37SBarry Smith idx = 6*i; 1390f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1391f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1392f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 139315091d37SBarry Smith } 139415091d37SBarry Smith /* backward solve the upper triangular */ 139515091d37SBarry Smith for (i=n-1; i>=0; i--){ 139615091d37SBarry Smith v = aa + 36*diag[i] + 36; 139715091d37SBarry Smith vi = aj + diag[i] + 1; 139815091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 139915091d37SBarry Smith idt = 6*i; 1400f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1401f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1402f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 140315091d37SBarry Smith while (nz--) { 140415091d37SBarry Smith idx = 6*(*vi++); 1405f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1406f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1407f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1408f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1409f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1410f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1411f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1412f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1413f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 141415091d37SBarry Smith v += 36; 141515091d37SBarry Smith } 141615091d37SBarry Smith idc = 6*(*c--); 141715091d37SBarry Smith v = aa + 36*diag[i]; 1418f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1419f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1420f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1421f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1422f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1423f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1424f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1425f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1426f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1427f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1428f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1429f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 143015091d37SBarry Smith } 143115091d37SBarry Smith 143215091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 143315091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1434b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 1435b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1436b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 143715091d37SBarry Smith PetscFunctionReturn(0); 143815091d37SBarry Smith } 143915091d37SBarry Smith 14404a2ae208SSatish Balay #undef __FUNCT__ 14414a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 144215091d37SBarry Smith int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 144315091d37SBarry Smith { 144415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 144515091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 144615091d37SBarry Smith int ierr,*diag = a->diag,jdx; 144715091d37SBarry Smith MatScalar *aa=a->a,*v; 144887828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 144915091d37SBarry Smith 145015091d37SBarry Smith PetscFunctionBegin; 1451b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 1452b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 145315091d37SBarry Smith /* forward solve the lower triangular */ 145415091d37SBarry Smith idx = 0; 145515091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 145615091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 145715091d37SBarry Smith for (i=1; i<n; i++) { 145815091d37SBarry Smith v = aa + 36*ai[i]; 145915091d37SBarry Smith vi = aj + ai[i]; 146015091d37SBarry Smith nz = diag[i] - ai[i]; 146115091d37SBarry Smith idx = 6*i; 1462f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1463f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 146415091d37SBarry Smith while (nz--) { 146515091d37SBarry Smith jdx = 6*(*vi++); 146615091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 146715091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1468f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1469f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1470f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1471f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1472f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1473f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 147415091d37SBarry Smith v += 36; 147515091d37SBarry Smith } 1476f1af5d2fSBarry Smith x[idx] = s1; 1477f1af5d2fSBarry Smith x[1+idx] = s2; 1478f1af5d2fSBarry Smith x[2+idx] = s3; 1479f1af5d2fSBarry Smith x[3+idx] = s4; 1480f1af5d2fSBarry Smith x[4+idx] = s5; 1481f1af5d2fSBarry Smith x[5+idx] = s6; 148215091d37SBarry Smith } 148315091d37SBarry Smith /* backward solve the upper triangular */ 148415091d37SBarry Smith for (i=n-1; i>=0; i--){ 148515091d37SBarry Smith v = aa + 36*diag[i] + 36; 148615091d37SBarry Smith vi = aj + diag[i] + 1; 148715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 148815091d37SBarry Smith idt = 6*i; 1489f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1490f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1491f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 149215091d37SBarry Smith while (nz--) { 149315091d37SBarry Smith idx = 6*(*vi++); 149415091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 149515091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1496f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1497f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1498f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1499f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1500f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1501f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 150215091d37SBarry Smith v += 36; 150315091d37SBarry Smith } 150415091d37SBarry Smith v = aa + 36*diag[i]; 1505f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1506f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1507f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1508f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1509f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1510f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 151115091d37SBarry Smith } 151215091d37SBarry Smith 1513b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 1514b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1515b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 151615091d37SBarry Smith PetscFunctionReturn(0); 151715091d37SBarry Smith } 151815091d37SBarry Smith 15194a2ae208SSatish Balay #undef __FUNCT__ 15204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 15214e2b4712SSatish Balay int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 15224e2b4712SSatish Balay { 15234e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15244e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 15254e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 15264e2b4712SSatish Balay int *diag = a->diag; 15273f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 152887828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 15294e2b4712SSatish Balay 15304e2b4712SSatish Balay PetscFunctionBegin; 1531b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 1532b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 1533f1af5d2fSBarry Smith t = a->solve_work; 15344e2b4712SSatish Balay 15354e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 15364e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 15374e2b4712SSatish Balay 15384e2b4712SSatish Balay /* forward solve the lower triangular */ 15394e2b4712SSatish Balay idx = 5*(*r++); 1540f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1541f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 15424e2b4712SSatish Balay for (i=1; i<n; i++) { 15434e2b4712SSatish Balay v = aa + 25*ai[i]; 15444e2b4712SSatish Balay vi = aj + ai[i]; 15454e2b4712SSatish Balay nz = diag[i] - ai[i]; 15464e2b4712SSatish Balay idx = 5*(*r++); 1547f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1548f1af5d2fSBarry Smith s5 = b[4+idx]; 15494e2b4712SSatish Balay while (nz--) { 15504e2b4712SSatish Balay idx = 5*(*vi++); 1551f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1552f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1553f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1554f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1555f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1556f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1557f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 15584e2b4712SSatish Balay v += 25; 15594e2b4712SSatish Balay } 15604e2b4712SSatish Balay idx = 5*i; 1561f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1562f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 15634e2b4712SSatish Balay } 15644e2b4712SSatish Balay /* backward solve the upper triangular */ 15654e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 15664e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 15674e2b4712SSatish Balay vi = aj + diag[i] + 1; 15684e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 15694e2b4712SSatish Balay idt = 5*i; 1570f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1571f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 15724e2b4712SSatish Balay while (nz--) { 15734e2b4712SSatish Balay idx = 5*(*vi++); 1574f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1575f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1576f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1577f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1578f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1579f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1580f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 15814e2b4712SSatish Balay v += 25; 15824e2b4712SSatish Balay } 15834e2b4712SSatish Balay idc = 5*(*c--); 15844e2b4712SSatish Balay v = aa + 25*diag[i]; 1585f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1586f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 1587f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1588f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 1589f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1590f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 1591f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1592f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 1593f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1594f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 15954e2b4712SSatish Balay } 15964e2b4712SSatish Balay 15974e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 15984e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1599b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 1600b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1601b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 16024e2b4712SSatish Balay PetscFunctionReturn(0); 16034e2b4712SSatish Balay } 16044e2b4712SSatish Balay 16054a2ae208SSatish Balay #undef __FUNCT__ 16064a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 160715091d37SBarry Smith int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 160815091d37SBarry Smith { 160915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 161015091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 161115091d37SBarry Smith int ierr,*diag = a->diag,jdx; 161215091d37SBarry Smith MatScalar *aa=a->a,*v; 161387828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 161415091d37SBarry Smith 161515091d37SBarry Smith PetscFunctionBegin; 1616b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 1617b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 161815091d37SBarry Smith /* forward solve the lower triangular */ 161915091d37SBarry Smith idx = 0; 162015091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 162115091d37SBarry Smith for (i=1; i<n; i++) { 162215091d37SBarry Smith v = aa + 25*ai[i]; 162315091d37SBarry Smith vi = aj + ai[i]; 162415091d37SBarry Smith nz = diag[i] - ai[i]; 162515091d37SBarry Smith idx = 5*i; 1626f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 162715091d37SBarry Smith while (nz--) { 162815091d37SBarry Smith jdx = 5*(*vi++); 162915091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1630f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1631f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1632f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1633f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1634f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 163515091d37SBarry Smith v += 25; 163615091d37SBarry Smith } 1637f1af5d2fSBarry Smith x[idx] = s1; 1638f1af5d2fSBarry Smith x[1+idx] = s2; 1639f1af5d2fSBarry Smith x[2+idx] = s3; 1640f1af5d2fSBarry Smith x[3+idx] = s4; 1641f1af5d2fSBarry Smith x[4+idx] = s5; 164215091d37SBarry Smith } 164315091d37SBarry Smith /* backward solve the upper triangular */ 164415091d37SBarry Smith for (i=n-1; i>=0; i--){ 164515091d37SBarry Smith v = aa + 25*diag[i] + 25; 164615091d37SBarry Smith vi = aj + diag[i] + 1; 164715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 164815091d37SBarry Smith idt = 5*i; 1649f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1650f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 165115091d37SBarry Smith while (nz--) { 165215091d37SBarry Smith idx = 5*(*vi++); 165315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1654f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1655f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1656f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1657f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1658f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 165915091d37SBarry Smith v += 25; 166015091d37SBarry Smith } 166115091d37SBarry Smith v = aa + 25*diag[i]; 1662f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1663f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1664f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1665f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1666f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 166715091d37SBarry Smith } 166815091d37SBarry Smith 1669b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 1670b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1671b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 167215091d37SBarry Smith PetscFunctionReturn(0); 167315091d37SBarry Smith } 167415091d37SBarry Smith 16754a2ae208SSatish Balay #undef __FUNCT__ 16764a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 16774e2b4712SSatish Balay int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 16784e2b4712SSatish Balay { 16794e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 16804e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 16814e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 16824e2b4712SSatish Balay int *diag = a->diag; 16833f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 168487828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 16854e2b4712SSatish Balay 16864e2b4712SSatish Balay PetscFunctionBegin; 1687b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 1688b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 1689f1af5d2fSBarry Smith t = a->solve_work; 16904e2b4712SSatish Balay 16914e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 16924e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 16934e2b4712SSatish Balay 16944e2b4712SSatish Balay /* forward solve the lower triangular */ 16954e2b4712SSatish Balay idx = 4*(*r++); 1696f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1697f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 16984e2b4712SSatish Balay for (i=1; i<n; i++) { 16994e2b4712SSatish Balay v = aa + 16*ai[i]; 17004e2b4712SSatish Balay vi = aj + ai[i]; 17014e2b4712SSatish Balay nz = diag[i] - ai[i]; 17024e2b4712SSatish Balay idx = 4*(*r++); 1703f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 17044e2b4712SSatish Balay while (nz--) { 17054e2b4712SSatish Balay idx = 4*(*vi++); 1706f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1707f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1708f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1709f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1710f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17114e2b4712SSatish Balay v += 16; 17124e2b4712SSatish Balay } 17134e2b4712SSatish Balay idx = 4*i; 1714f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1715f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 17164e2b4712SSatish Balay } 17174e2b4712SSatish Balay /* backward solve the upper triangular */ 17184e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 17194e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 17204e2b4712SSatish Balay vi = aj + diag[i] + 1; 17214e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 17224e2b4712SSatish Balay idt = 4*i; 1723f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1724f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 17254e2b4712SSatish Balay while (nz--) { 17264e2b4712SSatish Balay idx = 4*(*vi++); 1727f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1728f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1729f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1730f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1731f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1732f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17334e2b4712SSatish Balay v += 16; 17344e2b4712SSatish Balay } 17354e2b4712SSatish Balay idc = 4*(*c--); 17364e2b4712SSatish Balay v = aa + 16*diag[i]; 1737f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1738f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1739f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1740f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 17414e2b4712SSatish Balay } 17424e2b4712SSatish Balay 17434e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 17444e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1745b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 1746b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1747b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 17484e2b4712SSatish Balay PetscFunctionReturn(0); 17494e2b4712SSatish Balay } 1750f26ec98cSKris Buschelman 1751f26ec98cSKris Buschelman #undef __FUNCT__ 1752f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1753f26ec98cSKris Buschelman int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1754f26ec98cSKris Buschelman { 1755f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1756f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 1757f26ec98cSKris Buschelman int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1758f26ec98cSKris Buschelman int *diag = a->diag; 1759f26ec98cSKris Buschelman MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1760f26ec98cSKris Buschelman PetscScalar *x,*b; 1761f26ec98cSKris Buschelman 1762f26ec98cSKris Buschelman PetscFunctionBegin; 1763b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 1764b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 1765f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 1766f26ec98cSKris Buschelman 1767f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1768f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1769f26ec98cSKris Buschelman 1770f26ec98cSKris Buschelman /* forward solve the lower triangular */ 1771f26ec98cSKris Buschelman idx = 4*(*r++); 1772f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 1773f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 1774f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 1775f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 1776f26ec98cSKris Buschelman for (i=1; i<n; i++) { 1777f26ec98cSKris Buschelman v = aa + 16*ai[i]; 1778f26ec98cSKris Buschelman vi = aj + ai[i]; 1779f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 1780f26ec98cSKris Buschelman idx = 4*(*r++); 1781f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 1782f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 1783f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 1784f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 1785f26ec98cSKris Buschelman while (nz--) { 1786f26ec98cSKris Buschelman idx = 4*(*vi++); 1787f26ec98cSKris Buschelman x1 = t[idx]; 1788f26ec98cSKris Buschelman x2 = t[1+idx]; 1789f26ec98cSKris Buschelman x3 = t[2+idx]; 1790f26ec98cSKris Buschelman x4 = t[3+idx]; 1791f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1792f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1793f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1794f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1795f26ec98cSKris Buschelman v += 16; 1796f26ec98cSKris Buschelman } 1797f26ec98cSKris Buschelman idx = 4*i; 1798f26ec98cSKris Buschelman t[idx] = s1; 1799f26ec98cSKris Buschelman t[1+idx] = s2; 1800f26ec98cSKris Buschelman t[2+idx] = s3; 1801f26ec98cSKris Buschelman t[3+idx] = s4; 1802f26ec98cSKris Buschelman } 1803f26ec98cSKris Buschelman /* backward solve the upper triangular */ 1804f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 1805f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 1806f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 1807f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 1808f26ec98cSKris Buschelman idt = 4*i; 1809f26ec98cSKris Buschelman s1 = t[idt]; 1810f26ec98cSKris Buschelman s2 = t[1+idt]; 1811f26ec98cSKris Buschelman s3 = t[2+idt]; 1812f26ec98cSKris Buschelman s4 = t[3+idt]; 1813f26ec98cSKris Buschelman while (nz--) { 1814f26ec98cSKris Buschelman idx = 4*(*vi++); 1815f26ec98cSKris Buschelman x1 = t[idx]; 1816f26ec98cSKris Buschelman x2 = t[1+idx]; 1817f26ec98cSKris Buschelman x3 = t[2+idx]; 1818f26ec98cSKris Buschelman x4 = t[3+idx]; 1819f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1820f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1821f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1822f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1823f26ec98cSKris Buschelman v += 16; 1824f26ec98cSKris Buschelman } 1825f26ec98cSKris Buschelman idc = 4*(*c--); 1826f26ec98cSKris Buschelman v = aa + 16*diag[i]; 1827f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1828f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1829f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1830f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1831f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 1832f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 1833f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 1834f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 1835f26ec98cSKris Buschelman } 1836f26ec98cSKris Buschelman 1837f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1838f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1839b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 1840b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1841f26ec98cSKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 1842f26ec98cSKris Buschelman PetscFunctionReturn(0); 1843f26ec98cSKris Buschelman } 1844f26ec98cSKris Buschelman 184524c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 184624c233c2SKris Buschelman 184724c233c2SKris Buschelman #include PETSC_HAVE_SSE 184824c233c2SKris Buschelman 184924c233c2SKris Buschelman #undef __FUNCT__ 185024c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 185124c233c2SKris Buschelman int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 185224c233c2SKris Buschelman { 185324c233c2SKris Buschelman /* 185424c233c2SKris Buschelman Note: This code uses demotion of double 185524c233c2SKris Buschelman to float when performing the mixed-mode computation. 185624c233c2SKris Buschelman This may not be numerically reasonable for all applications. 185724c233c2SKris Buschelman */ 185824c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 185924c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 186024c233c2SKris Buschelman int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 186124c233c2SKris Buschelman int *diag = a->diag,ai16; 186224c233c2SKris Buschelman MatScalar *aa=a->a,*v; 186387828ca2SBarry Smith PetscScalar *x,*b,*t; 186424c233c2SKris Buschelman 186524c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 186624c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 186724c233c2SKris Buschelman unsigned long offset; 186824c233c2SKris Buschelman 186924c233c2SKris Buschelman PetscFunctionBegin; 187024c233c2SKris Buschelman SSE_SCOPE_BEGIN; 187124c233c2SKris Buschelman 187224c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 187324c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 187424c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 187524c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 187624c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 187724c233c2SKris Buschelman 1878b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 1879b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 188024c233c2SKris Buschelman t = a->solve_work; 188124c233c2SKris Buschelman 188224c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 188324c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 188424c233c2SKris Buschelman 188524c233c2SKris Buschelman /* forward solve the lower triangular */ 188624c233c2SKris Buschelman idx = 4*(*r++); 188724c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 188824c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 188924c233c2SKris Buschelman v = aa + 16*ai[1]; 189024c233c2SKris Buschelman 189124c233c2SKris Buschelman for (i=1; i<n;) { 189224c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 189324c233c2SKris Buschelman vi = aj + ai[i]; 189424c233c2SKris Buschelman nz = diag[i] - ai[i]; 189524c233c2SKris Buschelman idx = 4*(*r++); 189624c233c2SKris Buschelman 189724c233c2SKris Buschelman /* Demote sum from double to float */ 189824c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 189924c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 190024c233c2SKris Buschelman 190124c233c2SKris Buschelman while (nz--) { 190224c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 190324c233c2SKris Buschelman idx = 4*(*vi++); 190424c233c2SKris Buschelman 190524c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 190624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 190724c233c2SKris Buschelman 190824c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 190924c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 191024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 191124c233c2SKris Buschelman 191224c233c2SKris Buschelman /* First Column */ 191324c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 191424c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 191524c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 191624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 191724c233c2SKris Buschelman 191824c233c2SKris Buschelman /* Second Column */ 191924c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 192024c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 192124c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 192224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 192324c233c2SKris Buschelman 192424c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 192524c233c2SKris Buschelman 192624c233c2SKris Buschelman /* Third Column */ 192724c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 192824c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 192924c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 193024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 193124c233c2SKris Buschelman 193224c233c2SKris Buschelman /* Fourth Column */ 193324c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 193424c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 193524c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 193624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 193724c233c2SKris Buschelman SSE_INLINE_END_2 193824c233c2SKris Buschelman 193924c233c2SKris Buschelman v += 16; 194024c233c2SKris Buschelman } 194124c233c2SKris Buschelman idx = 4*i; 194224c233c2SKris Buschelman v = aa + 16*ai[++i]; 194324c233c2SKris Buschelman PREFETCH_NTA(v); 194424c233c2SKris Buschelman STORE_PS(tmps,XMM7); 194524c233c2SKris Buschelman 194624c233c2SKris Buschelman /* Promote result from float to double */ 194724c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 194824c233c2SKris Buschelman } 194924c233c2SKris Buschelman /* backward solve the upper triangular */ 195024c233c2SKris Buschelman idt = 4*(n-1); 195124c233c2SKris Buschelman ai16 = 16*diag[n-1]; 195224c233c2SKris Buschelman v = aa + ai16 + 16; 195324c233c2SKris Buschelman for (i=n-1; i>=0;){ 195424c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 195524c233c2SKris Buschelman vi = aj + diag[i] + 1; 195624c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 195724c233c2SKris Buschelman 195824c233c2SKris Buschelman /* Demote accumulator from double to float */ 195924c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 196024c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 196124c233c2SKris Buschelman 196224c233c2SKris Buschelman while (nz--) { 196324c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 196424c233c2SKris Buschelman idx = 4*(*vi++); 196524c233c2SKris Buschelman 196624c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 196724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 196824c233c2SKris Buschelman 196924c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 197024c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 197124c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 197224c233c2SKris Buschelman 197324c233c2SKris Buschelman /* First Column */ 197424c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 197524c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 197624c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 197724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 197824c233c2SKris Buschelman 197924c233c2SKris Buschelman /* Second Column */ 198024c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 198124c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 198224c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 198324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 198424c233c2SKris Buschelman 198524c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 198624c233c2SKris Buschelman 198724c233c2SKris Buschelman /* Third Column */ 198824c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 198924c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 199024c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 199124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 199224c233c2SKris Buschelman 199324c233c2SKris Buschelman /* Fourth Column */ 199424c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 199524c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 199624c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 199724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 199824c233c2SKris Buschelman SSE_INLINE_END_2 199924c233c2SKris Buschelman v += 16; 200024c233c2SKris Buschelman } 200124c233c2SKris Buschelman v = aa + ai16; 200224c233c2SKris Buschelman ai16 = 16*diag[--i]; 200324c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 200424c233c2SKris Buschelman /* 200524c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 200624c233c2SKris Buschelman which was inverted as part of the factorization 200724c233c2SKris Buschelman */ 200824c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 200924c233c2SKris Buschelman /* First Column */ 201024c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 201124c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 201224c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 201324c233c2SKris Buschelman 201424c233c2SKris Buschelman /* Second Column */ 201524c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 201624c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 201724c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 201824c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 201924c233c2SKris Buschelman 202024c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 202124c233c2SKris Buschelman 202224c233c2SKris Buschelman /* Third Column */ 202324c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 202424c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 202524c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 202624c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 202724c233c2SKris Buschelman 202824c233c2SKris Buschelman /* Fourth Column */ 202924c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 203024c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 203124c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 203224c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 203324c233c2SKris Buschelman 203424c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 203524c233c2SKris Buschelman SSE_INLINE_END_3 203624c233c2SKris Buschelman 203724c233c2SKris Buschelman /* Promote solution from float to double */ 203824c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 203924c233c2SKris Buschelman 204024c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 204124c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 204224c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 204324c233c2SKris Buschelman idc = 4*(*c--); 204424c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 204524c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 204624c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 204724c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 204824c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 204924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 205024c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 205124c233c2SKris Buschelman SSE_INLINE_END_2 205224c233c2SKris Buschelman v = aa + ai16 + 16; 205324c233c2SKris Buschelman idt -= 4; 205424c233c2SKris Buschelman } 205524c233c2SKris Buschelman 205624c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 205724c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2058b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 2059b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 206024c233c2SKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 206124c233c2SKris Buschelman SSE_SCOPE_END; 206224c233c2SKris Buschelman PetscFunctionReturn(0); 206324c233c2SKris Buschelman } 206424c233c2SKris Buschelman 206524c233c2SKris Buschelman #endif 20660ef38995SBarry Smith 20670ef38995SBarry Smith 20684e2b4712SSatish Balay /* 20694e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 20704e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 20714e2b4712SSatish Balay */ 20724a2ae208SSatish Balay #undef __FUNCT__ 20734a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 20744e2b4712SSatish Balay int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 20754e2b4712SSatish Balay { 20764e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 207730d4dcafSBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 207830d4dcafSBarry Smith int ierr,*diag = a->diag; 20793f1db9ecSBarry Smith MatScalar *aa=a->a; 208087828ca2SBarry Smith PetscScalar *x,*b; 20814e2b4712SSatish Balay 20824e2b4712SSatish Balay PetscFunctionBegin; 2083b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 2084b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 20854e2b4712SSatish Balay 2086aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 20872853dc0eSBarry Smith { 208887828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 20892853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 20902853dc0eSBarry Smith } 2091aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 20922853dc0eSBarry Smith { 209387828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 20942853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 20952853dc0eSBarry Smith } 2096aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 20972853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2098e1293385SBarry Smith #else 209930d4dcafSBarry Smith { 210087828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 21013f1db9ecSBarry Smith MatScalar *v; 21024e555682SBarry Smith int jdx,idt,idx,nz,*vi,i,ai16; 2103e1293385SBarry Smith 21044e2b4712SSatish Balay /* forward solve the lower triangular */ 21054e2b4712SSatish Balay idx = 0; 2106e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 21074e2b4712SSatish Balay for (i=1; i<n; i++) { 21084e2b4712SSatish Balay v = aa + 16*ai[i]; 21094e2b4712SSatish Balay vi = aj + ai[i]; 21104e2b4712SSatish Balay nz = diag[i] - ai[i]; 2111e1293385SBarry Smith idx += 4; 2112f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 21134e2b4712SSatish Balay while (nz--) { 21144e2b4712SSatish Balay jdx = 4*(*vi++); 21154e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2116f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2117f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2118f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2119f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 21204e2b4712SSatish Balay v += 16; 21214e2b4712SSatish Balay } 2122f1af5d2fSBarry Smith x[idx] = s1; 2123f1af5d2fSBarry Smith x[1+idx] = s2; 2124f1af5d2fSBarry Smith x[2+idx] = s3; 2125f1af5d2fSBarry Smith x[3+idx] = s4; 21264e2b4712SSatish Balay } 21274e2b4712SSatish Balay /* backward solve the upper triangular */ 21284e555682SBarry Smith idt = 4*(n-1); 21294e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 21304e555682SBarry Smith ai16 = 16*diag[i]; 21314e555682SBarry Smith v = aa + ai16 + 16; 21324e2b4712SSatish Balay vi = aj + diag[i] + 1; 21334e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2134f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2135f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 21364e2b4712SSatish Balay while (nz--) { 21374e2b4712SSatish Balay idx = 4*(*vi++); 21384e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2139f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2140f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2141f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2142f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 21434e2b4712SSatish Balay v += 16; 21444e2b4712SSatish Balay } 21454e555682SBarry Smith v = aa + ai16; 2146f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2147f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2148f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2149f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2150329f5518SBarry Smith idt -= 4; 21514e2b4712SSatish Balay } 215230d4dcafSBarry Smith } 2153e1293385SBarry Smith #endif 21544e2b4712SSatish Balay 2155b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 2156b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 2157b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 21584e2b4712SSatish Balay PetscFunctionReturn(0); 21594e2b4712SSatish Balay } 21604e2b4712SSatish Balay 2161f26ec98cSKris Buschelman #undef __FUNCT__ 2162f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2163f26ec98cSKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2164f26ec98cSKris Buschelman { 2165f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2166f26ec98cSKris Buschelman int n=a->mbs,*ai=a->i,*aj=a->j; 2167f26ec98cSKris Buschelman int ierr,*diag = a->diag; 2168f26ec98cSKris Buschelman MatScalar *aa=a->a; 2169f26ec98cSKris Buschelman PetscScalar *x,*b; 2170f26ec98cSKris Buschelman 2171f26ec98cSKris Buschelman PetscFunctionBegin; 2172b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 2173b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 2174f26ec98cSKris Buschelman 2175f26ec98cSKris Buschelman { 2176f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2177f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 2178f26ec98cSKris Buschelman int jdx,idt,idx,nz,*vi,i,ai16; 2179f26ec98cSKris Buschelman 2180f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2181f26ec98cSKris Buschelman idx = 0; 2182f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 2183f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 2184f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 2185f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 2186f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2187f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2188f26ec98cSKris Buschelman vi = aj + ai[i]; 2189f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2190f26ec98cSKris Buschelman idx += 4; 2191f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2192f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2193f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2194f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2195f26ec98cSKris Buschelman while (nz--) { 2196f26ec98cSKris Buschelman jdx = 4*(*vi++); 2197f26ec98cSKris Buschelman x1 = t[jdx]; 2198f26ec98cSKris Buschelman x2 = t[1+jdx]; 2199f26ec98cSKris Buschelman x3 = t[2+jdx]; 2200f26ec98cSKris Buschelman x4 = t[3+jdx]; 2201f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2202f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2203f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2204f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2205f26ec98cSKris Buschelman v += 16; 2206f26ec98cSKris Buschelman } 2207f26ec98cSKris Buschelman t[idx] = s1; 2208f26ec98cSKris Buschelman t[1+idx] = s2; 2209f26ec98cSKris Buschelman t[2+idx] = s3; 2210f26ec98cSKris Buschelman t[3+idx] = s4; 2211f26ec98cSKris Buschelman } 2212f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2213f26ec98cSKris Buschelman idt = 4*(n-1); 2214f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2215f26ec98cSKris Buschelman ai16 = 16*diag[i]; 2216f26ec98cSKris Buschelman v = aa + ai16 + 16; 2217f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2218f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2219f26ec98cSKris Buschelman s1 = t[idt]; 2220f26ec98cSKris Buschelman s2 = t[1+idt]; 2221f26ec98cSKris Buschelman s3 = t[2+idt]; 2222f26ec98cSKris Buschelman s4 = t[3+idt]; 2223f26ec98cSKris Buschelman while (nz--) { 2224f26ec98cSKris Buschelman idx = 4*(*vi++); 2225f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 2226f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 2227f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 2228f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 2229f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2230f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2231f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2232f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2233f26ec98cSKris Buschelman v += 16; 2234f26ec98cSKris Buschelman } 2235f26ec98cSKris Buschelman v = aa + ai16; 2236f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2237f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2238f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2239f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2240f26ec98cSKris Buschelman idt -= 4; 2241f26ec98cSKris Buschelman } 2242f26ec98cSKris Buschelman } 2243f26ec98cSKris Buschelman 2244b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 2245b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 2246f26ec98cSKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 2247f26ec98cSKris Buschelman PetscFunctionReturn(0); 2248f26ec98cSKris Buschelman } 2249f26ec98cSKris Buschelman 22503660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 22513660e330SKris Buschelman 22523660e330SKris Buschelman #include PETSC_HAVE_SSE 22533660e330SKris Buschelman #undef __FUNCT__ 22547cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 22557cf1b8d3SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 22563660e330SKris Buschelman { 22573660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 22582aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 22592aa5897fSKris Buschelman int ierr,*ai=a->i,n=a->mbs,*diag = a->diag; 22603660e330SKris Buschelman MatScalar *aa=a->a; 226187828ca2SBarry Smith PetscScalar *x,*b; 22623660e330SKris Buschelman 22633660e330SKris Buschelman PetscFunctionBegin; 22643660e330SKris Buschelman SSE_SCOPE_BEGIN; 22653660e330SKris Buschelman /* 22663660e330SKris Buschelman Note: This code currently uses demotion of double 22673660e330SKris Buschelman to float when performing the mixed-mode computation. 22683660e330SKris Buschelman This may not be numerically reasonable for all applications. 22693660e330SKris Buschelman */ 22703660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 22713660e330SKris Buschelman 22726f6a888dSBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 22736f6a888dSBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 22743660e330SKris Buschelman { 2275eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 2276eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 22772aa5897fSKris Buschelman int nz,i,idt,ai16; 22782aa5897fSKris Buschelman unsigned int jdx,idx; 22792aa5897fSKris Buschelman unsigned short *vi; 2280eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 22813660e330SKris Buschelman 2282eb05f457SKris Buschelman /* First block is the identity. */ 22833660e330SKris Buschelman idx = 0; 2284eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 22852aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 22863660e330SKris Buschelman 22873660e330SKris Buschelman for (i=1; i<n;) { 22883660e330SKris Buschelman PREFETCH_NTA(&v[8]); 22893660e330SKris Buschelman vi = aj + ai[i]; 22903660e330SKris Buschelman nz = diag[i] - ai[i]; 22913660e330SKris Buschelman idx += 4; 22923660e330SKris Buschelman 2293eb05f457SKris Buschelman /* Demote RHS from double to float. */ 2294eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2295eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 22963660e330SKris Buschelman 22973660e330SKris Buschelman while (nz--) { 22983660e330SKris Buschelman PREFETCH_NTA(&v[16]); 22992aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 23003660e330SKris Buschelman 23013660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 2302eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 23033660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 23043660e330SKris Buschelman 23053660e330SKris Buschelman /* First Column */ 23063660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 23073660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 23083660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 23093660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 23103660e330SKris Buschelman 23113660e330SKris Buschelman /* Second Column */ 23123660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 23133660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 23143660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 23153660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 23163660e330SKris Buschelman 23173660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 23183660e330SKris Buschelman 23193660e330SKris Buschelman /* Third Column */ 23203660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 23213660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 23223660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 23233660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 23243660e330SKris Buschelman 23253660e330SKris Buschelman /* Fourth Column */ 23263660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 23273660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 23283660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 23293660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 23303660e330SKris Buschelman SSE_INLINE_END_2 23313660e330SKris Buschelman 23323660e330SKris Buschelman v += 16; 23333660e330SKris Buschelman } 23343660e330SKris Buschelman v = aa + 16*ai[++i]; 23353660e330SKris Buschelman PREFETCH_NTA(v); 2336eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 23373660e330SKris Buschelman } 2338eb05f457SKris Buschelman 2339eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 2340eb05f457SKris Buschelman 23413660e330SKris Buschelman idt = 4*(n-1); 23423660e330SKris Buschelman ai16 = 16*diag[n-1]; 23433660e330SKris Buschelman v = aa + ai16 + 16; 23443660e330SKris Buschelman for (i=n-1; i>=0;){ 23453660e330SKris Buschelman PREFETCH_NTA(&v[8]); 23463660e330SKris Buschelman vi = aj + diag[i] + 1; 23473660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 23483660e330SKris Buschelman 2349eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 23503660e330SKris Buschelman 23513660e330SKris Buschelman while (nz--) { 23523660e330SKris Buschelman PREFETCH_NTA(&v[16]); 23532aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 23543660e330SKris Buschelman 23553660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 2356eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 23573660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 23583660e330SKris Buschelman 23593660e330SKris Buschelman /* First Column */ 23603660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 23613660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 23623660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 23633660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 23643660e330SKris Buschelman 23653660e330SKris Buschelman /* Second Column */ 23663660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 23673660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 23683660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 23693660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 23703660e330SKris Buschelman 23713660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 23723660e330SKris Buschelman 23733660e330SKris Buschelman /* Third Column */ 23743660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 23753660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 23763660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 23773660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 23783660e330SKris Buschelman 23793660e330SKris Buschelman /* Fourth Column */ 23803660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 23813660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 23823660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 23833660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 23843660e330SKris Buschelman SSE_INLINE_END_2 23853660e330SKris Buschelman v += 16; 23863660e330SKris Buschelman } 23873660e330SKris Buschelman v = aa + ai16; 23883660e330SKris Buschelman ai16 = 16*diag[--i]; 23893660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 23903660e330SKris Buschelman /* 23913660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 23923660e330SKris Buschelman which was inverted as part of the factorization 23933660e330SKris Buschelman */ 2394eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 23953660e330SKris Buschelman /* First Column */ 23963660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 23973660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 23983660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 23993660e330SKris Buschelman 24003660e330SKris Buschelman /* Second Column */ 24013660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 24023660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 24033660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 24043660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 24053660e330SKris Buschelman 24063660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 24073660e330SKris Buschelman 24083660e330SKris Buschelman /* Third Column */ 24093660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 24103660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 24113660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 24123660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 24133660e330SKris Buschelman 24143660e330SKris Buschelman /* Fourth Column */ 24153660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 24163660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 24173660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 24183660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 24193660e330SKris Buschelman 24203660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 24213660e330SKris Buschelman SSE_INLINE_END_3 24223660e330SKris Buschelman 24233660e330SKris Buschelman v = aa + ai16 + 16; 24243660e330SKris Buschelman idt -= 4; 24253660e330SKris Buschelman } 2426eb05f457SKris Buschelman 2427eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 2428eb05f457SKris Buschelman idt = 4*(n-1); 2429eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 2430eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2431eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2432eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 2433eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 2434eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 2435eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 2436eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 2437eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 243854693613SKris Buschelman idt -= 4; 24393660e330SKris Buschelman } 2440eb05f457SKris Buschelman 2441eb05f457SKris Buschelman } /* End of artificial scope. */ 24426f6a888dSBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 24436f6a888dSBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 24443660e330SKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 24453660e330SKris Buschelman SSE_SCOPE_END; 24463660e330SKris Buschelman PetscFunctionReturn(0); 24473660e330SKris Buschelman } 24483660e330SKris Buschelman 24497cf1b8d3SKris Buschelman #undef __FUNCT__ 24507cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 24517cf1b8d3SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 24527cf1b8d3SKris Buschelman { 24537cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 24547cf1b8d3SKris Buschelman int *aj=a->j; 24557cf1b8d3SKris Buschelman int ierr,*ai=a->i,n=a->mbs,*diag = a->diag; 24567cf1b8d3SKris Buschelman MatScalar *aa=a->a; 24577cf1b8d3SKris Buschelman PetscScalar *x,*b; 24587cf1b8d3SKris Buschelman 24597cf1b8d3SKris Buschelman PetscFunctionBegin; 24607cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 24617cf1b8d3SKris Buschelman /* 24627cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 24637cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 24647cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 24657cf1b8d3SKris Buschelman */ 24667cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 24677cf1b8d3SKris Buschelman 24687cf1b8d3SKris Buschelman ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 24697cf1b8d3SKris Buschelman ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 24707cf1b8d3SKris Buschelman { 24717cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 24727cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 24737cf1b8d3SKris Buschelman int nz,i,idt,ai16; 24747cf1b8d3SKris Buschelman int jdx,idx; 24757cf1b8d3SKris Buschelman int *vi; 24767cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 24777cf1b8d3SKris Buschelman 24787cf1b8d3SKris Buschelman /* First block is the identity. */ 24797cf1b8d3SKris Buschelman idx = 0; 24807cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 24817cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 24827cf1b8d3SKris Buschelman 24837cf1b8d3SKris Buschelman for (i=1; i<n;) { 24847cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 24857cf1b8d3SKris Buschelman vi = aj + ai[i]; 24867cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 24877cf1b8d3SKris Buschelman idx += 4; 24887cf1b8d3SKris Buschelman 24897cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 24907cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 24917cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 24927cf1b8d3SKris Buschelman 24937cf1b8d3SKris Buschelman while (nz--) { 24947cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 24957cf1b8d3SKris Buschelman jdx = 4*(*vi++); 24967cf1b8d3SKris Buschelman /* jdx = *vi++; */ 24977cf1b8d3SKris Buschelman 24987cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 24997cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 25007cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 25017cf1b8d3SKris Buschelman 25027cf1b8d3SKris Buschelman /* First Column */ 25037cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 25047cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25057cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 25067cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 25077cf1b8d3SKris Buschelman 25087cf1b8d3SKris Buschelman /* Second Column */ 25097cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 25107cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 25117cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 25127cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 25137cf1b8d3SKris Buschelman 25147cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 25157cf1b8d3SKris Buschelman 25167cf1b8d3SKris Buschelman /* Third Column */ 25177cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 25187cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 25197cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 25207cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 25217cf1b8d3SKris Buschelman 25227cf1b8d3SKris Buschelman /* Fourth Column */ 25237cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 25247cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 25257cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 25267cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 25277cf1b8d3SKris Buschelman SSE_INLINE_END_2 25287cf1b8d3SKris Buschelman 25297cf1b8d3SKris Buschelman v += 16; 25307cf1b8d3SKris Buschelman } 25317cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 25327cf1b8d3SKris Buschelman PREFETCH_NTA(v); 25337cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 25347cf1b8d3SKris Buschelman } 25357cf1b8d3SKris Buschelman 25367cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 25377cf1b8d3SKris Buschelman 25387cf1b8d3SKris Buschelman idt = 4*(n-1); 25397cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 25407cf1b8d3SKris Buschelman v = aa + ai16 + 16; 25417cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 25427cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 25437cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 25447cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 25457cf1b8d3SKris Buschelman 25467cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 25477cf1b8d3SKris Buschelman 25487cf1b8d3SKris Buschelman while (nz--) { 25497cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 25507cf1b8d3SKris Buschelman idx = 4*(*vi++); 25517cf1b8d3SKris Buschelman /* idx = *vi++; */ 25527cf1b8d3SKris Buschelman 25537cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 25547cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 25557cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 25567cf1b8d3SKris Buschelman 25577cf1b8d3SKris Buschelman /* First Column */ 25587cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 25597cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25607cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 25617cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 25627cf1b8d3SKris Buschelman 25637cf1b8d3SKris Buschelman /* Second Column */ 25647cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 25657cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 25667cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 25677cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 25687cf1b8d3SKris Buschelman 25697cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 25707cf1b8d3SKris Buschelman 25717cf1b8d3SKris Buschelman /* Third Column */ 25727cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 25737cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 25747cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 25757cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 25767cf1b8d3SKris Buschelman 25777cf1b8d3SKris Buschelman /* Fourth Column */ 25787cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 25797cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 25807cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 25817cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 25827cf1b8d3SKris Buschelman SSE_INLINE_END_2 25837cf1b8d3SKris Buschelman v += 16; 25847cf1b8d3SKris Buschelman } 25857cf1b8d3SKris Buschelman v = aa + ai16; 25867cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 25877cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 25887cf1b8d3SKris Buschelman /* 25897cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 25907cf1b8d3SKris Buschelman which was inverted as part of the factorization 25917cf1b8d3SKris Buschelman */ 25927cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 25937cf1b8d3SKris Buschelman /* First Column */ 25947cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 25957cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25967cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 25977cf1b8d3SKris Buschelman 25987cf1b8d3SKris Buschelman /* Second Column */ 25997cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 26007cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 26017cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 26027cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 26037cf1b8d3SKris Buschelman 26047cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 26057cf1b8d3SKris Buschelman 26067cf1b8d3SKris Buschelman /* Third Column */ 26077cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 26087cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26097cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 26107cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 26117cf1b8d3SKris Buschelman 26127cf1b8d3SKris Buschelman /* Fourth Column */ 26137cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 26147cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 26157cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 26167cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 26177cf1b8d3SKris Buschelman 26187cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 26197cf1b8d3SKris Buschelman SSE_INLINE_END_3 26207cf1b8d3SKris Buschelman 26217cf1b8d3SKris Buschelman v = aa + ai16 + 16; 26227cf1b8d3SKris Buschelman idt -= 4; 26237cf1b8d3SKris Buschelman } 26247cf1b8d3SKris Buschelman 26257cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 26267cf1b8d3SKris Buschelman idt = 4*(n-1); 26277cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 26287cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 26297cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 26307cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 26317cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 26327cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 26337cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 26347cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 26357cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 26367cf1b8d3SKris Buschelman idt -= 4; 26377cf1b8d3SKris Buschelman } 26387cf1b8d3SKris Buschelman 26397cf1b8d3SKris Buschelman } /* End of artificial scope. */ 26407cf1b8d3SKris Buschelman ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 26417cf1b8d3SKris Buschelman ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 26427cf1b8d3SKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 26437cf1b8d3SKris Buschelman SSE_SCOPE_END; 26447cf1b8d3SKris Buschelman PetscFunctionReturn(0); 26457cf1b8d3SKris Buschelman } 26467cf1b8d3SKris Buschelman 26473660e330SKris Buschelman #endif 26484a2ae208SSatish Balay #undef __FUNCT__ 26494a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 26504e2b4712SSatish Balay int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 26514e2b4712SSatish Balay { 26524e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 26534e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 26544e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 26554e2b4712SSatish Balay int *diag = a->diag; 26563f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 265787828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 26584e2b4712SSatish Balay 26594e2b4712SSatish Balay PetscFunctionBegin; 2660b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 2661b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 2662f1af5d2fSBarry Smith t = a->solve_work; 26634e2b4712SSatish Balay 26644e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 26654e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 26664e2b4712SSatish Balay 26674e2b4712SSatish Balay /* forward solve the lower triangular */ 26684e2b4712SSatish Balay idx = 3*(*r++); 2669f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 26704e2b4712SSatish Balay for (i=1; i<n; i++) { 26714e2b4712SSatish Balay v = aa + 9*ai[i]; 26724e2b4712SSatish Balay vi = aj + ai[i]; 26734e2b4712SSatish Balay nz = diag[i] - ai[i]; 26744e2b4712SSatish Balay idx = 3*(*r++); 2675f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 26764e2b4712SSatish Balay while (nz--) { 26774e2b4712SSatish Balay idx = 3*(*vi++); 2678f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2679f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2680f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2681f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 26824e2b4712SSatish Balay v += 9; 26834e2b4712SSatish Balay } 26844e2b4712SSatish Balay idx = 3*i; 2685f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 26864e2b4712SSatish Balay } 26874e2b4712SSatish Balay /* backward solve the upper triangular */ 26884e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 26894e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 26904e2b4712SSatish Balay vi = aj + diag[i] + 1; 26914e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 26924e2b4712SSatish Balay idt = 3*i; 2693f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 26944e2b4712SSatish Balay while (nz--) { 26954e2b4712SSatish Balay idx = 3*(*vi++); 2696f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2697f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2698f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2699f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 27004e2b4712SSatish Balay v += 9; 27014e2b4712SSatish Balay } 27024e2b4712SSatish Balay idc = 3*(*c--); 27034e2b4712SSatish Balay v = aa + 9*diag[i]; 2704f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2705f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2706f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 27074e2b4712SSatish Balay } 27084e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 27094e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2710b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 2711b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 2712b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 27134e2b4712SSatish Balay PetscFunctionReturn(0); 27144e2b4712SSatish Balay } 27154e2b4712SSatish Balay 271615091d37SBarry Smith /* 271715091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 271815091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 271915091d37SBarry Smith */ 27204a2ae208SSatish Balay #undef __FUNCT__ 27214a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 272215091d37SBarry Smith int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 272315091d37SBarry Smith { 272415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 272515091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 272615091d37SBarry Smith int ierr,*diag = a->diag; 272715091d37SBarry Smith MatScalar *aa=a->a,*v; 272887828ca2SBarry Smith PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 272915091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 273015091d37SBarry Smith 273115091d37SBarry Smith PetscFunctionBegin; 2732b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 2733b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 273415091d37SBarry Smith 273515091d37SBarry Smith 273615091d37SBarry Smith /* forward solve the lower triangular */ 273715091d37SBarry Smith idx = 0; 273815091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 273915091d37SBarry Smith for (i=1; i<n; i++) { 274015091d37SBarry Smith v = aa + 9*ai[i]; 274115091d37SBarry Smith vi = aj + ai[i]; 274215091d37SBarry Smith nz = diag[i] - ai[i]; 274315091d37SBarry Smith idx += 3; 2744f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 274515091d37SBarry Smith while (nz--) { 274615091d37SBarry Smith jdx = 3*(*vi++); 274715091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2748f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2749f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2750f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 275115091d37SBarry Smith v += 9; 275215091d37SBarry Smith } 2753f1af5d2fSBarry Smith x[idx] = s1; 2754f1af5d2fSBarry Smith x[1+idx] = s2; 2755f1af5d2fSBarry Smith x[2+idx] = s3; 275615091d37SBarry Smith } 275715091d37SBarry Smith /* backward solve the upper triangular */ 275815091d37SBarry Smith for (i=n-1; i>=0; i--){ 275915091d37SBarry Smith v = aa + 9*diag[i] + 9; 276015091d37SBarry Smith vi = aj + diag[i] + 1; 276115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 276215091d37SBarry Smith idt = 3*i; 2763f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2764f1af5d2fSBarry Smith s3 = x[2+idt]; 276515091d37SBarry Smith while (nz--) { 276615091d37SBarry Smith idx = 3*(*vi++); 276715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2768f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2769f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2770f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 277115091d37SBarry Smith v += 9; 277215091d37SBarry Smith } 277315091d37SBarry Smith v = aa + 9*diag[i]; 2774f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2775f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2776f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 277715091d37SBarry Smith } 277815091d37SBarry Smith 2779b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 2780b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 2781b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 278215091d37SBarry Smith PetscFunctionReturn(0); 278315091d37SBarry Smith } 278415091d37SBarry Smith 27854a2ae208SSatish Balay #undef __FUNCT__ 27864a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 27874e2b4712SSatish Balay int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 27884e2b4712SSatish Balay { 27894e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 27904e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 27914e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 27924e2b4712SSatish Balay int *diag = a->diag; 27933f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 279487828ca2SBarry Smith PetscScalar *x,*b,s1,s2,x1,x2,*t; 27954e2b4712SSatish Balay 27964e2b4712SSatish Balay PetscFunctionBegin; 2797b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 2798b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 2799f1af5d2fSBarry Smith t = a->solve_work; 28004e2b4712SSatish Balay 28014e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 28024e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 28034e2b4712SSatish Balay 28044e2b4712SSatish Balay /* forward solve the lower triangular */ 28054e2b4712SSatish Balay idx = 2*(*r++); 2806f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 28074e2b4712SSatish Balay for (i=1; i<n; i++) { 28084e2b4712SSatish Balay v = aa + 4*ai[i]; 28094e2b4712SSatish Balay vi = aj + ai[i]; 28104e2b4712SSatish Balay nz = diag[i] - ai[i]; 28114e2b4712SSatish Balay idx = 2*(*r++); 2812f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 28134e2b4712SSatish Balay while (nz--) { 28144e2b4712SSatish Balay idx = 2*(*vi++); 2815f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2816f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2817f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 28184e2b4712SSatish Balay v += 4; 28194e2b4712SSatish Balay } 28204e2b4712SSatish Balay idx = 2*i; 2821f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 28224e2b4712SSatish Balay } 28234e2b4712SSatish Balay /* backward solve the upper triangular */ 28244e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 28254e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 28264e2b4712SSatish Balay vi = aj + diag[i] + 1; 28274e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 28284e2b4712SSatish Balay idt = 2*i; 2829f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 28304e2b4712SSatish Balay while (nz--) { 28314e2b4712SSatish Balay idx = 2*(*vi++); 2832f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2833f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2834f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 28354e2b4712SSatish Balay v += 4; 28364e2b4712SSatish Balay } 28374e2b4712SSatish Balay idc = 2*(*c--); 28384e2b4712SSatish Balay v = aa + 4*diag[i]; 2839f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2840f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 28414e2b4712SSatish Balay } 28424e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 28434e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2844b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 2845b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 2846b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 28474e2b4712SSatish Balay PetscFunctionReturn(0); 28484e2b4712SSatish Balay } 28494e2b4712SSatish Balay 285015091d37SBarry Smith /* 285115091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 285215091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 285315091d37SBarry Smith */ 28544a2ae208SSatish Balay #undef __FUNCT__ 28554a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 285615091d37SBarry Smith int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 285715091d37SBarry Smith { 285815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 285915091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 286015091d37SBarry Smith int ierr,*diag = a->diag; 286115091d37SBarry Smith MatScalar *aa=a->a,*v; 286287828ca2SBarry Smith PetscScalar *x,*b,s1,s2,x1,x2; 286315091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 286415091d37SBarry Smith 286515091d37SBarry Smith PetscFunctionBegin; 2866b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 2867b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 286815091d37SBarry Smith 286915091d37SBarry Smith /* forward solve the lower triangular */ 287015091d37SBarry Smith idx = 0; 287115091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 287215091d37SBarry Smith for (i=1; i<n; i++) { 287315091d37SBarry Smith v = aa + 4*ai[i]; 287415091d37SBarry Smith vi = aj + ai[i]; 287515091d37SBarry Smith nz = diag[i] - ai[i]; 287615091d37SBarry Smith idx += 2; 2877f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 287815091d37SBarry Smith while (nz--) { 287915091d37SBarry Smith jdx = 2*(*vi++); 288015091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 2881f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2882f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 288315091d37SBarry Smith v += 4; 288415091d37SBarry Smith } 2885f1af5d2fSBarry Smith x[idx] = s1; 2886f1af5d2fSBarry Smith x[1+idx] = s2; 288715091d37SBarry Smith } 288815091d37SBarry Smith /* backward solve the upper triangular */ 288915091d37SBarry Smith for (i=n-1; i>=0; i--){ 289015091d37SBarry Smith v = aa + 4*diag[i] + 4; 289115091d37SBarry Smith vi = aj + diag[i] + 1; 289215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 289315091d37SBarry Smith idt = 2*i; 2894f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 289515091d37SBarry Smith while (nz--) { 289615091d37SBarry Smith idx = 2*(*vi++); 289715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 2898f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2899f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 290015091d37SBarry Smith v += 4; 290115091d37SBarry Smith } 290215091d37SBarry Smith v = aa + 4*diag[i]; 2903f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 2904f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 290515091d37SBarry Smith } 290615091d37SBarry Smith 2907b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 2908b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 2909b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 291015091d37SBarry Smith PetscFunctionReturn(0); 291115091d37SBarry Smith } 291215091d37SBarry Smith 29134a2ae208SSatish Balay #undef __FUNCT__ 29144a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 29154e2b4712SSatish Balay int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 29164e2b4712SSatish Balay { 29174e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 29184e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 29194e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 29204e2b4712SSatish Balay int *diag = a->diag; 29213f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 292287828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 29234e2b4712SSatish Balay 29244e2b4712SSatish Balay PetscFunctionBegin; 29254e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 29264e2b4712SSatish Balay 2927b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 2928b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 2929f1af5d2fSBarry Smith t = a->solve_work; 29304e2b4712SSatish Balay 29314e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 29324e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 29334e2b4712SSatish Balay 29344e2b4712SSatish Balay /* forward solve the lower triangular */ 2935f1af5d2fSBarry Smith t[0] = b[*r++]; 29364e2b4712SSatish Balay for (i=1; i<n; i++) { 29374e2b4712SSatish Balay v = aa + ai[i]; 29384e2b4712SSatish Balay vi = aj + ai[i]; 29394e2b4712SSatish Balay nz = diag[i] - ai[i]; 2940f1af5d2fSBarry Smith s1 = b[*r++]; 29414e2b4712SSatish Balay while (nz--) { 2942f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 29434e2b4712SSatish Balay } 2944f1af5d2fSBarry Smith t[i] = s1; 29454e2b4712SSatish Balay } 29464e2b4712SSatish Balay /* backward solve the upper triangular */ 29474e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 29484e2b4712SSatish Balay v = aa + diag[i] + 1; 29494e2b4712SSatish Balay vi = aj + diag[i] + 1; 29504e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2951f1af5d2fSBarry Smith s1 = t[i]; 29524e2b4712SSatish Balay while (nz--) { 2953f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 29544e2b4712SSatish Balay } 2955f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 29564e2b4712SSatish Balay } 29574e2b4712SSatish Balay 29584e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 29594e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2960b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 2961b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 2962b0a32e0cSBarry Smith PetscLogFlops(2*1*(a->nz) - A->n); 29634e2b4712SSatish Balay PetscFunctionReturn(0); 29644e2b4712SSatish Balay } 296515091d37SBarry Smith /* 296615091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 296715091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 296815091d37SBarry Smith */ 29694a2ae208SSatish Balay #undef __FUNCT__ 29704a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 297115091d37SBarry Smith int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 297215091d37SBarry Smith { 297315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 297415091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 297515091d37SBarry Smith int ierr,*diag = a->diag; 297615091d37SBarry Smith MatScalar *aa=a->a; 297787828ca2SBarry Smith PetscScalar *x,*b; 297887828ca2SBarry Smith PetscScalar s1,x1; 297915091d37SBarry Smith MatScalar *v; 298015091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 298115091d37SBarry Smith 298215091d37SBarry Smith PetscFunctionBegin; 2983b1d4fb26SBarry Smith ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 2984b1d4fb26SBarry Smith ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 298515091d37SBarry Smith 298615091d37SBarry Smith /* forward solve the lower triangular */ 298715091d37SBarry Smith idx = 0; 298815091d37SBarry Smith x[0] = b[0]; 298915091d37SBarry Smith for (i=1; i<n; i++) { 299015091d37SBarry Smith v = aa + ai[i]; 299115091d37SBarry Smith vi = aj + ai[i]; 299215091d37SBarry Smith nz = diag[i] - ai[i]; 299315091d37SBarry Smith idx += 1; 2994f1af5d2fSBarry Smith s1 = b[idx]; 299515091d37SBarry Smith while (nz--) { 299615091d37SBarry Smith jdx = *vi++; 299715091d37SBarry Smith x1 = x[jdx]; 2998f1af5d2fSBarry Smith s1 -= v[0]*x1; 299915091d37SBarry Smith v += 1; 300015091d37SBarry Smith } 3001f1af5d2fSBarry Smith x[idx] = s1; 300215091d37SBarry Smith } 300315091d37SBarry Smith /* backward solve the upper triangular */ 300415091d37SBarry Smith for (i=n-1; i>=0; i--){ 300515091d37SBarry Smith v = aa + diag[i] + 1; 300615091d37SBarry Smith vi = aj + diag[i] + 1; 300715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 300815091d37SBarry Smith idt = i; 3009f1af5d2fSBarry Smith s1 = x[idt]; 301015091d37SBarry Smith while (nz--) { 301115091d37SBarry Smith idx = *vi++; 301215091d37SBarry Smith x1 = x[idx]; 3013f1af5d2fSBarry Smith s1 -= v[0]*x1; 301415091d37SBarry Smith v += 1; 301515091d37SBarry Smith } 301615091d37SBarry Smith v = aa + diag[i]; 3017f1af5d2fSBarry Smith x[idt] = v[0]*s1; 301815091d37SBarry Smith } 3019b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 3020b1d4fb26SBarry Smith ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 3021b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 302215091d37SBarry Smith PetscFunctionReturn(0); 302315091d37SBarry Smith } 30244e2b4712SSatish Balay 30254e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 30264e2b4712SSatish Balay /* 30274e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 30284e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 30294e2b4712SSatish Balay Not a good example of code reuse. 30304e2b4712SSatish Balay */ 3031ca44d042SBarry Smith EXTERN int MatMissingDiagonal_SeqBAIJ(Mat); 3032435faa5fSBarry Smith 30334a2ae208SSatish Balay #undef __FUNCT__ 30344a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3035b380c88cSHong Zhang int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact) 30364e2b4712SSatish Balay { 30374e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 30384e2b4712SSatish Balay IS isicol; 30394e2b4712SSatish Balay int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j; 30404e2b4712SSatish Balay int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3041eb150c5cSKris Buschelman int *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0; 3042435faa5fSBarry Smith int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 30434533b203SBarry Smith PetscTruth col_identity,row_identity; 3044329f5518SBarry Smith PetscReal f; 30454e2b4712SSatish Balay 30464e2b4712SSatish Balay PetscFunctionBegin; 3047435faa5fSBarry Smith f = info->fill; 3048335d9088SBarry Smith levels = (int)info->levels; 3049335d9088SBarry Smith diagonal_fill = (int)info->diagonal_fill; 30504c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3051667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3052667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3053309c388cSBarry Smith 3054309c388cSBarry Smith if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3055bb3d539aSBarry Smith ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3056bb3d539aSBarry Smith (*fact)->factor = FACTOR_LU; 3057bb3d539aSBarry Smith b = (Mat_SeqBAIJ*)(*fact)->data; 3058bb3d539aSBarry Smith if (!b->diag) { 3059bb3d539aSBarry Smith ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3060bb3d539aSBarry Smith } 3061bb3d539aSBarry Smith ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3062bb3d539aSBarry Smith b->row = isrow; 3063bb3d539aSBarry Smith b->col = iscol; 3064bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3065bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3066bb3d539aSBarry Smith b->icol = isicol; 3067bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 306887828ca2SBarry Smith ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3069309c388cSBarry Smith } else { /* general case perform the symbolic factorization */ 30704e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 30714e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 30724e2b4712SSatish Balay 30734e2b4712SSatish Balay /* get new row pointers */ 3074b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 30754e2b4712SSatish Balay ainew[0] = 0; 30764e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 30774e2b4712SSatish Balay jmax = (int)(f*ai[n] + 1); 307882502324SSatish Balay ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 30794e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 308082502324SSatish Balay ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 30814e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 3082b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 30834e2b4712SSatish Balay /* im is level for each filled value */ 3084b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 30854e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 3086b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 30874e2b4712SSatish Balay dloc[0] = 0; 30884e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 3089435faa5fSBarry Smith 3090435faa5fSBarry Smith /* copy prow into linked list */ 30914e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 309229bbc08cSBarry Smith if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 30934e2b4712SSatish Balay xi = aj + ai[r[prow]]; 30944e2b4712SSatish Balay fill[n] = n; 3095435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 30964e2b4712SSatish Balay while (nz--) { 30974e2b4712SSatish Balay fm = n; 30984e2b4712SSatish Balay idx = ic[*xi++]; 30994e2b4712SSatish Balay do { 31004e2b4712SSatish Balay m = fm; 31014e2b4712SSatish Balay fm = fill[m]; 31024e2b4712SSatish Balay } while (fm < idx); 31034e2b4712SSatish Balay fill[m] = idx; 31044e2b4712SSatish Balay fill[idx] = fm; 31054e2b4712SSatish Balay im[idx] = 0; 31064e2b4712SSatish Balay } 3107435faa5fSBarry Smith 3108435faa5fSBarry Smith /* make sure diagonal entry is included */ 3109435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 3110435faa5fSBarry Smith fm = n; 3111435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 3112435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3113435faa5fSBarry Smith fill[fm] = prow; 3114435faa5fSBarry Smith im[prow] = 0; 3115435faa5fSBarry Smith nzf++; 3116335d9088SBarry Smith dcount++; 3117435faa5fSBarry Smith } 3118435faa5fSBarry Smith 31194e2b4712SSatish Balay nzi = 0; 31204e2b4712SSatish Balay row = fill[n]; 31214e2b4712SSatish Balay while (row < prow) { 31224e2b4712SSatish Balay incrlev = im[row] + 1; 31234e2b4712SSatish Balay nz = dloc[row]; 3124435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 31254e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 31264e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 31274e2b4712SSatish Balay fm = row; 31284e2b4712SSatish Balay while (nnz-- > 0) { 31294e2b4712SSatish Balay idx = *xi++; 31304e2b4712SSatish Balay if (*flev + incrlev > levels) { 31314e2b4712SSatish Balay flev++; 31324e2b4712SSatish Balay continue; 31334e2b4712SSatish Balay } 31344e2b4712SSatish Balay do { 31354e2b4712SSatish Balay m = fm; 31364e2b4712SSatish Balay fm = fill[m]; 31374e2b4712SSatish Balay } while (fm < idx); 31384e2b4712SSatish Balay if (fm != idx) { 31394e2b4712SSatish Balay im[idx] = *flev + incrlev; 31404e2b4712SSatish Balay fill[m] = idx; 31414e2b4712SSatish Balay fill[idx] = fm; 31424e2b4712SSatish Balay fm = idx; 31434e2b4712SSatish Balay nzf++; 3144ecf371e4SBarry Smith } else { 31454e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 31464e2b4712SSatish Balay } 31474e2b4712SSatish Balay flev++; 31484e2b4712SSatish Balay } 31494e2b4712SSatish Balay row = fill[row]; 31504e2b4712SSatish Balay nzi++; 31514e2b4712SSatish Balay } 31524e2b4712SSatish Balay /* copy new filled row into permanent storage */ 31534e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 31544e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 3155ecf371e4SBarry Smith 3156ecf371e4SBarry Smith /* estimate how much additional space we will need */ 3157ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3158ecf371e4SBarry Smith /* just double the memory each time */ 3159ecf371e4SBarry Smith int maxadd = jmax; 3160ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 31614e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 31624e2b4712SSatish Balay jmax += maxadd; 3163ecf371e4SBarry Smith 3164ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 316582502324SSatish Balay ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3166549d3d68SSatish Balay ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3167606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 31684e2b4712SSatish Balay ajnew = xi; 316982502324SSatish Balay ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3170549d3d68SSatish Balay ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3171606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 31724e2b4712SSatish Balay ajfill = xi; 3173eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 31744e2b4712SSatish Balay } 31754e2b4712SSatish Balay xi = ajnew + ainew[prow]; 31764e2b4712SSatish Balay flev = ajfill + ainew[prow]; 31774e2b4712SSatish Balay dloc[prow] = nzi; 31784e2b4712SSatish Balay fm = fill[n]; 31794e2b4712SSatish Balay while (nzf--) { 31804e2b4712SSatish Balay *xi++ = fm; 31814e2b4712SSatish Balay *flev++ = im[fm]; 31824e2b4712SSatish Balay fm = fill[fm]; 31834e2b4712SSatish Balay } 3184435faa5fSBarry Smith /* make sure row has diagonal entry */ 3185435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 318629bbc08cSBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 3187435faa5fSBarry Smith try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 3188435faa5fSBarry Smith } 31894e2b4712SSatish Balay } 3190606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 31914e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 31924e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3193606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 3194606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 31954e2b4712SSatish Balay 31964e2b4712SSatish Balay { 3197329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3198eb150c5cSKris Buschelman PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af); 3199b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 3200b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 3201b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 3202335d9088SBarry Smith if (diagonal_fill) { 3203b1bcba4aSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount); 3204335d9088SBarry Smith } 32054e2b4712SSatish Balay } 32064e2b4712SSatish Balay 32074e2b4712SSatish Balay /* put together the new matrix */ 3208*f204ca49SKris Buschelman ierr = MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);CHKERRQ(ierr); 3209*f204ca49SKris Buschelman ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr); 3210*f204ca49SKris Buschelman ierr = MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);CHKERRQ(ierr); 3211b0a32e0cSBarry Smith PetscLogObjectParent(*fact,isicol); 32124e2b4712SSatish Balay b = (Mat_SeqBAIJ*)(*fact)->data; 3213606d414cSSatish Balay ierr = PetscFree(b->imax);CHKERRQ(ierr); 32147c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 32153f1db9ecSBarry Smith len = bs2*ainew[n]*sizeof(MatScalar); 32164e2b4712SSatish Balay /* the next line frees the default space generated by the Create() */ 3217606d414cSSatish Balay ierr = PetscFree(b->a);CHKERRQ(ierr); 3218606d414cSSatish Balay ierr = PetscFree(b->ilen);CHKERRQ(ierr); 321982502324SSatish Balay ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 32204e2b4712SSatish Balay b->j = ajnew; 32214e2b4712SSatish Balay b->i = ainew; 32224e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 32234e2b4712SSatish Balay b->diag = dloc; 32244e2b4712SSatish Balay b->ilen = 0; 32254e2b4712SSatish Balay b->imax = 0; 32264e2b4712SSatish Balay b->row = isrow; 32274e2b4712SSatish Balay b->col = iscol; 3228bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3229c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3230c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3231e51c0b9cSSatish Balay b->icol = isicol; 323287828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 32334e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 32344e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 323587828ca2SBarry Smith PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 32364e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 32374e2b4712SSatish Balay (*fact)->factor = FACTOR_LU; 32384e2b4712SSatish Balay 3239eb150c5cSKris Buschelman (*fact)->info.factor_mallocs = reallocate; 32404e2b4712SSatish Balay (*fact)->info.fill_ratio_given = f; 3241329f5518SBarry Smith (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3242309c388cSBarry Smith } 32434e2b4712SSatish Balay 3244309c388cSBarry Smith if (row_identity && col_identity) { 3245732ee342SKris Buschelman ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 32468661488fSKris Buschelman } 32478661488fSKris Buschelman PetscFunctionReturn(0); 32488661488fSKris Buschelman } 32498661488fSKris Buschelman 3250732ee342SKris Buschelman #undef __FUNCT__ 32517e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 32527e7071cdSKris Buschelman int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 32537e7071cdSKris Buschelman { 325412272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 325512272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 32565a9542e3SKris Buschelman PetscFunctionBegin; 32577cf1b8d3SKris Buschelman /* Undo Column scaling */ 32587cf1b8d3SKris Buschelman /* while (nz--) { */ 32597cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 32607cf1b8d3SKris Buschelman /* } */ 3261c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 3262c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 32637cf1b8d3SKris Buschelman PetscFunctionReturn(0); 32647cf1b8d3SKris Buschelman } 32657cf1b8d3SKris Buschelman 32667cf1b8d3SKris Buschelman #undef __FUNCT__ 32677cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 32687cf1b8d3SKris Buschelman int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 32697cf1b8d3SKris Buschelman { 32707cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 32710b9da03eSKris Buschelman int *AJ=a->j,nz=a->nz; 32722aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 32735a9542e3SKris Buschelman PetscFunctionBegin; 32740b9da03eSKris Buschelman /* Is this really necessary? */ 327520235379SKris Buschelman while (nz--) { 32760b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 32777e7071cdSKris Buschelman } 3278c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 32797e7071cdSKris Buschelman PetscFunctionReturn(0); 32807e7071cdSKris Buschelman } 32817e7071cdSKris Buschelman 32827e7071cdSKris Buschelman #undef __FUNCT__ 3283732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3284732ee342SKris Buschelman int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 32858661488fSKris Buschelman { 32868661488fSKris Buschelman /* 32878661488fSKris Buschelman Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 32888661488fSKris Buschelman with natural ordering 32898661488fSKris Buschelman */ 32908661488fSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 32918661488fSKris Buschelman 32928661488fSKris Buschelman PetscFunctionBegin; 3293a7ba9c3cSKris Buschelman inA->ops->solve = MatSolve_SeqBAIJ_Update; 3294a7ba9c3cSKris Buschelman inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 32958661488fSKris Buschelman switch (a->bs) { 32968661488fSKris Buschelman case 1: 32978661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3298732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 3299732ee342SKris Buschelman break; 3300309c388cSBarry Smith case 2: 33018661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3302732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 3303309c388cSBarry Smith break; 3304309c388cSBarry Smith case 3: 33058661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3306732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 3307309c388cSBarry Smith break; 3308309c388cSBarry Smith case 4: 3309a7d8d0baSKris Buschelman #if defined(PETSC_USE_MAT_SINGLE) 3310a7d8d0baSKris Buschelman { 3311a7d8d0baSKris Buschelman PetscTruth sse_enabled_local; 331243b9cc93SKris Buschelman int ierr; 3313ccaa8a1bSKris Buschelman ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 33146b7cc795SKris Buschelman if (sse_enabled_local) { 3315b988c221SKris Buschelman # if defined(PETSC_HAVE_SSE) 33167cf1b8d3SKris Buschelman int i,*AJ=a->j,nz=a->nz,n=a->mbs; 33177cf1b8d3SKris Buschelman if (n==(unsigned short)n) { 33182aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 331913c7ffeeSKris Buschelman for (i=0;i<nz;i++) { 33202aa5897fSKris Buschelman aj[i] = (unsigned short)AJ[i]; 332113c7ffeeSKris Buschelman } 33227cf1b8d3SKris Buschelman inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj; 33237cf1b8d3SKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj; 332486b4ebfeSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n"); 33257cf1b8d3SKris Buschelman } else { 33267cf1b8d3SKris Buschelman /* Scale the column indices for easier indexing in MatSolve. */ 33277cf1b8d3SKris Buschelman /* for (i=0;i<nz;i++) { */ 33287cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]*4; */ 33297cf1b8d3SKris Buschelman /* } */ 33307e7071cdSKris Buschelman inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE; 33318661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 333286b4ebfeSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n"); 33337cf1b8d3SKris Buschelman } 3334b988c221SKris Buschelman # else 3335b988c221SKris Buschelman /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3336b988c221SKris Buschelman SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3337b988c221SKris Buschelman # endif 33383ba47ebaSKris Buschelman } else { 33398661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3340732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 33413ba47ebaSKris Buschelman } 3342a7d8d0baSKris Buschelman } 3343a7d8d0baSKris Buschelman #else 3344a7d8d0baSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3345a7d8d0baSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3346a7d8d0baSKris Buschelman #endif 3347309c388cSBarry Smith break; 3348309c388cSBarry Smith case 5: 33498661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3350732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 3351309c388cSBarry Smith break; 3352309c388cSBarry Smith case 6: 33538661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3354732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 3355309c388cSBarry Smith break; 3356309c388cSBarry Smith case 7: 33578661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3358732ee342SKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 3359309c388cSBarry Smith break; 3360309c388cSBarry Smith } 33614e2b4712SSatish Balay PetscFunctionReturn(0); 33624e2b4712SSatish Balay } 3363732ee342SKris Buschelman 3364732ee342SKris Buschelman #undef __FUNCT__ 3365732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3366732ee342SKris Buschelman int MatSeqBAIJ_UpdateSolvers(Mat A) 3367732ee342SKris Buschelman { 3368732ee342SKris Buschelman /* 3369732ee342SKris Buschelman Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3370732ee342SKris Buschelman with natural ordering 3371732ee342SKris Buschelman */ 3372732ee342SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3373732ee342SKris Buschelman IS row = a->row, col = a->col; 3374732ee342SKris Buschelman PetscTruth row_identity, col_identity; 337523c42b7cSKris Buschelman PetscTruth use_natural; 3376732ee342SKris Buschelman int ierr; 3377732ee342SKris Buschelman 3378732ee342SKris Buschelman PetscFunctionBegin; 3379cf242676SKris Buschelman 338094ee7fc8SKris Buschelman use_natural = PETSC_FALSE; 338121360622SBarry Smith if (row && col) { 3382732ee342SKris Buschelman ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3383732ee342SKris Buschelman ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3384732ee342SKris Buschelman 3385732ee342SKris Buschelman if (row_identity && col_identity) { 3386732ee342SKris Buschelman use_natural = PETSC_TRUE; 3387732ee342SKris Buschelman } 338821360622SBarry Smith } else { 338921360622SBarry Smith use_natural = PETSC_TRUE; 339021360622SBarry Smith } 339121360622SBarry Smith 3392732ee342SKris Buschelman switch (a->bs) { 3393732ee342SKris Buschelman case 1: 3394732ee342SKris Buschelman if (use_natural) { 3395732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3396732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3397732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 3398732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3399732ee342SKris Buschelman } else { 3400732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_1; 3401732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3402732ee342SKris Buschelman } 3403732ee342SKris Buschelman break; 3404732ee342SKris Buschelman case 2: 3405732ee342SKris Buschelman if (use_natural) { 3406732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3407732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3408732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 3409732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3410732ee342SKris Buschelman } else { 3411732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_2; 3412732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3413732ee342SKris Buschelman } 3414732ee342SKris Buschelman break; 3415732ee342SKris Buschelman case 3: 3416732ee342SKris Buschelman if (use_natural) { 3417732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3418732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3419732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 3420732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3421732ee342SKris Buschelman } else { 3422732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_3; 3423732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3424732ee342SKris Buschelman } 3425732ee342SKris Buschelman break; 3426732ee342SKris Buschelman case 4: 3427f26ec98cSKris Buschelman { 3428123145dfSKris Buschelman PetscTruth sse_enabled_local; 3429ccaa8a1bSKris Buschelman ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3430732ee342SKris Buschelman if (use_natural) { 34312859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE) 3432f26ec98cSKris Buschelman if (sse_enabled_local) { /* Natural + Single + SSE */ 3433eb150c5cSKris Buschelman # if defined(PETSC_HAVE_SSE) 3434995eb297SKris Buschelman int n=a->mbs; 3435995eb297SKris Buschelman if (n==(unsigned short)n) { 3436995eb297SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj; 3437995eb297SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n"); 3438995eb297SKris Buschelman } else { 3439732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 344086b4ebfeSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n"); 3441995eb297SKris Buschelman } 3442eb150c5cSKris Buschelman # else 3443eb150c5cSKris Buschelman /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3444eb150c5cSKris Buschelman SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3445eb150c5cSKris Buschelman # endif 3446f26ec98cSKris Buschelman } else { /* Natural + Single */ 3447f26ec98cSKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3448123145dfSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n"); 3449f26ec98cSKris Buschelman } 34502859b196SKris Buschelman #else 34512859b196SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3452123145dfSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 34532859b196SKris Buschelman #endif 3454732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3455123145dfSKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3456f26ec98cSKris Buschelman } else { /* Arbitrary ordering */ 34572859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE) 3458f26ec98cSKris Buschelman if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3459eb150c5cSKris Buschelman # if defined(PETSC_HAVE_SSE) 3460732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3461732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3462eb150c5cSKris Buschelman # else 3463eb150c5cSKris Buschelman /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3464eb150c5cSKris Buschelman SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3465eb150c5cSKris Buschelman # endif 3466f26ec98cSKris Buschelman } else { /* Arbitrary + Single */ 3467f26ec98cSKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3468f26ec98cSKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"); 3469732ee342SKris Buschelman } 34702859b196SKris Buschelman #else 34712859b196SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_4; 34722859b196SKris Buschelman #endif 3473732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3474732ee342SKris Buschelman } 3475f26ec98cSKris Buschelman } 3476732ee342SKris Buschelman break; 3477732ee342SKris Buschelman case 5: 3478732ee342SKris Buschelman if (use_natural) { 3479732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3480732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3481732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3482732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3483732ee342SKris Buschelman } else { 3484732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_5; 3485732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3486732ee342SKris Buschelman } 3487732ee342SKris Buschelman break; 3488732ee342SKris Buschelman case 6: 3489732ee342SKris Buschelman if (use_natural) { 3490732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3491732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3492732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3493732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3494732ee342SKris Buschelman } else { 3495732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_6; 3496732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3497732ee342SKris Buschelman } 3498732ee342SKris Buschelman break; 3499732ee342SKris Buschelman case 7: 3500732ee342SKris Buschelman if (use_natural) { 3501732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3502732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3503732ee342SKris Buschelman PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3504732ee342SKris Buschelman PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3505732ee342SKris Buschelman } else { 3506732ee342SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_7; 3507732ee342SKris Buschelman A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3508732ee342SKris Buschelman } 3509732ee342SKris Buschelman break; 351031801e53SKris Buschelman default: 351131801e53SKris Buschelman A->ops->solve = MatSolve_SeqBAIJ_N; 351231801e53SKris Buschelman break; 3513732ee342SKris Buschelman } 3514732ee342SKris Buschelman PetscFunctionReturn(0); 3515732ee342SKris Buschelman } 3516732ee342SKris Buschelman 3517732ee342SKris Buschelman #undef __FUNCT__ 3518732ee342SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3519732ee342SKris Buschelman int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3520732ee342SKris Buschelman int ierr; 3521732ee342SKris Buschelman 3522732ee342SKris Buschelman PetscFunctionBegin; 3523732ee342SKris Buschelman ierr = MatSeqBAIJ_UpdateSolvers(A); 3524cf242676SKris Buschelman if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3525732ee342SKris Buschelman ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3526cf242676SKris Buschelman } else { 3527cf242676SKris Buschelman SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3528cf242676SKris Buschelman } 3529732ee342SKris Buschelman PetscFunctionReturn(0); 3530732ee342SKris Buschelman } 3531732ee342SKris Buschelman 3532732ee342SKris Buschelman #undef __FUNCT__ 3533732ee342SKris Buschelman #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3534732ee342SKris Buschelman int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3535732ee342SKris Buschelman int ierr; 3536732ee342SKris Buschelman 3537732ee342SKris Buschelman PetscFunctionBegin; 3538732ee342SKris Buschelman ierr = MatSeqBAIJ_UpdateSolvers(A); 3539732ee342SKris Buschelman ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3540732ee342SKris Buschelman PetscFunctionReturn(0); 3541732ee342SKris Buschelman } 3542732ee342SKris Buschelman 3543732ee342SKris Buschelman 3544