1*8661488fSKris Buschelman /*$Id: baijfact2.c,v 1.57 2001/07/11 04:50:20 buschelm Exp buschelm $*/ 24e2b4712SSatish Balay /* 34e2b4712SSatish Balay Factorization code for BAIJ format. 44e2b4712SSatish Balay */ 54e2b4712SSatish Balay 64e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h" 74e2b4712SSatish Balay #include "src/vec/vecimpl.h" 84e2b4712SSatish Balay #include "src/inline/ilu.h" 974c49faeSBarry Smith #include "src/inline/dot.h" 104e2b4712SSatish Balay 114a2ae208SSatish Balay #undef __FUNCT__ 124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 137c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14f1af5d2fSBarry Smith { 15f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 17f1af5d2fSBarry Smith int *diag = a->diag; 18f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 19f1af5d2fSBarry Smith Scalar s1,*x,*b; 20f1af5d2fSBarry Smith 21f1af5d2fSBarry Smith PetscFunctionBegin; 22f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 23f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24f1af5d2fSBarry Smith 25f1af5d2fSBarry Smith /* forward solve the U^T */ 26f1af5d2fSBarry Smith for (i=0; i<n; i++) { 27f1af5d2fSBarry Smith 28f1af5d2fSBarry Smith v = aa + diag[i]; 29f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 30f1af5d2fSBarry Smith s1 = (*v++)*b[i]; 31f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 32f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 33f1af5d2fSBarry Smith while (nz--) { 34f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 35f1af5d2fSBarry Smith } 36f1af5d2fSBarry Smith x[i] = s1; 37f1af5d2fSBarry Smith } 38f1af5d2fSBarry Smith /* backward solve the L^T */ 39f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 40f1af5d2fSBarry Smith v = aa + diag[i] - 1; 41f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 42f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 43f1af5d2fSBarry Smith s1 = x[i]; 44f1af5d2fSBarry Smith while (nz--) { 45f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 46f1af5d2fSBarry Smith } 47f1af5d2fSBarry Smith } 48f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 49f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 50b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 51f1af5d2fSBarry Smith PetscFunctionReturn(0); 52f1af5d2fSBarry Smith } 53f1af5d2fSBarry Smith 544a2ae208SSatish Balay #undef __FUNCT__ 554a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 567c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 57f1af5d2fSBarry Smith { 58f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 59f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 60f1af5d2fSBarry Smith int *diag = a->diag,oidx; 61f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 62f1af5d2fSBarry Smith Scalar s1,s2,x1,x2; 63f1af5d2fSBarry Smith Scalar *x,*b; 64f1af5d2fSBarry Smith 65f1af5d2fSBarry Smith PetscFunctionBegin; 66f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 67f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 68f1af5d2fSBarry Smith 69f1af5d2fSBarry Smith /* forward solve the U^T */ 70f1af5d2fSBarry Smith idx = 0; 71f1af5d2fSBarry Smith for (i=0; i<n; i++) { 72f1af5d2fSBarry Smith 73f1af5d2fSBarry Smith v = aa + 4*diag[i]; 74f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 75f1af5d2fSBarry Smith x1 = b[idx]; x2 = b[1+idx]; 76f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 77f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 78f1af5d2fSBarry Smith v += 4; 79f1af5d2fSBarry Smith 80f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 81f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 82f1af5d2fSBarry Smith while (nz--) { 83f1af5d2fSBarry Smith oidx = 2*(*vi++); 84f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 85f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 86f1af5d2fSBarry Smith v += 4; 87f1af5d2fSBarry Smith } 88f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 89f1af5d2fSBarry Smith idx += 2; 90f1af5d2fSBarry Smith } 91f1af5d2fSBarry Smith /* backward solve the L^T */ 92f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 93f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 94f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 95f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 96f1af5d2fSBarry Smith idt = 2*i; 97f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 98f1af5d2fSBarry Smith while (nz--) { 99f1af5d2fSBarry Smith idx = 2*(*vi--); 100f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 101f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 102f1af5d2fSBarry Smith v -= 4; 103f1af5d2fSBarry Smith } 104f1af5d2fSBarry Smith } 105f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 106f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 107b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 108f1af5d2fSBarry Smith PetscFunctionReturn(0); 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith 1114a2ae208SSatish Balay #undef __FUNCT__ 1124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 1137c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 114f1af5d2fSBarry Smith { 115f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 116f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 117f1af5d2fSBarry Smith int *diag = a->diag,oidx; 118f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 119f1af5d2fSBarry Smith Scalar s1,s2,s3,x1,x2,x3; 120f1af5d2fSBarry Smith Scalar *x,*b; 121f1af5d2fSBarry Smith 122f1af5d2fSBarry Smith PetscFunctionBegin; 123f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 124f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 125f1af5d2fSBarry Smith 126f1af5d2fSBarry Smith /* forward solve the U^T */ 127f1af5d2fSBarry Smith idx = 0; 128f1af5d2fSBarry Smith for (i=0; i<n; i++) { 129f1af5d2fSBarry Smith 130f1af5d2fSBarry Smith v = aa + 9*diag[i]; 131f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 132f1af5d2fSBarry Smith x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; 133f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 134f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 135f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 136f1af5d2fSBarry Smith v += 9; 137f1af5d2fSBarry Smith 138f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 139f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 140f1af5d2fSBarry Smith while (nz--) { 141f1af5d2fSBarry Smith oidx = 3*(*vi++); 142f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 143f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 144f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 145f1af5d2fSBarry Smith v += 9; 146f1af5d2fSBarry Smith } 147f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 148f1af5d2fSBarry Smith idx += 3; 149f1af5d2fSBarry Smith } 150f1af5d2fSBarry Smith /* backward solve the L^T */ 151f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 152f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 153f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 154f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 155f1af5d2fSBarry Smith idt = 3*i; 156f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 157f1af5d2fSBarry Smith while (nz--) { 158f1af5d2fSBarry Smith idx = 3*(*vi--); 159f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 160f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 161f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 162f1af5d2fSBarry Smith v -= 9; 163f1af5d2fSBarry Smith } 164f1af5d2fSBarry Smith } 165f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 166f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 167b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 168f1af5d2fSBarry Smith PetscFunctionReturn(0); 169f1af5d2fSBarry Smith } 170f1af5d2fSBarry Smith 1714a2ae208SSatish Balay #undef __FUNCT__ 1724a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 1737c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 174f1af5d2fSBarry Smith { 175f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 176f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 177f1af5d2fSBarry Smith int *diag = a->diag,oidx; 178f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 179f1af5d2fSBarry Smith Scalar s1,s2,s3,s4,x1,x2,x3,x4; 180f1af5d2fSBarry Smith Scalar *x,*b; 181f1af5d2fSBarry Smith 182f1af5d2fSBarry Smith PetscFunctionBegin; 183f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 184f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 185f1af5d2fSBarry Smith 186f1af5d2fSBarry Smith /* forward solve the U^T */ 187f1af5d2fSBarry Smith idx = 0; 188f1af5d2fSBarry Smith for (i=0; i<n; i++) { 189f1af5d2fSBarry Smith 190f1af5d2fSBarry Smith v = aa + 16*diag[i]; 191f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 192f1af5d2fSBarry Smith x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; 193f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 194f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 195f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 196f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 197f1af5d2fSBarry Smith v += 16; 198f1af5d2fSBarry Smith 199f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 200f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 201f1af5d2fSBarry Smith while (nz--) { 202f1af5d2fSBarry Smith oidx = 4*(*vi++); 203f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 204f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 205f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 206f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 207f1af5d2fSBarry Smith v += 16; 208f1af5d2fSBarry Smith } 209f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 210f1af5d2fSBarry Smith idx += 4; 211f1af5d2fSBarry Smith } 212f1af5d2fSBarry Smith /* backward solve the L^T */ 213f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 214f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 215f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 216f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 217f1af5d2fSBarry Smith idt = 4*i; 218f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 219f1af5d2fSBarry Smith while (nz--) { 220f1af5d2fSBarry Smith idx = 4*(*vi--); 221f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 222f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 223f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 224f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 225f1af5d2fSBarry Smith v -= 16; 226f1af5d2fSBarry Smith } 227f1af5d2fSBarry Smith } 228f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 229f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 230b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 231f1af5d2fSBarry Smith PetscFunctionReturn(0); 232f1af5d2fSBarry Smith } 233f1af5d2fSBarry Smith 2344a2ae208SSatish Balay #undef __FUNCT__ 2354a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 2367c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 237f1af5d2fSBarry Smith { 238f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 239f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 240f1af5d2fSBarry Smith int *diag = a->diag,oidx; 241f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 242f1af5d2fSBarry Smith Scalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 243f1af5d2fSBarry Smith Scalar *x,*b; 244f1af5d2fSBarry Smith 245f1af5d2fSBarry Smith PetscFunctionBegin; 246f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 247f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 248f1af5d2fSBarry Smith 249f1af5d2fSBarry Smith /* forward solve the U^T */ 250f1af5d2fSBarry Smith idx = 0; 251f1af5d2fSBarry Smith for (i=0; i<n; i++) { 252f1af5d2fSBarry Smith 253f1af5d2fSBarry Smith v = aa + 25*diag[i]; 254f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 255f1af5d2fSBarry Smith x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 256f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 257f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 258f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 259f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 260f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 261f1af5d2fSBarry Smith v += 25; 262f1af5d2fSBarry Smith 263f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 264f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 265f1af5d2fSBarry Smith while (nz--) { 266f1af5d2fSBarry Smith oidx = 5*(*vi++); 267f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 268f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 269f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 270f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 271f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 272f1af5d2fSBarry Smith v += 25; 273f1af5d2fSBarry Smith } 274f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 275f1af5d2fSBarry Smith idx += 5; 276f1af5d2fSBarry Smith } 277f1af5d2fSBarry Smith /* backward solve the L^T */ 278f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 279f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 280f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 281f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 282f1af5d2fSBarry Smith idt = 5*i; 283f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 284f1af5d2fSBarry Smith while (nz--) { 285f1af5d2fSBarry Smith idx = 5*(*vi--); 286f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 287f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 288f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 289f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 290f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 291f1af5d2fSBarry Smith v -= 25; 292f1af5d2fSBarry Smith } 293f1af5d2fSBarry Smith } 294f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 295f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 296b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 297f1af5d2fSBarry Smith PetscFunctionReturn(0); 298f1af5d2fSBarry Smith } 299f1af5d2fSBarry Smith 3004a2ae208SSatish Balay #undef __FUNCT__ 3014a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 3027c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 303f1af5d2fSBarry Smith { 304f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 306f1af5d2fSBarry Smith int *diag = a->diag,oidx; 307f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 308f1af5d2fSBarry Smith Scalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 309f1af5d2fSBarry Smith Scalar *x,*b; 310f1af5d2fSBarry Smith 311f1af5d2fSBarry Smith PetscFunctionBegin; 312f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 313f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 314f1af5d2fSBarry Smith 315f1af5d2fSBarry Smith /* forward solve the U^T */ 316f1af5d2fSBarry Smith idx = 0; 317f1af5d2fSBarry Smith for (i=0; i<n; i++) { 318f1af5d2fSBarry Smith 319f1af5d2fSBarry Smith v = aa + 36*diag[i]; 320f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 321f1af5d2fSBarry Smith x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 322f1af5d2fSBarry Smith x6 = b[5+idx]; 323f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 324f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 325f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 326f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 327f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 328f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 329f1af5d2fSBarry Smith v += 36; 330f1af5d2fSBarry Smith 331f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 332f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 333f1af5d2fSBarry Smith while (nz--) { 334f1af5d2fSBarry Smith oidx = 6*(*vi++); 335f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 336f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 337f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 338f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 339f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 340f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 341f1af5d2fSBarry Smith v += 36; 342f1af5d2fSBarry Smith } 343f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 344f1af5d2fSBarry Smith x[5+idx] = s6; 345f1af5d2fSBarry Smith idx += 6; 346f1af5d2fSBarry Smith } 347f1af5d2fSBarry Smith /* backward solve the L^T */ 348f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 349f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 350f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 351f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 352f1af5d2fSBarry Smith idt = 6*i; 353f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 354f1af5d2fSBarry Smith s6 = x[5+idt]; 355f1af5d2fSBarry Smith while (nz--) { 356f1af5d2fSBarry Smith idx = 6*(*vi--); 357f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 358f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 359f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 360f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 361f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 362f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 363f1af5d2fSBarry Smith v -= 36; 364f1af5d2fSBarry Smith } 365f1af5d2fSBarry Smith } 366f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 367f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 368b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 369f1af5d2fSBarry Smith PetscFunctionReturn(0); 370f1af5d2fSBarry Smith } 371f1af5d2fSBarry Smith 3724a2ae208SSatish Balay #undef __FUNCT__ 3734a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 3747c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 375f1af5d2fSBarry Smith { 376f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 377f1af5d2fSBarry Smith int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 378f1af5d2fSBarry Smith int *diag = a->diag,oidx; 379f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 380f1af5d2fSBarry Smith Scalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 381f1af5d2fSBarry Smith Scalar *x,*b; 382f1af5d2fSBarry Smith 383f1af5d2fSBarry Smith PetscFunctionBegin; 384f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 385f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 386f1af5d2fSBarry Smith 387f1af5d2fSBarry Smith /* forward solve the U^T */ 388f1af5d2fSBarry Smith idx = 0; 389f1af5d2fSBarry Smith for (i=0; i<n; i++) { 390f1af5d2fSBarry Smith 391f1af5d2fSBarry Smith v = aa + 49*diag[i]; 392f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 393f1af5d2fSBarry Smith x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 394f1af5d2fSBarry Smith x6 = b[5+idx]; x7 = b[6+idx]; 395f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 396f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 397f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 398f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 399f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 400f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 401f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 402f1af5d2fSBarry Smith v += 49; 403f1af5d2fSBarry Smith 404f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 405f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 406f1af5d2fSBarry Smith while (nz--) { 407f1af5d2fSBarry Smith oidx = 7*(*vi++); 408f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 409f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 410f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 411f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 412f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 413f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 414f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 415f1af5d2fSBarry Smith v += 49; 416f1af5d2fSBarry Smith } 417f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 418f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 419f1af5d2fSBarry Smith idx += 7; 420f1af5d2fSBarry Smith } 421f1af5d2fSBarry Smith /* backward solve the L^T */ 422f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 423f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 424f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 425f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 426f1af5d2fSBarry Smith idt = 7*i; 427f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 428f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 429f1af5d2fSBarry Smith while (nz--) { 430f1af5d2fSBarry Smith idx = 7*(*vi--); 431f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 432f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 433f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 434f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 435f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 436f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 437f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 438f1af5d2fSBarry Smith v -= 49; 439f1af5d2fSBarry Smith } 440f1af5d2fSBarry Smith } 441f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 442f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 443b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 444f1af5d2fSBarry Smith PetscFunctionReturn(0); 445f1af5d2fSBarry Smith } 446f1af5d2fSBarry Smith 447f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4484a2ae208SSatish Balay #undef __FUNCT__ 4494a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 4507c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 451f1af5d2fSBarry Smith { 452f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 453f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 454f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 455f1af5d2fSBarry Smith int *diag = a->diag; 456f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 457f1af5d2fSBarry Smith Scalar s1,*x,*b,*t; 458f1af5d2fSBarry Smith 459f1af5d2fSBarry Smith PetscFunctionBegin; 460f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 461f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 462f1af5d2fSBarry Smith t = a->solve_work; 463f1af5d2fSBarry Smith 464f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 465f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 466f1af5d2fSBarry Smith 467f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 468f1af5d2fSBarry Smith for (i=0; i<n; i++) { 469f1af5d2fSBarry Smith t[i] = b[c[i]]; 470f1af5d2fSBarry Smith } 471f1af5d2fSBarry Smith 472f1af5d2fSBarry Smith /* forward solve the U^T */ 473f1af5d2fSBarry Smith for (i=0; i<n; i++) { 474f1af5d2fSBarry Smith 475f1af5d2fSBarry Smith v = aa + diag[i]; 476f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 477f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 478f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 479f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 480f1af5d2fSBarry Smith while (nz--) { 481f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 482f1af5d2fSBarry Smith } 483f1af5d2fSBarry Smith t[i] = s1; 484f1af5d2fSBarry Smith } 485f1af5d2fSBarry Smith /* backward solve the L^T */ 486f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 487f1af5d2fSBarry Smith v = aa + diag[i] - 1; 488f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 489f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 490f1af5d2fSBarry Smith s1 = t[i]; 491f1af5d2fSBarry Smith while (nz--) { 492f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 493f1af5d2fSBarry Smith } 494f1af5d2fSBarry Smith } 495f1af5d2fSBarry Smith 496f1af5d2fSBarry Smith /* copy t into x according to permutation */ 497f1af5d2fSBarry Smith for (i=0; i<n; i++) { 498f1af5d2fSBarry Smith x[r[i]] = t[i]; 499f1af5d2fSBarry Smith } 500f1af5d2fSBarry Smith 501f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 502f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 503f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 504f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 505b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 506f1af5d2fSBarry Smith PetscFunctionReturn(0); 507f1af5d2fSBarry Smith } 508f1af5d2fSBarry Smith 5094a2ae208SSatish Balay #undef __FUNCT__ 5104a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 5117c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 512f1af5d2fSBarry Smith { 513f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 514f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 515f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 516f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 517f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 518f1af5d2fSBarry Smith Scalar s1,s2,x1,x2; 519f1af5d2fSBarry Smith Scalar *x,*b,*t; 520f1af5d2fSBarry Smith 521f1af5d2fSBarry Smith PetscFunctionBegin; 522f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 523f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 524f1af5d2fSBarry Smith t = a->solve_work; 525f1af5d2fSBarry Smith 526f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 527f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 528f1af5d2fSBarry Smith 529f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 530f1af5d2fSBarry Smith ii = 0; 531f1af5d2fSBarry Smith for (i=0; i<n; i++) { 532f1af5d2fSBarry Smith ic = 2*c[i]; 533f1af5d2fSBarry Smith t[ii] = b[ic]; 534f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 535f1af5d2fSBarry Smith ii += 2; 536f1af5d2fSBarry Smith } 537f1af5d2fSBarry Smith 538f1af5d2fSBarry Smith /* forward solve the U^T */ 539f1af5d2fSBarry Smith idx = 0; 540f1af5d2fSBarry Smith for (i=0; i<n; i++) { 541f1af5d2fSBarry Smith 542f1af5d2fSBarry Smith v = aa + 4*diag[i]; 543f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 544f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 545f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 546f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 547f1af5d2fSBarry Smith v += 4; 548f1af5d2fSBarry Smith 549f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 550f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 551f1af5d2fSBarry Smith while (nz--) { 552f1af5d2fSBarry Smith oidx = 2*(*vi++); 553f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 554f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 555f1af5d2fSBarry Smith v += 4; 556f1af5d2fSBarry Smith } 557f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 558f1af5d2fSBarry Smith idx += 2; 559f1af5d2fSBarry Smith } 560f1af5d2fSBarry Smith /* backward solve the L^T */ 561f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 562f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 563f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 564f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 565f1af5d2fSBarry Smith idt = 2*i; 566f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 567f1af5d2fSBarry Smith while (nz--) { 568f1af5d2fSBarry Smith idx = 2*(*vi--); 569f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 570f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 571f1af5d2fSBarry Smith v -= 4; 572f1af5d2fSBarry Smith } 573f1af5d2fSBarry Smith } 574f1af5d2fSBarry Smith 575f1af5d2fSBarry Smith /* copy t into x according to permutation */ 576f1af5d2fSBarry Smith ii = 0; 577f1af5d2fSBarry Smith for (i=0; i<n; i++) { 578f1af5d2fSBarry Smith ir = 2*r[i]; 579f1af5d2fSBarry Smith x[ir] = t[ii]; 580f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 581f1af5d2fSBarry Smith ii += 2; 582f1af5d2fSBarry Smith } 583f1af5d2fSBarry Smith 584f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 585f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 586f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 587f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 588b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 589f1af5d2fSBarry Smith PetscFunctionReturn(0); 590f1af5d2fSBarry Smith } 591f1af5d2fSBarry Smith 5924a2ae208SSatish Balay #undef __FUNCT__ 5934a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 5947c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 595f1af5d2fSBarry Smith { 596f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 597f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 598f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 599f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 600f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 601f1af5d2fSBarry Smith Scalar s1,s2,s3,x1,x2,x3; 602f1af5d2fSBarry Smith Scalar *x,*b,*t; 603f1af5d2fSBarry Smith 604f1af5d2fSBarry Smith PetscFunctionBegin; 605f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 606f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 607f1af5d2fSBarry Smith t = a->solve_work; 608f1af5d2fSBarry Smith 609f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 610f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 611f1af5d2fSBarry Smith 612f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 613f1af5d2fSBarry Smith ii = 0; 614f1af5d2fSBarry Smith for (i=0; i<n; i++) { 615f1af5d2fSBarry Smith ic = 3*c[i]; 616f1af5d2fSBarry Smith t[ii] = b[ic]; 617f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 618f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 619f1af5d2fSBarry Smith ii += 3; 620f1af5d2fSBarry Smith } 621f1af5d2fSBarry Smith 622f1af5d2fSBarry Smith /* forward solve the U^T */ 623f1af5d2fSBarry Smith idx = 0; 624f1af5d2fSBarry Smith for (i=0; i<n; i++) { 625f1af5d2fSBarry Smith 626f1af5d2fSBarry Smith v = aa + 9*diag[i]; 627f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 628f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 629f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 630f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 631f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 632f1af5d2fSBarry Smith v += 9; 633f1af5d2fSBarry Smith 634f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 635f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 636f1af5d2fSBarry Smith while (nz--) { 637f1af5d2fSBarry Smith oidx = 3*(*vi++); 638f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 639f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 640f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 641f1af5d2fSBarry Smith v += 9; 642f1af5d2fSBarry Smith } 643f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 644f1af5d2fSBarry Smith idx += 3; 645f1af5d2fSBarry Smith } 646f1af5d2fSBarry Smith /* backward solve the L^T */ 647f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 648f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 649f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 650f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 651f1af5d2fSBarry Smith idt = 3*i; 652f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 653f1af5d2fSBarry Smith while (nz--) { 654f1af5d2fSBarry Smith idx = 3*(*vi--); 655f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 656f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 657f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 658f1af5d2fSBarry Smith v -= 9; 659f1af5d2fSBarry Smith } 660f1af5d2fSBarry Smith } 661f1af5d2fSBarry Smith 662f1af5d2fSBarry Smith /* copy t into x according to permutation */ 663f1af5d2fSBarry Smith ii = 0; 664f1af5d2fSBarry Smith for (i=0; i<n; i++) { 665f1af5d2fSBarry Smith ir = 3*r[i]; 666f1af5d2fSBarry Smith x[ir] = t[ii]; 667f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 668f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 669f1af5d2fSBarry Smith ii += 3; 670f1af5d2fSBarry Smith } 671f1af5d2fSBarry Smith 672f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 673f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 674f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 675f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 676b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 677f1af5d2fSBarry Smith PetscFunctionReturn(0); 678f1af5d2fSBarry Smith } 679f1af5d2fSBarry Smith 6804a2ae208SSatish Balay #undef __FUNCT__ 6814a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 6827c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 683f1af5d2fSBarry Smith { 684f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 685f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 686f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 687f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 688f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 689f1af5d2fSBarry Smith Scalar s1,s2,s3,s4,x1,x2,x3,x4; 690f1af5d2fSBarry Smith Scalar *x,*b,*t; 691f1af5d2fSBarry Smith 692f1af5d2fSBarry Smith PetscFunctionBegin; 693f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 694f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 695f1af5d2fSBarry Smith t = a->solve_work; 696f1af5d2fSBarry Smith 697f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 698f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 699f1af5d2fSBarry Smith 700f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 701f1af5d2fSBarry Smith ii = 0; 702f1af5d2fSBarry Smith for (i=0; i<n; i++) { 703f1af5d2fSBarry Smith ic = 4*c[i]; 704f1af5d2fSBarry Smith t[ii] = b[ic]; 705f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 706f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 707f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 708f1af5d2fSBarry Smith ii += 4; 709f1af5d2fSBarry Smith } 710f1af5d2fSBarry Smith 711f1af5d2fSBarry Smith /* forward solve the U^T */ 712f1af5d2fSBarry Smith idx = 0; 713f1af5d2fSBarry Smith for (i=0; i<n; i++) { 714f1af5d2fSBarry Smith 715f1af5d2fSBarry Smith v = aa + 16*diag[i]; 716f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 717f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 718f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 719f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 720f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 721f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 722f1af5d2fSBarry Smith v += 16; 723f1af5d2fSBarry Smith 724f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 725f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 726f1af5d2fSBarry Smith while (nz--) { 727f1af5d2fSBarry Smith oidx = 4*(*vi++); 728f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 729f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 730f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 731f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 732f1af5d2fSBarry Smith v += 16; 733f1af5d2fSBarry Smith } 734f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 735f1af5d2fSBarry Smith idx += 4; 736f1af5d2fSBarry Smith } 737f1af5d2fSBarry Smith /* backward solve the L^T */ 738f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 739f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 740f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 741f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 742f1af5d2fSBarry Smith idt = 4*i; 743f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 744f1af5d2fSBarry Smith while (nz--) { 745f1af5d2fSBarry Smith idx = 4*(*vi--); 746f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 747f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 748f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 749f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 750f1af5d2fSBarry Smith v -= 16; 751f1af5d2fSBarry Smith } 752f1af5d2fSBarry Smith } 753f1af5d2fSBarry Smith 754f1af5d2fSBarry Smith /* copy t into x according to permutation */ 755f1af5d2fSBarry Smith ii = 0; 756f1af5d2fSBarry Smith for (i=0; i<n; i++) { 757f1af5d2fSBarry Smith ir = 4*r[i]; 758f1af5d2fSBarry Smith x[ir] = t[ii]; 759f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 760f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 761f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 762f1af5d2fSBarry Smith ii += 4; 763f1af5d2fSBarry Smith } 764f1af5d2fSBarry Smith 765f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 766f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 767f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 768f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 769b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 770f1af5d2fSBarry Smith PetscFunctionReturn(0); 771f1af5d2fSBarry Smith } 772f1af5d2fSBarry Smith 7734a2ae208SSatish Balay #undef __FUNCT__ 7744a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 7757c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 776f1af5d2fSBarry Smith { 777f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 778f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 779f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 780f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 781f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 782f1af5d2fSBarry Smith Scalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 783f1af5d2fSBarry Smith Scalar *x,*b,*t; 784f1af5d2fSBarry Smith 785f1af5d2fSBarry Smith PetscFunctionBegin; 786f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 787f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 788f1af5d2fSBarry Smith t = a->solve_work; 789f1af5d2fSBarry Smith 790f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 791f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 792f1af5d2fSBarry Smith 793f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 794f1af5d2fSBarry Smith ii = 0; 795f1af5d2fSBarry Smith for (i=0; i<n; i++) { 796f1af5d2fSBarry Smith ic = 5*c[i]; 797f1af5d2fSBarry Smith t[ii] = b[ic]; 798f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 799f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 800f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 801f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 802f1af5d2fSBarry Smith ii += 5; 803f1af5d2fSBarry Smith } 804f1af5d2fSBarry Smith 805f1af5d2fSBarry Smith /* forward solve the U^T */ 806f1af5d2fSBarry Smith idx = 0; 807f1af5d2fSBarry Smith for (i=0; i<n; i++) { 808f1af5d2fSBarry Smith 809f1af5d2fSBarry Smith v = aa + 25*diag[i]; 810f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 811f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 812f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 813f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 814f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 815f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 816f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 817f1af5d2fSBarry Smith v += 25; 818f1af5d2fSBarry Smith 819f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 820f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 821f1af5d2fSBarry Smith while (nz--) { 822f1af5d2fSBarry Smith oidx = 5*(*vi++); 823f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 824f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 825f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 826f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 827f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 828f1af5d2fSBarry Smith v += 25; 829f1af5d2fSBarry Smith } 830f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 831f1af5d2fSBarry Smith idx += 5; 832f1af5d2fSBarry Smith } 833f1af5d2fSBarry Smith /* backward solve the L^T */ 834f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 835f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 836f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 837f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 838f1af5d2fSBarry Smith idt = 5*i; 839f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 840f1af5d2fSBarry Smith while (nz--) { 841f1af5d2fSBarry Smith idx = 5*(*vi--); 842f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 843f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 844f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 845f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 846f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 847f1af5d2fSBarry Smith v -= 25; 848f1af5d2fSBarry Smith } 849f1af5d2fSBarry Smith } 850f1af5d2fSBarry Smith 851f1af5d2fSBarry Smith /* copy t into x according to permutation */ 852f1af5d2fSBarry Smith ii = 0; 853f1af5d2fSBarry Smith for (i=0; i<n; i++) { 854f1af5d2fSBarry Smith ir = 5*r[i]; 855f1af5d2fSBarry Smith x[ir] = t[ii]; 856f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 857f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 858f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 859f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 860f1af5d2fSBarry Smith ii += 5; 861f1af5d2fSBarry Smith } 862f1af5d2fSBarry Smith 863f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 864f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 865f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 866f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 867b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 868f1af5d2fSBarry Smith PetscFunctionReturn(0); 869f1af5d2fSBarry Smith } 870f1af5d2fSBarry Smith 8714a2ae208SSatish Balay #undef __FUNCT__ 8724a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 8737c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 874f1af5d2fSBarry Smith { 875f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 876f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 877f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 878f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 879f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 880f1af5d2fSBarry Smith Scalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 881f1af5d2fSBarry Smith Scalar *x,*b,*t; 882f1af5d2fSBarry Smith 883f1af5d2fSBarry Smith PetscFunctionBegin; 884f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 885f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 886f1af5d2fSBarry Smith t = a->solve_work; 887f1af5d2fSBarry Smith 888f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 889f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 890f1af5d2fSBarry Smith 891f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 892f1af5d2fSBarry Smith ii = 0; 893f1af5d2fSBarry Smith for (i=0; i<n; i++) { 894f1af5d2fSBarry Smith ic = 6*c[i]; 895f1af5d2fSBarry Smith t[ii] = b[ic]; 896f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 897f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 898f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 899f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 900f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 901f1af5d2fSBarry Smith ii += 6; 902f1af5d2fSBarry Smith } 903f1af5d2fSBarry Smith 904f1af5d2fSBarry Smith /* forward solve the U^T */ 905f1af5d2fSBarry Smith idx = 0; 906f1af5d2fSBarry Smith for (i=0; i<n; i++) { 907f1af5d2fSBarry Smith 908f1af5d2fSBarry Smith v = aa + 36*diag[i]; 909f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 910f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 911f1af5d2fSBarry Smith x6 = t[5+idx]; 912f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 913f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 914f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 915f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 916f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 917f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 918f1af5d2fSBarry Smith v += 36; 919f1af5d2fSBarry Smith 920f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 921f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 922f1af5d2fSBarry Smith while (nz--) { 923f1af5d2fSBarry Smith oidx = 6*(*vi++); 924f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 925f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 926f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 927f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 928f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 929f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 930f1af5d2fSBarry Smith v += 36; 931f1af5d2fSBarry Smith } 932f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 933f1af5d2fSBarry Smith t[5+idx] = s6; 934f1af5d2fSBarry Smith idx += 6; 935f1af5d2fSBarry Smith } 936f1af5d2fSBarry Smith /* backward solve the L^T */ 937f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 938f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 939f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 940f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 941f1af5d2fSBarry Smith idt = 6*i; 942f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 943f1af5d2fSBarry Smith s6 = t[5+idt]; 944f1af5d2fSBarry Smith while (nz--) { 945f1af5d2fSBarry Smith idx = 6*(*vi--); 946f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 947f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 948f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 949f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 950f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 951f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 952f1af5d2fSBarry Smith v -= 36; 953f1af5d2fSBarry Smith } 954f1af5d2fSBarry Smith } 955f1af5d2fSBarry Smith 956f1af5d2fSBarry Smith /* copy t into x according to permutation */ 957f1af5d2fSBarry Smith ii = 0; 958f1af5d2fSBarry Smith for (i=0; i<n; i++) { 959f1af5d2fSBarry Smith ir = 6*r[i]; 960f1af5d2fSBarry Smith x[ir] = t[ii]; 961f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 962f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 963f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 964f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 965f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 966f1af5d2fSBarry Smith ii += 6; 967f1af5d2fSBarry Smith } 968f1af5d2fSBarry Smith 969f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 970f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 971f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 972f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 973b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 974f1af5d2fSBarry Smith PetscFunctionReturn(0); 975f1af5d2fSBarry Smith } 976f1af5d2fSBarry Smith 9774a2ae208SSatish Balay #undef __FUNCT__ 9784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 9797c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 980f1af5d2fSBarry Smith { 981f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 982f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 983f1af5d2fSBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 984f1af5d2fSBarry Smith int *diag = a->diag,ii,ic,ir,oidx; 985f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 986f1af5d2fSBarry Smith Scalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 987f1af5d2fSBarry Smith Scalar *x,*b,*t; 988f1af5d2fSBarry Smith 989f1af5d2fSBarry Smith PetscFunctionBegin; 990f1af5d2fSBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 991f1af5d2fSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 992f1af5d2fSBarry Smith t = a->solve_work; 993f1af5d2fSBarry Smith 994f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 995f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 996f1af5d2fSBarry Smith 997f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 998f1af5d2fSBarry Smith ii = 0; 999f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1000f1af5d2fSBarry Smith ic = 7*c[i]; 1001f1af5d2fSBarry Smith t[ii] = b[ic]; 1002f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1003f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1004f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1005f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1006f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1007f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1008f1af5d2fSBarry Smith ii += 7; 1009f1af5d2fSBarry Smith } 1010f1af5d2fSBarry Smith 1011f1af5d2fSBarry Smith /* forward solve the U^T */ 1012f1af5d2fSBarry Smith idx = 0; 1013f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1014f1af5d2fSBarry Smith 1015f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1016f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1017f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1018f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1019f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1020f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1021f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1022f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1023f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1024f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1025f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1026f1af5d2fSBarry Smith v += 49; 1027f1af5d2fSBarry Smith 1028f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1029f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1030f1af5d2fSBarry Smith while (nz--) { 1031f1af5d2fSBarry Smith oidx = 7*(*vi++); 1032f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1033f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1034f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1035f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1036f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1037f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1038f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1039f1af5d2fSBarry Smith v += 49; 1040f1af5d2fSBarry Smith } 1041f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1042f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1043f1af5d2fSBarry Smith idx += 7; 1044f1af5d2fSBarry Smith } 1045f1af5d2fSBarry Smith /* backward solve the L^T */ 1046f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1047f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1048f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1049f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1050f1af5d2fSBarry Smith idt = 7*i; 1051f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1052f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1053f1af5d2fSBarry Smith while (nz--) { 1054f1af5d2fSBarry Smith idx = 7*(*vi--); 1055f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1056f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1057f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1058f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1059f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1060f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1061f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1062f1af5d2fSBarry Smith v -= 49; 1063f1af5d2fSBarry Smith } 1064f1af5d2fSBarry Smith } 1065f1af5d2fSBarry Smith 1066f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1067f1af5d2fSBarry Smith ii = 0; 1068f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1069f1af5d2fSBarry Smith ir = 7*r[i]; 1070f1af5d2fSBarry Smith x[ir] = t[ii]; 1071f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1072f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1073f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1074f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1075f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1076f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1077f1af5d2fSBarry Smith ii += 7; 1078f1af5d2fSBarry Smith } 1079f1af5d2fSBarry Smith 1080f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1081f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1082f1af5d2fSBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1083f1af5d2fSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1084b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 1085f1af5d2fSBarry Smith PetscFunctionReturn(0); 1086f1af5d2fSBarry Smith } 1087f1af5d2fSBarry Smith 10884e2b4712SSatish Balay /* ----------------------------------------------------------- */ 10894a2ae208SSatish Balay #undef __FUNCT__ 10904a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 10914e2b4712SSatish Balay int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 10924e2b4712SSatish Balay { 10934e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 10944e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 10954e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 10964e2b4712SSatish Balay int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 10973f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 1098f1af5d2fSBarry Smith Scalar *x,*b,*s,*t,*ls; 10994e2b4712SSatish Balay 11004e2b4712SSatish Balay PetscFunctionBegin; 1101e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1102e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1103f1af5d2fSBarry Smith t = a->solve_work; 11044e2b4712SSatish Balay 11054e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11064e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11074e2b4712SSatish Balay 11084e2b4712SSatish Balay /* forward solve the lower triangular */ 1109f1af5d2fSBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr); 11104e2b4712SSatish Balay for (i=1; i<n; i++) { 11114e2b4712SSatish Balay v = aa + bs2*ai[i]; 11124e2b4712SSatish Balay vi = aj + ai[i]; 11134e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1114f1af5d2fSBarry Smith s = t + bs*i; 1115f1af5d2fSBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr); 11164e2b4712SSatish Balay while (nz--) { 1117f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11184e2b4712SSatish Balay v += bs2; 11194e2b4712SSatish Balay } 11204e2b4712SSatish Balay } 11214e2b4712SSatish Balay /* backward solve the upper triangular */ 1122273d9f13SBarry Smith ls = a->solve_work + A->n; 11234e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11244e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11254e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11264e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 1127f1af5d2fSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr); 11284e2b4712SSatish Balay while (nz--) { 1129f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11304e2b4712SSatish Balay v += bs2; 11314e2b4712SSatish Balay } 1132f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1133f1af5d2fSBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr); 11344e2b4712SSatish Balay } 11354e2b4712SSatish Balay 11364e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11374e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1138e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1139e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1140b0a32e0cSBarry Smith PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 11414e2b4712SSatish Balay PetscFunctionReturn(0); 11424e2b4712SSatish Balay } 11434e2b4712SSatish Balay 11444a2ae208SSatish Balay #undef __FUNCT__ 11454a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 11464e2b4712SSatish Balay int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11474e2b4712SSatish Balay { 11484e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11494e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11504e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 11514e2b4712SSatish Balay int *diag = a->diag; 11523f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 1153f1af5d2fSBarry Smith Scalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1154f1af5d2fSBarry Smith Scalar *x,*b,*t; 11554e2b4712SSatish Balay 11564e2b4712SSatish Balay PetscFunctionBegin; 1157e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1158e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1159f1af5d2fSBarry Smith t = a->solve_work; 11604e2b4712SSatish Balay 11614e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11624e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11634e2b4712SSatish Balay 11644e2b4712SSatish Balay /* forward solve the lower triangular */ 11654e2b4712SSatish Balay idx = 7*(*r++); 1166f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1167f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1168f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 11694e2b4712SSatish Balay 11704e2b4712SSatish Balay for (i=1; i<n; i++) { 11714e2b4712SSatish Balay v = aa + 49*ai[i]; 11724e2b4712SSatish Balay vi = aj + ai[i]; 11734e2b4712SSatish Balay nz = diag[i] - ai[i]; 11744e2b4712SSatish Balay idx = 7*(*r++); 1175f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1176f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 11774e2b4712SSatish Balay while (nz--) { 11784e2b4712SSatish Balay idx = 7*(*vi++); 1179f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1180f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1181f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1182f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1183f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1184f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1185f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1186f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1187f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1188f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 11894e2b4712SSatish Balay v += 49; 11904e2b4712SSatish Balay } 11914e2b4712SSatish Balay idx = 7*i; 1192f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1193f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1194f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 11954e2b4712SSatish Balay } 11964e2b4712SSatish Balay /* backward solve the upper triangular */ 11974e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11984e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 11994e2b4712SSatish Balay vi = aj + diag[i] + 1; 12004e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12014e2b4712SSatish Balay idt = 7*i; 1202f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1203f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1204f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12054e2b4712SSatish Balay while (nz--) { 12064e2b4712SSatish Balay idx = 7*(*vi++); 1207f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1208f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1209f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1210f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1211f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1212f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1213f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1214f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1215f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1216f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12174e2b4712SSatish Balay v += 49; 12184e2b4712SSatish Balay } 12194e2b4712SSatish Balay idc = 7*(*c--); 12204e2b4712SSatish Balay v = aa + 49*diag[i]; 1221f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1222f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1223f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1224f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1225f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1226f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1227f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1228f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1229f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1230f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1231f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1232f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1233f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1234f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12354e2b4712SSatish Balay } 12364e2b4712SSatish Balay 12374e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12384e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1239e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1240e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1241b0a32e0cSBarry Smith PetscLogFlops(2*49*(a->nz) - 7*A->n); 12424e2b4712SSatish Balay PetscFunctionReturn(0); 12434e2b4712SSatish Balay } 12444e2b4712SSatish Balay 12454a2ae208SSatish Balay #undef __FUNCT__ 12464a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 124715091d37SBarry Smith int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 124815091d37SBarry Smith { 124915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 125015091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 125115091d37SBarry Smith int ierr,*diag = a->diag,jdx; 125215091d37SBarry Smith MatScalar *aa=a->a,*v; 1253f1af5d2fSBarry Smith Scalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 125415091d37SBarry Smith 125515091d37SBarry Smith PetscFunctionBegin; 125615091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 125715091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 125815091d37SBarry Smith /* forward solve the lower triangular */ 125915091d37SBarry Smith idx = 0; 126015091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 126115091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 126215091d37SBarry Smith x[6] = b[6+idx]; 126315091d37SBarry Smith for (i=1; i<n; i++) { 126415091d37SBarry Smith v = aa + 49*ai[i]; 126515091d37SBarry Smith vi = aj + ai[i]; 126615091d37SBarry Smith nz = diag[i] - ai[i]; 126715091d37SBarry Smith idx = 7*i; 1268f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1269f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1270f1af5d2fSBarry Smith s7 = b[6+idx]; 127115091d37SBarry Smith while (nz--) { 127215091d37SBarry Smith jdx = 7*(*vi++); 127315091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 127415091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 127515091d37SBarry Smith x7 = x[6+jdx]; 1276f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1277f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1278f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1279f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1280f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1281f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1282f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 128315091d37SBarry Smith v += 49; 128415091d37SBarry Smith } 1285f1af5d2fSBarry Smith x[idx] = s1; 1286f1af5d2fSBarry Smith x[1+idx] = s2; 1287f1af5d2fSBarry Smith x[2+idx] = s3; 1288f1af5d2fSBarry Smith x[3+idx] = s4; 1289f1af5d2fSBarry Smith x[4+idx] = s5; 1290f1af5d2fSBarry Smith x[5+idx] = s6; 1291f1af5d2fSBarry Smith x[6+idx] = s7; 129215091d37SBarry Smith } 129315091d37SBarry Smith /* backward solve the upper triangular */ 129415091d37SBarry Smith for (i=n-1; i>=0; i--){ 129515091d37SBarry Smith v = aa + 49*diag[i] + 49; 129615091d37SBarry Smith vi = aj + diag[i] + 1; 129715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 129815091d37SBarry Smith idt = 7*i; 1299f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1300f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1301f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1302f1af5d2fSBarry Smith s7 = x[6+idt]; 130315091d37SBarry Smith while (nz--) { 130415091d37SBarry Smith idx = 7*(*vi++); 130515091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 130615091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 130715091d37SBarry Smith x7 = x[6+idx]; 1308f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1309f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1310f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1311f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1312f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1313f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1314f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 131515091d37SBarry Smith v += 49; 131615091d37SBarry Smith } 131715091d37SBarry Smith v = aa + 49*diag[i]; 1318f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1319f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1320f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1321f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1322f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1323f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1324f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1325f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1326f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1327f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1328f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1329f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1330f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1331f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 133215091d37SBarry Smith } 133315091d37SBarry Smith 133415091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 133515091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1336b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 133715091d37SBarry Smith PetscFunctionReturn(0); 133815091d37SBarry Smith } 133915091d37SBarry Smith 13404a2ae208SSatish Balay #undef __FUNCT__ 13414a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 134215091d37SBarry Smith int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 134315091d37SBarry Smith { 134415091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 134515091d37SBarry Smith IS iscol=a->col,isrow=a->row; 134615091d37SBarry Smith int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 134715091d37SBarry Smith int *diag = a->diag; 134815091d37SBarry Smith MatScalar *aa=a->a,*v; 1349f1af5d2fSBarry Smith Scalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 135015091d37SBarry Smith 135115091d37SBarry Smith PetscFunctionBegin; 135215091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 135315091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1354f1af5d2fSBarry Smith t = a->solve_work; 135515091d37SBarry Smith 135615091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 135715091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 135815091d37SBarry Smith 135915091d37SBarry Smith /* forward solve the lower triangular */ 136015091d37SBarry Smith idx = 6*(*r++); 1361f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1362f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1363f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 136415091d37SBarry Smith for (i=1; i<n; i++) { 136515091d37SBarry Smith v = aa + 36*ai[i]; 136615091d37SBarry Smith vi = aj + ai[i]; 136715091d37SBarry Smith nz = diag[i] - ai[i]; 136815091d37SBarry Smith idx = 6*(*r++); 1369f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1370f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 137115091d37SBarry Smith while (nz--) { 137215091d37SBarry Smith idx = 6*(*vi++); 1373f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1374f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1375f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1376f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1377f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1378f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1379f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1380f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 138115091d37SBarry Smith v += 36; 138215091d37SBarry Smith } 138315091d37SBarry Smith idx = 6*i; 1384f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1385f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1386f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 138715091d37SBarry Smith } 138815091d37SBarry Smith /* backward solve the upper triangular */ 138915091d37SBarry Smith for (i=n-1; i>=0; i--){ 139015091d37SBarry Smith v = aa + 36*diag[i] + 36; 139115091d37SBarry Smith vi = aj + diag[i] + 1; 139215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 139315091d37SBarry Smith idt = 6*i; 1394f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1395f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1396f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 139715091d37SBarry Smith while (nz--) { 139815091d37SBarry Smith idx = 6*(*vi++); 1399f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1400f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1401f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1402f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1403f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1404f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1405f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1406f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1407f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 140815091d37SBarry Smith v += 36; 140915091d37SBarry Smith } 141015091d37SBarry Smith idc = 6*(*c--); 141115091d37SBarry Smith v = aa + 36*diag[i]; 1412f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1413f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1414f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1415f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1416f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1417f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1418f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1419f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1420f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1421f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1422f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1423f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 142415091d37SBarry Smith } 142515091d37SBarry Smith 142615091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 142715091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 142815091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 142915091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1430b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 143115091d37SBarry Smith PetscFunctionReturn(0); 143215091d37SBarry Smith } 143315091d37SBarry Smith 14344a2ae208SSatish Balay #undef __FUNCT__ 14354a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 143615091d37SBarry Smith int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 143715091d37SBarry Smith { 143815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 143915091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 144015091d37SBarry Smith int ierr,*diag = a->diag,jdx; 144115091d37SBarry Smith MatScalar *aa=a->a,*v; 1442f1af5d2fSBarry Smith Scalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 144315091d37SBarry Smith 144415091d37SBarry Smith PetscFunctionBegin; 144515091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 144615091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 144715091d37SBarry Smith /* forward solve the lower triangular */ 144815091d37SBarry Smith idx = 0; 144915091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 145015091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 145115091d37SBarry Smith for (i=1; i<n; i++) { 145215091d37SBarry Smith v = aa + 36*ai[i]; 145315091d37SBarry Smith vi = aj + ai[i]; 145415091d37SBarry Smith nz = diag[i] - ai[i]; 145515091d37SBarry Smith idx = 6*i; 1456f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1457f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 145815091d37SBarry Smith while (nz--) { 145915091d37SBarry Smith jdx = 6*(*vi++); 146015091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 146115091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1462f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1463f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1464f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1465f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1466f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1467f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 146815091d37SBarry Smith v += 36; 146915091d37SBarry Smith } 1470f1af5d2fSBarry Smith x[idx] = s1; 1471f1af5d2fSBarry Smith x[1+idx] = s2; 1472f1af5d2fSBarry Smith x[2+idx] = s3; 1473f1af5d2fSBarry Smith x[3+idx] = s4; 1474f1af5d2fSBarry Smith x[4+idx] = s5; 1475f1af5d2fSBarry Smith x[5+idx] = s6; 147615091d37SBarry Smith } 147715091d37SBarry Smith /* backward solve the upper triangular */ 147815091d37SBarry Smith for (i=n-1; i>=0; i--){ 147915091d37SBarry Smith v = aa + 36*diag[i] + 36; 148015091d37SBarry Smith vi = aj + diag[i] + 1; 148115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 148215091d37SBarry Smith idt = 6*i; 1483f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1484f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1485f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 148615091d37SBarry Smith while (nz--) { 148715091d37SBarry Smith idx = 6*(*vi++); 148815091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 148915091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1490f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1491f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1492f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1493f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1494f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1495f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 149615091d37SBarry Smith v += 36; 149715091d37SBarry Smith } 149815091d37SBarry Smith v = aa + 36*diag[i]; 1499f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1500f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1501f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1502f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1503f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1504f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 150515091d37SBarry Smith } 150615091d37SBarry Smith 150715091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 150815091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1509b0a32e0cSBarry Smith PetscLogFlops(2*36*(a->nz) - 6*A->n); 151015091d37SBarry Smith PetscFunctionReturn(0); 151115091d37SBarry Smith } 151215091d37SBarry Smith 15134a2ae208SSatish Balay #undef __FUNCT__ 15144a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 15154e2b4712SSatish Balay int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 15164e2b4712SSatish Balay { 15174e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15184e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 15194e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 15204e2b4712SSatish Balay int *diag = a->diag; 15213f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 1522f1af5d2fSBarry Smith Scalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 15234e2b4712SSatish Balay 15244e2b4712SSatish Balay PetscFunctionBegin; 1525e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1526e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1527f1af5d2fSBarry Smith t = a->solve_work; 15284e2b4712SSatish Balay 15294e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 15304e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 15314e2b4712SSatish Balay 15324e2b4712SSatish Balay /* forward solve the lower triangular */ 15334e2b4712SSatish Balay idx = 5*(*r++); 1534f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1535f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 15364e2b4712SSatish Balay for (i=1; i<n; i++) { 15374e2b4712SSatish Balay v = aa + 25*ai[i]; 15384e2b4712SSatish Balay vi = aj + ai[i]; 15394e2b4712SSatish Balay nz = diag[i] - ai[i]; 15404e2b4712SSatish Balay idx = 5*(*r++); 1541f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1542f1af5d2fSBarry Smith s5 = b[4+idx]; 15434e2b4712SSatish Balay while (nz--) { 15444e2b4712SSatish Balay idx = 5*(*vi++); 1545f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1546f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1547f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1548f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1549f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1550f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1551f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 15524e2b4712SSatish Balay v += 25; 15534e2b4712SSatish Balay } 15544e2b4712SSatish Balay idx = 5*i; 1555f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1556f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 15574e2b4712SSatish Balay } 15584e2b4712SSatish Balay /* backward solve the upper triangular */ 15594e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 15604e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 15614e2b4712SSatish Balay vi = aj + diag[i] + 1; 15624e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 15634e2b4712SSatish Balay idt = 5*i; 1564f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1565f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 15664e2b4712SSatish Balay while (nz--) { 15674e2b4712SSatish Balay idx = 5*(*vi++); 1568f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1569f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1570f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1571f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1572f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1573f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1574f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 15754e2b4712SSatish Balay v += 25; 15764e2b4712SSatish Balay } 15774e2b4712SSatish Balay idc = 5*(*c--); 15784e2b4712SSatish Balay v = aa + 25*diag[i]; 1579f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1580f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 1581f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1582f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 1583f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1584f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 1585f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1586f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 1587f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1588f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 15894e2b4712SSatish Balay } 15904e2b4712SSatish Balay 15914e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 15924e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1593e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1594e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1595b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 15964e2b4712SSatish Balay PetscFunctionReturn(0); 15974e2b4712SSatish Balay } 15984e2b4712SSatish Balay 15994a2ae208SSatish Balay #undef __FUNCT__ 16004a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 160115091d37SBarry Smith int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 160215091d37SBarry Smith { 160315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 160415091d37SBarry Smith int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 160515091d37SBarry Smith int ierr,*diag = a->diag,jdx; 160615091d37SBarry Smith MatScalar *aa=a->a,*v; 1607329f5518SBarry Smith Scalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 160815091d37SBarry Smith 160915091d37SBarry Smith PetscFunctionBegin; 161015091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 161115091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 161215091d37SBarry Smith /* forward solve the lower triangular */ 161315091d37SBarry Smith idx = 0; 161415091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 161515091d37SBarry Smith for (i=1; i<n; i++) { 161615091d37SBarry Smith v = aa + 25*ai[i]; 161715091d37SBarry Smith vi = aj + ai[i]; 161815091d37SBarry Smith nz = diag[i] - ai[i]; 161915091d37SBarry Smith idx = 5*i; 1620f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 162115091d37SBarry Smith while (nz--) { 162215091d37SBarry Smith jdx = 5*(*vi++); 162315091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1624f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1625f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1626f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1627f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1628f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 162915091d37SBarry Smith v += 25; 163015091d37SBarry Smith } 1631f1af5d2fSBarry Smith x[idx] = s1; 1632f1af5d2fSBarry Smith x[1+idx] = s2; 1633f1af5d2fSBarry Smith x[2+idx] = s3; 1634f1af5d2fSBarry Smith x[3+idx] = s4; 1635f1af5d2fSBarry Smith x[4+idx] = s5; 163615091d37SBarry Smith } 163715091d37SBarry Smith /* backward solve the upper triangular */ 163815091d37SBarry Smith for (i=n-1; i>=0; i--){ 163915091d37SBarry Smith v = aa + 25*diag[i] + 25; 164015091d37SBarry Smith vi = aj + diag[i] + 1; 164115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 164215091d37SBarry Smith idt = 5*i; 1643f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1644f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 164515091d37SBarry Smith while (nz--) { 164615091d37SBarry Smith idx = 5*(*vi++); 164715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1648f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1649f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1650f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1651f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1652f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 165315091d37SBarry Smith v += 25; 165415091d37SBarry Smith } 165515091d37SBarry Smith v = aa + 25*diag[i]; 1656f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1657f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1658f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1659f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1660f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 166115091d37SBarry Smith } 166215091d37SBarry Smith 166315091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 166415091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1665b0a32e0cSBarry Smith PetscLogFlops(2*25*(a->nz) - 5*A->n); 166615091d37SBarry Smith PetscFunctionReturn(0); 166715091d37SBarry Smith } 166815091d37SBarry Smith 16694a2ae208SSatish Balay #undef __FUNCT__ 16704a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 16714e2b4712SSatish Balay int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 16724e2b4712SSatish Balay { 16734e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 16744e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 16754e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 16764e2b4712SSatish Balay int *diag = a->diag; 16773f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 1678f1af5d2fSBarry Smith Scalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 16794e2b4712SSatish Balay 16804e2b4712SSatish Balay PetscFunctionBegin; 1681e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1682e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1683f1af5d2fSBarry Smith t = a->solve_work; 16844e2b4712SSatish Balay 16854e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 16864e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 16874e2b4712SSatish Balay 16884e2b4712SSatish Balay /* forward solve the lower triangular */ 16894e2b4712SSatish Balay idx = 4*(*r++); 1690f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1691f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 16924e2b4712SSatish Balay for (i=1; i<n; i++) { 16934e2b4712SSatish Balay v = aa + 16*ai[i]; 16944e2b4712SSatish Balay vi = aj + ai[i]; 16954e2b4712SSatish Balay nz = diag[i] - ai[i]; 16964e2b4712SSatish Balay idx = 4*(*r++); 1697f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 16984e2b4712SSatish Balay while (nz--) { 16994e2b4712SSatish Balay idx = 4*(*vi++); 1700f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1701f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1702f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1703f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1704f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17054e2b4712SSatish Balay v += 16; 17064e2b4712SSatish Balay } 17074e2b4712SSatish Balay idx = 4*i; 1708f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1709f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 17104e2b4712SSatish Balay } 17114e2b4712SSatish Balay /* backward solve the upper triangular */ 17124e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 17134e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 17144e2b4712SSatish Balay vi = aj + diag[i] + 1; 17154e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 17164e2b4712SSatish Balay idt = 4*i; 1717f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1718f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 17194e2b4712SSatish Balay while (nz--) { 17204e2b4712SSatish Balay idx = 4*(*vi++); 1721f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1722f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1723f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1724f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1725f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1726f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17274e2b4712SSatish Balay v += 16; 17284e2b4712SSatish Balay } 17294e2b4712SSatish Balay idc = 4*(*c--); 17304e2b4712SSatish Balay v = aa + 16*diag[i]; 1731f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1732f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1733f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1734f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 17354e2b4712SSatish Balay } 17364e2b4712SSatish Balay 17374e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 17384e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1739e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1740e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1741b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 17424e2b4712SSatish Balay PetscFunctionReturn(0); 17434e2b4712SSatish Balay } 17440ef38995SBarry Smith 17450ef38995SBarry Smith 17464e2b4712SSatish Balay /* 17474e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 17484e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 17494e2b4712SSatish Balay */ 17504a2ae208SSatish Balay #undef __FUNCT__ 17514a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 17524e2b4712SSatish Balay int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 17534e2b4712SSatish Balay { 17544e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 175530d4dcafSBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 175630d4dcafSBarry Smith int ierr,*diag = a->diag; 17573f1db9ecSBarry Smith MatScalar *aa=a->a; 175830d4dcafSBarry Smith Scalar *x,*b; 17594e2b4712SSatish Balay 17604e2b4712SSatish Balay PetscFunctionBegin; 1761e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1762e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 17634e2b4712SSatish Balay 1764aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 17652853dc0eSBarry Smith { 17662853dc0eSBarry Smith static Scalar w[2000]; /* very BAD need to fix */ 17672853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 17682853dc0eSBarry Smith } 1769aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 17702853dc0eSBarry Smith { 17712853dc0eSBarry Smith static Scalar w[2000]; /* very BAD need to fix */ 17722853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 17732853dc0eSBarry Smith } 1774aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 17752853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 1776e1293385SBarry Smith #else 177730d4dcafSBarry Smith { 1778f1af5d2fSBarry Smith Scalar s1,s2,s3,s4,x1,x2,x3,x4; 17793f1db9ecSBarry Smith MatScalar *v; 17804e555682SBarry Smith int jdx,idt,idx,nz,*vi,i,ai16; 1781e1293385SBarry Smith 17824e2b4712SSatish Balay /* forward solve the lower triangular */ 17834e2b4712SSatish Balay idx = 0; 1784e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 17854e2b4712SSatish Balay for (i=1; i<n; i++) { 17864e2b4712SSatish Balay v = aa + 16*ai[i]; 17874e2b4712SSatish Balay vi = aj + ai[i]; 17884e2b4712SSatish Balay nz = diag[i] - ai[i]; 1789e1293385SBarry Smith idx += 4; 1790f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 17914e2b4712SSatish Balay while (nz--) { 17924e2b4712SSatish Balay jdx = 4*(*vi++); 17934e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 1794f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1795f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1796f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1797f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 17984e2b4712SSatish Balay v += 16; 17994e2b4712SSatish Balay } 1800f1af5d2fSBarry Smith x[idx] = s1; 1801f1af5d2fSBarry Smith x[1+idx] = s2; 1802f1af5d2fSBarry Smith x[2+idx] = s3; 1803f1af5d2fSBarry Smith x[3+idx] = s4; 18044e2b4712SSatish Balay } 18054e2b4712SSatish Balay /* backward solve the upper triangular */ 18064e555682SBarry Smith idt = 4*(n-1); 18074e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 18084e555682SBarry Smith ai16 = 16*diag[i]; 18094e555682SBarry Smith v = aa + ai16 + 16; 18104e2b4712SSatish Balay vi = aj + diag[i] + 1; 18114e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 1812f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1813f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 18144e2b4712SSatish Balay while (nz--) { 18154e2b4712SSatish Balay idx = 4*(*vi++); 18164e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 1817f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1818f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1819f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1820f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 18214e2b4712SSatish Balay v += 16; 18224e2b4712SSatish Balay } 18234e555682SBarry Smith v = aa + ai16; 1824f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 1825f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 1826f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 1827f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 1828329f5518SBarry Smith idt -= 4; 18294e2b4712SSatish Balay } 183030d4dcafSBarry Smith } 1831e1293385SBarry Smith #endif 18324e2b4712SSatish Balay 1833e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1834e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1835b0a32e0cSBarry Smith PetscLogFlops(2*16*(a->nz) - 4*A->n); 18364e2b4712SSatish Balay PetscFunctionReturn(0); 18374e2b4712SSatish Balay } 18384e2b4712SSatish Balay 18393660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 18403660e330SKris Buschelman 18413660e330SKris Buschelman #include PETSC_HAVE_SSE 18423660e330SKris Buschelman 18433660e330SKris Buschelman #undef __FUNCT__ 18443660e330SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 18453660e330SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 18463660e330SKris Buschelman { 18473660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 18483660e330SKris Buschelman int n=a->mbs,*ai=a->i,*aj=a->j; 18493660e330SKris Buschelman int ierr,*diag = a->diag; 18503660e330SKris Buschelman MatScalar *aa=a->a; 18513660e330SKris Buschelman Scalar *x,*b; 18523660e330SKris Buschelman 18533660e330SKris Buschelman PetscFunctionBegin; 18543660e330SKris Buschelman SSE_SCOPE_BEGIN; 18553660e330SKris Buschelman /* 18563660e330SKris Buschelman Note: This code currently uses demotion of double 18573660e330SKris Buschelman to float when performing the mixed-mode computation. 18583660e330SKris Buschelman This may not be numerically reasonable for all applications. 18593660e330SKris Buschelman */ 18603660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 18613660e330SKris Buschelman 18623660e330SKris Buschelman ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 18633660e330SKris Buschelman ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 18643660e330SKris Buschelman { 18653660e330SKris Buschelman MatScalar *v; 18663660e330SKris Buschelman int jdx,idt,idx,nz,*vi,i,ai16; 18673660e330SKris Buschelman 18683660e330SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 18693660e330SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 18703660e330SKris Buschelman unsigned long offset = (unsigned long)ssealignedspace % 16; 18713660e330SKris Buschelman if (offset) offset = (16 - offset)/4; 18723660e330SKris Buschelman tmps = &ssealignedspace[offset]; 18733660e330SKris Buschelman tmpx = &ssealignedspace[offset+4]; 18743660e330SKris Buschelman 18753660e330SKris Buschelman /* forward solve the lower triangular */ 18763660e330SKris Buschelman idx = 0; 18773660e330SKris Buschelman x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 18783660e330SKris Buschelman v = aa + 16*ai[1]; 18793660e330SKris Buschelman 18803660e330SKris Buschelman for (i=1; i<n;) { 18813660e330SKris Buschelman PREFETCH_NTA(&v[8]); 18823660e330SKris Buschelman vi = aj + ai[i]; 18833660e330SKris Buschelman nz = diag[i] - ai[i]; 18843660e330SKris Buschelman idx += 4; 18853660e330SKris Buschelman 18863660e330SKris Buschelman /* Demote sum from double to float */ 18873660e330SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 18883660e330SKris Buschelman LOAD_PS(tmps,XMM7); 18893660e330SKris Buschelman 18903660e330SKris Buschelman while (nz--) { 18913660e330SKris Buschelman PREFETCH_NTA(&v[16]); 18923660e330SKris Buschelman jdx = 4*(*vi++); 18933660e330SKris Buschelman 18943660e330SKris Buschelman /* Demote solution (so far) from double to float */ 18953660e330SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]); 18963660e330SKris Buschelman 18973660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 18983660e330SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 18993660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 19003660e330SKris Buschelman 19013660e330SKris Buschelman /* First Column */ 19023660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 19033660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 19043660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 19053660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 19063660e330SKris Buschelman 19073660e330SKris Buschelman /* Second Column */ 19083660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 19093660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 19103660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 19113660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 19123660e330SKris Buschelman 19133660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 19143660e330SKris Buschelman 19153660e330SKris Buschelman /* Third Column */ 19163660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 19173660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 19183660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 19193660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 19203660e330SKris Buschelman 19213660e330SKris Buschelman /* Fourth Column */ 19223660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 19233660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 19243660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 19253660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 19263660e330SKris Buschelman SSE_INLINE_END_2 19273660e330SKris Buschelman 19283660e330SKris Buschelman v += 16; 19293660e330SKris Buschelman } 19303660e330SKris Buschelman v = aa + 16*ai[++i]; 19313660e330SKris Buschelman PREFETCH_NTA(v); 19323660e330SKris Buschelman STORE_PS(tmps,XMM7); 19333660e330SKris Buschelman 19343660e330SKris Buschelman /* Promote result from float to double */ 19353660e330SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps); 19363660e330SKris Buschelman } 19373660e330SKris Buschelman /* backward solve the upper triangular */ 19383660e330SKris Buschelman idt = 4*(n-1); 19393660e330SKris Buschelman ai16 = 16*diag[n-1]; 19403660e330SKris Buschelman v = aa + ai16 + 16; 19413660e330SKris Buschelman for (i=n-1; i>=0;){ 19423660e330SKris Buschelman PREFETCH_NTA(&v[8]); 19433660e330SKris Buschelman vi = aj + diag[i] + 1; 19443660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 19453660e330SKris Buschelman 19463660e330SKris Buschelman /* Demote accumulator from double to float */ 19473660e330SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]); 19483660e330SKris Buschelman LOAD_PS(tmps,XMM7); 19493660e330SKris Buschelman 19503660e330SKris Buschelman while (nz--) { 19513660e330SKris Buschelman PREFETCH_NTA(&v[16]); 19523660e330SKris Buschelman idx = 4*(*vi++); 19533660e330SKris Buschelman 19543660e330SKris Buschelman /* Demote solution (so far) from double to float */ 19553660e330SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 19563660e330SKris Buschelman 19573660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 19583660e330SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 19593660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 19603660e330SKris Buschelman 19613660e330SKris Buschelman /* First Column */ 19623660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 19633660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 19643660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 19653660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 19663660e330SKris Buschelman 19673660e330SKris Buschelman /* Second Column */ 19683660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 19693660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 19703660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 19713660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 19723660e330SKris Buschelman 19733660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 19743660e330SKris Buschelman 19753660e330SKris Buschelman /* Third Column */ 19763660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 19773660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 19783660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 19793660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 19803660e330SKris Buschelman 19813660e330SKris Buschelman /* Fourth Column */ 19823660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 19833660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 19843660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 19853660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 19863660e330SKris Buschelman SSE_INLINE_END_2 19873660e330SKris Buschelman v += 16; 19883660e330SKris Buschelman } 19893660e330SKris Buschelman v = aa + ai16; 19903660e330SKris Buschelman ai16 = 16*diag[--i]; 19913660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 19923660e330SKris Buschelman /* 19933660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 19943660e330SKris Buschelman which was inverted as part of the factorization 19953660e330SKris Buschelman */ 19963660e330SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 19973660e330SKris Buschelman /* First Column */ 19983660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 19993660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 20003660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 20013660e330SKris Buschelman 20023660e330SKris Buschelman /* Second Column */ 20033660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 20043660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 20053660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 20063660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 20073660e330SKris Buschelman 20083660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 20093660e330SKris Buschelman 20103660e330SKris Buschelman /* Third Column */ 20113660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 20123660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 20133660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 20143660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 20153660e330SKris Buschelman 20163660e330SKris Buschelman /* Fourth Column */ 20173660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 20183660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 20193660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 20203660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 20213660e330SKris Buschelman 20223660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 20233660e330SKris Buschelman SSE_INLINE_END_3 20243660e330SKris Buschelman 20253660e330SKris Buschelman /* Promote solution from float to double */ 20263660e330SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps); 20273660e330SKris Buschelman 20283660e330SKris Buschelman v = aa + ai16 + 16; 20293660e330SKris Buschelman idt -= 4; 20303660e330SKris Buschelman } 20313660e330SKris Buschelman } 20323660e330SKris Buschelman ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 20333660e330SKris Buschelman ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 20343660e330SKris Buschelman PetscLogFlops(2*16*(a->nz) - 4*A->n); 20353660e330SKris Buschelman SSE_SCOPE_END; 20363660e330SKris Buschelman PetscFunctionReturn(0); 20373660e330SKris Buschelman } 20383660e330SKris Buschelman 20393660e330SKris Buschelman #endif 20404a2ae208SSatish Balay #undef __FUNCT__ 20414a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 20424e2b4712SSatish Balay int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 20434e2b4712SSatish Balay { 20444e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 20454e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 20464e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 20474e2b4712SSatish Balay int *diag = a->diag; 20483f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 2049f1af5d2fSBarry Smith Scalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 20504e2b4712SSatish Balay 20514e2b4712SSatish Balay PetscFunctionBegin; 2052e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2053e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2054f1af5d2fSBarry Smith t = a->solve_work; 20554e2b4712SSatish Balay 20564e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 20574e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 20584e2b4712SSatish Balay 20594e2b4712SSatish Balay /* forward solve the lower triangular */ 20604e2b4712SSatish Balay idx = 3*(*r++); 2061f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 20624e2b4712SSatish Balay for (i=1; i<n; i++) { 20634e2b4712SSatish Balay v = aa + 9*ai[i]; 20644e2b4712SSatish Balay vi = aj + ai[i]; 20654e2b4712SSatish Balay nz = diag[i] - ai[i]; 20664e2b4712SSatish Balay idx = 3*(*r++); 2067f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 20684e2b4712SSatish Balay while (nz--) { 20694e2b4712SSatish Balay idx = 3*(*vi++); 2070f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2071f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2072f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2073f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 20744e2b4712SSatish Balay v += 9; 20754e2b4712SSatish Balay } 20764e2b4712SSatish Balay idx = 3*i; 2077f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 20784e2b4712SSatish Balay } 20794e2b4712SSatish Balay /* backward solve the upper triangular */ 20804e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 20814e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 20824e2b4712SSatish Balay vi = aj + diag[i] + 1; 20834e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 20844e2b4712SSatish Balay idt = 3*i; 2085f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 20864e2b4712SSatish Balay while (nz--) { 20874e2b4712SSatish Balay idx = 3*(*vi++); 2088f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2089f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2090f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2091f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 20924e2b4712SSatish Balay v += 9; 20934e2b4712SSatish Balay } 20944e2b4712SSatish Balay idc = 3*(*c--); 20954e2b4712SSatish Balay v = aa + 9*diag[i]; 2096f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2097f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2098f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 20994e2b4712SSatish Balay } 21004e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21014e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2102e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2103e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2104b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 21054e2b4712SSatish Balay PetscFunctionReturn(0); 21064e2b4712SSatish Balay } 21074e2b4712SSatish Balay 210815091d37SBarry Smith /* 210915091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 211015091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 211115091d37SBarry Smith */ 21124a2ae208SSatish Balay #undef __FUNCT__ 21134a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 211415091d37SBarry Smith int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 211515091d37SBarry Smith { 211615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 211715091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 211815091d37SBarry Smith int ierr,*diag = a->diag; 211915091d37SBarry Smith MatScalar *aa=a->a,*v; 2120f1af5d2fSBarry Smith Scalar *x,*b,s1,s2,s3,x1,x2,x3; 212115091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 212215091d37SBarry Smith 212315091d37SBarry Smith PetscFunctionBegin; 212415091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 212515091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 212615091d37SBarry Smith 212715091d37SBarry Smith 212815091d37SBarry Smith /* forward solve the lower triangular */ 212915091d37SBarry Smith idx = 0; 213015091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 213115091d37SBarry Smith for (i=1; i<n; i++) { 213215091d37SBarry Smith v = aa + 9*ai[i]; 213315091d37SBarry Smith vi = aj + ai[i]; 213415091d37SBarry Smith nz = diag[i] - ai[i]; 213515091d37SBarry Smith idx += 3; 2136f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 213715091d37SBarry Smith while (nz--) { 213815091d37SBarry Smith jdx = 3*(*vi++); 213915091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2140f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2141f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2142f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 214315091d37SBarry Smith v += 9; 214415091d37SBarry Smith } 2145f1af5d2fSBarry Smith x[idx] = s1; 2146f1af5d2fSBarry Smith x[1+idx] = s2; 2147f1af5d2fSBarry Smith x[2+idx] = s3; 214815091d37SBarry Smith } 214915091d37SBarry Smith /* backward solve the upper triangular */ 215015091d37SBarry Smith for (i=n-1; i>=0; i--){ 215115091d37SBarry Smith v = aa + 9*diag[i] + 9; 215215091d37SBarry Smith vi = aj + diag[i] + 1; 215315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 215415091d37SBarry Smith idt = 3*i; 2155f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2156f1af5d2fSBarry Smith s3 = x[2+idt]; 215715091d37SBarry Smith while (nz--) { 215815091d37SBarry Smith idx = 3*(*vi++); 215915091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2160f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2161f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2162f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 216315091d37SBarry Smith v += 9; 216415091d37SBarry Smith } 216515091d37SBarry Smith v = aa + 9*diag[i]; 2166f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2167f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2168f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 216915091d37SBarry Smith } 217015091d37SBarry Smith 217115091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 217215091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2173b0a32e0cSBarry Smith PetscLogFlops(2*9*(a->nz) - 3*A->n); 217415091d37SBarry Smith PetscFunctionReturn(0); 217515091d37SBarry Smith } 217615091d37SBarry Smith 21774a2ae208SSatish Balay #undef __FUNCT__ 21784a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 21794e2b4712SSatish Balay int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 21804e2b4712SSatish Balay { 21814e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21824e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 21834e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 21844e2b4712SSatish Balay int *diag = a->diag; 21853f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 2186f1af5d2fSBarry Smith Scalar *x,*b,s1,s2,x1,x2,*t; 21874e2b4712SSatish Balay 21884e2b4712SSatish Balay PetscFunctionBegin; 2189e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2190e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2191f1af5d2fSBarry Smith t = a->solve_work; 21924e2b4712SSatish Balay 21934e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21944e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 21954e2b4712SSatish Balay 21964e2b4712SSatish Balay /* forward solve the lower triangular */ 21974e2b4712SSatish Balay idx = 2*(*r++); 2198f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 21994e2b4712SSatish Balay for (i=1; i<n; i++) { 22004e2b4712SSatish Balay v = aa + 4*ai[i]; 22014e2b4712SSatish Balay vi = aj + ai[i]; 22024e2b4712SSatish Balay nz = diag[i] - ai[i]; 22034e2b4712SSatish Balay idx = 2*(*r++); 2204f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 22054e2b4712SSatish Balay while (nz--) { 22064e2b4712SSatish Balay idx = 2*(*vi++); 2207f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2208f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2209f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 22104e2b4712SSatish Balay v += 4; 22114e2b4712SSatish Balay } 22124e2b4712SSatish Balay idx = 2*i; 2213f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 22144e2b4712SSatish Balay } 22154e2b4712SSatish Balay /* backward solve the upper triangular */ 22164e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 22174e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 22184e2b4712SSatish Balay vi = aj + diag[i] + 1; 22194e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 22204e2b4712SSatish Balay idt = 2*i; 2221f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 22224e2b4712SSatish Balay while (nz--) { 22234e2b4712SSatish Balay idx = 2*(*vi++); 2224f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2225f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2226f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 22274e2b4712SSatish Balay v += 4; 22284e2b4712SSatish Balay } 22294e2b4712SSatish Balay idc = 2*(*c--); 22304e2b4712SSatish Balay v = aa + 4*diag[i]; 2231f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2232f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 22334e2b4712SSatish Balay } 22344e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22354e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2236e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2237e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2238b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 22394e2b4712SSatish Balay PetscFunctionReturn(0); 22404e2b4712SSatish Balay } 22414e2b4712SSatish Balay 224215091d37SBarry Smith /* 224315091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 224415091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 224515091d37SBarry Smith */ 22464a2ae208SSatish Balay #undef __FUNCT__ 22474a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 224815091d37SBarry Smith int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 224915091d37SBarry Smith { 225015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 225115091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 225215091d37SBarry Smith int ierr,*diag = a->diag; 225315091d37SBarry Smith MatScalar *aa=a->a,*v; 2254f1af5d2fSBarry Smith Scalar *x,*b,s1,s2,x1,x2; 225515091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 225615091d37SBarry Smith 225715091d37SBarry Smith PetscFunctionBegin; 225815091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 225915091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 226015091d37SBarry Smith 226115091d37SBarry Smith /* forward solve the lower triangular */ 226215091d37SBarry Smith idx = 0; 226315091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 226415091d37SBarry Smith for (i=1; i<n; i++) { 226515091d37SBarry Smith v = aa + 4*ai[i]; 226615091d37SBarry Smith vi = aj + ai[i]; 226715091d37SBarry Smith nz = diag[i] - ai[i]; 226815091d37SBarry Smith idx += 2; 2269f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 227015091d37SBarry Smith while (nz--) { 227115091d37SBarry Smith jdx = 2*(*vi++); 227215091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 2273f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2274f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 227515091d37SBarry Smith v += 4; 227615091d37SBarry Smith } 2277f1af5d2fSBarry Smith x[idx] = s1; 2278f1af5d2fSBarry Smith x[1+idx] = s2; 227915091d37SBarry Smith } 228015091d37SBarry Smith /* backward solve the upper triangular */ 228115091d37SBarry Smith for (i=n-1; i>=0; i--){ 228215091d37SBarry Smith v = aa + 4*diag[i] + 4; 228315091d37SBarry Smith vi = aj + diag[i] + 1; 228415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 228515091d37SBarry Smith idt = 2*i; 2286f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 228715091d37SBarry Smith while (nz--) { 228815091d37SBarry Smith idx = 2*(*vi++); 228915091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 2290f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2291f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 229215091d37SBarry Smith v += 4; 229315091d37SBarry Smith } 229415091d37SBarry Smith v = aa + 4*diag[i]; 2295f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 2296f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 229715091d37SBarry Smith } 229815091d37SBarry Smith 229915091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 230015091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2301b0a32e0cSBarry Smith PetscLogFlops(2*4*(a->nz) - 2*A->n); 230215091d37SBarry Smith PetscFunctionReturn(0); 230315091d37SBarry Smith } 230415091d37SBarry Smith 23054a2ae208SSatish Balay #undef __FUNCT__ 23064a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 23074e2b4712SSatish Balay int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 23084e2b4712SSatish Balay { 23094e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 23104e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 23114e2b4712SSatish Balay int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 23124e2b4712SSatish Balay int *diag = a->diag; 23133f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 2314f1af5d2fSBarry Smith Scalar *x,*b,s1,*t; 23154e2b4712SSatish Balay 23164e2b4712SSatish Balay PetscFunctionBegin; 23174e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 23184e2b4712SSatish Balay 2319e1311b90SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2320e1311b90SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2321f1af5d2fSBarry Smith t = a->solve_work; 23224e2b4712SSatish Balay 23234e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 23244e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 23254e2b4712SSatish Balay 23264e2b4712SSatish Balay /* forward solve the lower triangular */ 2327f1af5d2fSBarry Smith t[0] = b[*r++]; 23284e2b4712SSatish Balay for (i=1; i<n; i++) { 23294e2b4712SSatish Balay v = aa + ai[i]; 23304e2b4712SSatish Balay vi = aj + ai[i]; 23314e2b4712SSatish Balay nz = diag[i] - ai[i]; 2332f1af5d2fSBarry Smith s1 = b[*r++]; 23334e2b4712SSatish Balay while (nz--) { 2334f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 23354e2b4712SSatish Balay } 2336f1af5d2fSBarry Smith t[i] = s1; 23374e2b4712SSatish Balay } 23384e2b4712SSatish Balay /* backward solve the upper triangular */ 23394e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 23404e2b4712SSatish Balay v = aa + diag[i] + 1; 23414e2b4712SSatish Balay vi = aj + diag[i] + 1; 23424e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2343f1af5d2fSBarry Smith s1 = t[i]; 23444e2b4712SSatish Balay while (nz--) { 2345f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 23464e2b4712SSatish Balay } 2347f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 23484e2b4712SSatish Balay } 23494e2b4712SSatish Balay 23504e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 23514e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2352e1311b90SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2353e1311b90SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2354b0a32e0cSBarry Smith PetscLogFlops(2*1*(a->nz) - A->n); 23554e2b4712SSatish Balay PetscFunctionReturn(0); 23564e2b4712SSatish Balay } 235715091d37SBarry Smith /* 235815091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 235915091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 236015091d37SBarry Smith */ 23614a2ae208SSatish Balay #undef __FUNCT__ 23624a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 236315091d37SBarry Smith int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 236415091d37SBarry Smith { 236515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 236615091d37SBarry Smith int n=a->mbs,*ai=a->i,*aj=a->j; 236715091d37SBarry Smith int ierr,*diag = a->diag; 236815091d37SBarry Smith MatScalar *aa=a->a; 236915091d37SBarry Smith Scalar *x,*b; 2370f1af5d2fSBarry Smith Scalar s1,x1; 237115091d37SBarry Smith MatScalar *v; 237215091d37SBarry Smith int jdx,idt,idx,nz,*vi,i; 237315091d37SBarry Smith 237415091d37SBarry Smith PetscFunctionBegin; 237515091d37SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 237615091d37SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 237715091d37SBarry Smith 237815091d37SBarry Smith /* forward solve the lower triangular */ 237915091d37SBarry Smith idx = 0; 238015091d37SBarry Smith x[0] = b[0]; 238115091d37SBarry Smith for (i=1; i<n; i++) { 238215091d37SBarry Smith v = aa + ai[i]; 238315091d37SBarry Smith vi = aj + ai[i]; 238415091d37SBarry Smith nz = diag[i] - ai[i]; 238515091d37SBarry Smith idx += 1; 2386f1af5d2fSBarry Smith s1 = b[idx]; 238715091d37SBarry Smith while (nz--) { 238815091d37SBarry Smith jdx = *vi++; 238915091d37SBarry Smith x1 = x[jdx]; 2390f1af5d2fSBarry Smith s1 -= v[0]*x1; 239115091d37SBarry Smith v += 1; 239215091d37SBarry Smith } 2393f1af5d2fSBarry Smith x[idx] = s1; 239415091d37SBarry Smith } 239515091d37SBarry Smith /* backward solve the upper triangular */ 239615091d37SBarry Smith for (i=n-1; i>=0; i--){ 239715091d37SBarry Smith v = aa + diag[i] + 1; 239815091d37SBarry Smith vi = aj + diag[i] + 1; 239915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 240015091d37SBarry Smith idt = i; 2401f1af5d2fSBarry Smith s1 = x[idt]; 240215091d37SBarry Smith while (nz--) { 240315091d37SBarry Smith idx = *vi++; 240415091d37SBarry Smith x1 = x[idx]; 2405f1af5d2fSBarry Smith s1 -= v[0]*x1; 240615091d37SBarry Smith v += 1; 240715091d37SBarry Smith } 240815091d37SBarry Smith v = aa + diag[i]; 2409f1af5d2fSBarry Smith x[idt] = v[0]*s1; 241015091d37SBarry Smith } 241115091d37SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 241215091d37SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2413b0a32e0cSBarry Smith PetscLogFlops(2*(a->nz) - A->n); 241415091d37SBarry Smith PetscFunctionReturn(0); 241515091d37SBarry Smith } 24164e2b4712SSatish Balay 24174e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 24184e2b4712SSatish Balay /* 24194e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 24204e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 24214e2b4712SSatish Balay Not a good example of code reuse. 24224e2b4712SSatish Balay */ 2423ca44d042SBarry Smith EXTERN int MatMissingDiagonal_SeqBAIJ(Mat); 2424435faa5fSBarry Smith 24254a2ae208SSatish Balay #undef __FUNCT__ 24264a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 2427435faa5fSBarry Smith int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact) 24284e2b4712SSatish Balay { 24294e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 24304e2b4712SSatish Balay IS isicol; 24314e2b4712SSatish Balay int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j; 24324e2b4712SSatish Balay int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 2433335d9088SBarry Smith int *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0; 2434435faa5fSBarry Smith int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 24354533b203SBarry Smith PetscTruth col_identity,row_identity; 2436329f5518SBarry Smith PetscReal f; 24374e2b4712SSatish Balay 24384e2b4712SSatish Balay PetscFunctionBegin; 2439435faa5fSBarry Smith if (info) { 2440435faa5fSBarry Smith f = info->fill; 2441335d9088SBarry Smith levels = (int)info->levels; 2442335d9088SBarry Smith diagonal_fill = (int)info->diagonal_fill; 2443435faa5fSBarry Smith } else { 2444435faa5fSBarry Smith f = 1.0; 2445435faa5fSBarry Smith levels = 0; 2446435faa5fSBarry Smith diagonal_fill = 0; 2447435faa5fSBarry Smith } 24484c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 2449667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 2450667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 2451309c388cSBarry Smith 2452309c388cSBarry Smith if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 2453bb3d539aSBarry Smith ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 2454bb3d539aSBarry Smith (*fact)->factor = FACTOR_LU; 2455bb3d539aSBarry Smith b = (Mat_SeqBAIJ*)(*fact)->data; 2456bb3d539aSBarry Smith if (!b->diag) { 2457bb3d539aSBarry Smith ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 2458bb3d539aSBarry Smith } 2459bb3d539aSBarry Smith ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 2460bb3d539aSBarry Smith b->row = isrow; 2461bb3d539aSBarry Smith b->col = iscol; 2462bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 2463bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 2464bb3d539aSBarry Smith b->icol = isicol; 2465b0a32e0cSBarry Smith ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(Scalar),&b->solve_work);CHKERRQ(ierr); 2466309c388cSBarry Smith } else { /* general case perform the symbolic factorization */ 24674e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 24684e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 24694e2b4712SSatish Balay 24704e2b4712SSatish Balay /* get new row pointers */ 2471b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 24724e2b4712SSatish Balay ainew[0] = 0; 24734e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 24744e2b4712SSatish Balay jmax = (int)(f*ai[n] + 1); 247582502324SSatish Balay ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 24764e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 247782502324SSatish Balay ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 24784e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 2479b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 24804e2b4712SSatish Balay /* im is level for each filled value */ 2481b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 24824e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 2483b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 24844e2b4712SSatish Balay dloc[0] = 0; 24854e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 2486435faa5fSBarry Smith 2487435faa5fSBarry Smith /* copy prow into linked list */ 24884e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 248929bbc08cSBarry Smith if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 24904e2b4712SSatish Balay xi = aj + ai[r[prow]]; 24914e2b4712SSatish Balay fill[n] = n; 2492435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 24934e2b4712SSatish Balay while (nz--) { 24944e2b4712SSatish Balay fm = n; 24954e2b4712SSatish Balay idx = ic[*xi++]; 24964e2b4712SSatish Balay do { 24974e2b4712SSatish Balay m = fm; 24984e2b4712SSatish Balay fm = fill[m]; 24994e2b4712SSatish Balay } while (fm < idx); 25004e2b4712SSatish Balay fill[m] = idx; 25014e2b4712SSatish Balay fill[idx] = fm; 25024e2b4712SSatish Balay im[idx] = 0; 25034e2b4712SSatish Balay } 2504435faa5fSBarry Smith 2505435faa5fSBarry Smith /* make sure diagonal entry is included */ 2506435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 2507435faa5fSBarry Smith fm = n; 2508435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 2509435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 2510435faa5fSBarry Smith fill[fm] = prow; 2511435faa5fSBarry Smith im[prow] = 0; 2512435faa5fSBarry Smith nzf++; 2513335d9088SBarry Smith dcount++; 2514435faa5fSBarry Smith } 2515435faa5fSBarry Smith 25164e2b4712SSatish Balay nzi = 0; 25174e2b4712SSatish Balay row = fill[n]; 25184e2b4712SSatish Balay while (row < prow) { 25194e2b4712SSatish Balay incrlev = im[row] + 1; 25204e2b4712SSatish Balay nz = dloc[row]; 2521435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 25224e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 25234e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 25244e2b4712SSatish Balay fm = row; 25254e2b4712SSatish Balay while (nnz-- > 0) { 25264e2b4712SSatish Balay idx = *xi++; 25274e2b4712SSatish Balay if (*flev + incrlev > levels) { 25284e2b4712SSatish Balay flev++; 25294e2b4712SSatish Balay continue; 25304e2b4712SSatish Balay } 25314e2b4712SSatish Balay do { 25324e2b4712SSatish Balay m = fm; 25334e2b4712SSatish Balay fm = fill[m]; 25344e2b4712SSatish Balay } while (fm < idx); 25354e2b4712SSatish Balay if (fm != idx) { 25364e2b4712SSatish Balay im[idx] = *flev + incrlev; 25374e2b4712SSatish Balay fill[m] = idx; 25384e2b4712SSatish Balay fill[idx] = fm; 25394e2b4712SSatish Balay fm = idx; 25404e2b4712SSatish Balay nzf++; 2541ecf371e4SBarry Smith } else { 25424e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 25434e2b4712SSatish Balay } 25444e2b4712SSatish Balay flev++; 25454e2b4712SSatish Balay } 25464e2b4712SSatish Balay row = fill[row]; 25474e2b4712SSatish Balay nzi++; 25484e2b4712SSatish Balay } 25494e2b4712SSatish Balay /* copy new filled row into permanent storage */ 25504e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 25514e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 2552ecf371e4SBarry Smith 2553ecf371e4SBarry Smith /* estimate how much additional space we will need */ 2554ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2555ecf371e4SBarry Smith /* just double the memory each time */ 2556ecf371e4SBarry Smith int maxadd = jmax; 2557ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 25584e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 25594e2b4712SSatish Balay jmax += maxadd; 2560ecf371e4SBarry Smith 2561ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 256282502324SSatish Balay ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 2563549d3d68SSatish Balay ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 2564606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 25654e2b4712SSatish Balay ajnew = xi; 256682502324SSatish Balay ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 2567549d3d68SSatish Balay ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 2568606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 25694e2b4712SSatish Balay ajfill = xi; 2570ecf371e4SBarry Smith realloc++; /* count how many reallocations are needed */ 25714e2b4712SSatish Balay } 25724e2b4712SSatish Balay xi = ajnew + ainew[prow]; 25734e2b4712SSatish Balay flev = ajfill + ainew[prow]; 25744e2b4712SSatish Balay dloc[prow] = nzi; 25754e2b4712SSatish Balay fm = fill[n]; 25764e2b4712SSatish Balay while (nzf--) { 25774e2b4712SSatish Balay *xi++ = fm; 25784e2b4712SSatish Balay *flev++ = im[fm]; 25794e2b4712SSatish Balay fm = fill[fm]; 25804e2b4712SSatish Balay } 2581435faa5fSBarry Smith /* make sure row has diagonal entry */ 2582435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 258329bbc08cSBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 2584435faa5fSBarry Smith try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 2585435faa5fSBarry Smith } 25864e2b4712SSatish Balay } 2587606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 25884e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 25894e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2590606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 2591606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 25924e2b4712SSatish Balay 25934e2b4712SSatish Balay { 2594329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 2595b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 2596b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 2597b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 2598b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 2599335d9088SBarry Smith if (diagonal_fill) { 2600b0a32e0cSBarry Smith PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount); 2601335d9088SBarry Smith } 26024e2b4712SSatish Balay } 26034e2b4712SSatish Balay 26044e2b4712SSatish Balay /* put together the new matrix */ 26054e2b4712SSatish Balay ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 2606b0a32e0cSBarry Smith PetscLogObjectParent(*fact,isicol); 26074e2b4712SSatish Balay b = (Mat_SeqBAIJ*)(*fact)->data; 2608606d414cSSatish Balay ierr = PetscFree(b->imax);CHKERRQ(ierr); 26097c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 26103f1db9ecSBarry Smith len = bs2*ainew[n]*sizeof(MatScalar); 26114e2b4712SSatish Balay /* the next line frees the default space generated by the Create() */ 2612606d414cSSatish Balay ierr = PetscFree(b->a);CHKERRQ(ierr); 2613606d414cSSatish Balay ierr = PetscFree(b->ilen);CHKERRQ(ierr); 261482502324SSatish Balay ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 26154e2b4712SSatish Balay b->j = ajnew; 26164e2b4712SSatish Balay b->i = ainew; 26174e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 26184e2b4712SSatish Balay b->diag = dloc; 26194e2b4712SSatish Balay b->ilen = 0; 26204e2b4712SSatish Balay b->imax = 0; 26214e2b4712SSatish Balay b->row = isrow; 26224e2b4712SSatish Balay b->col = iscol; 2623c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 2624c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 2625e51c0b9cSSatish Balay b->icol = isicol; 2626b0a32e0cSBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(Scalar),&b->solve_work);CHKERRQ(ierr); 26274e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 26284e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 2629b0a32e0cSBarry Smith PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(Scalar)); 26304e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 26314e2b4712SSatish Balay (*fact)->factor = FACTOR_LU; 26324e2b4712SSatish Balay 26334e2b4712SSatish Balay (*fact)->info.factor_mallocs = realloc; 26344e2b4712SSatish Balay (*fact)->info.fill_ratio_given = f; 2635329f5518SBarry Smith (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 2636309c388cSBarry Smith } 26374e2b4712SSatish Balay 2638309c388cSBarry Smith if (row_identity && col_identity) { 2639*8661488fSKris Buschelman ierr = MatSeqBAIJ_UpdateFactorizerSolver_NaturalOrdering(*fact);CHKERRQ(ierr); 2640*8661488fSKris Buschelman } 2641*8661488fSKris Buschelman PetscFunctionReturn(0); 2642*8661488fSKris Buschelman } 2643*8661488fSKris Buschelman 2644*8661488fSKris Buschelman int MatSeqBAIJ_UpdateFactorizerSolver_NaturalOrdering(Mat inA) 2645*8661488fSKris Buschelman { 2646*8661488fSKris Buschelman /* 2647*8661488fSKris Buschelman Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 2648*8661488fSKris Buschelman with natural ordering 2649*8661488fSKris Buschelman */ 2650*8661488fSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 2651*8661488fSKris Buschelman 2652*8661488fSKris Buschelman PetscFunctionBegin; 2653*8661488fSKris Buschelman switch (a->bs) { 2654*8661488fSKris Buschelman case 1: 2655*8661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 2656*8661488fSKris Buschelman inA->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 2657*8661488fSKris Buschelman inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 2658*8661488fSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor and solve BS=1\n"); 2659309c388cSBarry Smith case 2: 2660*8661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 2661*8661488fSKris Buschelman inA->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 2662*8661488fSKris Buschelman inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 2663*8661488fSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor and solve BS=2\n"); 2664309c388cSBarry Smith break; 2665309c388cSBarry Smith case 3: 2666*8661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 2667*8661488fSKris Buschelman inA->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 2668*8661488fSKris Buschelman inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 2669*8661488fSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor and solve BS=3\n"); 2670309c388cSBarry Smith break; 2671309c388cSBarry Smith case 4: 2672f3e16f13SKris Buschelman #if defined(PETSC_HAVE_SSE) 26737de2e858SKris Buschelman { 26744533b203SBarry Smith PetscTruth sse_enabled; 2675*8661488fSKris Buschelman int ierr; 2676*8661488fSKris Buschelman 267783b3e5c4SKris Buschelman ierr = PetscSSEIsEnabled(&sse_enabled);CHKERRQ(ierr); 267883b3e5c4SKris Buschelman if (sse_enabled) { 2679*8661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 26803ba47ebaSKris Buschelman } else { 2681*8661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 26823ba47ebaSKris Buschelman } 26837de2e858SKris Buschelman } 26844533b203SBarry Smith #else 2685*8661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 26864533b203SBarry Smith #endif 2687*8661488fSKris Buschelman if (!a->single_precision_solves) { /* If this option was set, don't clobber this pointer */ 2688*8661488fSKris Buschelman inA->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 2689*8661488fSKris Buschelman } 2690*8661488fSKris Buschelman inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 2691*8661488fSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor and solve BS=4\n"); 2692309c388cSBarry Smith break; 2693309c388cSBarry Smith case 5: 2694*8661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 2695*8661488fSKris Buschelman inA->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 2696*8661488fSKris Buschelman inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 2697*8661488fSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor and solve BS=5\n"); 2698309c388cSBarry Smith break; 2699309c388cSBarry Smith case 6: 2700*8661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 2701*8661488fSKris Buschelman inA->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 2702*8661488fSKris Buschelman inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 2703*8661488fSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor and solve BS=6\n"); 2704309c388cSBarry Smith break; 2705309c388cSBarry Smith case 7: 2706*8661488fSKris Buschelman inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 2707*8661488fSKris Buschelman inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 2708*8661488fSKris Buschelman inA->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 2709*8661488fSKris Buschelman PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor and solve BS=7\n"); 2710309c388cSBarry Smith break; 2711309c388cSBarry Smith } 27124e2b4712SSatish Balay PetscFunctionReturn(0); 27134e2b4712SSatish Balay } 2714