1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 1016a2bf60SHong Zhang #include "petscbt.h" 1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 124e2b4712SSatish Balay 134a2ae208SSatish Balay #undef __FUNCT__ 144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16f1af5d2fSBarry Smith { 17f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18dfbe8321SBarry Smith PetscErrorCode ierr; 19690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20690b6cddSBarry Smith PetscInt *diag = a->diag; 21f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2287828ca2SBarry Smith PetscScalar s1,*x,*b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 66f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6887828ca2SBarry Smith PetscScalar *x,*b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 1111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120f1af5d2fSBarry Smith { 121f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122dfbe8321SBarry Smith PetscErrorCode ierr; 123690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 125f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12687828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12787828ca2SBarry Smith PetscScalar *x,*b; 128f1af5d2fSBarry Smith 129f1af5d2fSBarry Smith PetscFunctionBegin; 130ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1311ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133f1af5d2fSBarry Smith 134f1af5d2fSBarry Smith /* forward solve the U^T */ 135f1af5d2fSBarry Smith idx = 0; 136f1af5d2fSBarry Smith for (i=0; i<n; i++) { 137f1af5d2fSBarry Smith 138f1af5d2fSBarry Smith v = aa + 9*diag[i]; 139f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 140ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144f1af5d2fSBarry Smith v += 9; 145f1af5d2fSBarry Smith 146f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 147f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 148f1af5d2fSBarry Smith while (nz--) { 149f1af5d2fSBarry Smith oidx = 3*(*vi++); 150f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153f1af5d2fSBarry Smith v += 9; 154f1af5d2fSBarry Smith } 155f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156f1af5d2fSBarry Smith idx += 3; 157f1af5d2fSBarry Smith } 158f1af5d2fSBarry Smith /* backward solve the L^T */ 159f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 160f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 161f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 162f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 163f1af5d2fSBarry Smith idt = 3*i; 164f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165f1af5d2fSBarry Smith while (nz--) { 166f1af5d2fSBarry Smith idx = 3*(*vi--); 167f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170f1af5d2fSBarry Smith v -= 9; 171f1af5d2fSBarry Smith } 172f1af5d2fSBarry Smith } 1731ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176f1af5d2fSBarry Smith PetscFunctionReturn(0); 177f1af5d2fSBarry Smith } 178f1af5d2fSBarry Smith 1794a2ae208SSatish Balay #undef __FUNCT__ 1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182f1af5d2fSBarry Smith { 183f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184dfbe8321SBarry Smith PetscErrorCode ierr; 185690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 187f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18987828ca2SBarry Smith PetscScalar *x,*b; 190f1af5d2fSBarry Smith 191f1af5d2fSBarry Smith PetscFunctionBegin; 192ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1931ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195f1af5d2fSBarry Smith 196f1af5d2fSBarry Smith /* forward solve the U^T */ 197f1af5d2fSBarry Smith idx = 0; 198f1af5d2fSBarry Smith for (i=0; i<n; i++) { 199f1af5d2fSBarry Smith 200f1af5d2fSBarry Smith v = aa + 16*diag[i]; 201f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 202ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207f1af5d2fSBarry Smith v += 16; 208f1af5d2fSBarry Smith 209f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 210f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 211f1af5d2fSBarry Smith while (nz--) { 212f1af5d2fSBarry Smith oidx = 4*(*vi++); 213f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217f1af5d2fSBarry Smith v += 16; 218f1af5d2fSBarry Smith } 219f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220f1af5d2fSBarry Smith idx += 4; 221f1af5d2fSBarry Smith } 222f1af5d2fSBarry Smith /* backward solve the L^T */ 223f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 224f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 225f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 226f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 227f1af5d2fSBarry Smith idt = 4*i; 228f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229f1af5d2fSBarry Smith while (nz--) { 230f1af5d2fSBarry Smith idx = 4*(*vi--); 231f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235f1af5d2fSBarry Smith v -= 16; 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith } 2381ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2391ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241f1af5d2fSBarry Smith PetscFunctionReturn(0); 242f1af5d2fSBarry Smith } 243f1af5d2fSBarry Smith 2444a2ae208SSatish Balay #undef __FUNCT__ 2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247f1af5d2fSBarry Smith { 248f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249dfbe8321SBarry Smith PetscErrorCode ierr; 250690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 252f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 25387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 25487828ca2SBarry Smith PetscScalar *x,*b; 255f1af5d2fSBarry Smith 256f1af5d2fSBarry Smith PetscFunctionBegin; 257ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2581ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2591ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260f1af5d2fSBarry Smith 261f1af5d2fSBarry Smith /* forward solve the U^T */ 262f1af5d2fSBarry Smith idx = 0; 263f1af5d2fSBarry Smith for (i=0; i<n; i++) { 264f1af5d2fSBarry Smith 265f1af5d2fSBarry Smith v = aa + 25*diag[i]; 266f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 267ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273f1af5d2fSBarry Smith v += 25; 274f1af5d2fSBarry Smith 275f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 276f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 277f1af5d2fSBarry Smith while (nz--) { 278f1af5d2fSBarry Smith oidx = 5*(*vi++); 279f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284f1af5d2fSBarry Smith v += 25; 285f1af5d2fSBarry Smith } 286f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287f1af5d2fSBarry Smith idx += 5; 288f1af5d2fSBarry Smith } 289f1af5d2fSBarry Smith /* backward solve the L^T */ 290f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 291f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 292f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 293f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 294f1af5d2fSBarry Smith idt = 5*i; 295f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296f1af5d2fSBarry Smith while (nz--) { 297f1af5d2fSBarry Smith idx = 5*(*vi--); 298f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303f1af5d2fSBarry Smith v -= 25; 304f1af5d2fSBarry Smith } 305f1af5d2fSBarry Smith } 3061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309f1af5d2fSBarry Smith PetscFunctionReturn(0); 310f1af5d2fSBarry Smith } 311f1af5d2fSBarry Smith 3124a2ae208SSatish Balay #undef __FUNCT__ 3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315f1af5d2fSBarry Smith { 316f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317dfbe8321SBarry Smith PetscErrorCode ierr; 318690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 320f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 32187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 32287828ca2SBarry Smith PetscScalar *x,*b; 323f1af5d2fSBarry Smith 324f1af5d2fSBarry Smith PetscFunctionBegin; 325ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328f1af5d2fSBarry Smith 329f1af5d2fSBarry Smith /* forward solve the U^T */ 330f1af5d2fSBarry Smith idx = 0; 331f1af5d2fSBarry Smith for (i=0; i<n; i++) { 332f1af5d2fSBarry Smith 333f1af5d2fSBarry Smith v = aa + 36*diag[i]; 334f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 335ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336ef66eb69SBarry Smith x6 = x[5+idx]; 337f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343f1af5d2fSBarry Smith v += 36; 344f1af5d2fSBarry Smith 345f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 346f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 347f1af5d2fSBarry Smith while (nz--) { 348f1af5d2fSBarry Smith oidx = 6*(*vi++); 349f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355f1af5d2fSBarry Smith v += 36; 356f1af5d2fSBarry Smith } 357f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358f1af5d2fSBarry Smith x[5+idx] = s6; 359f1af5d2fSBarry Smith idx += 6; 360f1af5d2fSBarry Smith } 361f1af5d2fSBarry Smith /* backward solve the L^T */ 362f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 363f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 364f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 365f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 366f1af5d2fSBarry Smith idt = 6*i; 367f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368f1af5d2fSBarry Smith s6 = x[5+idt]; 369f1af5d2fSBarry Smith while (nz--) { 370f1af5d2fSBarry Smith idx = 6*(*vi--); 371f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377f1af5d2fSBarry Smith v -= 36; 378f1af5d2fSBarry Smith } 379f1af5d2fSBarry Smith } 3801ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383f1af5d2fSBarry Smith PetscFunctionReturn(0); 384f1af5d2fSBarry Smith } 385f1af5d2fSBarry Smith 3864a2ae208SSatish Balay #undef __FUNCT__ 3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389f1af5d2fSBarry Smith { 390f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391dfbe8321SBarry Smith PetscErrorCode ierr; 392690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 394f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 39587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 39687828ca2SBarry Smith PetscScalar *x,*b; 397f1af5d2fSBarry Smith 398f1af5d2fSBarry Smith PetscFunctionBegin; 399ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4001ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402f1af5d2fSBarry Smith 403f1af5d2fSBarry Smith /* forward solve the U^T */ 404f1af5d2fSBarry Smith idx = 0; 405f1af5d2fSBarry Smith for (i=0; i<n; i++) { 406f1af5d2fSBarry Smith 407f1af5d2fSBarry Smith v = aa + 49*diag[i]; 408f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 409ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 411f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418f1af5d2fSBarry Smith v += 49; 419f1af5d2fSBarry Smith 420f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 421f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 422f1af5d2fSBarry Smith while (nz--) { 423f1af5d2fSBarry Smith oidx = 7*(*vi++); 424f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431f1af5d2fSBarry Smith v += 49; 432f1af5d2fSBarry Smith } 433f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 435f1af5d2fSBarry Smith idx += 7; 436f1af5d2fSBarry Smith } 437f1af5d2fSBarry Smith /* backward solve the L^T */ 438f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 439f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 440f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 441f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 442f1af5d2fSBarry Smith idt = 7*i; 443f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 445f1af5d2fSBarry Smith while (nz--) { 446f1af5d2fSBarry Smith idx = 7*(*vi--); 447f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454f1af5d2fSBarry Smith v -= 49; 455f1af5d2fSBarry Smith } 456f1af5d2fSBarry Smith } 4571ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460f1af5d2fSBarry Smith PetscFunctionReturn(0); 461f1af5d2fSBarry Smith } 462f1af5d2fSBarry Smith 463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4644a2ae208SSatish Balay #undef __FUNCT__ 4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467f1af5d2fSBarry Smith { 468f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 4706849ba73SBarry Smith PetscErrorCode ierr; 4715d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 4725d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473690b6cddSBarry Smith PetscInt *diag = a->diag; 474f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 47587828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 476f1af5d2fSBarry Smith 477f1af5d2fSBarry Smith PetscFunctionBegin; 4781ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480f1af5d2fSBarry Smith t = a->solve_work; 481f1af5d2fSBarry Smith 482f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484f1af5d2fSBarry Smith 485f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 486f1af5d2fSBarry Smith for (i=0; i<n; i++) { 487f1af5d2fSBarry Smith t[i] = b[c[i]]; 488f1af5d2fSBarry Smith } 489f1af5d2fSBarry Smith 490f1af5d2fSBarry Smith /* forward solve the U^T */ 491f1af5d2fSBarry Smith for (i=0; i<n; i++) { 492f1af5d2fSBarry Smith 493f1af5d2fSBarry Smith v = aa + diag[i]; 494f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 495f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 496f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 497f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 498f1af5d2fSBarry Smith while (nz--) { 499f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 500f1af5d2fSBarry Smith } 501f1af5d2fSBarry Smith t[i] = s1; 502f1af5d2fSBarry Smith } 503f1af5d2fSBarry Smith /* backward solve the L^T */ 504f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 505f1af5d2fSBarry Smith v = aa + diag[i] - 1; 506f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 507f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 508f1af5d2fSBarry Smith s1 = t[i]; 509f1af5d2fSBarry Smith while (nz--) { 510f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 511f1af5d2fSBarry Smith } 512f1af5d2fSBarry Smith } 513f1af5d2fSBarry Smith 514f1af5d2fSBarry Smith /* copy t into x according to permutation */ 515f1af5d2fSBarry Smith for (i=0; i<n; i++) { 516f1af5d2fSBarry Smith x[r[i]] = t[i]; 517f1af5d2fSBarry Smith } 518f1af5d2fSBarry Smith 519f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5211ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5221ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524f1af5d2fSBarry Smith PetscFunctionReturn(0); 525f1af5d2fSBarry Smith } 526f1af5d2fSBarry Smith 5274a2ae208SSatish Balay #undef __FUNCT__ 5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530f1af5d2fSBarry Smith { 531f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5336849ba73SBarry Smith PetscErrorCode ierr; 5345d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5355d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 537f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53887828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 53987828ca2SBarry Smith PetscScalar *x,*b,*t; 540f1af5d2fSBarry Smith 541f1af5d2fSBarry Smith PetscFunctionBegin; 5421ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5431ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544f1af5d2fSBarry Smith t = a->solve_work; 545f1af5d2fSBarry Smith 546f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548f1af5d2fSBarry Smith 549f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 550f1af5d2fSBarry Smith ii = 0; 551f1af5d2fSBarry Smith for (i=0; i<n; i++) { 552f1af5d2fSBarry Smith ic = 2*c[i]; 553f1af5d2fSBarry Smith t[ii] = b[ic]; 554f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 555f1af5d2fSBarry Smith ii += 2; 556f1af5d2fSBarry Smith } 557f1af5d2fSBarry Smith 558f1af5d2fSBarry Smith /* forward solve the U^T */ 559f1af5d2fSBarry Smith idx = 0; 560f1af5d2fSBarry Smith for (i=0; i<n; i++) { 561f1af5d2fSBarry Smith 562f1af5d2fSBarry Smith v = aa + 4*diag[i]; 563f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 564f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 565f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 566f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 567f1af5d2fSBarry Smith v += 4; 568f1af5d2fSBarry Smith 569f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 570f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 571f1af5d2fSBarry Smith while (nz--) { 572f1af5d2fSBarry Smith oidx = 2*(*vi++); 573f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 574f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 575f1af5d2fSBarry Smith v += 4; 576f1af5d2fSBarry Smith } 577f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 578f1af5d2fSBarry Smith idx += 2; 579f1af5d2fSBarry Smith } 580f1af5d2fSBarry Smith /* backward solve the L^T */ 581f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 582f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 583f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 584f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 585f1af5d2fSBarry Smith idt = 2*i; 586f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 587f1af5d2fSBarry Smith while (nz--) { 588f1af5d2fSBarry Smith idx = 2*(*vi--); 589f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 590f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 591f1af5d2fSBarry Smith v -= 4; 592f1af5d2fSBarry Smith } 593f1af5d2fSBarry Smith } 594f1af5d2fSBarry Smith 595f1af5d2fSBarry Smith /* copy t into x according to permutation */ 596f1af5d2fSBarry Smith ii = 0; 597f1af5d2fSBarry Smith for (i=0; i<n; i++) { 598f1af5d2fSBarry Smith ir = 2*r[i]; 599f1af5d2fSBarry Smith x[ir] = t[ii]; 600f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 601f1af5d2fSBarry Smith ii += 2; 602f1af5d2fSBarry Smith } 603f1af5d2fSBarry Smith 604f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609f1af5d2fSBarry Smith PetscFunctionReturn(0); 610f1af5d2fSBarry Smith } 611f1af5d2fSBarry Smith 6124a2ae208SSatish Balay #undef __FUNCT__ 6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615f1af5d2fSBarry Smith { 616f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6186849ba73SBarry Smith PetscErrorCode ierr; 6195d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 6205d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 622f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 62387828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 62487828ca2SBarry Smith PetscScalar *x,*b,*t; 625f1af5d2fSBarry Smith 626f1af5d2fSBarry Smith PetscFunctionBegin; 6271ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629f1af5d2fSBarry Smith t = a->solve_work; 630f1af5d2fSBarry Smith 631f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633f1af5d2fSBarry Smith 634f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 635f1af5d2fSBarry Smith ii = 0; 636f1af5d2fSBarry Smith for (i=0; i<n; i++) { 637f1af5d2fSBarry Smith ic = 3*c[i]; 638f1af5d2fSBarry Smith t[ii] = b[ic]; 639f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 640f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 641f1af5d2fSBarry Smith ii += 3; 642f1af5d2fSBarry Smith } 643f1af5d2fSBarry Smith 644f1af5d2fSBarry Smith /* forward solve the U^T */ 645f1af5d2fSBarry Smith idx = 0; 646f1af5d2fSBarry Smith for (i=0; i<n; i++) { 647f1af5d2fSBarry Smith 648f1af5d2fSBarry Smith v = aa + 9*diag[i]; 649f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 650f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654f1af5d2fSBarry Smith v += 9; 655f1af5d2fSBarry Smith 656f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 657f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 658f1af5d2fSBarry Smith while (nz--) { 659f1af5d2fSBarry Smith oidx = 3*(*vi++); 660f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663f1af5d2fSBarry Smith v += 9; 664f1af5d2fSBarry Smith } 665f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666f1af5d2fSBarry Smith idx += 3; 667f1af5d2fSBarry Smith } 668f1af5d2fSBarry Smith /* backward solve the L^T */ 669f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 670f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 671f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 672f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 673f1af5d2fSBarry Smith idt = 3*i; 674f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675f1af5d2fSBarry Smith while (nz--) { 676f1af5d2fSBarry Smith idx = 3*(*vi--); 677f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680f1af5d2fSBarry Smith v -= 9; 681f1af5d2fSBarry Smith } 682f1af5d2fSBarry Smith } 683f1af5d2fSBarry Smith 684f1af5d2fSBarry Smith /* copy t into x according to permutation */ 685f1af5d2fSBarry Smith ii = 0; 686f1af5d2fSBarry Smith for (i=0; i<n; i++) { 687f1af5d2fSBarry Smith ir = 3*r[i]; 688f1af5d2fSBarry Smith x[ir] = t[ii]; 689f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 690f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 691f1af5d2fSBarry Smith ii += 3; 692f1af5d2fSBarry Smith } 693f1af5d2fSBarry Smith 694f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6961ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699f1af5d2fSBarry Smith PetscFunctionReturn(0); 700f1af5d2fSBarry Smith } 701f1af5d2fSBarry Smith 7024a2ae208SSatish Balay #undef __FUNCT__ 7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705f1af5d2fSBarry Smith { 706f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7086849ba73SBarry Smith PetscErrorCode ierr; 7095d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 7105d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 712f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 71387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 71487828ca2SBarry Smith PetscScalar *x,*b,*t; 715f1af5d2fSBarry Smith 716f1af5d2fSBarry Smith PetscFunctionBegin; 7171ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719f1af5d2fSBarry Smith t = a->solve_work; 720f1af5d2fSBarry Smith 721f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723f1af5d2fSBarry Smith 724f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 725f1af5d2fSBarry Smith ii = 0; 726f1af5d2fSBarry Smith for (i=0; i<n; i++) { 727f1af5d2fSBarry Smith ic = 4*c[i]; 728f1af5d2fSBarry Smith t[ii] = b[ic]; 729f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 730f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 731f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 732f1af5d2fSBarry Smith ii += 4; 733f1af5d2fSBarry Smith } 734f1af5d2fSBarry Smith 735f1af5d2fSBarry Smith /* forward solve the U^T */ 736f1af5d2fSBarry Smith idx = 0; 737f1af5d2fSBarry Smith for (i=0; i<n; i++) { 738f1af5d2fSBarry Smith 739f1af5d2fSBarry Smith v = aa + 16*diag[i]; 740f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 741f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746f1af5d2fSBarry Smith v += 16; 747f1af5d2fSBarry Smith 748f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 749f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 750f1af5d2fSBarry Smith while (nz--) { 751f1af5d2fSBarry Smith oidx = 4*(*vi++); 752f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756f1af5d2fSBarry Smith v += 16; 757f1af5d2fSBarry Smith } 758f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759f1af5d2fSBarry Smith idx += 4; 760f1af5d2fSBarry Smith } 761f1af5d2fSBarry Smith /* backward solve the L^T */ 762f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 763f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 764f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 765f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 766f1af5d2fSBarry Smith idt = 4*i; 767f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768f1af5d2fSBarry Smith while (nz--) { 769f1af5d2fSBarry Smith idx = 4*(*vi--); 770f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774f1af5d2fSBarry Smith v -= 16; 775f1af5d2fSBarry Smith } 776f1af5d2fSBarry Smith } 777f1af5d2fSBarry Smith 778f1af5d2fSBarry Smith /* copy t into x according to permutation */ 779f1af5d2fSBarry Smith ii = 0; 780f1af5d2fSBarry Smith for (i=0; i<n; i++) { 781f1af5d2fSBarry Smith ir = 4*r[i]; 782f1af5d2fSBarry Smith x[ir] = t[ii]; 783f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 784f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 785f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 786f1af5d2fSBarry Smith ii += 4; 787f1af5d2fSBarry Smith } 788f1af5d2fSBarry Smith 789f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794f1af5d2fSBarry Smith PetscFunctionReturn(0); 795f1af5d2fSBarry Smith } 796f1af5d2fSBarry Smith 7974a2ae208SSatish Balay #undef __FUNCT__ 7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800f1af5d2fSBarry Smith { 801f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8036849ba73SBarry Smith PetscErrorCode ierr; 8045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 807f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 80887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 80987828ca2SBarry Smith PetscScalar *x,*b,*t; 810f1af5d2fSBarry Smith 811f1af5d2fSBarry Smith PetscFunctionBegin; 8121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814f1af5d2fSBarry Smith t = a->solve_work; 815f1af5d2fSBarry Smith 816f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818f1af5d2fSBarry Smith 819f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 820f1af5d2fSBarry Smith ii = 0; 821f1af5d2fSBarry Smith for (i=0; i<n; i++) { 822f1af5d2fSBarry Smith ic = 5*c[i]; 823f1af5d2fSBarry Smith t[ii] = b[ic]; 824f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 825f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 826f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 827f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 828f1af5d2fSBarry Smith ii += 5; 829f1af5d2fSBarry Smith } 830f1af5d2fSBarry Smith 831f1af5d2fSBarry Smith /* forward solve the U^T */ 832f1af5d2fSBarry Smith idx = 0; 833f1af5d2fSBarry Smith for (i=0; i<n; i++) { 834f1af5d2fSBarry Smith 835f1af5d2fSBarry Smith v = aa + 25*diag[i]; 836f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 837f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843f1af5d2fSBarry Smith v += 25; 844f1af5d2fSBarry Smith 845f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 846f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 847f1af5d2fSBarry Smith while (nz--) { 848f1af5d2fSBarry Smith oidx = 5*(*vi++); 849f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854f1af5d2fSBarry Smith v += 25; 855f1af5d2fSBarry Smith } 856f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857f1af5d2fSBarry Smith idx += 5; 858f1af5d2fSBarry Smith } 859f1af5d2fSBarry Smith /* backward solve the L^T */ 860f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 861f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 862f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 863f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 864f1af5d2fSBarry Smith idt = 5*i; 865f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866f1af5d2fSBarry Smith while (nz--) { 867f1af5d2fSBarry Smith idx = 5*(*vi--); 868f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873f1af5d2fSBarry Smith v -= 25; 874f1af5d2fSBarry Smith } 875f1af5d2fSBarry Smith } 876f1af5d2fSBarry Smith 877f1af5d2fSBarry Smith /* copy t into x according to permutation */ 878f1af5d2fSBarry Smith ii = 0; 879f1af5d2fSBarry Smith for (i=0; i<n; i++) { 880f1af5d2fSBarry Smith ir = 5*r[i]; 881f1af5d2fSBarry Smith x[ir] = t[ii]; 882f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 883f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 884f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 885f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 886f1af5d2fSBarry Smith ii += 5; 887f1af5d2fSBarry Smith } 888f1af5d2fSBarry Smith 889f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894f1af5d2fSBarry Smith PetscFunctionReturn(0); 895f1af5d2fSBarry Smith } 896f1af5d2fSBarry Smith 8974a2ae208SSatish Balay #undef __FUNCT__ 8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900f1af5d2fSBarry Smith { 901f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9036849ba73SBarry Smith PetscErrorCode ierr; 9045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 907f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 90887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 90987828ca2SBarry Smith PetscScalar *x,*b,*t; 910f1af5d2fSBarry Smith 911f1af5d2fSBarry Smith PetscFunctionBegin; 9121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914f1af5d2fSBarry Smith t = a->solve_work; 915f1af5d2fSBarry Smith 916f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918f1af5d2fSBarry Smith 919f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 920f1af5d2fSBarry Smith ii = 0; 921f1af5d2fSBarry Smith for (i=0; i<n; i++) { 922f1af5d2fSBarry Smith ic = 6*c[i]; 923f1af5d2fSBarry Smith t[ii] = b[ic]; 924f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 925f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 926f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 927f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 928f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 929f1af5d2fSBarry Smith ii += 6; 930f1af5d2fSBarry Smith } 931f1af5d2fSBarry Smith 932f1af5d2fSBarry Smith /* forward solve the U^T */ 933f1af5d2fSBarry Smith idx = 0; 934f1af5d2fSBarry Smith for (i=0; i<n; i++) { 935f1af5d2fSBarry Smith 936f1af5d2fSBarry Smith v = aa + 36*diag[i]; 937f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 938f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939f1af5d2fSBarry Smith x6 = t[5+idx]; 940f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946f1af5d2fSBarry Smith v += 36; 947f1af5d2fSBarry Smith 948f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 949f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 950f1af5d2fSBarry Smith while (nz--) { 951f1af5d2fSBarry Smith oidx = 6*(*vi++); 952f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958f1af5d2fSBarry Smith v += 36; 959f1af5d2fSBarry Smith } 960f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961f1af5d2fSBarry Smith t[5+idx] = s6; 962f1af5d2fSBarry Smith idx += 6; 963f1af5d2fSBarry Smith } 964f1af5d2fSBarry Smith /* backward solve the L^T */ 965f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 966f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 967f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 968f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 969f1af5d2fSBarry Smith idt = 6*i; 970f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971f1af5d2fSBarry Smith s6 = t[5+idt]; 972f1af5d2fSBarry Smith while (nz--) { 973f1af5d2fSBarry Smith idx = 6*(*vi--); 974f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980f1af5d2fSBarry Smith v -= 36; 981f1af5d2fSBarry Smith } 982f1af5d2fSBarry Smith } 983f1af5d2fSBarry Smith 984f1af5d2fSBarry Smith /* copy t into x according to permutation */ 985f1af5d2fSBarry Smith ii = 0; 986f1af5d2fSBarry Smith for (i=0; i<n; i++) { 987f1af5d2fSBarry Smith ir = 6*r[i]; 988f1af5d2fSBarry Smith x[ir] = t[ii]; 989f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 990f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 991f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 992f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 993f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 994f1af5d2fSBarry Smith ii += 6; 995f1af5d2fSBarry Smith } 996f1af5d2fSBarry Smith 997f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9991ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002f1af5d2fSBarry Smith PetscFunctionReturn(0); 1003f1af5d2fSBarry Smith } 1004f1af5d2fSBarry Smith 10054a2ae208SSatish Balay #undef __FUNCT__ 10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008f1af5d2fSBarry Smith { 1009f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10116849ba73SBarry Smith PetscErrorCode ierr; 10125d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 10135d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 101687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 101787828ca2SBarry Smith PetscScalar *x,*b,*t; 1018f1af5d2fSBarry Smith 1019f1af5d2fSBarry Smith PetscFunctionBegin; 10201ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022f1af5d2fSBarry Smith t = a->solve_work; 1023f1af5d2fSBarry Smith 1024f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026f1af5d2fSBarry Smith 1027f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1028f1af5d2fSBarry Smith ii = 0; 1029f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1030f1af5d2fSBarry Smith ic = 7*c[i]; 1031f1af5d2fSBarry Smith t[ii] = b[ic]; 1032f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1033f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1034f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1035f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1036f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1037f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1038f1af5d2fSBarry Smith ii += 7; 1039f1af5d2fSBarry Smith } 1040f1af5d2fSBarry Smith 1041f1af5d2fSBarry Smith /* forward solve the U^T */ 1042f1af5d2fSBarry Smith idx = 0; 1043f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1044f1af5d2fSBarry Smith 1045f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1046f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1047f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1049f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056f1af5d2fSBarry Smith v += 49; 1057f1af5d2fSBarry Smith 1058f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1059f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1060f1af5d2fSBarry Smith while (nz--) { 1061f1af5d2fSBarry Smith oidx = 7*(*vi++); 1062f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069f1af5d2fSBarry Smith v += 49; 1070f1af5d2fSBarry Smith } 1071f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1073f1af5d2fSBarry Smith idx += 7; 1074f1af5d2fSBarry Smith } 1075f1af5d2fSBarry Smith /* backward solve the L^T */ 1076f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1077f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1078f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1079f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1080f1af5d2fSBarry Smith idt = 7*i; 1081f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1083f1af5d2fSBarry Smith while (nz--) { 1084f1af5d2fSBarry Smith idx = 7*(*vi--); 1085f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092f1af5d2fSBarry Smith v -= 49; 1093f1af5d2fSBarry Smith } 1094f1af5d2fSBarry Smith } 1095f1af5d2fSBarry Smith 1096f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1097f1af5d2fSBarry Smith ii = 0; 1098f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1099f1af5d2fSBarry Smith ir = 7*r[i]; 1100f1af5d2fSBarry Smith x[ir] = t[ii]; 1101f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1102f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1103f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1104f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1105f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1106f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1107f1af5d2fSBarry Smith ii += 7; 1108f1af5d2fSBarry Smith } 1109f1af5d2fSBarry Smith 1110f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11121ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115f1af5d2fSBarry Smith PetscFunctionReturn(0); 1116f1af5d2fSBarry Smith } 1117f1af5d2fSBarry Smith 11184e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11194a2ae208SSatish Balay #undef __FUNCT__ 11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11224e2b4712SSatish Balay { 11234e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11244e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11256849ba73SBarry Smith PetscErrorCode ierr; 11265d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 11275d0c19d7SBarry Smith PetscInt i,n=a->mbs; 11285d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11293f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 113087828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11314e2b4712SSatish Balay 11324e2b4712SSatish Balay PetscFunctionBegin; 11331ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11341ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135f1af5d2fSBarry Smith t = a->solve_work; 11364e2b4712SSatish Balay 11374e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11384e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11394e2b4712SSatish Balay 11404e2b4712SSatish Balay /* forward solve the lower triangular */ 114187828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11424e2b4712SSatish Balay for (i=1; i<n; i++) { 11434e2b4712SSatish Balay v = aa + bs2*ai[i]; 11444e2b4712SSatish Balay vi = aj + ai[i]; 11454e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1146f1af5d2fSBarry Smith s = t + bs*i; 114787828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11484e2b4712SSatish Balay while (nz--) { 1149f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11504e2b4712SSatish Balay v += bs2; 11514e2b4712SSatish Balay } 11524e2b4712SSatish Balay } 11534e2b4712SSatish Balay /* backward solve the upper triangular */ 1154d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 11554e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11564e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11574e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11584e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 115987828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11604e2b4712SSatish Balay while (nz--) { 1161f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11624e2b4712SSatish Balay v += bs2; 11634e2b4712SSatish Balay } 1164f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 116587828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11664e2b4712SSatish Balay } 11674e2b4712SSatish Balay 11684e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11694e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11701ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 11734e2b4712SSatish Balay PetscFunctionReturn(0); 11744e2b4712SSatish Balay } 11754e2b4712SSatish Balay 11764a2ae208SSatish Balay #undef __FUNCT__ 11774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1178dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11794e2b4712SSatish Balay { 11804e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11814e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11826849ba73SBarry Smith PetscErrorCode ierr; 11835d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 11845d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 11853f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 118687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 118787828ca2SBarry Smith PetscScalar *x,*b,*t; 11884e2b4712SSatish Balay 11894e2b4712SSatish Balay PetscFunctionBegin; 11901ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1192f1af5d2fSBarry Smith t = a->solve_work; 11934e2b4712SSatish Balay 11944e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11954e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11964e2b4712SSatish Balay 11974e2b4712SSatish Balay /* forward solve the lower triangular */ 11984e2b4712SSatish Balay idx = 7*(*r++); 1199f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1200f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1201f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 12024e2b4712SSatish Balay 12034e2b4712SSatish Balay for (i=1; i<n; i++) { 12044e2b4712SSatish Balay v = aa + 49*ai[i]; 12054e2b4712SSatish Balay vi = aj + ai[i]; 12064e2b4712SSatish Balay nz = diag[i] - ai[i]; 12074e2b4712SSatish Balay idx = 7*(*r++); 1208f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1209f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 12104e2b4712SSatish Balay while (nz--) { 12114e2b4712SSatish Balay idx = 7*(*vi++); 1212f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1213f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1214f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1215f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1216f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1217f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1218f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1219f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1220f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1221f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12224e2b4712SSatish Balay v += 49; 12234e2b4712SSatish Balay } 12244e2b4712SSatish Balay idx = 7*i; 1225f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1226f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1227f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12284e2b4712SSatish Balay } 12294e2b4712SSatish Balay /* backward solve the upper triangular */ 12304e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12314e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12324e2b4712SSatish Balay vi = aj + diag[i] + 1; 12334e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12344e2b4712SSatish Balay idt = 7*i; 1235f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1236f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1237f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12384e2b4712SSatish Balay while (nz--) { 12394e2b4712SSatish Balay idx = 7*(*vi++); 1240f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1241f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1243f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1244f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1245f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1246f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1247f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1248f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1249f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12504e2b4712SSatish Balay v += 49; 12514e2b4712SSatish Balay } 12524e2b4712SSatish Balay idc = 7*(*c--); 12534e2b4712SSatish Balay v = aa + 49*diag[i]; 1254f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1255f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1256f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1257f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1258f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1259f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1260f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1261f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1262f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1263f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1264f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1265f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1266f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1267f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12684e2b4712SSatish Balay } 12694e2b4712SSatish Balay 12704e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12714e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12721ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12731ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1274dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 12754e2b4712SSatish Balay PetscFunctionReturn(0); 12764e2b4712SSatish Balay } 12774e2b4712SSatish Balay 12784a2ae208SSatish Balay #undef __FUNCT__ 12798f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 12808f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 12818f690400SShri Abhyankar { 12828f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 12838f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 12848f690400SShri Abhyankar PetscErrorCode ierr; 12858f690400SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 128629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 12878f690400SShri Abhyankar MatScalar *aa=a->a,*v; 12888f690400SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 12898f690400SShri Abhyankar PetscScalar *x,*b,*t; 12908f690400SShri Abhyankar 12918f690400SShri Abhyankar PetscFunctionBegin; 12928f690400SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 12938f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 12948f690400SShri Abhyankar t = a->solve_work; 12958f690400SShri Abhyankar 12968f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 129729b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 12988f690400SShri Abhyankar 12998f690400SShri Abhyankar /* forward solve the lower triangular */ 130029b92fc1SShri Abhyankar idx = 7*r[0]; 13018f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 13028f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 13038f690400SShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 13048f690400SShri Abhyankar 13058f690400SShri Abhyankar for (i=1; i<n; i++) { 13068f690400SShri Abhyankar v = aa + 49*ai[i]; 13078f690400SShri Abhyankar vi = aj + ai[i]; 13088f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 130929b92fc1SShri Abhyankar idx = 7*r[i]; 13108f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 13118f690400SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 131229b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 131329b92fc1SShri Abhyankar idx = 7*vi[m]; 13148f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 13158f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 13168f690400SShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 13178f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 13188f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 13198f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 13208f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 13218f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 13228f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 13238f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13248f690400SShri Abhyankar v += 49; 13258f690400SShri Abhyankar } 13268f690400SShri Abhyankar idx = 7*i; 13278f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 13288f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 13298f690400SShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 13308f690400SShri Abhyankar } 13318f690400SShri Abhyankar /* backward solve the upper triangular */ 13328f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 13338f690400SShri Abhyankar k = 2*n-i; 13348f690400SShri Abhyankar v = aa + 49*ai[k]; 13358f690400SShri Abhyankar vi = aj + ai[k]; 13368f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 13378f690400SShri Abhyankar idt = 7*i; 13388f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 13398f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 13408f690400SShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 134129b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 134229b92fc1SShri Abhyankar idx = 7*vi[m]; 13438f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 13448f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 13458f690400SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 13468f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 13478f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 13488f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 13498f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 13508f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 13518f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 13528f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13538f690400SShri Abhyankar v += 49; 13548f690400SShri Abhyankar } 135529b92fc1SShri Abhyankar idc = 7*c[i]; 13568f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 13578f690400SShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 13588f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 13598f690400SShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 13608f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 13618f690400SShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 13628f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 13638f690400SShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 13648f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 13658f690400SShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 13668f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 13678f690400SShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 13688f690400SShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 13698f690400SShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 13708f690400SShri Abhyankar } 13718f690400SShri Abhyankar 13728f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 13738f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13748f690400SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 13758f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 13768f690400SShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 13778f690400SShri Abhyankar PetscFunctionReturn(0); 13788f690400SShri Abhyankar } 13798f690400SShri Abhyankar 13808f690400SShri Abhyankar #undef __FUNCT__ 13814a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1382dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 138315091d37SBarry Smith { 138415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1385690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1386dfbe8321SBarry Smith PetscErrorCode ierr; 1387690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1388d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1389d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1390d9fead3dSBarry Smith const PetscScalar *b; 139115091d37SBarry Smith 139215091d37SBarry Smith PetscFunctionBegin; 1393d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 139515091d37SBarry Smith /* forward solve the lower triangular */ 139615091d37SBarry Smith idx = 0; 139715091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 139815091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 139915091d37SBarry Smith x[6] = b[6+idx]; 140015091d37SBarry Smith for (i=1; i<n; i++) { 140115091d37SBarry Smith v = aa + 49*ai[i]; 140215091d37SBarry Smith vi = aj + ai[i]; 140315091d37SBarry Smith nz = diag[i] - ai[i]; 140415091d37SBarry Smith idx = 7*i; 1405f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1406f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1407f1af5d2fSBarry Smith s7 = b[6+idx]; 140815091d37SBarry Smith while (nz--) { 140915091d37SBarry Smith jdx = 7*(*vi++); 141015091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 141115091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 141215091d37SBarry Smith x7 = x[6+jdx]; 1413f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1414f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1415f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1416f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1417f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1418f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1419f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 142015091d37SBarry Smith v += 49; 142115091d37SBarry Smith } 1422f1af5d2fSBarry Smith x[idx] = s1; 1423f1af5d2fSBarry Smith x[1+idx] = s2; 1424f1af5d2fSBarry Smith x[2+idx] = s3; 1425f1af5d2fSBarry Smith x[3+idx] = s4; 1426f1af5d2fSBarry Smith x[4+idx] = s5; 1427f1af5d2fSBarry Smith x[5+idx] = s6; 1428f1af5d2fSBarry Smith x[6+idx] = s7; 142915091d37SBarry Smith } 143015091d37SBarry Smith /* backward solve the upper triangular */ 143115091d37SBarry Smith for (i=n-1; i>=0; i--){ 143215091d37SBarry Smith v = aa + 49*diag[i] + 49; 143315091d37SBarry Smith vi = aj + diag[i] + 1; 143415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 143515091d37SBarry Smith idt = 7*i; 1436f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1437f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1438f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1439f1af5d2fSBarry Smith s7 = x[6+idt]; 144015091d37SBarry Smith while (nz--) { 144115091d37SBarry Smith idx = 7*(*vi++); 144215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 144315091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 144415091d37SBarry Smith x7 = x[6+idx]; 1445f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1446f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1447f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1448f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1449f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1450f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1451f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 145215091d37SBarry Smith v += 49; 145315091d37SBarry Smith } 145415091d37SBarry Smith v = aa + 49*diag[i]; 1455f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1456f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1457f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1458f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1459f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1460f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1461f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1462f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1463f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1464f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1465f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1466f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1467f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1468f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 146915091d37SBarry Smith } 147015091d37SBarry Smith 1471d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14721ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1473dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 147415091d37SBarry Smith PetscFunctionReturn(0); 147515091d37SBarry Smith } 147615091d37SBarry Smith 14774a2ae208SSatish Balay #undef __FUNCT__ 1478cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1479cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1480cee9d6f2SShri Abhyankar { 1481cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 14826464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1483cee9d6f2SShri Abhyankar PetscErrorCode ierr; 1484cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 1485cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1486cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 1487cee9d6f2SShri Abhyankar PetscScalar *x; 1488cee9d6f2SShri Abhyankar const PetscScalar *b; 1489cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1490cee9d6f2SShri Abhyankar 1491cee9d6f2SShri Abhyankar PetscFunctionBegin; 1492cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1493cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1494cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 1495cee9d6f2SShri Abhyankar idx = 0; 1496cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1497cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1498cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 1499cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 1500cee9d6f2SShri Abhyankar vi = aj + ai[i]; 1501cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 1502cee9d6f2SShri Abhyankar idx = bs*i; 1503cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1504cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 15056464896eSShri Abhyankar for(k=0;k<nz;k++) { 15066464896eSShri Abhyankar jdx = bs*vi[k]; 1507cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1508cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1509cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1510cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1511cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1512cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1513cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1514cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1515cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1516cee9d6f2SShri Abhyankar v += bs2; 1517cee9d6f2SShri Abhyankar } 1518cee9d6f2SShri Abhyankar 1519cee9d6f2SShri Abhyankar x[idx] = s1; 1520cee9d6f2SShri Abhyankar x[1+idx] = s2; 1521cee9d6f2SShri Abhyankar x[2+idx] = s3; 1522cee9d6f2SShri Abhyankar x[3+idx] = s4; 1523cee9d6f2SShri Abhyankar x[4+idx] = s5; 1524cee9d6f2SShri Abhyankar x[5+idx] = s6; 1525cee9d6f2SShri Abhyankar x[6+idx] = s7; 1526cee9d6f2SShri Abhyankar } 1527cee9d6f2SShri Abhyankar 1528cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 1529cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 1530cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 1531cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 1532cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 1533cee9d6f2SShri Abhyankar idt = bs*i; 1534cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1535cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 15366464896eSShri Abhyankar for(k=0;k<nz;k++) { 15376464896eSShri Abhyankar idx = bs*vi[k]; 1538cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1539cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1540cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1541cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1542cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1543cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1544cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1545cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1546cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1547cee9d6f2SShri Abhyankar v += bs2; 1548cee9d6f2SShri Abhyankar } 1549cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 1550cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1551cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1552cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1553cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1554cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1555cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1556cee9d6f2SShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1557cee9d6f2SShri Abhyankar } 1558cee9d6f2SShri Abhyankar 1559cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1560cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1561cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1562cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 1563cee9d6f2SShri Abhyankar } 1564cee9d6f2SShri Abhyankar 1565cee9d6f2SShri Abhyankar #undef __FUNCT__ 15664a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1567dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 156815091d37SBarry Smith { 156915091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 157015091d37SBarry Smith IS iscol=a->col,isrow=a->row; 15716849ba73SBarry Smith PetscErrorCode ierr; 15725d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 15735d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1574d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1575d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1576d9fead3dSBarry Smith const PetscScalar *b; 157715091d37SBarry Smith PetscFunctionBegin; 1578d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1580f1af5d2fSBarry Smith t = a->solve_work; 158115091d37SBarry Smith 158215091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 158315091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 158415091d37SBarry Smith 158515091d37SBarry Smith /* forward solve the lower triangular */ 158615091d37SBarry Smith idx = 6*(*r++); 1587f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1588f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1589f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 159015091d37SBarry Smith for (i=1; i<n; i++) { 159115091d37SBarry Smith v = aa + 36*ai[i]; 159215091d37SBarry Smith vi = aj + ai[i]; 159315091d37SBarry Smith nz = diag[i] - ai[i]; 159415091d37SBarry Smith idx = 6*(*r++); 1595f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1596f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 159715091d37SBarry Smith while (nz--) { 159815091d37SBarry Smith idx = 6*(*vi++); 1599f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1600f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1601f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1602f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1603f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1604f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1605f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1606f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 160715091d37SBarry Smith v += 36; 160815091d37SBarry Smith } 160915091d37SBarry Smith idx = 6*i; 1610f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1611f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1612f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 161315091d37SBarry Smith } 161415091d37SBarry Smith /* backward solve the upper triangular */ 161515091d37SBarry Smith for (i=n-1; i>=0; i--){ 161615091d37SBarry Smith v = aa + 36*diag[i] + 36; 161715091d37SBarry Smith vi = aj + diag[i] + 1; 161815091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 161915091d37SBarry Smith idt = 6*i; 1620f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1621f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1622f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 162315091d37SBarry Smith while (nz--) { 162415091d37SBarry Smith idx = 6*(*vi++); 1625f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1626f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1627f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1628f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1629f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1630f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1631f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1632f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1633f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 163415091d37SBarry Smith v += 36; 163515091d37SBarry Smith } 163615091d37SBarry Smith idc = 6*(*c--); 163715091d37SBarry Smith v = aa + 36*diag[i]; 1638f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1639f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1640f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1641f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1642f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1643f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1644f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1645f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1646f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1647f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1648f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1649f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 165015091d37SBarry Smith } 165115091d37SBarry Smith 165215091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 165315091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1654d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16551ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1656dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 165715091d37SBarry Smith PetscFunctionReturn(0); 165815091d37SBarry Smith } 165915091d37SBarry Smith 16604a2ae208SSatish Balay #undef __FUNCT__ 16618f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 16628f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 16638f690400SShri Abhyankar { 16648f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16658f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 16668f690400SShri Abhyankar PetscErrorCode ierr; 16678f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 166829b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 16698f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 16708f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 16718f690400SShri Abhyankar const PetscScalar *b; 16728f690400SShri Abhyankar PetscFunctionBegin; 16738f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16748f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 16758f690400SShri Abhyankar t = a->solve_work; 16768f690400SShri Abhyankar 16778f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 167829b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 16798f690400SShri Abhyankar 16808f690400SShri Abhyankar /* forward solve the lower triangular */ 168129b92fc1SShri Abhyankar idx = 6*r[0]; 16828f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 16838f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 16848f690400SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 16858f690400SShri Abhyankar for (i=1; i<n; i++) { 16868f690400SShri Abhyankar v = aa + 36*ai[i]; 16878f690400SShri Abhyankar vi = aj + ai[i]; 16888f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 168929b92fc1SShri Abhyankar idx = 6*r[i]; 16908f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 16918f690400SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 169229b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 169329b92fc1SShri Abhyankar idx = 6*vi[m]; 16948f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 16958f690400SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 16968f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 16978f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 16988f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 16998f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 17008f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 17018f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 17028f690400SShri Abhyankar v += 36; 17038f690400SShri Abhyankar } 17048f690400SShri Abhyankar idx = 6*i; 17058f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 17068f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 17078f690400SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 17088f690400SShri Abhyankar } 17098f690400SShri Abhyankar /* backward solve the upper triangular */ 17108f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 17118f690400SShri Abhyankar k = 2*n-i; 17128f690400SShri Abhyankar v = aa + 36*ai[k]; 17138f690400SShri Abhyankar vi = aj + ai[k]; 17148f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 17158f690400SShri Abhyankar idt = 6*i; 17168f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 17178f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 17188f690400SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 171929b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 172029b92fc1SShri Abhyankar idx = 6*vi[m]; 17218f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 17228f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 17238f690400SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 17248f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 17258f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 17268f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 17278f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 17288f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 17298f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 17308f690400SShri Abhyankar v += 36; 17318f690400SShri Abhyankar } 173229b92fc1SShri Abhyankar idc = 6*c[i]; 17338f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 17348f690400SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 17358f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 17368f690400SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 17378f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 17388f690400SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 17398f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 17408f690400SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 17418f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 17428f690400SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 17438f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 17448f690400SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 17458f690400SShri Abhyankar } 17468f690400SShri Abhyankar 17478f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 17488f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 17498f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17508f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 17518f690400SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 17528f690400SShri Abhyankar PetscFunctionReturn(0); 17538f690400SShri Abhyankar } 17548f690400SShri Abhyankar 17558f690400SShri Abhyankar 17568f690400SShri Abhyankar #undef __FUNCT__ 17574a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1758dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 175915091d37SBarry Smith { 176015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1761690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1762dfbe8321SBarry Smith PetscErrorCode ierr; 1763690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1764d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1765d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1766d9fead3dSBarry Smith const PetscScalar *b; 176715091d37SBarry Smith 176815091d37SBarry Smith PetscFunctionBegin; 1769d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 177115091d37SBarry Smith /* forward solve the lower triangular */ 177215091d37SBarry Smith idx = 0; 177315091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 177415091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 177515091d37SBarry Smith for (i=1; i<n; i++) { 177615091d37SBarry Smith v = aa + 36*ai[i]; 177715091d37SBarry Smith vi = aj + ai[i]; 177815091d37SBarry Smith nz = diag[i] - ai[i]; 177915091d37SBarry Smith idx = 6*i; 1780f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1781f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 178215091d37SBarry Smith while (nz--) { 178315091d37SBarry Smith jdx = 6*(*vi++); 178415091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 178515091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1786f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1787f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1788f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1789f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1790f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1791f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 179215091d37SBarry Smith v += 36; 179315091d37SBarry Smith } 1794f1af5d2fSBarry Smith x[idx] = s1; 1795f1af5d2fSBarry Smith x[1+idx] = s2; 1796f1af5d2fSBarry Smith x[2+idx] = s3; 1797f1af5d2fSBarry Smith x[3+idx] = s4; 1798f1af5d2fSBarry Smith x[4+idx] = s5; 1799f1af5d2fSBarry Smith x[5+idx] = s6; 180015091d37SBarry Smith } 180115091d37SBarry Smith /* backward solve the upper triangular */ 180215091d37SBarry Smith for (i=n-1; i>=0; i--){ 180315091d37SBarry Smith v = aa + 36*diag[i] + 36; 180415091d37SBarry Smith vi = aj + diag[i] + 1; 180515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 180615091d37SBarry Smith idt = 6*i; 1807f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1808f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1809f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 181015091d37SBarry Smith while (nz--) { 181115091d37SBarry Smith idx = 6*(*vi++); 181215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 181315091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1814f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1815f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1816f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1817f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1818f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1819f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 182015091d37SBarry Smith v += 36; 182115091d37SBarry Smith } 182215091d37SBarry Smith v = aa + 36*diag[i]; 1823f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1824f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1825f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1826f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1827f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1828f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 182915091d37SBarry Smith } 183015091d37SBarry Smith 1831d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18321ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1833dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 183415091d37SBarry Smith PetscFunctionReturn(0); 183515091d37SBarry Smith } 183615091d37SBarry Smith 18374a2ae208SSatish Balay #undef __FUNCT__ 1838cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 1839cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1840cee9d6f2SShri Abhyankar { 1841cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 18426464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1843cee9d6f2SShri Abhyankar PetscErrorCode ierr; 1844cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 1845cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1846cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 1847cee9d6f2SShri Abhyankar PetscScalar *x; 1848cee9d6f2SShri Abhyankar const PetscScalar *b; 1849cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1850cee9d6f2SShri Abhyankar 1851cee9d6f2SShri Abhyankar PetscFunctionBegin; 1852cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1853cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1854cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 1855cee9d6f2SShri Abhyankar idx = 0; 1856cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1857cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 1858cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 1859cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 1860cee9d6f2SShri Abhyankar vi = aj + ai[i]; 1861cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 1862cee9d6f2SShri Abhyankar idx = bs*i; 1863cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1864cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 18656464896eSShri Abhyankar for(k=0;k<nz;k++){ 18666464896eSShri Abhyankar jdx = bs*vi[k]; 1867cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1868cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 1869cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1870cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1871cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1872cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1873cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1874cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1875cee9d6f2SShri Abhyankar v += bs2; 1876cee9d6f2SShri Abhyankar } 1877cee9d6f2SShri Abhyankar 1878cee9d6f2SShri Abhyankar x[idx] = s1; 1879cee9d6f2SShri Abhyankar x[1+idx] = s2; 1880cee9d6f2SShri Abhyankar x[2+idx] = s3; 1881cee9d6f2SShri Abhyankar x[3+idx] = s4; 1882cee9d6f2SShri Abhyankar x[4+idx] = s5; 1883cee9d6f2SShri Abhyankar x[5+idx] = s6; 1884cee9d6f2SShri Abhyankar } 1885cee9d6f2SShri Abhyankar 1886cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 1887cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 1888cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 1889cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 1890cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 1891cee9d6f2SShri Abhyankar idt = bs*i; 1892cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1893cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 18946464896eSShri Abhyankar for(k=0;k<nz;k++){ 18956464896eSShri Abhyankar idx = bs*vi[k]; 1896cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1897cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 1898cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1899cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1900cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1901cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1902cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1903cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1904cee9d6f2SShri Abhyankar v += bs2; 1905cee9d6f2SShri Abhyankar } 1906cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 1907cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1908cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1909cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1910cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1911cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1912cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1913cee9d6f2SShri Abhyankar } 1914cee9d6f2SShri Abhyankar 1915cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1916cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1917cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1918cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 1919cee9d6f2SShri Abhyankar } 19208f690400SShri Abhyankar 1921cee9d6f2SShri Abhyankar #undef __FUNCT__ 19224a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1923dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 19244e2b4712SSatish Balay { 19254e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 19264e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 19276849ba73SBarry Smith PetscErrorCode ierr; 19285d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 19295d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1930d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1931d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1932d9fead3dSBarry Smith const PetscScalar *b; 19334e2b4712SSatish Balay 19344e2b4712SSatish Balay PetscFunctionBegin; 1935d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19361ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1937f1af5d2fSBarry Smith t = a->solve_work; 19384e2b4712SSatish Balay 19394e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 19404e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 19414e2b4712SSatish Balay 19424e2b4712SSatish Balay /* forward solve the lower triangular */ 19434e2b4712SSatish Balay idx = 5*(*r++); 1944f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1945f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 19464e2b4712SSatish Balay for (i=1; i<n; i++) { 19474e2b4712SSatish Balay v = aa + 25*ai[i]; 19484e2b4712SSatish Balay vi = aj + ai[i]; 19494e2b4712SSatish Balay nz = diag[i] - ai[i]; 19504e2b4712SSatish Balay idx = 5*(*r++); 1951f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1952f1af5d2fSBarry Smith s5 = b[4+idx]; 19534e2b4712SSatish Balay while (nz--) { 19544e2b4712SSatish Balay idx = 5*(*vi++); 1955f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1956f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1957f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1958f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1959f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1960f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1961f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 19624e2b4712SSatish Balay v += 25; 19634e2b4712SSatish Balay } 19644e2b4712SSatish Balay idx = 5*i; 1965f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1966f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 19674e2b4712SSatish Balay } 19684e2b4712SSatish Balay /* backward solve the upper triangular */ 19694e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 19704e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 19714e2b4712SSatish Balay vi = aj + diag[i] + 1; 19724e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 19734e2b4712SSatish Balay idt = 5*i; 1974f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1975f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 19764e2b4712SSatish Balay while (nz--) { 19774e2b4712SSatish Balay idx = 5*(*vi++); 1978f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1979f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1980f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1981f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1982f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1983f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1984f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 19854e2b4712SSatish Balay v += 25; 19864e2b4712SSatish Balay } 19874e2b4712SSatish Balay idc = 5*(*c--); 19884e2b4712SSatish Balay v = aa + 25*diag[i]; 1989f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1990f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 1991f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1992f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 1993f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1994f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 1995f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1996f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 1997f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1998f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 19994e2b4712SSatish Balay } 20004e2b4712SSatish Balay 20014e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 20024e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2003d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20041ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2005dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 20064e2b4712SSatish Balay PetscFunctionReturn(0); 20074e2b4712SSatish Balay } 20084e2b4712SSatish Balay 20094a2ae208SSatish Balay #undef __FUNCT__ 20108f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 20118f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 20128f690400SShri Abhyankar { 20138f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 20148f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 20158f690400SShri Abhyankar PetscErrorCode ierr; 20168f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 201729b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 20188f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 20198f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 20208f690400SShri Abhyankar const PetscScalar *b; 20218f690400SShri Abhyankar 20228f690400SShri Abhyankar PetscFunctionBegin; 20238f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20248f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 20258f690400SShri Abhyankar t = a->solve_work; 20268f690400SShri Abhyankar 20278f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 202829b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 20298f690400SShri Abhyankar 20308f690400SShri Abhyankar /* forward solve the lower triangular */ 203129b92fc1SShri Abhyankar idx = 5*r[0]; 20328f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 20338f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 20348f690400SShri Abhyankar for (i=1; i<n; i++) { 20358f690400SShri Abhyankar v = aa + 25*ai[i]; 20368f690400SShri Abhyankar vi = aj + ai[i]; 20378f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 203829b92fc1SShri Abhyankar idx = 5*r[i]; 20398f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 20408f690400SShri Abhyankar s5 = b[4+idx]; 204129b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 204229b92fc1SShri Abhyankar idx = 5*vi[m]; 20438f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 20448f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 20458f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 20468f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 20478f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 20488f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 20498f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 20508f690400SShri Abhyankar v += 25; 20518f690400SShri Abhyankar } 20528f690400SShri Abhyankar idx = 5*i; 20538f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 20548f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 20558f690400SShri Abhyankar } 20568f690400SShri Abhyankar /* backward solve the upper triangular */ 20578f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 20588f690400SShri Abhyankar k = 2*n-i; 20598f690400SShri Abhyankar v = aa + 25*ai[k]; 20608f690400SShri Abhyankar vi = aj + ai[k]; 20618f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 20628f690400SShri Abhyankar idt = 5*i; 20638f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 20648f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 206529b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 206629b92fc1SShri Abhyankar idx = 5*vi[m]; 20678f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 20688f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 20698f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 20708f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 20718f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 20728f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 20738f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 20748f690400SShri Abhyankar v += 25; 20758f690400SShri Abhyankar } 207629b92fc1SShri Abhyankar idc = 5*c[i]; 20778f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 20788f690400SShri Abhyankar v[15]*s4+v[20]*s5; 20798f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 20808f690400SShri Abhyankar v[16]*s4+v[21]*s5; 20818f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 20828f690400SShri Abhyankar v[17]*s4+v[22]*s5; 20838f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 20848f690400SShri Abhyankar v[18]*s4+v[23]*s5; 20858f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 20868f690400SShri Abhyankar v[19]*s4+v[24]*s5; 20878f690400SShri Abhyankar } 20888f690400SShri Abhyankar 20898f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 20908f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 20918f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20928f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 20938f690400SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 20948f690400SShri Abhyankar PetscFunctionReturn(0); 20958f690400SShri Abhyankar } 20968f690400SShri Abhyankar #undef __FUNCT__ 20974a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2098dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 209915091d37SBarry Smith { 210015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2101690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2102dfbe8321SBarry Smith PetscErrorCode ierr; 2103690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2104d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2105d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2106d9fead3dSBarry Smith const PetscScalar *b; 210715091d37SBarry Smith 210815091d37SBarry Smith PetscFunctionBegin; 2109d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21101ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 211115091d37SBarry Smith /* forward solve the lower triangular */ 211215091d37SBarry Smith idx = 0; 211315091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 211415091d37SBarry Smith for (i=1; i<n; i++) { 211515091d37SBarry Smith v = aa + 25*ai[i]; 211615091d37SBarry Smith vi = aj + ai[i]; 211715091d37SBarry Smith nz = diag[i] - ai[i]; 211815091d37SBarry Smith idx = 5*i; 2119f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 212015091d37SBarry Smith while (nz--) { 212115091d37SBarry Smith jdx = 5*(*vi++); 212215091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2123f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2124f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2125f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2126f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2127f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 212815091d37SBarry Smith v += 25; 212915091d37SBarry Smith } 2130f1af5d2fSBarry Smith x[idx] = s1; 2131f1af5d2fSBarry Smith x[1+idx] = s2; 2132f1af5d2fSBarry Smith x[2+idx] = s3; 2133f1af5d2fSBarry Smith x[3+idx] = s4; 2134f1af5d2fSBarry Smith x[4+idx] = s5; 213515091d37SBarry Smith } 213615091d37SBarry Smith /* backward solve the upper triangular */ 213715091d37SBarry Smith for (i=n-1; i>=0; i--){ 213815091d37SBarry Smith v = aa + 25*diag[i] + 25; 213915091d37SBarry Smith vi = aj + diag[i] + 1; 214015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 214115091d37SBarry Smith idt = 5*i; 2142f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2143f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 214415091d37SBarry Smith while (nz--) { 214515091d37SBarry Smith idx = 5*(*vi++); 214615091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2147f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2148f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2149f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2150f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2151f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 215215091d37SBarry Smith v += 25; 215315091d37SBarry Smith } 215415091d37SBarry Smith v = aa + 25*diag[i]; 2155f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2156f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2157f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2158f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2159f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 216015091d37SBarry Smith } 216115091d37SBarry Smith 2162d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21631ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2164dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 216515091d37SBarry Smith PetscFunctionReturn(0); 216615091d37SBarry Smith } 216715091d37SBarry Smith 21684a2ae208SSatish Balay #undef __FUNCT__ 2169cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2170cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2171cee9d6f2SShri Abhyankar { 2172cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 21736464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2174cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2175cee9d6f2SShri Abhyankar PetscInt jdx; 2176cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2177cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2178cee9d6f2SShri Abhyankar const PetscScalar *b; 2179cee9d6f2SShri Abhyankar 2180cee9d6f2SShri Abhyankar PetscFunctionBegin; 2181cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2182cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2183cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2184cee9d6f2SShri Abhyankar idx = 0; 2185cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2186cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2187cee9d6f2SShri Abhyankar v = aa + 25*ai[i]; 2188cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2189cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2190cee9d6f2SShri Abhyankar idx = 5*i; 2191cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 21926464896eSShri Abhyankar for(k=0;k<nz;k++) { 21936464896eSShri Abhyankar jdx = 5*vi[k]; 2194cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2195cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2196cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2197cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2198cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2199cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2200cee9d6f2SShri Abhyankar v += 25; 2201cee9d6f2SShri Abhyankar } 2202cee9d6f2SShri Abhyankar x[idx] = s1; 2203cee9d6f2SShri Abhyankar x[1+idx] = s2; 2204cee9d6f2SShri Abhyankar x[2+idx] = s3; 2205cee9d6f2SShri Abhyankar x[3+idx] = s4; 2206cee9d6f2SShri Abhyankar x[4+idx] = s5; 2207cee9d6f2SShri Abhyankar } 2208cee9d6f2SShri Abhyankar 2209cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2210cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2211cee9d6f2SShri Abhyankar v = aa + 25*ai[2*n-i]; 2212cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2213cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2214cee9d6f2SShri Abhyankar idt = 5*i; 2215cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 2216cee9d6f2SShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 22176464896eSShri Abhyankar for(k=0;k<nz;k++){ 22186464896eSShri Abhyankar idx = 5*vi[k]; 2219cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2220cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2221cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2222cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2223cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2224cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2225cee9d6f2SShri Abhyankar v += 25; 2226cee9d6f2SShri Abhyankar } 2227cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2228cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2229cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2230cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2231cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2232cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2233cee9d6f2SShri Abhyankar } 2234cee9d6f2SShri Abhyankar 2235cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2236cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2237cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2238cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2239cee9d6f2SShri Abhyankar } 2240cee9d6f2SShri Abhyankar 2241cee9d6f2SShri Abhyankar #undef __FUNCT__ 22424a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2243dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 22444e2b4712SSatish Balay { 22454e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 22464e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 22476849ba73SBarry Smith PetscErrorCode ierr; 22485d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 22495d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2250d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2251d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2252d9fead3dSBarry Smith const PetscScalar *b; 22534e2b4712SSatish Balay 22544e2b4712SSatish Balay PetscFunctionBegin; 2255d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22561ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2257f1af5d2fSBarry Smith t = a->solve_work; 22584e2b4712SSatish Balay 22594e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 22604e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 22614e2b4712SSatish Balay 22624e2b4712SSatish Balay /* forward solve the lower triangular */ 22634e2b4712SSatish Balay idx = 4*(*r++); 2264f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2265f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 22664e2b4712SSatish Balay for (i=1; i<n; i++) { 22674e2b4712SSatish Balay v = aa + 16*ai[i]; 22684e2b4712SSatish Balay vi = aj + ai[i]; 22694e2b4712SSatish Balay nz = diag[i] - ai[i]; 22704e2b4712SSatish Balay idx = 4*(*r++); 2271f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 22724e2b4712SSatish Balay while (nz--) { 22734e2b4712SSatish Balay idx = 4*(*vi++); 2274f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2275f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2276f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2277f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2278f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 22794e2b4712SSatish Balay v += 16; 22804e2b4712SSatish Balay } 22814e2b4712SSatish Balay idx = 4*i; 2282f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2283f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 22844e2b4712SSatish Balay } 22854e2b4712SSatish Balay /* backward solve the upper triangular */ 22864e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 22874e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 22884e2b4712SSatish Balay vi = aj + diag[i] + 1; 22894e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 22904e2b4712SSatish Balay idt = 4*i; 2291f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2292f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 22934e2b4712SSatish Balay while (nz--) { 22944e2b4712SSatish Balay idx = 4*(*vi++); 2295f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2296f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2297f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2298f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2299f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2300f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 23014e2b4712SSatish Balay v += 16; 23024e2b4712SSatish Balay } 23034e2b4712SSatish Balay idc = 4*(*c--); 23044e2b4712SSatish Balay v = aa + 16*diag[i]; 2305f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2306f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2307f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2308f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 23094e2b4712SSatish Balay } 23104e2b4712SSatish Balay 23114e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 23124e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2313d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23141ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2315dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 23164e2b4712SSatish Balay PetscFunctionReturn(0); 23174e2b4712SSatish Balay } 2318f26ec98cSKris Buschelman 2319f26ec98cSKris Buschelman #undef __FUNCT__ 23208f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 23218f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 23228f690400SShri Abhyankar { 23238f690400SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 23248f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 23258f690400SShri Abhyankar PetscErrorCode ierr; 232629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 23278f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 23288f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 23298f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 23308f690400SShri Abhyankar const PetscScalar *b; 23318f690400SShri Abhyankar 23328f690400SShri Abhyankar PetscFunctionBegin; 23338f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23348f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23358f690400SShri Abhyankar t = a->solve_work; 23368f690400SShri Abhyankar 23378f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 233829b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 23398f690400SShri Abhyankar 23408f690400SShri Abhyankar /* forward solve the lower triangular */ 234129b92fc1SShri Abhyankar idx = 4*r[0]; 23428f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 23438f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 23448f690400SShri Abhyankar for (i=1; i<n; i++) { 23458f690400SShri Abhyankar v = aa + 16*ai[i]; 23468f690400SShri Abhyankar vi = aj + ai[i]; 23478f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 234829b92fc1SShri Abhyankar idx = 4*r[i]; 23498f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 235029b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 235129b92fc1SShri Abhyankar idx = 4*vi[m]; 23528f690400SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 23538f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 23548f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 23558f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 23568f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 23578f690400SShri Abhyankar v += 16; 23588f690400SShri Abhyankar } 23598f690400SShri Abhyankar idx = 4*i; 23608f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 23618f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 23628f690400SShri Abhyankar } 23638f690400SShri Abhyankar /* backward solve the upper triangular */ 23648f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 23658f690400SShri Abhyankar k = 2*n-i; 23668f690400SShri Abhyankar v = aa + 16*ai[k]; 23678f690400SShri Abhyankar vi = aj + ai[k]; 23688f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 23698f690400SShri Abhyankar idt = 4*i; 23708f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 23718f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 237229b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 237329b92fc1SShri Abhyankar idx = 4*vi[m]; 23748f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 23758f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 23768f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 23778f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 23788f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 23798f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 23808f690400SShri Abhyankar v += 16; 23818f690400SShri Abhyankar } 238229b92fc1SShri Abhyankar idc = 4*c[i]; 23838f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 23848f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 23858f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 23868f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 23878f690400SShri Abhyankar } 23888f690400SShri Abhyankar 23898f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 23908f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 23918f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23928f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 23938f690400SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 23948f690400SShri Abhyankar PetscFunctionReturn(0); 23958f690400SShri Abhyankar } 23968f690400SShri Abhyankar 23978f690400SShri Abhyankar #undef __FUNCT__ 2398f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2399dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2400f26ec98cSKris Buschelman { 2401f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2402f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 24036849ba73SBarry Smith PetscErrorCode ierr; 24045d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 24055d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2406d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2407d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2408d9fead3dSBarry Smith PetscScalar *x; 2409d9fead3dSBarry Smith const PetscScalar *b; 2410f26ec98cSKris Buschelman 2411f26ec98cSKris Buschelman PetscFunctionBegin; 2412d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2414f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 2415f26ec98cSKris Buschelman 2416f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2417f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2418f26ec98cSKris Buschelman 2419f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2420f26ec98cSKris Buschelman idx = 4*(*r++); 2421f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 2422f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 2423f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 2424f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 2425f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2426f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2427f26ec98cSKris Buschelman vi = aj + ai[i]; 2428f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2429f26ec98cSKris Buschelman idx = 4*(*r++); 2430f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2431f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2432f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2433f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2434f26ec98cSKris Buschelman while (nz--) { 2435f26ec98cSKris Buschelman idx = 4*(*vi++); 2436f26ec98cSKris Buschelman x1 = t[idx]; 2437f26ec98cSKris Buschelman x2 = t[1+idx]; 2438f26ec98cSKris Buschelman x3 = t[2+idx]; 2439f26ec98cSKris Buschelman x4 = t[3+idx]; 2440f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2441f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2442f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2443f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2444f26ec98cSKris Buschelman v += 16; 2445f26ec98cSKris Buschelman } 2446f26ec98cSKris Buschelman idx = 4*i; 2447f26ec98cSKris Buschelman t[idx] = s1; 2448f26ec98cSKris Buschelman t[1+idx] = s2; 2449f26ec98cSKris Buschelman t[2+idx] = s3; 2450f26ec98cSKris Buschelman t[3+idx] = s4; 2451f26ec98cSKris Buschelman } 2452f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2453f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2454f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 2455f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2456f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2457f26ec98cSKris Buschelman idt = 4*i; 2458f26ec98cSKris Buschelman s1 = t[idt]; 2459f26ec98cSKris Buschelman s2 = t[1+idt]; 2460f26ec98cSKris Buschelman s3 = t[2+idt]; 2461f26ec98cSKris Buschelman s4 = t[3+idt]; 2462f26ec98cSKris Buschelman while (nz--) { 2463f26ec98cSKris Buschelman idx = 4*(*vi++); 2464f26ec98cSKris Buschelman x1 = t[idx]; 2465f26ec98cSKris Buschelman x2 = t[1+idx]; 2466f26ec98cSKris Buschelman x3 = t[2+idx]; 2467f26ec98cSKris Buschelman x4 = t[3+idx]; 2468f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2469f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2470f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2471f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2472f26ec98cSKris Buschelman v += 16; 2473f26ec98cSKris Buschelman } 2474f26ec98cSKris Buschelman idc = 4*(*c--); 2475f26ec98cSKris Buschelman v = aa + 16*diag[i]; 2476f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2477f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2478f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2479f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2480f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 2481f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 2482f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 2483f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 2484f26ec98cSKris Buschelman } 2485f26ec98cSKris Buschelman 2486f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2487f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2488d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24891ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2490dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2491f26ec98cSKris Buschelman PetscFunctionReturn(0); 2492f26ec98cSKris Buschelman } 2493f26ec98cSKris Buschelman 249424c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 249524c233c2SKris Buschelman 249624c233c2SKris Buschelman #include PETSC_HAVE_SSE 249724c233c2SKris Buschelman 249824c233c2SKris Buschelman #undef __FUNCT__ 249924c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 2500dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 250124c233c2SKris Buschelman { 250224c233c2SKris Buschelman /* 250324c233c2SKris Buschelman Note: This code uses demotion of double 250424c233c2SKris Buschelman to float when performing the mixed-mode computation. 250524c233c2SKris Buschelman This may not be numerically reasonable for all applications. 250624c233c2SKris Buschelman */ 250724c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 250824c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 25096849ba73SBarry Smith PetscErrorCode ierr; 25105d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 25115d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 251224c233c2SKris Buschelman MatScalar *aa=a->a,*v; 251387828ca2SBarry Smith PetscScalar *x,*b,*t; 251424c233c2SKris Buschelman 251524c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 251624c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 251724c233c2SKris Buschelman unsigned long offset; 251824c233c2SKris Buschelman 251924c233c2SKris Buschelman PetscFunctionBegin; 252024c233c2SKris Buschelman SSE_SCOPE_BEGIN; 252124c233c2SKris Buschelman 252224c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 252324c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 252424c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 252524c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 252624c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 252724c233c2SKris Buschelman 25281ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25291ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 253024c233c2SKris Buschelman t = a->solve_work; 253124c233c2SKris Buschelman 253224c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 253324c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 253424c233c2SKris Buschelman 253524c233c2SKris Buschelman /* forward solve the lower triangular */ 253624c233c2SKris Buschelman idx = 4*(*r++); 253724c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 253824c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 253924c233c2SKris Buschelman v = aa + 16*ai[1]; 254024c233c2SKris Buschelman 254124c233c2SKris Buschelman for (i=1; i<n;) { 254224c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 254324c233c2SKris Buschelman vi = aj + ai[i]; 254424c233c2SKris Buschelman nz = diag[i] - ai[i]; 254524c233c2SKris Buschelman idx = 4*(*r++); 254624c233c2SKris Buschelman 254724c233c2SKris Buschelman /* Demote sum from double to float */ 254824c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 254924c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 255024c233c2SKris Buschelman 255124c233c2SKris Buschelman while (nz--) { 255224c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 255324c233c2SKris Buschelman idx = 4*(*vi++); 255424c233c2SKris Buschelman 255524c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 255624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 255724c233c2SKris Buschelman 255824c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 255924c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 256024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 256124c233c2SKris Buschelman 256224c233c2SKris Buschelman /* First Column */ 256324c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 256424c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 256524c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 256624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 256724c233c2SKris Buschelman 256824c233c2SKris Buschelman /* Second Column */ 256924c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 257024c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 257124c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 257224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 257324c233c2SKris Buschelman 257424c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 257524c233c2SKris Buschelman 257624c233c2SKris Buschelman /* Third Column */ 257724c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 257824c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 257924c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 258024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 258124c233c2SKris Buschelman 258224c233c2SKris Buschelman /* Fourth Column */ 258324c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 258424c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 258524c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 258624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 258724c233c2SKris Buschelman SSE_INLINE_END_2 258824c233c2SKris Buschelman 258924c233c2SKris Buschelman v += 16; 259024c233c2SKris Buschelman } 259124c233c2SKris Buschelman idx = 4*i; 259224c233c2SKris Buschelman v = aa + 16*ai[++i]; 259324c233c2SKris Buschelman PREFETCH_NTA(v); 259424c233c2SKris Buschelman STORE_PS(tmps,XMM7); 259524c233c2SKris Buschelman 259624c233c2SKris Buschelman /* Promote result from float to double */ 259724c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 259824c233c2SKris Buschelman } 259924c233c2SKris Buschelman /* backward solve the upper triangular */ 260024c233c2SKris Buschelman idt = 4*(n-1); 260124c233c2SKris Buschelman ai16 = 16*diag[n-1]; 260224c233c2SKris Buschelman v = aa + ai16 + 16; 260324c233c2SKris Buschelman for (i=n-1; i>=0;){ 260424c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 260524c233c2SKris Buschelman vi = aj + diag[i] + 1; 260624c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 260724c233c2SKris Buschelman 260824c233c2SKris Buschelman /* Demote accumulator from double to float */ 260924c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 261024c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 261124c233c2SKris Buschelman 261224c233c2SKris Buschelman while (nz--) { 261324c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 261424c233c2SKris Buschelman idx = 4*(*vi++); 261524c233c2SKris Buschelman 261624c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 261724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 261824c233c2SKris Buschelman 261924c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 262024c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 262124c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 262224c233c2SKris Buschelman 262324c233c2SKris Buschelman /* First Column */ 262424c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 262524c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 262624c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 262724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 262824c233c2SKris Buschelman 262924c233c2SKris Buschelman /* Second Column */ 263024c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 263124c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 263224c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 263324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 263424c233c2SKris Buschelman 263524c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 263624c233c2SKris Buschelman 263724c233c2SKris Buschelman /* Third Column */ 263824c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 263924c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 264024c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 264124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 264224c233c2SKris Buschelman 264324c233c2SKris Buschelman /* Fourth Column */ 264424c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 264524c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 264624c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 264724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 264824c233c2SKris Buschelman SSE_INLINE_END_2 264924c233c2SKris Buschelman v += 16; 265024c233c2SKris Buschelman } 265124c233c2SKris Buschelman v = aa + ai16; 265224c233c2SKris Buschelman ai16 = 16*diag[--i]; 265324c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 265424c233c2SKris Buschelman /* 265524c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 265624c233c2SKris Buschelman which was inverted as part of the factorization 265724c233c2SKris Buschelman */ 265824c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 265924c233c2SKris Buschelman /* First Column */ 266024c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 266124c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 266224c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 266324c233c2SKris Buschelman 266424c233c2SKris Buschelman /* Second Column */ 266524c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 266624c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 266724c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 266824c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 266924c233c2SKris Buschelman 267024c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 267124c233c2SKris Buschelman 267224c233c2SKris Buschelman /* Third Column */ 267324c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 267424c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 267524c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 267624c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 267724c233c2SKris Buschelman 267824c233c2SKris Buschelman /* Fourth Column */ 267924c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 268024c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 268124c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 268224c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 268324c233c2SKris Buschelman 268424c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 268524c233c2SKris Buschelman SSE_INLINE_END_3 268624c233c2SKris Buschelman 268724c233c2SKris Buschelman /* Promote solution from float to double */ 268824c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 268924c233c2SKris Buschelman 269024c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 269124c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 269224c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 269324c233c2SKris Buschelman idc = 4*(*c--); 269424c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 269524c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 269624c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 269724c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 269824c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 269924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 270024c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 270124c233c2SKris Buschelman SSE_INLINE_END_2 270224c233c2SKris Buschelman v = aa + ai16 + 16; 270324c233c2SKris Buschelman idt -= 4; 270424c233c2SKris Buschelman } 270524c233c2SKris Buschelman 270624c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 270724c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 27081ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 27091ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2710dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 271124c233c2SKris Buschelman SSE_SCOPE_END; 271224c233c2SKris Buschelman PetscFunctionReturn(0); 271324c233c2SKris Buschelman } 271424c233c2SKris Buschelman 271524c233c2SKris Buschelman #endif 27160ef38995SBarry Smith 27170ef38995SBarry Smith 27184e2b4712SSatish Balay /* 27194e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 27204e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 27214e2b4712SSatish Balay */ 27224a2ae208SSatish Balay #undef __FUNCT__ 27234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2724dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 27254e2b4712SSatish Balay { 27264e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2727356650c2SBarry Smith PetscInt n=a->mbs; 2728356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 2729dfbe8321SBarry Smith PetscErrorCode ierr; 2730356650c2SBarry Smith const PetscInt *diag = a->diag; 2731d9fead3dSBarry Smith const MatScalar *aa=a->a; 2732d9fead3dSBarry Smith PetscScalar *x; 2733d9fead3dSBarry Smith const PetscScalar *b; 27344e2b4712SSatish Balay 27354e2b4712SSatish Balay PetscFunctionBegin; 2736d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27371ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 27384e2b4712SSatish Balay 2739aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 27402853dc0eSBarry Smith { 274187828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 27422853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 27432853dc0eSBarry Smith } 2744aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 27452853dc0eSBarry Smith { 274687828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 27472853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 27482853dc0eSBarry Smith } 2749aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 27502853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2751e1293385SBarry Smith #else 275230d4dcafSBarry Smith { 275387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2754d9fead3dSBarry Smith const MatScalar *v; 2755356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 2756356650c2SBarry Smith const PetscInt *vi; 2757e1293385SBarry Smith 27584e2b4712SSatish Balay /* forward solve the lower triangular */ 27594e2b4712SSatish Balay idx = 0; 2760e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 27614e2b4712SSatish Balay for (i=1; i<n; i++) { 27624e2b4712SSatish Balay v = aa + 16*ai[i]; 27634e2b4712SSatish Balay vi = aj + ai[i]; 27644e2b4712SSatish Balay nz = diag[i] - ai[i]; 2765e1293385SBarry Smith idx += 4; 2766f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 27674e2b4712SSatish Balay while (nz--) { 27684e2b4712SSatish Balay jdx = 4*(*vi++); 27694e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2770f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2771f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2772f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2773f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 27744e2b4712SSatish Balay v += 16; 27754e2b4712SSatish Balay } 2776f1af5d2fSBarry Smith x[idx] = s1; 2777f1af5d2fSBarry Smith x[1+idx] = s2; 2778f1af5d2fSBarry Smith x[2+idx] = s3; 2779f1af5d2fSBarry Smith x[3+idx] = s4; 27804e2b4712SSatish Balay } 27814e2b4712SSatish Balay /* backward solve the upper triangular */ 27824e555682SBarry Smith idt = 4*(n-1); 27834e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 27844e555682SBarry Smith ai16 = 16*diag[i]; 27854e555682SBarry Smith v = aa + ai16 + 16; 27864e2b4712SSatish Balay vi = aj + diag[i] + 1; 27874e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2788f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2789f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 27904e2b4712SSatish Balay while (nz--) { 27914e2b4712SSatish Balay idx = 4*(*vi++); 27924e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2793f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2794f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2795f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2796f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 27974e2b4712SSatish Balay v += 16; 27984e2b4712SSatish Balay } 27994e555682SBarry Smith v = aa + ai16; 2800f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2801f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2802f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2803f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2804329f5518SBarry Smith idt -= 4; 28054e2b4712SSatish Balay } 280630d4dcafSBarry Smith } 2807e1293385SBarry Smith #endif 28084e2b4712SSatish Balay 2809d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28101ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2811dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 28124e2b4712SSatish Balay PetscFunctionReturn(0); 28134e2b4712SSatish Balay } 28144e2b4712SSatish Balay 2815f26ec98cSKris Buschelman #undef __FUNCT__ 2816cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 2817cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2818cee9d6f2SShri Abhyankar { 2819cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 28206464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2821cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2822cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 2823cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2824cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2825cee9d6f2SShri Abhyankar PetscScalar *x; 2826cee9d6f2SShri Abhyankar const PetscScalar *b; 2827cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2828cee9d6f2SShri Abhyankar 2829cee9d6f2SShri Abhyankar PetscFunctionBegin; 2830cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2831cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2832cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2833cee9d6f2SShri Abhyankar idx = 0; 2834cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2835cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2836cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 2837cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2838cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2839cee9d6f2SShri Abhyankar idx = bs*i; 2840cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 28416464896eSShri Abhyankar for(k=0;k<nz;k++) { 28426464896eSShri Abhyankar jdx = bs*vi[k]; 2843cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2844cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2845cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2846cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2847cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2848cee9d6f2SShri Abhyankar 2849cee9d6f2SShri Abhyankar v += bs2; 2850cee9d6f2SShri Abhyankar } 2851cee9d6f2SShri Abhyankar 2852cee9d6f2SShri Abhyankar x[idx] = s1; 2853cee9d6f2SShri Abhyankar x[1+idx] = s2; 2854cee9d6f2SShri Abhyankar x[2+idx] = s3; 2855cee9d6f2SShri Abhyankar x[3+idx] = s4; 2856cee9d6f2SShri Abhyankar } 2857cee9d6f2SShri Abhyankar 2858cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2859cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2860cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 2861cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2862cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2863cee9d6f2SShri Abhyankar idt = bs*i; 2864cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2865cee9d6f2SShri Abhyankar 28666464896eSShri Abhyankar for(k=0;k<nz;k++){ 28676464896eSShri Abhyankar idx = bs*vi[k]; 2868cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2869cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2870cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2871cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2872cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2873cee9d6f2SShri Abhyankar 2874cee9d6f2SShri Abhyankar v += bs2; 2875cee9d6f2SShri Abhyankar } 2876cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2877cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2878cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 2879cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2880cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2881cee9d6f2SShri Abhyankar 2882cee9d6f2SShri Abhyankar } 2883cee9d6f2SShri Abhyankar 2884cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2885cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2886cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2887cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2888cee9d6f2SShri Abhyankar } 2889cee9d6f2SShri Abhyankar 2890*b2b2dd24SShri Abhyankar #undef __FUNCT__ 2891*b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 2892*b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2893*b2b2dd24SShri Abhyankar { 2894*b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2895*b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2896*b2b2dd24SShri Abhyankar PetscErrorCode ierr; 2897*b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 2898*b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2899*b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 2900*b2b2dd24SShri Abhyankar PetscScalar *x; 2901*b2b2dd24SShri Abhyankar const PetscScalar *b; 2902*b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2903cee9d6f2SShri Abhyankar 2904*b2b2dd24SShri Abhyankar PetscFunctionBegin; 2905*b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2906*b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2907*b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 2908*b2b2dd24SShri Abhyankar idx = 0; 2909*b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2910*b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 2911*b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 2912*b2b2dd24SShri Abhyankar vi = aj + ai[i]; 2913*b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 2914*b2b2dd24SShri Abhyankar idx = bs*i; 2915*b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2916*b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 2917*b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 2918*b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2919*b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2920*b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2921*b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2922*b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2923*b2b2dd24SShri Abhyankar 2924*b2b2dd24SShri Abhyankar v += bs2; 2925*b2b2dd24SShri Abhyankar } 2926*b2b2dd24SShri Abhyankar 2927*b2b2dd24SShri Abhyankar x[idx] = s1; 2928*b2b2dd24SShri Abhyankar x[1+idx] = s2; 2929*b2b2dd24SShri Abhyankar x[2+idx] = s3; 2930*b2b2dd24SShri Abhyankar x[3+idx] = s4; 2931*b2b2dd24SShri Abhyankar } 2932*b2b2dd24SShri Abhyankar 2933*b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 2934*b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 2935*b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 2936*b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 2937*b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 2938*b2b2dd24SShri Abhyankar idt = bs*i; 2939*b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2940*b2b2dd24SShri Abhyankar 2941*b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 2942*b2b2dd24SShri Abhyankar idx = bs*vi[k]; 2943*b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2944*b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2945*b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2946*b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2947*b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2948*b2b2dd24SShri Abhyankar 2949*b2b2dd24SShri Abhyankar v += bs2; 2950*b2b2dd24SShri Abhyankar } 2951*b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 2952*b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2953*b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 2954*b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2955*b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2956*b2b2dd24SShri Abhyankar 2957*b2b2dd24SShri Abhyankar } 2958*b2b2dd24SShri Abhyankar 2959*b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2960*b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2961*b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2962*b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 2963*b2b2dd24SShri Abhyankar } 2964cee9d6f2SShri Abhyankar 2965cee9d6f2SShri Abhyankar #undef __FUNCT__ 2966f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2967dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2968f26ec98cSKris Buschelman { 2969f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2970690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2971dfbe8321SBarry Smith PetscErrorCode ierr; 2972690b6cddSBarry Smith PetscInt *diag = a->diag; 2973f26ec98cSKris Buschelman MatScalar *aa=a->a; 2974f26ec98cSKris Buschelman PetscScalar *x,*b; 2975f26ec98cSKris Buschelman 2976f26ec98cSKris Buschelman PetscFunctionBegin; 29771ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 29781ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2979f26ec98cSKris Buschelman 2980f26ec98cSKris Buschelman { 2981f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2982f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 2983690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2984f26ec98cSKris Buschelman 2985f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2986f26ec98cSKris Buschelman idx = 0; 2987f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 2988f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 2989f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 2990f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 2991f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2992f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2993f26ec98cSKris Buschelman vi = aj + ai[i]; 2994f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2995f26ec98cSKris Buschelman idx += 4; 2996f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2997f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2998f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2999f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3000f26ec98cSKris Buschelman while (nz--) { 3001f26ec98cSKris Buschelman jdx = 4*(*vi++); 3002f26ec98cSKris Buschelman x1 = t[jdx]; 3003f26ec98cSKris Buschelman x2 = t[1+jdx]; 3004f26ec98cSKris Buschelman x3 = t[2+jdx]; 3005f26ec98cSKris Buschelman x4 = t[3+jdx]; 3006f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3007f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3008f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3009f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3010f26ec98cSKris Buschelman v += 16; 3011f26ec98cSKris Buschelman } 3012f26ec98cSKris Buschelman t[idx] = s1; 3013f26ec98cSKris Buschelman t[1+idx] = s2; 3014f26ec98cSKris Buschelman t[2+idx] = s3; 3015f26ec98cSKris Buschelman t[3+idx] = s4; 3016f26ec98cSKris Buschelman } 3017f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3018f26ec98cSKris Buschelman idt = 4*(n-1); 3019f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3020f26ec98cSKris Buschelman ai16 = 16*diag[i]; 3021f26ec98cSKris Buschelman v = aa + ai16 + 16; 3022f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3023f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3024f26ec98cSKris Buschelman s1 = t[idt]; 3025f26ec98cSKris Buschelman s2 = t[1+idt]; 3026f26ec98cSKris Buschelman s3 = t[2+idt]; 3027f26ec98cSKris Buschelman s4 = t[3+idt]; 3028f26ec98cSKris Buschelman while (nz--) { 3029f26ec98cSKris Buschelman idx = 4*(*vi++); 3030f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 3031f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 3032f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 3033f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 3034f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3035f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3036f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3037f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3038f26ec98cSKris Buschelman v += 16; 3039f26ec98cSKris Buschelman } 3040f26ec98cSKris Buschelman v = aa + ai16; 3041f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3042f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3043f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3044f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3045f26ec98cSKris Buschelman idt -= 4; 3046f26ec98cSKris Buschelman } 3047f26ec98cSKris Buschelman } 3048f26ec98cSKris Buschelman 30491ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 30501ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3051dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3052f26ec98cSKris Buschelman PetscFunctionReturn(0); 3053f26ec98cSKris Buschelman } 3054f26ec98cSKris Buschelman 30553660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 30563660e330SKris Buschelman 30573660e330SKris Buschelman #include PETSC_HAVE_SSE 30583660e330SKris Buschelman #undef __FUNCT__ 30597cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3060dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 30613660e330SKris Buschelman { 30623660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 30632aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 3064dfbe8321SBarry Smith PetscErrorCode ierr; 3065dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 30663660e330SKris Buschelman MatScalar *aa=a->a; 306787828ca2SBarry Smith PetscScalar *x,*b; 30683660e330SKris Buschelman 30693660e330SKris Buschelman PetscFunctionBegin; 30703660e330SKris Buschelman SSE_SCOPE_BEGIN; 30713660e330SKris Buschelman /* 30723660e330SKris Buschelman Note: This code currently uses demotion of double 30733660e330SKris Buschelman to float when performing the mixed-mode computation. 30743660e330SKris Buschelman This may not be numerically reasonable for all applications. 30753660e330SKris Buschelman */ 30763660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 30773660e330SKris Buschelman 30781ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 30791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 30803660e330SKris Buschelman { 3081eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 3082eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 30832aa5897fSKris Buschelman int nz,i,idt,ai16; 30842aa5897fSKris Buschelman unsigned int jdx,idx; 30852aa5897fSKris Buschelman unsigned short *vi; 3086eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 30873660e330SKris Buschelman 3088eb05f457SKris Buschelman /* First block is the identity. */ 30893660e330SKris Buschelman idx = 0; 3090eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 30912aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 30923660e330SKris Buschelman 30933660e330SKris Buschelman for (i=1; i<n;) { 30943660e330SKris Buschelman PREFETCH_NTA(&v[8]); 30953660e330SKris Buschelman vi = aj + ai[i]; 30963660e330SKris Buschelman nz = diag[i] - ai[i]; 30973660e330SKris Buschelman idx += 4; 30983660e330SKris Buschelman 3099eb05f457SKris Buschelman /* Demote RHS from double to float. */ 3100eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3101eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 31023660e330SKris Buschelman 31033660e330SKris Buschelman while (nz--) { 31043660e330SKris Buschelman PREFETCH_NTA(&v[16]); 31052aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 31063660e330SKris Buschelman 31073660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 3108eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 31093660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 31103660e330SKris Buschelman 31113660e330SKris Buschelman /* First Column */ 31123660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 31133660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 31143660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 31153660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 31163660e330SKris Buschelman 31173660e330SKris Buschelman /* Second Column */ 31183660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 31193660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 31203660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 31213660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 31223660e330SKris Buschelman 31233660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 31243660e330SKris Buschelman 31253660e330SKris Buschelman /* Third Column */ 31263660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 31273660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 31283660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 31293660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 31303660e330SKris Buschelman 31313660e330SKris Buschelman /* Fourth Column */ 31323660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 31333660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 31343660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 31353660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 31363660e330SKris Buschelman SSE_INLINE_END_2 31373660e330SKris Buschelman 31383660e330SKris Buschelman v += 16; 31393660e330SKris Buschelman } 31403660e330SKris Buschelman v = aa + 16*ai[++i]; 31413660e330SKris Buschelman PREFETCH_NTA(v); 3142eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 31433660e330SKris Buschelman } 3144eb05f457SKris Buschelman 3145eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 3146eb05f457SKris Buschelman 31473660e330SKris Buschelman idt = 4*(n-1); 31483660e330SKris Buschelman ai16 = 16*diag[n-1]; 31493660e330SKris Buschelman v = aa + ai16 + 16; 31503660e330SKris Buschelman for (i=n-1; i>=0;){ 31513660e330SKris Buschelman PREFETCH_NTA(&v[8]); 31523660e330SKris Buschelman vi = aj + diag[i] + 1; 31533660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 31543660e330SKris Buschelman 3155eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 31563660e330SKris Buschelman 31573660e330SKris Buschelman while (nz--) { 31583660e330SKris Buschelman PREFETCH_NTA(&v[16]); 31592aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 31603660e330SKris Buschelman 31613660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 3162eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 31633660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 31643660e330SKris Buschelman 31653660e330SKris Buschelman /* First Column */ 31663660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 31673660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 31683660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 31693660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 31703660e330SKris Buschelman 31713660e330SKris Buschelman /* Second Column */ 31723660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 31733660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 31743660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 31753660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 31763660e330SKris Buschelman 31773660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 31783660e330SKris Buschelman 31793660e330SKris Buschelman /* Third Column */ 31803660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 31813660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 31823660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 31833660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 31843660e330SKris Buschelman 31853660e330SKris Buschelman /* Fourth Column */ 31863660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 31873660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 31883660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 31893660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 31903660e330SKris Buschelman SSE_INLINE_END_2 31913660e330SKris Buschelman v += 16; 31923660e330SKris Buschelman } 31933660e330SKris Buschelman v = aa + ai16; 31943660e330SKris Buschelman ai16 = 16*diag[--i]; 31953660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 31963660e330SKris Buschelman /* 31973660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 31983660e330SKris Buschelman which was inverted as part of the factorization 31993660e330SKris Buschelman */ 3200eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 32013660e330SKris Buschelman /* First Column */ 32023660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 32033660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 32043660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 32053660e330SKris Buschelman 32063660e330SKris Buschelman /* Second Column */ 32073660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 32083660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 32093660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 32103660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 32113660e330SKris Buschelman 32123660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 32133660e330SKris Buschelman 32143660e330SKris Buschelman /* Third Column */ 32153660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 32163660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 32173660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 32183660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 32193660e330SKris Buschelman 32203660e330SKris Buschelman /* Fourth Column */ 32213660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 32223660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 32233660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 32243660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 32253660e330SKris Buschelman 32263660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 32273660e330SKris Buschelman SSE_INLINE_END_3 32283660e330SKris Buschelman 32293660e330SKris Buschelman v = aa + ai16 + 16; 32303660e330SKris Buschelman idt -= 4; 32313660e330SKris Buschelman } 3232eb05f457SKris Buschelman 3233eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 3234eb05f457SKris Buschelman idt = 4*(n-1); 3235eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 3236eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3237eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3238eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 3239eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 3240eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 3241eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 3242eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 3243eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 324454693613SKris Buschelman idt -= 4; 32453660e330SKris Buschelman } 3246eb05f457SKris Buschelman 3247eb05f457SKris Buschelman } /* End of artificial scope. */ 32481ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 32491ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3250dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 32513660e330SKris Buschelman SSE_SCOPE_END; 32523660e330SKris Buschelman PetscFunctionReturn(0); 32533660e330SKris Buschelman } 32543660e330SKris Buschelman 32557cf1b8d3SKris Buschelman #undef __FUNCT__ 32567cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3257dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 32587cf1b8d3SKris Buschelman { 32597cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 32607cf1b8d3SKris Buschelman int *aj=a->j; 3261dfbe8321SBarry Smith PetscErrorCode ierr; 3262dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 32637cf1b8d3SKris Buschelman MatScalar *aa=a->a; 32647cf1b8d3SKris Buschelman PetscScalar *x,*b; 32657cf1b8d3SKris Buschelman 32667cf1b8d3SKris Buschelman PetscFunctionBegin; 32677cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 32687cf1b8d3SKris Buschelman /* 32697cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 32707cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 32717cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 32727cf1b8d3SKris Buschelman */ 32737cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 32747cf1b8d3SKris Buschelman 32751ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 32761ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 32777cf1b8d3SKris Buschelman { 32787cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 32797cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 32807cf1b8d3SKris Buschelman int nz,i,idt,ai16; 32817cf1b8d3SKris Buschelman int jdx,idx; 32827cf1b8d3SKris Buschelman int *vi; 32837cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 32847cf1b8d3SKris Buschelman 32857cf1b8d3SKris Buschelman /* First block is the identity. */ 32867cf1b8d3SKris Buschelman idx = 0; 32877cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 32887cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 32897cf1b8d3SKris Buschelman 32907cf1b8d3SKris Buschelman for (i=1; i<n;) { 32917cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 32927cf1b8d3SKris Buschelman vi = aj + ai[i]; 32937cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 32947cf1b8d3SKris Buschelman idx += 4; 32957cf1b8d3SKris Buschelman 32967cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 32977cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 32987cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 32997cf1b8d3SKris Buschelman 33007cf1b8d3SKris Buschelman while (nz--) { 33017cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 33027cf1b8d3SKris Buschelman jdx = 4*(*vi++); 33037cf1b8d3SKris Buschelman /* jdx = *vi++; */ 33047cf1b8d3SKris Buschelman 33057cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 33067cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 33077cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 33087cf1b8d3SKris Buschelman 33097cf1b8d3SKris Buschelman /* First Column */ 33107cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 33117cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 33127cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 33137cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 33147cf1b8d3SKris Buschelman 33157cf1b8d3SKris Buschelman /* Second Column */ 33167cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 33177cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 33187cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 33197cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 33207cf1b8d3SKris Buschelman 33217cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 33227cf1b8d3SKris Buschelman 33237cf1b8d3SKris Buschelman /* Third Column */ 33247cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 33257cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 33267cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 33277cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 33287cf1b8d3SKris Buschelman 33297cf1b8d3SKris Buschelman /* Fourth Column */ 33307cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 33317cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 33327cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 33337cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 33347cf1b8d3SKris Buschelman SSE_INLINE_END_2 33357cf1b8d3SKris Buschelman 33367cf1b8d3SKris Buschelman v += 16; 33377cf1b8d3SKris Buschelman } 33387cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 33397cf1b8d3SKris Buschelman PREFETCH_NTA(v); 33407cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 33417cf1b8d3SKris Buschelman } 33427cf1b8d3SKris Buschelman 33437cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 33447cf1b8d3SKris Buschelman 33457cf1b8d3SKris Buschelman idt = 4*(n-1); 33467cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 33477cf1b8d3SKris Buschelman v = aa + ai16 + 16; 33487cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 33497cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 33507cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 33517cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 33527cf1b8d3SKris Buschelman 33537cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 33547cf1b8d3SKris Buschelman 33557cf1b8d3SKris Buschelman while (nz--) { 33567cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 33577cf1b8d3SKris Buschelman idx = 4*(*vi++); 33587cf1b8d3SKris Buschelman /* idx = *vi++; */ 33597cf1b8d3SKris Buschelman 33607cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 33617cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 33627cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 33637cf1b8d3SKris Buschelman 33647cf1b8d3SKris Buschelman /* First Column */ 33657cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 33667cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 33677cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 33687cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 33697cf1b8d3SKris Buschelman 33707cf1b8d3SKris Buschelman /* Second Column */ 33717cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 33727cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 33737cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 33747cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 33757cf1b8d3SKris Buschelman 33767cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 33777cf1b8d3SKris Buschelman 33787cf1b8d3SKris Buschelman /* Third Column */ 33797cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 33807cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 33817cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 33827cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 33837cf1b8d3SKris Buschelman 33847cf1b8d3SKris Buschelman /* Fourth Column */ 33857cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 33867cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 33877cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 33887cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 33897cf1b8d3SKris Buschelman SSE_INLINE_END_2 33907cf1b8d3SKris Buschelman v += 16; 33917cf1b8d3SKris Buschelman } 33927cf1b8d3SKris Buschelman v = aa + ai16; 33937cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 33947cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 33957cf1b8d3SKris Buschelman /* 33967cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 33977cf1b8d3SKris Buschelman which was inverted as part of the factorization 33987cf1b8d3SKris Buschelman */ 33997cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 34007cf1b8d3SKris Buschelman /* First Column */ 34017cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 34027cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 34037cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 34047cf1b8d3SKris Buschelman 34057cf1b8d3SKris Buschelman /* Second Column */ 34067cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 34077cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 34087cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 34097cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 34107cf1b8d3SKris Buschelman 34117cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 34127cf1b8d3SKris Buschelman 34137cf1b8d3SKris Buschelman /* Third Column */ 34147cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 34157cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 34167cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 34177cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 34187cf1b8d3SKris Buschelman 34197cf1b8d3SKris Buschelman /* Fourth Column */ 34207cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 34217cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 34227cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 34237cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 34247cf1b8d3SKris Buschelman 34257cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 34267cf1b8d3SKris Buschelman SSE_INLINE_END_3 34277cf1b8d3SKris Buschelman 34287cf1b8d3SKris Buschelman v = aa + ai16 + 16; 34297cf1b8d3SKris Buschelman idt -= 4; 34307cf1b8d3SKris Buschelman } 34317cf1b8d3SKris Buschelman 34327cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 34337cf1b8d3SKris Buschelman idt = 4*(n-1); 34347cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 34357cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 34367cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 34377cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 34387cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 34397cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 34407cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 34417cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 34427cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 34437cf1b8d3SKris Buschelman idt -= 4; 34447cf1b8d3SKris Buschelman } 34457cf1b8d3SKris Buschelman 34467cf1b8d3SKris Buschelman } /* End of artificial scope. */ 34471ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 34481ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3449dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 34507cf1b8d3SKris Buschelman SSE_SCOPE_END; 34517cf1b8d3SKris Buschelman PetscFunctionReturn(0); 34527cf1b8d3SKris Buschelman } 34537cf1b8d3SKris Buschelman 34543660e330SKris Buschelman #endif 34558f690400SShri Abhyankar 34564a2ae208SSatish Balay #undef __FUNCT__ 34574a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3458dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 34594e2b4712SSatish Balay { 34604e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 34614e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 34626849ba73SBarry Smith PetscErrorCode ierr; 34635d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 34645d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3465d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3466d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3467d9fead3dSBarry Smith const PetscScalar *b; 34684e2b4712SSatish Balay 34694e2b4712SSatish Balay PetscFunctionBegin; 3470d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 34711ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3472f1af5d2fSBarry Smith t = a->solve_work; 34734e2b4712SSatish Balay 34744e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 34754e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 34764e2b4712SSatish Balay 34774e2b4712SSatish Balay /* forward solve the lower triangular */ 34784e2b4712SSatish Balay idx = 3*(*r++); 3479f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 34804e2b4712SSatish Balay for (i=1; i<n; i++) { 34814e2b4712SSatish Balay v = aa + 9*ai[i]; 34824e2b4712SSatish Balay vi = aj + ai[i]; 34834e2b4712SSatish Balay nz = diag[i] - ai[i]; 34844e2b4712SSatish Balay idx = 3*(*r++); 3485f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 34864e2b4712SSatish Balay while (nz--) { 34874e2b4712SSatish Balay idx = 3*(*vi++); 3488f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3489f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3490f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3491f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 34924e2b4712SSatish Balay v += 9; 34934e2b4712SSatish Balay } 34944e2b4712SSatish Balay idx = 3*i; 3495f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 34964e2b4712SSatish Balay } 34974e2b4712SSatish Balay /* backward solve the upper triangular */ 34984e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 34994e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 35004e2b4712SSatish Balay vi = aj + diag[i] + 1; 35014e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 35024e2b4712SSatish Balay idt = 3*i; 3503f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 35044e2b4712SSatish Balay while (nz--) { 35054e2b4712SSatish Balay idx = 3*(*vi++); 3506f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3507f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3508f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3509f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 35104e2b4712SSatish Balay v += 9; 35114e2b4712SSatish Balay } 35124e2b4712SSatish Balay idc = 3*(*c--); 35134e2b4712SSatish Balay v = aa + 9*diag[i]; 3514f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3515f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3516f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 35174e2b4712SSatish Balay } 35184e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 35194e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3520d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 35211ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3522dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 35234e2b4712SSatish Balay PetscFunctionReturn(0); 35244e2b4712SSatish Balay } 35254e2b4712SSatish Balay 35268f690400SShri Abhyankar #undef __FUNCT__ 35278f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 35288f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 35298f690400SShri Abhyankar { 35308f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 35318f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 35328f690400SShri Abhyankar PetscErrorCode ierr; 353329b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 35348f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 35358f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 35368f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 35378f690400SShri Abhyankar const PetscScalar *b; 35388f690400SShri Abhyankar 35398f690400SShri Abhyankar PetscFunctionBegin; 35408f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 35418f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 35428f690400SShri Abhyankar t = a->solve_work; 35438f690400SShri Abhyankar 35448f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 354529b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 35468f690400SShri Abhyankar 35478f690400SShri Abhyankar /* forward solve the lower triangular */ 354829b92fc1SShri Abhyankar idx = 3*r[0]; 35498f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 35508f690400SShri Abhyankar for (i=1; i<n; i++) { 35518f690400SShri Abhyankar v = aa + 9*ai[i]; 35528f690400SShri Abhyankar vi = aj + ai[i]; 35538f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 355429b92fc1SShri Abhyankar idx = 3*r[i]; 35558f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 355629b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 355729b92fc1SShri Abhyankar idx = 3*vi[m]; 35588f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 35598f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 35608f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 35618f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 35628f690400SShri Abhyankar v += 9; 35638f690400SShri Abhyankar } 35648f690400SShri Abhyankar idx = 3*i; 35658f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 35668f690400SShri Abhyankar } 35678f690400SShri Abhyankar /* backward solve the upper triangular */ 35688f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 35698f690400SShri Abhyankar k = 2*n-i; 35708f690400SShri Abhyankar v = aa + 9*ai[k]; 35718f690400SShri Abhyankar vi = aj + ai[k]; 35728f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 35738f690400SShri Abhyankar idt = 3*i; 35748f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 357529b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 357629b92fc1SShri Abhyankar idx = 3*vi[m]; 35778f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 35788f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 35798f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 35808f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 35818f690400SShri Abhyankar v += 9; 35828f690400SShri Abhyankar } 358329b92fc1SShri Abhyankar idc = 3*c[i]; 35848f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 35858f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 35868f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 35878f690400SShri Abhyankar } 35888f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 35898f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 35908f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 35918f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 35928f690400SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 35938f690400SShri Abhyankar PetscFunctionReturn(0); 35948f690400SShri Abhyankar } 35958f690400SShri Abhyankar 359615091d37SBarry Smith /* 359715091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 359815091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 359915091d37SBarry Smith */ 36004a2ae208SSatish Balay #undef __FUNCT__ 36014a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 3602dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 360315091d37SBarry Smith { 360415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3605690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3606dfbe8321SBarry Smith PetscErrorCode ierr; 3607690b6cddSBarry Smith PetscInt *diag = a->diag; 3608d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3609d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 3610d9fead3dSBarry Smith const PetscScalar *b; 3611690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 361215091d37SBarry Smith 361315091d37SBarry Smith PetscFunctionBegin; 3614d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 36151ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 361615091d37SBarry Smith 361715091d37SBarry Smith /* forward solve the lower triangular */ 361815091d37SBarry Smith idx = 0; 361915091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 362015091d37SBarry Smith for (i=1; i<n; i++) { 362115091d37SBarry Smith v = aa + 9*ai[i]; 362215091d37SBarry Smith vi = aj + ai[i]; 362315091d37SBarry Smith nz = diag[i] - ai[i]; 362415091d37SBarry Smith idx += 3; 3625f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 362615091d37SBarry Smith while (nz--) { 362715091d37SBarry Smith jdx = 3*(*vi++); 362815091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 3629f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3630f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3631f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 363215091d37SBarry Smith v += 9; 363315091d37SBarry Smith } 3634f1af5d2fSBarry Smith x[idx] = s1; 3635f1af5d2fSBarry Smith x[1+idx] = s2; 3636f1af5d2fSBarry Smith x[2+idx] = s3; 363715091d37SBarry Smith } 363815091d37SBarry Smith /* backward solve the upper triangular */ 363915091d37SBarry Smith for (i=n-1; i>=0; i--){ 364015091d37SBarry Smith v = aa + 9*diag[i] + 9; 364115091d37SBarry Smith vi = aj + diag[i] + 1; 364215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 364315091d37SBarry Smith idt = 3*i; 3644f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3645f1af5d2fSBarry Smith s3 = x[2+idt]; 364615091d37SBarry Smith while (nz--) { 364715091d37SBarry Smith idx = 3*(*vi++); 364815091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 3649f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3650f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3651f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 365215091d37SBarry Smith v += 9; 365315091d37SBarry Smith } 365415091d37SBarry Smith v = aa + 9*diag[i]; 3655f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3656f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3657f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 365815091d37SBarry Smith } 365915091d37SBarry Smith 3660d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 36611ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3662dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 366315091d37SBarry Smith PetscFunctionReturn(0); 366415091d37SBarry Smith } 366515091d37SBarry Smith 36664a2ae208SSatish Balay #undef __FUNCT__ 3667cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 3668cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3669cee9d6f2SShri Abhyankar { 3670cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3671ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3672cee9d6f2SShri Abhyankar PetscErrorCode ierr; 3673cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 3674cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3675cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 3676cee9d6f2SShri Abhyankar PetscScalar *x; 3677cee9d6f2SShri Abhyankar const PetscScalar *b; 3678cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 3679cee9d6f2SShri Abhyankar 3680cee9d6f2SShri Abhyankar PetscFunctionBegin; 3681cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3682cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3683cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 3684cee9d6f2SShri Abhyankar idx = 0; 3685cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 3686cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 3687cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 3688cee9d6f2SShri Abhyankar vi = aj + ai[i]; 3689cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 3690cee9d6f2SShri Abhyankar idx = bs*i; 3691cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3692ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 3693ce3d78c0SShri Abhyankar jdx = bs*vi[k]; 3694cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 3695cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3696cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3697cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3698cee9d6f2SShri Abhyankar 3699cee9d6f2SShri Abhyankar v += bs2; 3700cee9d6f2SShri Abhyankar } 3701cee9d6f2SShri Abhyankar 3702cee9d6f2SShri Abhyankar x[idx] = s1; 3703cee9d6f2SShri Abhyankar x[1+idx] = s2; 3704cee9d6f2SShri Abhyankar x[2+idx] = s3; 3705cee9d6f2SShri Abhyankar } 3706cee9d6f2SShri Abhyankar 3707cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 3708cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 3709cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 3710cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 3711cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 3712cee9d6f2SShri Abhyankar idt = bs*i; 3713cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 3714cee9d6f2SShri Abhyankar 3715ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 3716ce3d78c0SShri Abhyankar idx = bs*vi[k]; 3717cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3718cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3719cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3720cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3721cee9d6f2SShri Abhyankar 3722cee9d6f2SShri Abhyankar v += bs2; 3723cee9d6f2SShri Abhyankar } 3724cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 3725cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3726cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3727cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3728cee9d6f2SShri Abhyankar 3729cee9d6f2SShri Abhyankar } 3730cee9d6f2SShri Abhyankar 3731cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3732cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3733cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3734cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 3735cee9d6f2SShri Abhyankar } 3736cee9d6f2SShri Abhyankar 3737cee9d6f2SShri Abhyankar #undef __FUNCT__ 3738*b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2" 3739*b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 3740*b2b2dd24SShri Abhyankar { 3741*b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3742*b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3743*b2b2dd24SShri Abhyankar PetscErrorCode ierr; 3744*b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 3745*b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3746*b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 3747*b2b2dd24SShri Abhyankar PetscScalar *x; 3748*b2b2dd24SShri Abhyankar const PetscScalar *b; 3749*b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 3750*b2b2dd24SShri Abhyankar 3751*b2b2dd24SShri Abhyankar PetscFunctionBegin; 3752*b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3753*b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3754*b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 3755*b2b2dd24SShri Abhyankar idx = 0; 3756*b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 3757*b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 3758*b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 3759*b2b2dd24SShri Abhyankar vi = aj + ai[i]; 3760*b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 3761*b2b2dd24SShri Abhyankar idx = bs*i; 3762*b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3763*b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 3764*b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 3765*b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 3766*b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3767*b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3768*b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3769*b2b2dd24SShri Abhyankar 3770*b2b2dd24SShri Abhyankar v += bs2; 3771*b2b2dd24SShri Abhyankar } 3772*b2b2dd24SShri Abhyankar 3773*b2b2dd24SShri Abhyankar x[idx] = s1; 3774*b2b2dd24SShri Abhyankar x[1+idx] = s2; 3775*b2b2dd24SShri Abhyankar x[2+idx] = s3; 3776*b2b2dd24SShri Abhyankar } 3777*b2b2dd24SShri Abhyankar 3778*b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 3779*b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 3780*b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 3781*b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 3782*b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 3783*b2b2dd24SShri Abhyankar idt = bs*i; 3784*b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 3785*b2b2dd24SShri Abhyankar 3786*b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 3787*b2b2dd24SShri Abhyankar idx = bs*vi[k]; 3788*b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3789*b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3790*b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3791*b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3792*b2b2dd24SShri Abhyankar 3793*b2b2dd24SShri Abhyankar v += bs2; 3794*b2b2dd24SShri Abhyankar } 3795*b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 3796*b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3797*b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3798*b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3799*b2b2dd24SShri Abhyankar 3800*b2b2dd24SShri Abhyankar } 3801*b2b2dd24SShri Abhyankar 3802*b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3803*b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3804*b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3805*b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 3806*b2b2dd24SShri Abhyankar } 3807*b2b2dd24SShri Abhyankar 3808*b2b2dd24SShri Abhyankar #undef __FUNCT__ 38094a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 3810dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 38114e2b4712SSatish Balay { 38124e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 38134e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 38146849ba73SBarry Smith PetscErrorCode ierr; 38155d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 38165d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3817d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3818d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 3819d9fead3dSBarry Smith const PetscScalar *b; 38204e2b4712SSatish Balay 38214e2b4712SSatish Balay PetscFunctionBegin; 3822d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 38231ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3824f1af5d2fSBarry Smith t = a->solve_work; 38254e2b4712SSatish Balay 38264e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 38274e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 38284e2b4712SSatish Balay 38294e2b4712SSatish Balay /* forward solve the lower triangular */ 38304e2b4712SSatish Balay idx = 2*(*r++); 3831f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 38324e2b4712SSatish Balay for (i=1; i<n; i++) { 38334e2b4712SSatish Balay v = aa + 4*ai[i]; 38344e2b4712SSatish Balay vi = aj + ai[i]; 38354e2b4712SSatish Balay nz = diag[i] - ai[i]; 38364e2b4712SSatish Balay idx = 2*(*r++); 3837f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 38384e2b4712SSatish Balay while (nz--) { 38394e2b4712SSatish Balay idx = 2*(*vi++); 3840f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3841f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3842f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 38434e2b4712SSatish Balay v += 4; 38444e2b4712SSatish Balay } 38454e2b4712SSatish Balay idx = 2*i; 3846f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 38474e2b4712SSatish Balay } 38484e2b4712SSatish Balay /* backward solve the upper triangular */ 38494e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 38504e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 38514e2b4712SSatish Balay vi = aj + diag[i] + 1; 38524e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 38534e2b4712SSatish Balay idt = 2*i; 3854f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 38554e2b4712SSatish Balay while (nz--) { 38564e2b4712SSatish Balay idx = 2*(*vi++); 3857f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3858f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3859f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 38604e2b4712SSatish Balay v += 4; 38614e2b4712SSatish Balay } 38624e2b4712SSatish Balay idc = 2*(*c--); 38634e2b4712SSatish Balay v = aa + 4*diag[i]; 3864f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 3865f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 38664e2b4712SSatish Balay } 38674e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 38684e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3869d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 38701ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3871dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 38724e2b4712SSatish Balay PetscFunctionReturn(0); 38734e2b4712SSatish Balay } 38744e2b4712SSatish Balay 38758f690400SShri Abhyankar #undef __FUNCT__ 38768f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 38778f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 38788f690400SShri Abhyankar { 38798f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 38808f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 38818f690400SShri Abhyankar PetscErrorCode ierr; 388229b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 38838f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 38848f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 38858f690400SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 38868f690400SShri Abhyankar const PetscScalar *b; 38878f690400SShri Abhyankar 38888f690400SShri Abhyankar PetscFunctionBegin; 38898f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 38908f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 38918f690400SShri Abhyankar t = a->solve_work; 38928f690400SShri Abhyankar 38938f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 389429b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 38958f690400SShri Abhyankar 38968f690400SShri Abhyankar /* forward solve the lower triangular */ 389729b92fc1SShri Abhyankar idx = 2*r[0]; 38988f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 38998f690400SShri Abhyankar for (i=1; i<n; i++) { 39008f690400SShri Abhyankar v = aa + 4*ai[i]; 39018f690400SShri Abhyankar vi = aj + ai[i]; 39028f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 390329b92fc1SShri Abhyankar idx = 2*r[i]; 39048f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 390529b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 390629b92fc1SShri Abhyankar jdx = 2*vi[m]; 39078f690400SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 39088f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 39098f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 39108f690400SShri Abhyankar v += 4; 39118f690400SShri Abhyankar } 39128f690400SShri Abhyankar idx = 2*i; 39138f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 39148f690400SShri Abhyankar } 39158f690400SShri Abhyankar /* backward solve the upper triangular */ 39168f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 39178f690400SShri Abhyankar k = 2*n-i; 39188f690400SShri Abhyankar v = aa + 4*ai[k]; 39198f690400SShri Abhyankar vi = aj + ai[k]; 39208f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 39218f690400SShri Abhyankar idt = 2*i; 39228f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 392329b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 392429b92fc1SShri Abhyankar idx = 2*vi[m]; 39258f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 39268f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 39278f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 39288f690400SShri Abhyankar v += 4; 39298f690400SShri Abhyankar } 393029b92fc1SShri Abhyankar idc = 2*c[i]; 39318f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 39328f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 39338f690400SShri Abhyankar } 39348f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 39358f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 39368f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 39378f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 39388f690400SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 39398f690400SShri Abhyankar PetscFunctionReturn(0); 39408f690400SShri Abhyankar } 39418f690400SShri Abhyankar 39428f690400SShri Abhyankar 394315091d37SBarry Smith /* 394415091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 394515091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 394615091d37SBarry Smith */ 39474a2ae208SSatish Balay #undef __FUNCT__ 39484a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 3949dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 395015091d37SBarry Smith { 395115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3952690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3953dfbe8321SBarry Smith PetscErrorCode ierr; 3954690b6cddSBarry Smith PetscInt *diag = a->diag; 3955d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3956d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 3957d9fead3dSBarry Smith const PetscScalar *b; 3958690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 395915091d37SBarry Smith 396015091d37SBarry Smith PetscFunctionBegin; 3961d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 39621ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 396315091d37SBarry Smith 396415091d37SBarry Smith /* forward solve the lower triangular */ 396515091d37SBarry Smith idx = 0; 396615091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 396715091d37SBarry Smith for (i=1; i<n; i++) { 396815091d37SBarry Smith v = aa + 4*ai[i]; 396915091d37SBarry Smith vi = aj + ai[i]; 397015091d37SBarry Smith nz = diag[i] - ai[i]; 397115091d37SBarry Smith idx += 2; 3972f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 397315091d37SBarry Smith while (nz--) { 397415091d37SBarry Smith jdx = 2*(*vi++); 397515091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 3976f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3977f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 397815091d37SBarry Smith v += 4; 397915091d37SBarry Smith } 3980f1af5d2fSBarry Smith x[idx] = s1; 3981f1af5d2fSBarry Smith x[1+idx] = s2; 398215091d37SBarry Smith } 398315091d37SBarry Smith /* backward solve the upper triangular */ 398415091d37SBarry Smith for (i=n-1; i>=0; i--){ 398515091d37SBarry Smith v = aa + 4*diag[i] + 4; 398615091d37SBarry Smith vi = aj + diag[i] + 1; 398715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 398815091d37SBarry Smith idt = 2*i; 3989f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 399015091d37SBarry Smith while (nz--) { 399115091d37SBarry Smith idx = 2*(*vi++); 399215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 3993f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3994f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 399515091d37SBarry Smith v += 4; 399615091d37SBarry Smith } 399715091d37SBarry Smith v = aa + 4*diag[i]; 3998f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 3999f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 400015091d37SBarry Smith } 400115091d37SBarry Smith 4002d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 40031ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4004dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 400515091d37SBarry Smith PetscFunctionReturn(0); 400615091d37SBarry Smith } 400715091d37SBarry Smith 40084a2ae208SSatish Balay #undef __FUNCT__ 4009cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4010cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4011cee9d6f2SShri Abhyankar { 4012cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4013ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4014cee9d6f2SShri Abhyankar PetscErrorCode ierr; 4015cee9d6f2SShri Abhyankar PetscInt jdx; 4016cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 4017cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4018cee9d6f2SShri Abhyankar const PetscScalar *b; 4019cee9d6f2SShri Abhyankar 4020cee9d6f2SShri Abhyankar PetscFunctionBegin; 4021cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4022cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4023cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 4024cee9d6f2SShri Abhyankar idx = 0; 4025cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4026cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 4027cee9d6f2SShri Abhyankar v = aa + 4*ai[i]; 4028cee9d6f2SShri Abhyankar vi = aj + ai[i]; 4029cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 4030cee9d6f2SShri Abhyankar idx = 2*i; 4031cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4032ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4033ce3d78c0SShri Abhyankar jdx = 2*vi[k]; 4034cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4035cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4036cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4037cee9d6f2SShri Abhyankar v += 4; 4038cee9d6f2SShri Abhyankar } 4039cee9d6f2SShri Abhyankar x[idx] = s1; 4040cee9d6f2SShri Abhyankar x[1+idx] = s2; 4041cee9d6f2SShri Abhyankar } 4042cee9d6f2SShri Abhyankar 4043cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 4044cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 4045cee9d6f2SShri Abhyankar v = aa + 4*ai[2*n-i]; 4046cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 4047cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 4048cee9d6f2SShri Abhyankar idt = 2*i; 4049cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4050ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4051ce3d78c0SShri Abhyankar idx = 2*vi[k]; 4052cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4053cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4054cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4055cee9d6f2SShri Abhyankar v += 4; 4056cee9d6f2SShri Abhyankar } 4057cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 4058cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4059cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4060cee9d6f2SShri Abhyankar } 4061cee9d6f2SShri Abhyankar 4062cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4063cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4064cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4065cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 4066cee9d6f2SShri Abhyankar } 4067cee9d6f2SShri Abhyankar 4068cee9d6f2SShri Abhyankar #undef __FUNCT__ 4069*b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2" 4070*b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4071*b2b2dd24SShri Abhyankar { 4072*b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4073*b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4074*b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4075*b2b2dd24SShri Abhyankar PetscInt jdx; 4076*b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4077*b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4078*b2b2dd24SShri Abhyankar const PetscScalar *b; 4079*b2b2dd24SShri Abhyankar 4080*b2b2dd24SShri Abhyankar PetscFunctionBegin; 4081*b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4082*b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4083*b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4084*b2b2dd24SShri Abhyankar idx = 0; 4085*b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4086*b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4087*b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 4088*b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4089*b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4090*b2b2dd24SShri Abhyankar idx = 2*i; 4091*b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4092*b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4093*b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 4094*b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4095*b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4096*b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4097*b2b2dd24SShri Abhyankar v += 4; 4098*b2b2dd24SShri Abhyankar } 4099*b2b2dd24SShri Abhyankar x[idx] = s1; 4100*b2b2dd24SShri Abhyankar x[1+idx] = s2; 4101*b2b2dd24SShri Abhyankar } 4102*b2b2dd24SShri Abhyankar 4103*b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4104*b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4105*b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 4106*b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4107*b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4108*b2b2dd24SShri Abhyankar idt = 2*i; 4109*b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4110*b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4111*b2b2dd24SShri Abhyankar idx = 2*vi[k]; 4112*b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4113*b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4114*b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4115*b2b2dd24SShri Abhyankar v += 4; 4116*b2b2dd24SShri Abhyankar } 4117*b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4118*b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4119*b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4120*b2b2dd24SShri Abhyankar } 4121*b2b2dd24SShri Abhyankar 4122*b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4123*b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4124*b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4125*b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4126*b2b2dd24SShri Abhyankar } 4127*b2b2dd24SShri Abhyankar 4128*b2b2dd24SShri Abhyankar #undef __FUNCT__ 41294a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4130dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 41314e2b4712SSatish Balay { 41324e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 41334e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 41346849ba73SBarry Smith PetscErrorCode ierr; 41355d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 41365d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 41373f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 413887828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 41394e2b4712SSatish Balay 41404e2b4712SSatish Balay PetscFunctionBegin; 41414e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 41424e2b4712SSatish Balay 41431ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 41441ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4145f1af5d2fSBarry Smith t = a->solve_work; 41464e2b4712SSatish Balay 41474e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 41484e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 41494e2b4712SSatish Balay 41504e2b4712SSatish Balay /* forward solve the lower triangular */ 4151f1af5d2fSBarry Smith t[0] = b[*r++]; 41524e2b4712SSatish Balay for (i=1; i<n; i++) { 41534e2b4712SSatish Balay v = aa + ai[i]; 41544e2b4712SSatish Balay vi = aj + ai[i]; 41554e2b4712SSatish Balay nz = diag[i] - ai[i]; 4156f1af5d2fSBarry Smith s1 = b[*r++]; 41574e2b4712SSatish Balay while (nz--) { 4158f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 41594e2b4712SSatish Balay } 4160f1af5d2fSBarry Smith t[i] = s1; 41614e2b4712SSatish Balay } 41624e2b4712SSatish Balay /* backward solve the upper triangular */ 41634e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 41644e2b4712SSatish Balay v = aa + diag[i] + 1; 41654e2b4712SSatish Balay vi = aj + diag[i] + 1; 41664e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4167f1af5d2fSBarry Smith s1 = t[i]; 41684e2b4712SSatish Balay while (nz--) { 4169f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 41704e2b4712SSatish Balay } 4171f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 41724e2b4712SSatish Balay } 41734e2b4712SSatish Balay 41744e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 41754e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 41761ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 41771ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4178dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 41794e2b4712SSatish Balay PetscFunctionReturn(0); 41804e2b4712SSatish Balay } 418115091d37SBarry Smith /* 418215091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 418315091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 418415091d37SBarry Smith */ 41854a2ae208SSatish Balay #undef __FUNCT__ 41864a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4187dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 418815091d37SBarry Smith { 418915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4190690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4191dfbe8321SBarry Smith PetscErrorCode ierr; 4192690b6cddSBarry Smith PetscInt *diag = a->diag; 419315091d37SBarry Smith MatScalar *aa=a->a; 419487828ca2SBarry Smith PetscScalar *x,*b; 419587828ca2SBarry Smith PetscScalar s1,x1; 419615091d37SBarry Smith MatScalar *v; 4197690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 419815091d37SBarry Smith 419915091d37SBarry Smith PetscFunctionBegin; 42001ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 42011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 420215091d37SBarry Smith 420315091d37SBarry Smith /* forward solve the lower triangular */ 420415091d37SBarry Smith idx = 0; 420515091d37SBarry Smith x[0] = b[0]; 420615091d37SBarry Smith for (i=1; i<n; i++) { 420715091d37SBarry Smith v = aa + ai[i]; 420815091d37SBarry Smith vi = aj + ai[i]; 420915091d37SBarry Smith nz = diag[i] - ai[i]; 421015091d37SBarry Smith idx += 1; 4211f1af5d2fSBarry Smith s1 = b[idx]; 421215091d37SBarry Smith while (nz--) { 421315091d37SBarry Smith jdx = *vi++; 421415091d37SBarry Smith x1 = x[jdx]; 4215f1af5d2fSBarry Smith s1 -= v[0]*x1; 421615091d37SBarry Smith v += 1; 421715091d37SBarry Smith } 4218f1af5d2fSBarry Smith x[idx] = s1; 421915091d37SBarry Smith } 422015091d37SBarry Smith /* backward solve the upper triangular */ 422115091d37SBarry Smith for (i=n-1; i>=0; i--){ 422215091d37SBarry Smith v = aa + diag[i] + 1; 422315091d37SBarry Smith vi = aj + diag[i] + 1; 422415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 422515091d37SBarry Smith idt = i; 4226f1af5d2fSBarry Smith s1 = x[idt]; 422715091d37SBarry Smith while (nz--) { 422815091d37SBarry Smith idx = *vi++; 422915091d37SBarry Smith x1 = x[idx]; 4230f1af5d2fSBarry Smith s1 -= v[0]*x1; 423115091d37SBarry Smith v += 1; 423215091d37SBarry Smith } 423315091d37SBarry Smith v = aa + diag[i]; 4234f1af5d2fSBarry Smith x[idt] = v[0]*s1; 423515091d37SBarry Smith } 42361ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 42371ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4238dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 423915091d37SBarry Smith PetscFunctionReturn(0); 424015091d37SBarry Smith } 42414e2b4712SSatish Balay 42424e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 424316a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 42446bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 42456bce7ff8SHong Zhang 424684a281e5SHong Zhang extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 42478f690400SShri Abhyankar extern PetscErrorCode MatSolve_SeqBAIJ_N_newdatastruct(Mat,Vec,Vec); 42488f690400SShri Abhyankar 42496bce7ff8SHong Zhang #undef __FUNCT__ 42506bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 42516bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 42526bce7ff8SHong Zhang { 42536bce7ff8SHong Zhang Mat C=B; 42546bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 42556bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 42566bce7ff8SHong Zhang PetscErrorCode ierr; 42576bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 42586bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 42596bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4260b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4261914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4262914a18a2SHong Zhang MatScalar *v_work; 42636bce7ff8SHong Zhang 42646bce7ff8SHong Zhang PetscFunctionBegin; 42656bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 42666bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4267914a18a2SHong Zhang ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4268914a18a2SHong Zhang ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 42696bce7ff8SHong Zhang ics = ic; 42706bce7ff8SHong Zhang 4271914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 4272914a18a2SHong Zhang ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 4273b588c5a2SHong Zhang mwork = v_work + bs; 4274b588c5a2SHong Zhang v_pivots = (PetscInt*)(mwork + bs2); 4275914a18a2SHong Zhang 42766bce7ff8SHong Zhang for (i=0; i<n; i++){ 42776bce7ff8SHong Zhang /* zero rtmp */ 42786bce7ff8SHong Zhang /* L part */ 42796bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 42806bce7ff8SHong Zhang bjtmp = bj + bi[i]; 4281914a18a2SHong Zhang for (j=0; j<nz; j++){ 4282914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4283914a18a2SHong Zhang } 42846bce7ff8SHong Zhang 42856bce7ff8SHong Zhang /* U part */ 42866bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i]; 42876bce7ff8SHong Zhang bjtmp = bj + bi[2*n-i]; 4288914a18a2SHong Zhang for (j=0; j<nz; j++){ 4289914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4290914a18a2SHong Zhang } 42916bce7ff8SHong Zhang 42926bce7ff8SHong Zhang /* load in initial (unfactored row) */ 42936bce7ff8SHong Zhang nz = ai[r[i]+1] - ai[r[i]]; 42946bce7ff8SHong Zhang ajtmp = aj + ai[r[i]]; 4295914a18a2SHong Zhang v = aa + bs2*ai[r[i]]; 42966bce7ff8SHong Zhang for (j=0; j<nz; j++) { 4297914a18a2SHong Zhang ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 42986bce7ff8SHong Zhang } 42996bce7ff8SHong Zhang 43006bce7ff8SHong Zhang /* elimination */ 43016bce7ff8SHong Zhang bjtmp = bj + bi[i]; 43026bce7ff8SHong Zhang nzL = bi[i+1] - bi[i]; 4303b1646270SShri Abhyankar for(k=0;k < nzL;k++) { 4304b1646270SShri Abhyankar row = bjtmp[k]; 4305914a18a2SHong Zhang pc = rtmp + bs2*row; 4306914a18a2SHong Zhang for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 4307914a18a2SHong Zhang if (flg) { 4308914a18a2SHong Zhang pv = b->a + bs2*bdiag[row]; 4309b588c5a2SHong Zhang Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 43106bce7ff8SHong Zhang pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 4311914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-row]; 43126bce7ff8SHong Zhang nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 4313914a18a2SHong Zhang for (j=0; j<nz; j++) { 4314914a18a2SHong Zhang Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 4315914a18a2SHong Zhang } 4316b588c5a2SHong Zhang ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 43176bce7ff8SHong Zhang } 43186bce7ff8SHong Zhang } 43196bce7ff8SHong Zhang 43206bce7ff8SHong Zhang /* finished row so stick it into b->a */ 43216bce7ff8SHong Zhang /* L part */ 4322914a18a2SHong Zhang pv = b->a + bs2*bi[i] ; 43236bce7ff8SHong Zhang pj = b->j + bi[i] ; 43246bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 43256bce7ff8SHong Zhang for (j=0; j<nz; j++) { 4326914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 43276bce7ff8SHong Zhang } 43286bce7ff8SHong Zhang 43296bce7ff8SHong Zhang /* Mark diagonal and invert diagonal for simplier triangular solves */ 4330914a18a2SHong Zhang pv = b->a + bs2*bdiag[i]; 43316bce7ff8SHong Zhang pj = b->j + bdiag[i]; 4332914a18a2SHong Zhang /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 4333914a18a2SHong Zhang ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4334914a18a2SHong Zhang ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 43356bce7ff8SHong Zhang 43366bce7ff8SHong Zhang /* U part */ 4337914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-i]; 43386bce7ff8SHong Zhang pj = b->j + bi[2*n-i]; 43396bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i] - 1; 4340914a18a2SHong Zhang for (j=0; j<nz; j++){ 4341914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4342914a18a2SHong Zhang } 43436bce7ff8SHong Zhang } 43446bce7ff8SHong Zhang 43456bce7ff8SHong Zhang ierr = PetscFree(rtmp);CHKERRQ(ierr); 43466bce7ff8SHong Zhang ierr = PetscFree(v_work);CHKERRQ(ierr); 43476bce7ff8SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 43486bce7ff8SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 434927019359SHong Zhang 43506bce7ff8SHong Zhang C->assembled = PETSC_TRUE; 4351914a18a2SHong Zhang ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 43526bce7ff8SHong Zhang PetscFunctionReturn(0); 43536bce7ff8SHong Zhang } 43546bce7ff8SHong Zhang 43556bce7ff8SHong Zhang /* 43566bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 435716a2bf60SHong Zhang See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 435816a2bf60SHong Zhang because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 43596bce7ff8SHong Zhang */ 43606bce7ff8SHong Zhang #undef __FUNCT__ 43616bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 43626bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 43636bce7ff8SHong Zhang { 43646bce7ff8SHong Zhang 43656bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 43666bce7ff8SHong Zhang PetscErrorCode ierr; 436716a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 436816a2bf60SHong Zhang PetscInt i,j,nz,*bi,*bj,*bdiag; 43696bce7ff8SHong Zhang 43706bce7ff8SHong Zhang PetscFunctionBegin; 437116a2bf60SHong Zhang /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */ 437216a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 43736bce7ff8SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 437416a2bf60SHong Zhang 437516a2bf60SHong Zhang /* allocate matrix arrays for new data structure */ 437616a2bf60SHong Zhang ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr); 437716a2bf60SHong Zhang ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr); 437816a2bf60SHong Zhang b->singlemalloc = PETSC_TRUE; 437916a2bf60SHong Zhang if (!b->diag){ 438016a2bf60SHong Zhang ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 438116a2bf60SHong Zhang } 4382914a18a2SHong Zhang bdiag = b->diag; 43836bce7ff8SHong Zhang 438416a2bf60SHong Zhang if (n > 0) { 438516a2bf60SHong Zhang ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 43866bce7ff8SHong Zhang } 43876bce7ff8SHong Zhang 43886bce7ff8SHong Zhang /* set bi and bj with new data structure */ 43896bce7ff8SHong Zhang bi = b->i; 43906bce7ff8SHong Zhang bj = b->j; 43916bce7ff8SHong Zhang 43926bce7ff8SHong Zhang /* L part */ 43936bce7ff8SHong Zhang bi[0] = 0; 439416a2bf60SHong Zhang for (i=0; i<n; i++){ 43956bce7ff8SHong Zhang nz = adiag[i] - ai[i]; 4396914a18a2SHong Zhang bi[i+1] = bi[i] + nz; 43976bce7ff8SHong Zhang aj = a->j + ai[i]; 43986bce7ff8SHong Zhang for (j=0; j<nz; j++){ 43996bce7ff8SHong Zhang *bj = aj[j]; bj++; 44006bce7ff8SHong Zhang } 44016bce7ff8SHong Zhang } 44026bce7ff8SHong Zhang 44036bce7ff8SHong Zhang /* U part */ 440416a2bf60SHong Zhang bi[n+1] = bi[n]; 440516a2bf60SHong Zhang for (i=n-1; i>=0; i--){ 44066bce7ff8SHong Zhang nz = ai[i+1] - adiag[i] - 1; 440716a2bf60SHong Zhang bi[2*n-i+1] = bi[2*n-i] + nz + 1; 44086bce7ff8SHong Zhang aj = a->j + adiag[i] + 1; 44096bce7ff8SHong Zhang for (j=0; j<nz; j++){ 44106bce7ff8SHong Zhang *bj = aj[j]; bj++; 44116bce7ff8SHong Zhang } 44126bce7ff8SHong Zhang /* diag[i] */ 44136bce7ff8SHong Zhang *bj = i; bj++; 441416a2bf60SHong Zhang bdiag[i] = bi[2*n-i+1]-1; 44156bce7ff8SHong Zhang } 44166bce7ff8SHong Zhang PetscFunctionReturn(0); 44176bce7ff8SHong Zhang } 44186bce7ff8SHong Zhang 441916a2bf60SHong Zhang #undef __FUNCT__ 442016a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 442116a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 442216a2bf60SHong Zhang { 442316a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 442416a2bf60SHong Zhang IS isicol; 442516a2bf60SHong Zhang PetscErrorCode ierr; 442616a2bf60SHong Zhang const PetscInt *r,*ic; 44277fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 442816a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 442916a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 443016a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 44317fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 443216a2bf60SHong Zhang PetscReal f; 443316a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 443416a2bf60SHong Zhang PetscBT lnkbt; 443516a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 443616a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 443716a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 443816a2bf60SHong Zhang PetscTruth missing; 44397fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 444016a2bf60SHong Zhang 444116a2bf60SHong Zhang PetscFunctionBegin; 444216a2bf60SHong Zhang if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 444316a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 444416a2bf60SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 444516a2bf60SHong Zhang 444616a2bf60SHong Zhang f = info->fill; 444716a2bf60SHong Zhang levels = (PetscInt)info->levels; 444816a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 444916a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 445016a2bf60SHong Zhang 445116a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 445216a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 44537fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 445416a2bf60SHong Zhang 44557fa3a6a0SHong Zhang if (!levels && both_identity) { 445616a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 445716a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 445816a2bf60SHong Zhang (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 44597fa3a6a0SHong Zhang /* set MatSolve routines */ 44607fa3a6a0SHong Zhang switch (bs){ 44617fa3a6a0SHong Zhang case 2: 44627fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 44637fa3a6a0SHong Zhang break; 44647fa3a6a0SHong Zhang case 3: 44657fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 44667fa3a6a0SHong Zhang break; 44677fa3a6a0SHong Zhang case 4: 44687fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 44697fa3a6a0SHong Zhang break; 44707fa3a6a0SHong Zhang case 5: 44717fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 44727fa3a6a0SHong Zhang break; 44737fa3a6a0SHong Zhang case 6: 44747fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 44757fa3a6a0SHong Zhang break; 44767fa3a6a0SHong Zhang case 7: 44777fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 44787fa3a6a0SHong Zhang break; 44797fa3a6a0SHong Zhang default: 44807fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 44817fa3a6a0SHong Zhang break; 44827fa3a6a0SHong Zhang } 448316a2bf60SHong Zhang 448416a2bf60SHong Zhang fact->factor = MAT_FACTOR_ILU; 448516a2bf60SHong Zhang (fact)->info.factor_mallocs = 0; 448616a2bf60SHong Zhang (fact)->info.fill_ratio_given = info->fill; 448716a2bf60SHong Zhang (fact)->info.fill_ratio_needed = 1.0; 448816a2bf60SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 448916a2bf60SHong Zhang b->row = isrow; 449016a2bf60SHong Zhang b->col = iscol; 449116a2bf60SHong Zhang b->icol = isicol; 449216a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 449316a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 449416a2bf60SHong Zhang b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4495b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 449616a2bf60SHong Zhang PetscFunctionReturn(0); 449716a2bf60SHong Zhang } 449816a2bf60SHong Zhang 449916a2bf60SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 450016a2bf60SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 450116a2bf60SHong Zhang 450216a2bf60SHong Zhang /* get new row pointers */ 450316a2bf60SHong Zhang ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 450416a2bf60SHong Zhang bi[0] = 0; 450516a2bf60SHong Zhang /* bdiag is location of diagonal in factor */ 450616a2bf60SHong Zhang ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 450716a2bf60SHong Zhang bdiag[0] = 0; 450816a2bf60SHong Zhang 450916a2bf60SHong Zhang ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr); 451016a2bf60SHong Zhang bjlvl_ptr = (PetscInt**)(bj_ptr + n); 451116a2bf60SHong Zhang 451216a2bf60SHong Zhang /* create a linked list for storing column indices of the active row */ 451316a2bf60SHong Zhang nlnk = n + 1; 451416a2bf60SHong Zhang ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 451516a2bf60SHong Zhang 451616a2bf60SHong Zhang /* initial FreeSpace size is f*(ai[n]+1) */ 451716a2bf60SHong Zhang ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 451816a2bf60SHong Zhang current_space = free_space; 451916a2bf60SHong Zhang ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 452016a2bf60SHong Zhang current_space_lvl = free_space_lvl; 452116a2bf60SHong Zhang 452216a2bf60SHong Zhang for (i=0; i<n; i++) { 452316a2bf60SHong Zhang nzi = 0; 452416a2bf60SHong Zhang /* copy current row into linked list */ 452516a2bf60SHong Zhang nnz = ai[r[i]+1] - ai[r[i]]; 452616a2bf60SHong Zhang if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 452716a2bf60SHong Zhang cols = aj + ai[r[i]]; 452816a2bf60SHong Zhang lnk[i] = -1; /* marker to indicate if diagonal exists */ 452916a2bf60SHong Zhang ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 453016a2bf60SHong Zhang nzi += nlnk; 453116a2bf60SHong Zhang 453216a2bf60SHong Zhang /* make sure diagonal entry is included */ 453316a2bf60SHong Zhang if (diagonal_fill && lnk[i] == -1) { 453416a2bf60SHong Zhang fm = n; 453516a2bf60SHong Zhang while (lnk[fm] < i) fm = lnk[fm]; 453616a2bf60SHong Zhang lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 453716a2bf60SHong Zhang lnk[fm] = i; 453816a2bf60SHong Zhang lnk_lvl[i] = 0; 453916a2bf60SHong Zhang nzi++; dcount++; 454016a2bf60SHong Zhang } 454116a2bf60SHong Zhang 454216a2bf60SHong Zhang /* add pivot rows into the active row */ 454316a2bf60SHong Zhang nzbd = 0; 454416a2bf60SHong Zhang prow = lnk[n]; 454516a2bf60SHong Zhang while (prow < i) { 454616a2bf60SHong Zhang nnz = bdiag[prow]; 454716a2bf60SHong Zhang cols = bj_ptr[prow] + nnz + 1; 454816a2bf60SHong Zhang cols_lvl = bjlvl_ptr[prow] + nnz + 1; 454916a2bf60SHong Zhang nnz = bi[prow+1] - bi[prow] - nnz - 1; 455016a2bf60SHong Zhang ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 455116a2bf60SHong Zhang nzi += nlnk; 455216a2bf60SHong Zhang prow = lnk[prow]; 455316a2bf60SHong Zhang nzbd++; 455416a2bf60SHong Zhang } 455516a2bf60SHong Zhang bdiag[i] = nzbd; 455616a2bf60SHong Zhang bi[i+1] = bi[i] + nzi; 455716a2bf60SHong Zhang 455816a2bf60SHong Zhang /* if free space is not available, make more free space */ 455916a2bf60SHong Zhang if (current_space->local_remaining<nzi) { 456016a2bf60SHong Zhang nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 456116a2bf60SHong Zhang ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 456216a2bf60SHong Zhang ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 456316a2bf60SHong Zhang reallocs++; 456416a2bf60SHong Zhang } 456516a2bf60SHong Zhang 456616a2bf60SHong Zhang /* copy data into free_space and free_space_lvl, then initialize lnk */ 456716a2bf60SHong Zhang ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 456816a2bf60SHong Zhang bj_ptr[i] = current_space->array; 456916a2bf60SHong Zhang bjlvl_ptr[i] = current_space_lvl->array; 457016a2bf60SHong Zhang 457116a2bf60SHong Zhang /* make sure the active row i has diagonal entry */ 457216a2bf60SHong Zhang if (*(bj_ptr[i]+bdiag[i]) != i) { 457316a2bf60SHong Zhang SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 457416a2bf60SHong Zhang try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 457516a2bf60SHong Zhang } 457616a2bf60SHong Zhang 457716a2bf60SHong Zhang current_space->array += nzi; 457816a2bf60SHong Zhang current_space->local_used += nzi; 457916a2bf60SHong Zhang current_space->local_remaining -= nzi; 458016a2bf60SHong Zhang current_space_lvl->array += nzi; 458116a2bf60SHong Zhang current_space_lvl->local_used += nzi; 458216a2bf60SHong Zhang current_space_lvl->local_remaining -= nzi; 458316a2bf60SHong Zhang } 458416a2bf60SHong Zhang 458516a2bf60SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 458616a2bf60SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 458716a2bf60SHong Zhang 458816a2bf60SHong Zhang /* destroy list of free space and other temporary arrays */ 458916a2bf60SHong Zhang ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 459016a2bf60SHong Zhang 459116a2bf60SHong Zhang /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 4592783ef271SHong Zhang ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 459316a2bf60SHong Zhang 459416a2bf60SHong Zhang ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 459516a2bf60SHong Zhang ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 459616a2bf60SHong Zhang ierr = PetscFree(bj_ptr);CHKERRQ(ierr); 459716a2bf60SHong Zhang 459816a2bf60SHong Zhang #if defined(PETSC_USE_INFO) 459916a2bf60SHong Zhang { 460016a2bf60SHong Zhang PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 460116a2bf60SHong Zhang ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 460216a2bf60SHong Zhang ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 460316a2bf60SHong Zhang ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 460416a2bf60SHong Zhang ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 460516a2bf60SHong Zhang if (diagonal_fill) { 460616a2bf60SHong Zhang ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 460716a2bf60SHong Zhang } 460816a2bf60SHong Zhang } 460916a2bf60SHong Zhang #endif 461016a2bf60SHong Zhang 461116a2bf60SHong Zhang /* put together the new matrix */ 461216a2bf60SHong Zhang ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 461316a2bf60SHong Zhang ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 461416a2bf60SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 461516a2bf60SHong Zhang b->free_a = PETSC_TRUE; 461616a2bf60SHong Zhang b->free_ij = PETSC_TRUE; 461716a2bf60SHong Zhang b->singlemalloc = PETSC_FALSE; 46187fa3a6a0SHong Zhang ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 461916a2bf60SHong Zhang b->j = bj; 462016a2bf60SHong Zhang b->i = bi; 462116a2bf60SHong Zhang b->diag = bdiag; 46227f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 462316a2bf60SHong Zhang b->ilen = 0; 462416a2bf60SHong Zhang b->imax = 0; 462516a2bf60SHong Zhang b->row = isrow; 462616a2bf60SHong Zhang b->col = iscol; 462716a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 462816a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 462916a2bf60SHong Zhang b->icol = isicol; 46307fa3a6a0SHong Zhang ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 463116a2bf60SHong Zhang /* In b structure: Free imax, ilen, old a, old j. 463216a2bf60SHong Zhang Allocate bdiag, solve_work, new a, new j */ 46337fa3a6a0SHong Zhang ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 463416a2bf60SHong Zhang b->maxnz = b->nz = bi[2*n+1] ; 463516a2bf60SHong Zhang (fact)->info.factor_mallocs = reallocs; 463616a2bf60SHong Zhang (fact)->info.fill_ratio_given = f; 463716a2bf60SHong Zhang (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]); 463816a2bf60SHong Zhang (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 46397fa3a6a0SHong Zhang /* set MatSolve routines */ 46407fa3a6a0SHong Zhang if (both_identity){ 46417fa3a6a0SHong Zhang switch (bs){ 46427fa3a6a0SHong Zhang case 2: 46437fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 46447fa3a6a0SHong Zhang break; 46457fa3a6a0SHong Zhang case 3: 46467fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 46477fa3a6a0SHong Zhang break; 46487fa3a6a0SHong Zhang case 4: 46497fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 46507fa3a6a0SHong Zhang break; 46517fa3a6a0SHong Zhang case 5: 46527fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 46537fa3a6a0SHong Zhang break; 46547fa3a6a0SHong Zhang case 6: 46557fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 46567fa3a6a0SHong Zhang break; 46577fa3a6a0SHong Zhang case 7: 46587fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 46597fa3a6a0SHong Zhang break; 46607fa3a6a0SHong Zhang default: 46617fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 46627fa3a6a0SHong Zhang break; 46637fa3a6a0SHong Zhang } 46647fa3a6a0SHong Zhang } else { 46657fa3a6a0SHong Zhang switch (bs){ 46667fa3a6a0SHong Zhang case 2: 46677fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct; 46687fa3a6a0SHong Zhang break; 46697fa3a6a0SHong Zhang case 3: 46707fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct; 46717fa3a6a0SHong Zhang break; 46727fa3a6a0SHong Zhang case 4: 46737fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct; 46747fa3a6a0SHong Zhang break; 46757fa3a6a0SHong Zhang case 5: 46767fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct; 46777fa3a6a0SHong Zhang break; 46787fa3a6a0SHong Zhang case 6: 46797fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct; 46807fa3a6a0SHong Zhang break; 46817fa3a6a0SHong Zhang case 7: 46827fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct; 46837fa3a6a0SHong Zhang break; 46847fa3a6a0SHong Zhang default: 46857fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 46867fa3a6a0SHong Zhang break; 46877fa3a6a0SHong Zhang } 46887fa3a6a0SHong Zhang } 468916a2bf60SHong Zhang PetscFunctionReturn(0); 469016a2bf60SHong Zhang } 469116a2bf60SHong Zhang 46924e2b4712SSatish Balay /* 46934e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 46944e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 46954e2b4712SSatish Balay Not a good example of code reuse. 46964e2b4712SSatish Balay */ 46974a2ae208SSatish Balay #undef __FUNCT__ 46984a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 46990481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 47004e2b4712SSatish Balay { 47014e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 47024e2b4712SSatish Balay IS isicol; 47036849ba73SBarry Smith PetscErrorCode ierr; 47045d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 47055d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 4706a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 4707d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 470841df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 4709329f5518SBarry Smith PetscReal f; 471016a2bf60SHong Zhang PetscTruth newdatastruct=PETSC_FALSE; 47114e2b4712SSatish Balay 47124e2b4712SSatish Balay PetscFunctionBegin; 471316a2bf60SHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 471416a2bf60SHong Zhang if (newdatastruct){ 471516a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 471616a2bf60SHong Zhang PetscFunctionReturn(0); 471716a2bf60SHong Zhang } 471816a2bf60SHong Zhang 47196bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 47206bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 47216bce7ff8SHong Zhang 4722435faa5fSBarry Smith f = info->fill; 4723690b6cddSBarry Smith levels = (PetscInt)info->levels; 4724690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 47254c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 472616a2bf60SHong Zhang 4727667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4728667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 47297d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 4730309c388cSBarry Smith 473141df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 473216a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 47336bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 47346bce7ff8SHong Zhang 4735719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 4736719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 4737bb3d539aSBarry Smith b->row = isrow; 4738bb3d539aSBarry Smith b->col = iscol; 4739bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4740bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4741bb3d539aSBarry Smith b->icol = isicol; 4742bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4743b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 47446bce7ff8SHong Zhang PetscFunctionReturn(0); 47456bce7ff8SHong Zhang } 47466bce7ff8SHong Zhang 47476bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 47484e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 47494e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 47504e2b4712SSatish Balay 47514e2b4712SSatish Balay /* get new row pointers */ 4752690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 47534e2b4712SSatish Balay ainew[0] = 0; 47544e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 4755690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 4756690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 47574e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 4758690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 47594e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 4760690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 47614e2b4712SSatish Balay /* im is level for each filled value */ 4762690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 47634e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 4764690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 47654e2b4712SSatish Balay dloc[0] = 0; 47664e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 4767435faa5fSBarry Smith 4768435faa5fSBarry Smith /* copy prow into linked list */ 47694e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 47703b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 47714e2b4712SSatish Balay xi = aj + ai[r[prow]]; 47724e2b4712SSatish Balay fill[n] = n; 4773435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 47744e2b4712SSatish Balay while (nz--) { 47754e2b4712SSatish Balay fm = n; 47764e2b4712SSatish Balay idx = ic[*xi++]; 47774e2b4712SSatish Balay do { 47784e2b4712SSatish Balay m = fm; 47794e2b4712SSatish Balay fm = fill[m]; 47804e2b4712SSatish Balay } while (fm < idx); 47814e2b4712SSatish Balay fill[m] = idx; 47824e2b4712SSatish Balay fill[idx] = fm; 47834e2b4712SSatish Balay im[idx] = 0; 47844e2b4712SSatish Balay } 4785435faa5fSBarry Smith 4786435faa5fSBarry Smith /* make sure diagonal entry is included */ 4787435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 4788435faa5fSBarry Smith fm = n; 4789435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 4790435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 4791435faa5fSBarry Smith fill[fm] = prow; 4792435faa5fSBarry Smith im[prow] = 0; 4793435faa5fSBarry Smith nzf++; 4794335d9088SBarry Smith dcount++; 4795435faa5fSBarry Smith } 4796435faa5fSBarry Smith 47974e2b4712SSatish Balay nzi = 0; 47984e2b4712SSatish Balay row = fill[n]; 47994e2b4712SSatish Balay while (row < prow) { 48004e2b4712SSatish Balay incrlev = im[row] + 1; 48014e2b4712SSatish Balay nz = dloc[row]; 4802435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 48034e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 48044e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 48054e2b4712SSatish Balay fm = row; 48064e2b4712SSatish Balay while (nnz-- > 0) { 48074e2b4712SSatish Balay idx = *xi++; 48084e2b4712SSatish Balay if (*flev + incrlev > levels) { 48094e2b4712SSatish Balay flev++; 48104e2b4712SSatish Balay continue; 48114e2b4712SSatish Balay } 48124e2b4712SSatish Balay do { 48134e2b4712SSatish Balay m = fm; 48144e2b4712SSatish Balay fm = fill[m]; 48154e2b4712SSatish Balay } while (fm < idx); 48164e2b4712SSatish Balay if (fm != idx) { 48174e2b4712SSatish Balay im[idx] = *flev + incrlev; 48184e2b4712SSatish Balay fill[m] = idx; 48194e2b4712SSatish Balay fill[idx] = fm; 48204e2b4712SSatish Balay fm = idx; 48214e2b4712SSatish Balay nzf++; 4822ecf371e4SBarry Smith } else { 48234e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 48244e2b4712SSatish Balay } 48254e2b4712SSatish Balay flev++; 48264e2b4712SSatish Balay } 48274e2b4712SSatish Balay row = fill[row]; 48284e2b4712SSatish Balay nzi++; 48294e2b4712SSatish Balay } 48304e2b4712SSatish Balay /* copy new filled row into permanent storage */ 48314e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 48324e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 4833ecf371e4SBarry Smith 4834ecf371e4SBarry Smith /* estimate how much additional space we will need */ 4835ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 4836ecf371e4SBarry Smith /* just double the memory each time */ 4837690b6cddSBarry Smith PetscInt maxadd = jmax; 4838ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 48394e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 48404e2b4712SSatish Balay jmax += maxadd; 4841ecf371e4SBarry Smith 4842ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 48435d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 48445d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4845606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 48465d0c19d7SBarry Smith ajnew = xitmp; 48475d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 48485d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4849606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 48505d0c19d7SBarry Smith ajfill = xitmp; 4851eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 48524e2b4712SSatish Balay } 48535d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 48544e2b4712SSatish Balay flev = ajfill + ainew[prow]; 48554e2b4712SSatish Balay dloc[prow] = nzi; 48564e2b4712SSatish Balay fm = fill[n]; 48574e2b4712SSatish Balay while (nzf--) { 48585d0c19d7SBarry Smith *xitmp++ = fm; 48594e2b4712SSatish Balay *flev++ = im[fm]; 48604e2b4712SSatish Balay fm = fill[fm]; 48614e2b4712SSatish Balay } 4862435faa5fSBarry Smith /* make sure row has diagonal entry */ 4863435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 486477431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 48652401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 4866435faa5fSBarry Smith } 48674e2b4712SSatish Balay } 4868606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 48694e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 48704e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4871606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 4872606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 48734e2b4712SSatish Balay 48746cf91177SBarry Smith #if defined(PETSC_USE_INFO) 48754e2b4712SSatish Balay { 4876329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 4877ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 4878ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 4879ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 4880ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 4881335d9088SBarry Smith if (diagonal_fill) { 4882ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 4883335d9088SBarry Smith } 48844e2b4712SSatish Balay } 488563ba0a88SBarry Smith #endif 48864e2b4712SSatish Balay 48874e2b4712SSatish Balay /* put together the new matrix */ 4888719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 4889719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 4890719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 4891e6b907acSBarry Smith b->free_a = PETSC_TRUE; 4892e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 48937c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 4894a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 48954e2b4712SSatish Balay b->j = ajnew; 48964e2b4712SSatish Balay b->i = ainew; 48974e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 48984e2b4712SSatish Balay b->diag = dloc; 48997f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 49004e2b4712SSatish Balay b->ilen = 0; 49014e2b4712SSatish Balay b->imax = 0; 49024e2b4712SSatish Balay b->row = isrow; 49034e2b4712SSatish Balay b->col = iscol; 4904bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4905c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4906c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4907e51c0b9cSSatish Balay b->icol = isicol; 490887828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 49094e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 49104e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 4911719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 49124e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 49134e2b4712SSatish Balay 4914719d5645SBarry Smith (fact)->info.factor_mallocs = reallocate; 4915719d5645SBarry Smith (fact)->info.fill_ratio_given = f; 4916719d5645SBarry Smith (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 49176bce7ff8SHong Zhang 491841df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 49198661488fSKris Buschelman PetscFunctionReturn(0); 49208661488fSKris Buschelman } 49218661488fSKris Buschelman 4922732ee342SKris Buschelman #undef __FUNCT__ 49237e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 4924dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 49257e7071cdSKris Buschelman { 492612272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 492712272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 49285a9542e3SKris Buschelman PetscFunctionBegin; 49297cf1b8d3SKris Buschelman /* Undo Column scaling */ 49307cf1b8d3SKris Buschelman /* while (nz--) { */ 49317cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 49327cf1b8d3SKris Buschelman /* } */ 4933c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 4934c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 49357cf1b8d3SKris Buschelman PetscFunctionReturn(0); 49367cf1b8d3SKris Buschelman } 49377cf1b8d3SKris Buschelman 49387cf1b8d3SKris Buschelman #undef __FUNCT__ 49397cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 4940dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 49417cf1b8d3SKris Buschelman { 49427cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4943b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 49442aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 49455a9542e3SKris Buschelman PetscFunctionBegin; 49460b9da03eSKris Buschelman /* Is this really necessary? */ 494720235379SKris Buschelman while (nz--) { 49480b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 49497e7071cdSKris Buschelman } 4950c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 49517e7071cdSKris Buschelman PetscFunctionReturn(0); 49527e7071cdSKris Buschelman } 49537e7071cdSKris Buschelman 4954732ee342SKris Buschelman 4955