1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 1016a2bf60SHong Zhang #include "petscbt.h" 1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 124e2b4712SSatish Balay 134a2ae208SSatish Balay #undef __FUNCT__ 144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16f1af5d2fSBarry Smith { 17f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18dfbe8321SBarry Smith PetscErrorCode ierr; 19690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20690b6cddSBarry Smith PetscInt *diag = a->diag; 21f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2287828ca2SBarry Smith PetscScalar s1,*x,*b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 66f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6887828ca2SBarry Smith PetscScalar *x,*b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 1111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120f1af5d2fSBarry Smith { 121f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122dfbe8321SBarry Smith PetscErrorCode ierr; 123690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 125f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12687828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12787828ca2SBarry Smith PetscScalar *x,*b; 128f1af5d2fSBarry Smith 129f1af5d2fSBarry Smith PetscFunctionBegin; 130ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1311ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133f1af5d2fSBarry Smith 134f1af5d2fSBarry Smith /* forward solve the U^T */ 135f1af5d2fSBarry Smith idx = 0; 136f1af5d2fSBarry Smith for (i=0; i<n; i++) { 137f1af5d2fSBarry Smith 138f1af5d2fSBarry Smith v = aa + 9*diag[i]; 139f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 140ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144f1af5d2fSBarry Smith v += 9; 145f1af5d2fSBarry Smith 146f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 147f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 148f1af5d2fSBarry Smith while (nz--) { 149f1af5d2fSBarry Smith oidx = 3*(*vi++); 150f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153f1af5d2fSBarry Smith v += 9; 154f1af5d2fSBarry Smith } 155f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156f1af5d2fSBarry Smith idx += 3; 157f1af5d2fSBarry Smith } 158f1af5d2fSBarry Smith /* backward solve the L^T */ 159f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 160f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 161f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 162f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 163f1af5d2fSBarry Smith idt = 3*i; 164f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165f1af5d2fSBarry Smith while (nz--) { 166f1af5d2fSBarry Smith idx = 3*(*vi--); 167f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170f1af5d2fSBarry Smith v -= 9; 171f1af5d2fSBarry Smith } 172f1af5d2fSBarry Smith } 1731ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176f1af5d2fSBarry Smith PetscFunctionReturn(0); 177f1af5d2fSBarry Smith } 178f1af5d2fSBarry Smith 1794a2ae208SSatish Balay #undef __FUNCT__ 1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182f1af5d2fSBarry Smith { 183f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184dfbe8321SBarry Smith PetscErrorCode ierr; 185690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 187f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18987828ca2SBarry Smith PetscScalar *x,*b; 190f1af5d2fSBarry Smith 191f1af5d2fSBarry Smith PetscFunctionBegin; 192ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1931ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195f1af5d2fSBarry Smith 196f1af5d2fSBarry Smith /* forward solve the U^T */ 197f1af5d2fSBarry Smith idx = 0; 198f1af5d2fSBarry Smith for (i=0; i<n; i++) { 199f1af5d2fSBarry Smith 200f1af5d2fSBarry Smith v = aa + 16*diag[i]; 201f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 202ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207f1af5d2fSBarry Smith v += 16; 208f1af5d2fSBarry Smith 209f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 210f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 211f1af5d2fSBarry Smith while (nz--) { 212f1af5d2fSBarry Smith oidx = 4*(*vi++); 213f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217f1af5d2fSBarry Smith v += 16; 218f1af5d2fSBarry Smith } 219f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220f1af5d2fSBarry Smith idx += 4; 221f1af5d2fSBarry Smith } 222f1af5d2fSBarry Smith /* backward solve the L^T */ 223f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 224f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 225f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 226f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 227f1af5d2fSBarry Smith idt = 4*i; 228f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229f1af5d2fSBarry Smith while (nz--) { 230f1af5d2fSBarry Smith idx = 4*(*vi--); 231f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235f1af5d2fSBarry Smith v -= 16; 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith } 2381ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2391ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241f1af5d2fSBarry Smith PetscFunctionReturn(0); 242f1af5d2fSBarry Smith } 243f1af5d2fSBarry Smith 2444a2ae208SSatish Balay #undef __FUNCT__ 2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247f1af5d2fSBarry Smith { 248f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249dfbe8321SBarry Smith PetscErrorCode ierr; 250690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 252f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 25387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 25487828ca2SBarry Smith PetscScalar *x,*b; 255f1af5d2fSBarry Smith 256f1af5d2fSBarry Smith PetscFunctionBegin; 257ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2581ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2591ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260f1af5d2fSBarry Smith 261f1af5d2fSBarry Smith /* forward solve the U^T */ 262f1af5d2fSBarry Smith idx = 0; 263f1af5d2fSBarry Smith for (i=0; i<n; i++) { 264f1af5d2fSBarry Smith 265f1af5d2fSBarry Smith v = aa + 25*diag[i]; 266f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 267ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273f1af5d2fSBarry Smith v += 25; 274f1af5d2fSBarry Smith 275f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 276f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 277f1af5d2fSBarry Smith while (nz--) { 278f1af5d2fSBarry Smith oidx = 5*(*vi++); 279f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284f1af5d2fSBarry Smith v += 25; 285f1af5d2fSBarry Smith } 286f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287f1af5d2fSBarry Smith idx += 5; 288f1af5d2fSBarry Smith } 289f1af5d2fSBarry Smith /* backward solve the L^T */ 290f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 291f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 292f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 293f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 294f1af5d2fSBarry Smith idt = 5*i; 295f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296f1af5d2fSBarry Smith while (nz--) { 297f1af5d2fSBarry Smith idx = 5*(*vi--); 298f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303f1af5d2fSBarry Smith v -= 25; 304f1af5d2fSBarry Smith } 305f1af5d2fSBarry Smith } 3061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309f1af5d2fSBarry Smith PetscFunctionReturn(0); 310f1af5d2fSBarry Smith } 311f1af5d2fSBarry Smith 3124a2ae208SSatish Balay #undef __FUNCT__ 3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315f1af5d2fSBarry Smith { 316f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317dfbe8321SBarry Smith PetscErrorCode ierr; 318690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 320f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 32187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 32287828ca2SBarry Smith PetscScalar *x,*b; 323f1af5d2fSBarry Smith 324f1af5d2fSBarry Smith PetscFunctionBegin; 325ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328f1af5d2fSBarry Smith 329f1af5d2fSBarry Smith /* forward solve the U^T */ 330f1af5d2fSBarry Smith idx = 0; 331f1af5d2fSBarry Smith for (i=0; i<n; i++) { 332f1af5d2fSBarry Smith 333f1af5d2fSBarry Smith v = aa + 36*diag[i]; 334f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 335ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336ef66eb69SBarry Smith x6 = x[5+idx]; 337f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343f1af5d2fSBarry Smith v += 36; 344f1af5d2fSBarry Smith 345f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 346f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 347f1af5d2fSBarry Smith while (nz--) { 348f1af5d2fSBarry Smith oidx = 6*(*vi++); 349f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355f1af5d2fSBarry Smith v += 36; 356f1af5d2fSBarry Smith } 357f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358f1af5d2fSBarry Smith x[5+idx] = s6; 359f1af5d2fSBarry Smith idx += 6; 360f1af5d2fSBarry Smith } 361f1af5d2fSBarry Smith /* backward solve the L^T */ 362f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 363f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 364f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 365f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 366f1af5d2fSBarry Smith idt = 6*i; 367f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368f1af5d2fSBarry Smith s6 = x[5+idt]; 369f1af5d2fSBarry Smith while (nz--) { 370f1af5d2fSBarry Smith idx = 6*(*vi--); 371f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377f1af5d2fSBarry Smith v -= 36; 378f1af5d2fSBarry Smith } 379f1af5d2fSBarry Smith } 3801ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383f1af5d2fSBarry Smith PetscFunctionReturn(0); 384f1af5d2fSBarry Smith } 385f1af5d2fSBarry Smith 3864a2ae208SSatish Balay #undef __FUNCT__ 3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389f1af5d2fSBarry Smith { 390f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391dfbe8321SBarry Smith PetscErrorCode ierr; 392690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 394f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 39587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 39687828ca2SBarry Smith PetscScalar *x,*b; 397f1af5d2fSBarry Smith 398f1af5d2fSBarry Smith PetscFunctionBegin; 399ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4001ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402f1af5d2fSBarry Smith 403f1af5d2fSBarry Smith /* forward solve the U^T */ 404f1af5d2fSBarry Smith idx = 0; 405f1af5d2fSBarry Smith for (i=0; i<n; i++) { 406f1af5d2fSBarry Smith 407f1af5d2fSBarry Smith v = aa + 49*diag[i]; 408f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 409ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 411f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418f1af5d2fSBarry Smith v += 49; 419f1af5d2fSBarry Smith 420f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 421f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 422f1af5d2fSBarry Smith while (nz--) { 423f1af5d2fSBarry Smith oidx = 7*(*vi++); 424f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431f1af5d2fSBarry Smith v += 49; 432f1af5d2fSBarry Smith } 433f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 435f1af5d2fSBarry Smith idx += 7; 436f1af5d2fSBarry Smith } 437f1af5d2fSBarry Smith /* backward solve the L^T */ 438f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 439f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 440f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 441f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 442f1af5d2fSBarry Smith idt = 7*i; 443f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 445f1af5d2fSBarry Smith while (nz--) { 446f1af5d2fSBarry Smith idx = 7*(*vi--); 447f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454f1af5d2fSBarry Smith v -= 49; 455f1af5d2fSBarry Smith } 456f1af5d2fSBarry Smith } 4571ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460f1af5d2fSBarry Smith PetscFunctionReturn(0); 461f1af5d2fSBarry Smith } 462f1af5d2fSBarry Smith 463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4644a2ae208SSatish Balay #undef __FUNCT__ 4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467f1af5d2fSBarry Smith { 468f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 4706849ba73SBarry Smith PetscErrorCode ierr; 4715d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 4725d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473690b6cddSBarry Smith PetscInt *diag = a->diag; 474f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 47587828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 476f1af5d2fSBarry Smith 477f1af5d2fSBarry Smith PetscFunctionBegin; 4781ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480f1af5d2fSBarry Smith t = a->solve_work; 481f1af5d2fSBarry Smith 482f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484f1af5d2fSBarry Smith 485f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 486f1af5d2fSBarry Smith for (i=0; i<n; i++) { 487f1af5d2fSBarry Smith t[i] = b[c[i]]; 488f1af5d2fSBarry Smith } 489f1af5d2fSBarry Smith 490f1af5d2fSBarry Smith /* forward solve the U^T */ 491f1af5d2fSBarry Smith for (i=0; i<n; i++) { 492f1af5d2fSBarry Smith 493f1af5d2fSBarry Smith v = aa + diag[i]; 494f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 495f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 496f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 497f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 498f1af5d2fSBarry Smith while (nz--) { 499f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 500f1af5d2fSBarry Smith } 501f1af5d2fSBarry Smith t[i] = s1; 502f1af5d2fSBarry Smith } 503f1af5d2fSBarry Smith /* backward solve the L^T */ 504f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 505f1af5d2fSBarry Smith v = aa + diag[i] - 1; 506f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 507f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 508f1af5d2fSBarry Smith s1 = t[i]; 509f1af5d2fSBarry Smith while (nz--) { 510f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 511f1af5d2fSBarry Smith } 512f1af5d2fSBarry Smith } 513f1af5d2fSBarry Smith 514f1af5d2fSBarry Smith /* copy t into x according to permutation */ 515f1af5d2fSBarry Smith for (i=0; i<n; i++) { 516f1af5d2fSBarry Smith x[r[i]] = t[i]; 517f1af5d2fSBarry Smith } 518f1af5d2fSBarry Smith 519f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5211ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5221ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524f1af5d2fSBarry Smith PetscFunctionReturn(0); 525f1af5d2fSBarry Smith } 526f1af5d2fSBarry Smith 5274a2ae208SSatish Balay #undef __FUNCT__ 5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530f1af5d2fSBarry Smith { 531f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5336849ba73SBarry Smith PetscErrorCode ierr; 5345d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5355d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 537f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53887828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 53987828ca2SBarry Smith PetscScalar *x,*b,*t; 540f1af5d2fSBarry Smith 541f1af5d2fSBarry Smith PetscFunctionBegin; 5421ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5431ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544f1af5d2fSBarry Smith t = a->solve_work; 545f1af5d2fSBarry Smith 546f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548f1af5d2fSBarry Smith 549f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 550f1af5d2fSBarry Smith ii = 0; 551f1af5d2fSBarry Smith for (i=0; i<n; i++) { 552f1af5d2fSBarry Smith ic = 2*c[i]; 553f1af5d2fSBarry Smith t[ii] = b[ic]; 554f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 555f1af5d2fSBarry Smith ii += 2; 556f1af5d2fSBarry Smith } 557f1af5d2fSBarry Smith 558f1af5d2fSBarry Smith /* forward solve the U^T */ 559f1af5d2fSBarry Smith idx = 0; 560f1af5d2fSBarry Smith for (i=0; i<n; i++) { 561f1af5d2fSBarry Smith 562f1af5d2fSBarry Smith v = aa + 4*diag[i]; 563f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 564f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 565f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 566f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 567f1af5d2fSBarry Smith v += 4; 568f1af5d2fSBarry Smith 569f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 570f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 571f1af5d2fSBarry Smith while (nz--) { 572f1af5d2fSBarry Smith oidx = 2*(*vi++); 573f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 574f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 575f1af5d2fSBarry Smith v += 4; 576f1af5d2fSBarry Smith } 577f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 578f1af5d2fSBarry Smith idx += 2; 579f1af5d2fSBarry Smith } 580f1af5d2fSBarry Smith /* backward solve the L^T */ 581f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 582f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 583f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 584f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 585f1af5d2fSBarry Smith idt = 2*i; 586f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 587f1af5d2fSBarry Smith while (nz--) { 588f1af5d2fSBarry Smith idx = 2*(*vi--); 589f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 590f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 591f1af5d2fSBarry Smith v -= 4; 592f1af5d2fSBarry Smith } 593f1af5d2fSBarry Smith } 594f1af5d2fSBarry Smith 595f1af5d2fSBarry Smith /* copy t into x according to permutation */ 596f1af5d2fSBarry Smith ii = 0; 597f1af5d2fSBarry Smith for (i=0; i<n; i++) { 598f1af5d2fSBarry Smith ir = 2*r[i]; 599f1af5d2fSBarry Smith x[ir] = t[ii]; 600f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 601f1af5d2fSBarry Smith ii += 2; 602f1af5d2fSBarry Smith } 603f1af5d2fSBarry Smith 604f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609f1af5d2fSBarry Smith PetscFunctionReturn(0); 610f1af5d2fSBarry Smith } 611f1af5d2fSBarry Smith 6124a2ae208SSatish Balay #undef __FUNCT__ 6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615f1af5d2fSBarry Smith { 616f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6186849ba73SBarry Smith PetscErrorCode ierr; 6195d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 6205d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 622f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 62387828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 62487828ca2SBarry Smith PetscScalar *x,*b,*t; 625f1af5d2fSBarry Smith 626f1af5d2fSBarry Smith PetscFunctionBegin; 6271ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629f1af5d2fSBarry Smith t = a->solve_work; 630f1af5d2fSBarry Smith 631f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633f1af5d2fSBarry Smith 634f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 635f1af5d2fSBarry Smith ii = 0; 636f1af5d2fSBarry Smith for (i=0; i<n; i++) { 637f1af5d2fSBarry Smith ic = 3*c[i]; 638f1af5d2fSBarry Smith t[ii] = b[ic]; 639f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 640f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 641f1af5d2fSBarry Smith ii += 3; 642f1af5d2fSBarry Smith } 643f1af5d2fSBarry Smith 644f1af5d2fSBarry Smith /* forward solve the U^T */ 645f1af5d2fSBarry Smith idx = 0; 646f1af5d2fSBarry Smith for (i=0; i<n; i++) { 647f1af5d2fSBarry Smith 648f1af5d2fSBarry Smith v = aa + 9*diag[i]; 649f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 650f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654f1af5d2fSBarry Smith v += 9; 655f1af5d2fSBarry Smith 656f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 657f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 658f1af5d2fSBarry Smith while (nz--) { 659f1af5d2fSBarry Smith oidx = 3*(*vi++); 660f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663f1af5d2fSBarry Smith v += 9; 664f1af5d2fSBarry Smith } 665f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666f1af5d2fSBarry Smith idx += 3; 667f1af5d2fSBarry Smith } 668f1af5d2fSBarry Smith /* backward solve the L^T */ 669f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 670f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 671f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 672f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 673f1af5d2fSBarry Smith idt = 3*i; 674f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675f1af5d2fSBarry Smith while (nz--) { 676f1af5d2fSBarry Smith idx = 3*(*vi--); 677f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680f1af5d2fSBarry Smith v -= 9; 681f1af5d2fSBarry Smith } 682f1af5d2fSBarry Smith } 683f1af5d2fSBarry Smith 684f1af5d2fSBarry Smith /* copy t into x according to permutation */ 685f1af5d2fSBarry Smith ii = 0; 686f1af5d2fSBarry Smith for (i=0; i<n; i++) { 687f1af5d2fSBarry Smith ir = 3*r[i]; 688f1af5d2fSBarry Smith x[ir] = t[ii]; 689f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 690f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 691f1af5d2fSBarry Smith ii += 3; 692f1af5d2fSBarry Smith } 693f1af5d2fSBarry Smith 694f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6961ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699f1af5d2fSBarry Smith PetscFunctionReturn(0); 700f1af5d2fSBarry Smith } 701f1af5d2fSBarry Smith 7024a2ae208SSatish Balay #undef __FUNCT__ 7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705f1af5d2fSBarry Smith { 706f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7086849ba73SBarry Smith PetscErrorCode ierr; 7095d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 7105d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 712f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 71387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 71487828ca2SBarry Smith PetscScalar *x,*b,*t; 715f1af5d2fSBarry Smith 716f1af5d2fSBarry Smith PetscFunctionBegin; 7171ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719f1af5d2fSBarry Smith t = a->solve_work; 720f1af5d2fSBarry Smith 721f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723f1af5d2fSBarry Smith 724f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 725f1af5d2fSBarry Smith ii = 0; 726f1af5d2fSBarry Smith for (i=0; i<n; i++) { 727f1af5d2fSBarry Smith ic = 4*c[i]; 728f1af5d2fSBarry Smith t[ii] = b[ic]; 729f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 730f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 731f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 732f1af5d2fSBarry Smith ii += 4; 733f1af5d2fSBarry Smith } 734f1af5d2fSBarry Smith 735f1af5d2fSBarry Smith /* forward solve the U^T */ 736f1af5d2fSBarry Smith idx = 0; 737f1af5d2fSBarry Smith for (i=0; i<n; i++) { 738f1af5d2fSBarry Smith 739f1af5d2fSBarry Smith v = aa + 16*diag[i]; 740f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 741f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746f1af5d2fSBarry Smith v += 16; 747f1af5d2fSBarry Smith 748f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 749f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 750f1af5d2fSBarry Smith while (nz--) { 751f1af5d2fSBarry Smith oidx = 4*(*vi++); 752f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756f1af5d2fSBarry Smith v += 16; 757f1af5d2fSBarry Smith } 758f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759f1af5d2fSBarry Smith idx += 4; 760f1af5d2fSBarry Smith } 761f1af5d2fSBarry Smith /* backward solve the L^T */ 762f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 763f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 764f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 765f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 766f1af5d2fSBarry Smith idt = 4*i; 767f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768f1af5d2fSBarry Smith while (nz--) { 769f1af5d2fSBarry Smith idx = 4*(*vi--); 770f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774f1af5d2fSBarry Smith v -= 16; 775f1af5d2fSBarry Smith } 776f1af5d2fSBarry Smith } 777f1af5d2fSBarry Smith 778f1af5d2fSBarry Smith /* copy t into x according to permutation */ 779f1af5d2fSBarry Smith ii = 0; 780f1af5d2fSBarry Smith for (i=0; i<n; i++) { 781f1af5d2fSBarry Smith ir = 4*r[i]; 782f1af5d2fSBarry Smith x[ir] = t[ii]; 783f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 784f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 785f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 786f1af5d2fSBarry Smith ii += 4; 787f1af5d2fSBarry Smith } 788f1af5d2fSBarry Smith 789f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794f1af5d2fSBarry Smith PetscFunctionReturn(0); 795f1af5d2fSBarry Smith } 796f1af5d2fSBarry Smith 7974a2ae208SSatish Balay #undef __FUNCT__ 7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800f1af5d2fSBarry Smith { 801f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8036849ba73SBarry Smith PetscErrorCode ierr; 8045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 807f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 80887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 80987828ca2SBarry Smith PetscScalar *x,*b,*t; 810f1af5d2fSBarry Smith 811f1af5d2fSBarry Smith PetscFunctionBegin; 8121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814f1af5d2fSBarry Smith t = a->solve_work; 815f1af5d2fSBarry Smith 816f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818f1af5d2fSBarry Smith 819f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 820f1af5d2fSBarry Smith ii = 0; 821f1af5d2fSBarry Smith for (i=0; i<n; i++) { 822f1af5d2fSBarry Smith ic = 5*c[i]; 823f1af5d2fSBarry Smith t[ii] = b[ic]; 824f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 825f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 826f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 827f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 828f1af5d2fSBarry Smith ii += 5; 829f1af5d2fSBarry Smith } 830f1af5d2fSBarry Smith 831f1af5d2fSBarry Smith /* forward solve the U^T */ 832f1af5d2fSBarry Smith idx = 0; 833f1af5d2fSBarry Smith for (i=0; i<n; i++) { 834f1af5d2fSBarry Smith 835f1af5d2fSBarry Smith v = aa + 25*diag[i]; 836f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 837f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843f1af5d2fSBarry Smith v += 25; 844f1af5d2fSBarry Smith 845f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 846f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 847f1af5d2fSBarry Smith while (nz--) { 848f1af5d2fSBarry Smith oidx = 5*(*vi++); 849f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854f1af5d2fSBarry Smith v += 25; 855f1af5d2fSBarry Smith } 856f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857f1af5d2fSBarry Smith idx += 5; 858f1af5d2fSBarry Smith } 859f1af5d2fSBarry Smith /* backward solve the L^T */ 860f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 861f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 862f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 863f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 864f1af5d2fSBarry Smith idt = 5*i; 865f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866f1af5d2fSBarry Smith while (nz--) { 867f1af5d2fSBarry Smith idx = 5*(*vi--); 868f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873f1af5d2fSBarry Smith v -= 25; 874f1af5d2fSBarry Smith } 875f1af5d2fSBarry Smith } 876f1af5d2fSBarry Smith 877f1af5d2fSBarry Smith /* copy t into x according to permutation */ 878f1af5d2fSBarry Smith ii = 0; 879f1af5d2fSBarry Smith for (i=0; i<n; i++) { 880f1af5d2fSBarry Smith ir = 5*r[i]; 881f1af5d2fSBarry Smith x[ir] = t[ii]; 882f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 883f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 884f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 885f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 886f1af5d2fSBarry Smith ii += 5; 887f1af5d2fSBarry Smith } 888f1af5d2fSBarry Smith 889f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894f1af5d2fSBarry Smith PetscFunctionReturn(0); 895f1af5d2fSBarry Smith } 896f1af5d2fSBarry Smith 8974a2ae208SSatish Balay #undef __FUNCT__ 8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900f1af5d2fSBarry Smith { 901f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9036849ba73SBarry Smith PetscErrorCode ierr; 9045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 907f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 90887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 90987828ca2SBarry Smith PetscScalar *x,*b,*t; 910f1af5d2fSBarry Smith 911f1af5d2fSBarry Smith PetscFunctionBegin; 9121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914f1af5d2fSBarry Smith t = a->solve_work; 915f1af5d2fSBarry Smith 916f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918f1af5d2fSBarry Smith 919f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 920f1af5d2fSBarry Smith ii = 0; 921f1af5d2fSBarry Smith for (i=0; i<n; i++) { 922f1af5d2fSBarry Smith ic = 6*c[i]; 923f1af5d2fSBarry Smith t[ii] = b[ic]; 924f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 925f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 926f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 927f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 928f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 929f1af5d2fSBarry Smith ii += 6; 930f1af5d2fSBarry Smith } 931f1af5d2fSBarry Smith 932f1af5d2fSBarry Smith /* forward solve the U^T */ 933f1af5d2fSBarry Smith idx = 0; 934f1af5d2fSBarry Smith for (i=0; i<n; i++) { 935f1af5d2fSBarry Smith 936f1af5d2fSBarry Smith v = aa + 36*diag[i]; 937f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 938f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939f1af5d2fSBarry Smith x6 = t[5+idx]; 940f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946f1af5d2fSBarry Smith v += 36; 947f1af5d2fSBarry Smith 948f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 949f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 950f1af5d2fSBarry Smith while (nz--) { 951f1af5d2fSBarry Smith oidx = 6*(*vi++); 952f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958f1af5d2fSBarry Smith v += 36; 959f1af5d2fSBarry Smith } 960f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961f1af5d2fSBarry Smith t[5+idx] = s6; 962f1af5d2fSBarry Smith idx += 6; 963f1af5d2fSBarry Smith } 964f1af5d2fSBarry Smith /* backward solve the L^T */ 965f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 966f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 967f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 968f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 969f1af5d2fSBarry Smith idt = 6*i; 970f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971f1af5d2fSBarry Smith s6 = t[5+idt]; 972f1af5d2fSBarry Smith while (nz--) { 973f1af5d2fSBarry Smith idx = 6*(*vi--); 974f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980f1af5d2fSBarry Smith v -= 36; 981f1af5d2fSBarry Smith } 982f1af5d2fSBarry Smith } 983f1af5d2fSBarry Smith 984f1af5d2fSBarry Smith /* copy t into x according to permutation */ 985f1af5d2fSBarry Smith ii = 0; 986f1af5d2fSBarry Smith for (i=0; i<n; i++) { 987f1af5d2fSBarry Smith ir = 6*r[i]; 988f1af5d2fSBarry Smith x[ir] = t[ii]; 989f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 990f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 991f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 992f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 993f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 994f1af5d2fSBarry Smith ii += 6; 995f1af5d2fSBarry Smith } 996f1af5d2fSBarry Smith 997f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9991ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002f1af5d2fSBarry Smith PetscFunctionReturn(0); 1003f1af5d2fSBarry Smith } 1004f1af5d2fSBarry Smith 10054a2ae208SSatish Balay #undef __FUNCT__ 10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008f1af5d2fSBarry Smith { 1009f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10116849ba73SBarry Smith PetscErrorCode ierr; 10125d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 10135d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 101687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 101787828ca2SBarry Smith PetscScalar *x,*b,*t; 1018f1af5d2fSBarry Smith 1019f1af5d2fSBarry Smith PetscFunctionBegin; 10201ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022f1af5d2fSBarry Smith t = a->solve_work; 1023f1af5d2fSBarry Smith 1024f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026f1af5d2fSBarry Smith 1027f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1028f1af5d2fSBarry Smith ii = 0; 1029f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1030f1af5d2fSBarry Smith ic = 7*c[i]; 1031f1af5d2fSBarry Smith t[ii] = b[ic]; 1032f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1033f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1034f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1035f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1036f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1037f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1038f1af5d2fSBarry Smith ii += 7; 1039f1af5d2fSBarry Smith } 1040f1af5d2fSBarry Smith 1041f1af5d2fSBarry Smith /* forward solve the U^T */ 1042f1af5d2fSBarry Smith idx = 0; 1043f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1044f1af5d2fSBarry Smith 1045f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1046f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1047f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1049f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056f1af5d2fSBarry Smith v += 49; 1057f1af5d2fSBarry Smith 1058f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1059f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1060f1af5d2fSBarry Smith while (nz--) { 1061f1af5d2fSBarry Smith oidx = 7*(*vi++); 1062f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069f1af5d2fSBarry Smith v += 49; 1070f1af5d2fSBarry Smith } 1071f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1073f1af5d2fSBarry Smith idx += 7; 1074f1af5d2fSBarry Smith } 1075f1af5d2fSBarry Smith /* backward solve the L^T */ 1076f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1077f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1078f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1079f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1080f1af5d2fSBarry Smith idt = 7*i; 1081f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1083f1af5d2fSBarry Smith while (nz--) { 1084f1af5d2fSBarry Smith idx = 7*(*vi--); 1085f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092f1af5d2fSBarry Smith v -= 49; 1093f1af5d2fSBarry Smith } 1094f1af5d2fSBarry Smith } 1095f1af5d2fSBarry Smith 1096f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1097f1af5d2fSBarry Smith ii = 0; 1098f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1099f1af5d2fSBarry Smith ir = 7*r[i]; 1100f1af5d2fSBarry Smith x[ir] = t[ii]; 1101f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1102f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1103f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1104f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1105f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1106f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1107f1af5d2fSBarry Smith ii += 7; 1108f1af5d2fSBarry Smith } 1109f1af5d2fSBarry Smith 1110f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11121ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115f1af5d2fSBarry Smith PetscFunctionReturn(0); 1116f1af5d2fSBarry Smith } 1117f1af5d2fSBarry Smith 11184e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11194a2ae208SSatish Balay #undef __FUNCT__ 11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11224e2b4712SSatish Balay { 11234e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11244e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11256849ba73SBarry Smith PetscErrorCode ierr; 11265d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 11275d0c19d7SBarry Smith PetscInt i,n=a->mbs; 11285d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11293f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 113087828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11314e2b4712SSatish Balay 11324e2b4712SSatish Balay PetscFunctionBegin; 11331ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11341ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135f1af5d2fSBarry Smith t = a->solve_work; 11364e2b4712SSatish Balay 11374e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11384e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11394e2b4712SSatish Balay 11404e2b4712SSatish Balay /* forward solve the lower triangular */ 114187828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11424e2b4712SSatish Balay for (i=1; i<n; i++) { 11434e2b4712SSatish Balay v = aa + bs2*ai[i]; 11444e2b4712SSatish Balay vi = aj + ai[i]; 11454e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1146f1af5d2fSBarry Smith s = t + bs*i; 114787828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11484e2b4712SSatish Balay while (nz--) { 1149f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11504e2b4712SSatish Balay v += bs2; 11514e2b4712SSatish Balay } 11524e2b4712SSatish Balay } 11534e2b4712SSatish Balay /* backward solve the upper triangular */ 1154d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 11554e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11564e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11574e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11584e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 115987828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11604e2b4712SSatish Balay while (nz--) { 1161f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11624e2b4712SSatish Balay v += bs2; 11634e2b4712SSatish Balay } 1164f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 116587828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11664e2b4712SSatish Balay } 11674e2b4712SSatish Balay 11684e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11694e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11701ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 11734e2b4712SSatish Balay PetscFunctionReturn(0); 11744e2b4712SSatish Balay } 11754e2b4712SSatish Balay 11764a2ae208SSatish Balay #undef __FUNCT__ 11774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1178dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11794e2b4712SSatish Balay { 11804e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11814e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11826849ba73SBarry Smith PetscErrorCode ierr; 11835d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 11845d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 11853f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 118687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 118787828ca2SBarry Smith PetscScalar *x,*b,*t; 11884e2b4712SSatish Balay 11894e2b4712SSatish Balay PetscFunctionBegin; 11901ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1192f1af5d2fSBarry Smith t = a->solve_work; 11934e2b4712SSatish Balay 11944e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11954e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11964e2b4712SSatish Balay 11974e2b4712SSatish Balay /* forward solve the lower triangular */ 11984e2b4712SSatish Balay idx = 7*(*r++); 1199f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1200f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1201f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 12024e2b4712SSatish Balay 12034e2b4712SSatish Balay for (i=1; i<n; i++) { 12044e2b4712SSatish Balay v = aa + 49*ai[i]; 12054e2b4712SSatish Balay vi = aj + ai[i]; 12064e2b4712SSatish Balay nz = diag[i] - ai[i]; 12074e2b4712SSatish Balay idx = 7*(*r++); 1208f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1209f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 12104e2b4712SSatish Balay while (nz--) { 12114e2b4712SSatish Balay idx = 7*(*vi++); 1212f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1213f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1214f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1215f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1216f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1217f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1218f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1219f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1220f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1221f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12224e2b4712SSatish Balay v += 49; 12234e2b4712SSatish Balay } 12244e2b4712SSatish Balay idx = 7*i; 1225f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1226f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1227f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12284e2b4712SSatish Balay } 12294e2b4712SSatish Balay /* backward solve the upper triangular */ 12304e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12314e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12324e2b4712SSatish Balay vi = aj + diag[i] + 1; 12334e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12344e2b4712SSatish Balay idt = 7*i; 1235f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1236f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1237f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12384e2b4712SSatish Balay while (nz--) { 12394e2b4712SSatish Balay idx = 7*(*vi++); 1240f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1241f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1243f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1244f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1245f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1246f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1247f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1248f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1249f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12504e2b4712SSatish Balay v += 49; 12514e2b4712SSatish Balay } 12524e2b4712SSatish Balay idc = 7*(*c--); 12534e2b4712SSatish Balay v = aa + 49*diag[i]; 1254f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1255f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1256f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1257f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1258f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1259f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1260f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1261f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1262f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1263f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1264f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1265f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1266f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1267f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12684e2b4712SSatish Balay } 12694e2b4712SSatish Balay 12704e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12714e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12721ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12731ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1274dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 12754e2b4712SSatish Balay PetscFunctionReturn(0); 12764e2b4712SSatish Balay } 12774e2b4712SSatish Balay 12784a2ae208SSatish Balay #undef __FUNCT__ 12798f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 12808f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 12818f690400SShri Abhyankar { 12828f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 12838f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 12848f690400SShri Abhyankar PetscErrorCode ierr; 12858f690400SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 128629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 12878f690400SShri Abhyankar MatScalar *aa=a->a,*v; 12888f690400SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 12898f690400SShri Abhyankar PetscScalar *x,*b,*t; 12908f690400SShri Abhyankar 12918f690400SShri Abhyankar PetscFunctionBegin; 12928f690400SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 12938f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 12948f690400SShri Abhyankar t = a->solve_work; 12958f690400SShri Abhyankar 12968f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 129729b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 12988f690400SShri Abhyankar 12998f690400SShri Abhyankar /* forward solve the lower triangular */ 130029b92fc1SShri Abhyankar idx = 7*r[0]; 13018f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 13028f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 13038f690400SShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 13048f690400SShri Abhyankar 13058f690400SShri Abhyankar for (i=1; i<n; i++) { 13068f690400SShri Abhyankar v = aa + 49*ai[i]; 13078f690400SShri Abhyankar vi = aj + ai[i]; 13088f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 130929b92fc1SShri Abhyankar idx = 7*r[i]; 13108f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 13118f690400SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 131229b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 131329b92fc1SShri Abhyankar idx = 7*vi[m]; 13148f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 13158f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 13168f690400SShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 13178f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 13188f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 13198f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 13208f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 13218f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 13228f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 13238f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13248f690400SShri Abhyankar v += 49; 13258f690400SShri Abhyankar } 13268f690400SShri Abhyankar idx = 7*i; 13278f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 13288f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 13298f690400SShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 13308f690400SShri Abhyankar } 13318f690400SShri Abhyankar /* backward solve the upper triangular */ 13328f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 13338f690400SShri Abhyankar k = 2*n-i; 13348f690400SShri Abhyankar v = aa + 49*ai[k]; 13358f690400SShri Abhyankar vi = aj + ai[k]; 13368f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 13378f690400SShri Abhyankar idt = 7*i; 13388f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 13398f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 13408f690400SShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 134129b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 134229b92fc1SShri Abhyankar idx = 7*vi[m]; 13438f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 13448f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 13458f690400SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 13468f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 13478f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 13488f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 13498f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 13508f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 13518f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 13528f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13538f690400SShri Abhyankar v += 49; 13548f690400SShri Abhyankar } 135529b92fc1SShri Abhyankar idc = 7*c[i]; 13568f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 13578f690400SShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 13588f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 13598f690400SShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 13608f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 13618f690400SShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 13628f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 13638f690400SShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 13648f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 13658f690400SShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 13668f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 13678f690400SShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 13688f690400SShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 13698f690400SShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 13708f690400SShri Abhyankar } 13718f690400SShri Abhyankar 13728f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 13738f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13748f690400SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 13758f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 13768f690400SShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 13778f690400SShri Abhyankar PetscFunctionReturn(0); 13788f690400SShri Abhyankar } 13798f690400SShri Abhyankar 13808f690400SShri Abhyankar #undef __FUNCT__ 138135aa4fcfSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct_v2" 138235aa4fcfSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct_v2(Mat A,Vec bb,Vec xx) 138335aa4fcfSShri Abhyankar { 138435aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 138535aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 138635aa4fcfSShri Abhyankar PetscErrorCode ierr; 138735aa4fcfSShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 138835aa4fcfSShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 138935aa4fcfSShri Abhyankar MatScalar *aa=a->a,*v; 139035aa4fcfSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 139135aa4fcfSShri Abhyankar PetscScalar *x,*b,*t; 139235aa4fcfSShri Abhyankar 139335aa4fcfSShri Abhyankar PetscFunctionBegin; 139435aa4fcfSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 139535aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 139635aa4fcfSShri Abhyankar t = a->solve_work; 139735aa4fcfSShri Abhyankar 139835aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 139935aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 140035aa4fcfSShri Abhyankar 140135aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 140235aa4fcfSShri Abhyankar idx = 7*r[0]; 140335aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 140435aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 140535aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 140635aa4fcfSShri Abhyankar 140735aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 140835aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 140935aa4fcfSShri Abhyankar vi = aj + ai[i]; 141035aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 141135aa4fcfSShri Abhyankar idx = 7*r[i]; 141235aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 141335aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 141435aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 141535aa4fcfSShri Abhyankar idx = 7*vi[m]; 141635aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 141735aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 141835aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 141935aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 142035aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 142135aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 142235aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 142335aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 142435aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 142535aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 142635aa4fcfSShri Abhyankar v += 49; 142735aa4fcfSShri Abhyankar } 142835aa4fcfSShri Abhyankar idx = 7*i; 142935aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 143035aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 143135aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 143235aa4fcfSShri Abhyankar } 143335aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 143435aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 143535aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 143635aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 143735aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 143835aa4fcfSShri Abhyankar idt = 7*i; 143935aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 144035aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 144135aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 144235aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 144335aa4fcfSShri Abhyankar idx = 7*vi[m]; 144435aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 144535aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 144635aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 144735aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 144835aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 144935aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 145035aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 145135aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 145235aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 145335aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 145435aa4fcfSShri Abhyankar v += 49; 145535aa4fcfSShri Abhyankar } 145635aa4fcfSShri Abhyankar idc = 7*c[i]; 145735aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 145835aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 145935aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 146035aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 146135aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 146235aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 146335aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 146435aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 146535aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 146635aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 146735aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 146835aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 146935aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 147035aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 147135aa4fcfSShri Abhyankar } 147235aa4fcfSShri Abhyankar 147335aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 147435aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 147535aa4fcfSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 147635aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 147735aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 147835aa4fcfSShri Abhyankar PetscFunctionReturn(0); 147935aa4fcfSShri Abhyankar } 148035aa4fcfSShri Abhyankar 148135aa4fcfSShri Abhyankar #undef __FUNCT__ 14824a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1483dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 148415091d37SBarry Smith { 148515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1486690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1487dfbe8321SBarry Smith PetscErrorCode ierr; 1488690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1489d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1490d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1491d9fead3dSBarry Smith const PetscScalar *b; 149215091d37SBarry Smith 149315091d37SBarry Smith PetscFunctionBegin; 1494d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14951ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 149615091d37SBarry Smith /* forward solve the lower triangular */ 149715091d37SBarry Smith idx = 0; 149815091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 149915091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 150015091d37SBarry Smith x[6] = b[6+idx]; 150115091d37SBarry Smith for (i=1; i<n; i++) { 150215091d37SBarry Smith v = aa + 49*ai[i]; 150315091d37SBarry Smith vi = aj + ai[i]; 150415091d37SBarry Smith nz = diag[i] - ai[i]; 150515091d37SBarry Smith idx = 7*i; 1506f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1507f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1508f1af5d2fSBarry Smith s7 = b[6+idx]; 150915091d37SBarry Smith while (nz--) { 151015091d37SBarry Smith jdx = 7*(*vi++); 151115091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 151215091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 151315091d37SBarry Smith x7 = x[6+jdx]; 1514f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1515f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1516f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1517f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1518f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1519f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1520f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 152115091d37SBarry Smith v += 49; 152215091d37SBarry Smith } 1523f1af5d2fSBarry Smith x[idx] = s1; 1524f1af5d2fSBarry Smith x[1+idx] = s2; 1525f1af5d2fSBarry Smith x[2+idx] = s3; 1526f1af5d2fSBarry Smith x[3+idx] = s4; 1527f1af5d2fSBarry Smith x[4+idx] = s5; 1528f1af5d2fSBarry Smith x[5+idx] = s6; 1529f1af5d2fSBarry Smith x[6+idx] = s7; 153015091d37SBarry Smith } 153115091d37SBarry Smith /* backward solve the upper triangular */ 153215091d37SBarry Smith for (i=n-1; i>=0; i--){ 153315091d37SBarry Smith v = aa + 49*diag[i] + 49; 153415091d37SBarry Smith vi = aj + diag[i] + 1; 153515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 153615091d37SBarry Smith idt = 7*i; 1537f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1538f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1539f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1540f1af5d2fSBarry Smith s7 = x[6+idt]; 154115091d37SBarry Smith while (nz--) { 154215091d37SBarry Smith idx = 7*(*vi++); 154315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 154415091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 154515091d37SBarry Smith x7 = x[6+idx]; 1546f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1547f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1548f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1549f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1550f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1551f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1552f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 155315091d37SBarry Smith v += 49; 155415091d37SBarry Smith } 155515091d37SBarry Smith v = aa + 49*diag[i]; 1556f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1557f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1558f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1559f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1560f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1561f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1562f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1563f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1564f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1565f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1566f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1567f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1568f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1569f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 157015091d37SBarry Smith } 157115091d37SBarry Smith 1572d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15731ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1574dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 157515091d37SBarry Smith PetscFunctionReturn(0); 157615091d37SBarry Smith } 157715091d37SBarry Smith 15784a2ae208SSatish Balay #undef __FUNCT__ 1579cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1580cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1581cee9d6f2SShri Abhyankar { 1582cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 15836464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1584cee9d6f2SShri Abhyankar PetscErrorCode ierr; 1585cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 1586cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1587cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 1588cee9d6f2SShri Abhyankar PetscScalar *x; 1589cee9d6f2SShri Abhyankar const PetscScalar *b; 1590cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1591cee9d6f2SShri Abhyankar 1592cee9d6f2SShri Abhyankar PetscFunctionBegin; 1593cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1594cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1595cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 1596cee9d6f2SShri Abhyankar idx = 0; 1597cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1598cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1599cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 1600cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 1601cee9d6f2SShri Abhyankar vi = aj + ai[i]; 1602cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 1603cee9d6f2SShri Abhyankar idx = bs*i; 1604cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1605cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 16066464896eSShri Abhyankar for(k=0;k<nz;k++) { 16076464896eSShri Abhyankar jdx = bs*vi[k]; 1608cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1609cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1610cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1611cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1612cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1613cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1614cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1615cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1616cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1617cee9d6f2SShri Abhyankar v += bs2; 1618cee9d6f2SShri Abhyankar } 1619cee9d6f2SShri Abhyankar 1620cee9d6f2SShri Abhyankar x[idx] = s1; 1621cee9d6f2SShri Abhyankar x[1+idx] = s2; 1622cee9d6f2SShri Abhyankar x[2+idx] = s3; 1623cee9d6f2SShri Abhyankar x[3+idx] = s4; 1624cee9d6f2SShri Abhyankar x[4+idx] = s5; 1625cee9d6f2SShri Abhyankar x[5+idx] = s6; 1626cee9d6f2SShri Abhyankar x[6+idx] = s7; 1627cee9d6f2SShri Abhyankar } 1628cee9d6f2SShri Abhyankar 1629cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 1630cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 1631cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 1632cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 1633cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 1634cee9d6f2SShri Abhyankar idt = bs*i; 1635cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1636cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 16376464896eSShri Abhyankar for(k=0;k<nz;k++) { 16386464896eSShri Abhyankar idx = bs*vi[k]; 1639cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1640cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1641cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1642cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1643cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1644cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1645cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1646cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1647cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1648cee9d6f2SShri Abhyankar v += bs2; 1649cee9d6f2SShri Abhyankar } 1650cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 1651cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1652cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1653cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1654cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1655cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1656cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1657cee9d6f2SShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1658cee9d6f2SShri Abhyankar } 1659cee9d6f2SShri Abhyankar 1660cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1661cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1662cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1663cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 1664cee9d6f2SShri Abhyankar } 1665cee9d6f2SShri Abhyankar 1666cee9d6f2SShri Abhyankar #undef __FUNCT__ 166753cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2" 166853cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 166953cca76cSShri Abhyankar { 167053cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 167153cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 167253cca76cSShri Abhyankar PetscErrorCode ierr; 167353cca76cSShri Abhyankar PetscInt idx,jdx,idt; 167453cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 167553cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 167653cca76cSShri Abhyankar PetscScalar *x; 167753cca76cSShri Abhyankar const PetscScalar *b; 167853cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 167953cca76cSShri Abhyankar 168053cca76cSShri Abhyankar PetscFunctionBegin; 168153cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 168253cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 168353cca76cSShri Abhyankar /* forward solve the lower triangular */ 168453cca76cSShri Abhyankar idx = 0; 168553cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 168653cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 168753cca76cSShri Abhyankar for (i=1; i<n; i++) { 168853cca76cSShri Abhyankar v = aa + bs2*ai[i]; 168953cca76cSShri Abhyankar vi = aj + ai[i]; 169053cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 169153cca76cSShri Abhyankar idx = bs*i; 169253cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 169353cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 169453cca76cSShri Abhyankar for(k=0;k<nz;k++) { 169553cca76cSShri Abhyankar jdx = bs*vi[k]; 169653cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 169753cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 169853cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 169953cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 170053cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 170153cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 170253cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 170353cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 170453cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 170553cca76cSShri Abhyankar v += bs2; 170653cca76cSShri Abhyankar } 170753cca76cSShri Abhyankar 170853cca76cSShri Abhyankar x[idx] = s1; 170953cca76cSShri Abhyankar x[1+idx] = s2; 171053cca76cSShri Abhyankar x[2+idx] = s3; 171153cca76cSShri Abhyankar x[3+idx] = s4; 171253cca76cSShri Abhyankar x[4+idx] = s5; 171353cca76cSShri Abhyankar x[5+idx] = s6; 171453cca76cSShri Abhyankar x[6+idx] = s7; 171553cca76cSShri Abhyankar } 171653cca76cSShri Abhyankar 171753cca76cSShri Abhyankar /* backward solve the upper triangular */ 171853cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 171953cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 172053cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 172153cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 172253cca76cSShri Abhyankar idt = bs*i; 172353cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 172453cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 172553cca76cSShri Abhyankar for(k=0;k<nz;k++) { 172653cca76cSShri Abhyankar idx = bs*vi[k]; 172753cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 172853cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 172953cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 173053cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 173153cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 173253cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 173353cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 173453cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 173553cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 173653cca76cSShri Abhyankar v += bs2; 173753cca76cSShri Abhyankar } 173853cca76cSShri Abhyankar /* x = inv_diagonal*x */ 173953cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 174053cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 174153cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 174253cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 174353cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 174453cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 174553cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 174653cca76cSShri Abhyankar } 174753cca76cSShri Abhyankar 174853cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 174953cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175053cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 175153cca76cSShri Abhyankar PetscFunctionReturn(0); 175253cca76cSShri Abhyankar } 175353cca76cSShri Abhyankar 175453cca76cSShri Abhyankar #undef __FUNCT__ 17554a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1756dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 175715091d37SBarry Smith { 175815091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 175915091d37SBarry Smith IS iscol=a->col,isrow=a->row; 17606849ba73SBarry Smith PetscErrorCode ierr; 17615d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 17625d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1763d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1764d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1765d9fead3dSBarry Smith const PetscScalar *b; 176615091d37SBarry Smith PetscFunctionBegin; 1767d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17681ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1769f1af5d2fSBarry Smith t = a->solve_work; 177015091d37SBarry Smith 177115091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 177215091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 177315091d37SBarry Smith 177415091d37SBarry Smith /* forward solve the lower triangular */ 177515091d37SBarry Smith idx = 6*(*r++); 1776f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1777f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1778f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 177915091d37SBarry Smith for (i=1; i<n; i++) { 178015091d37SBarry Smith v = aa + 36*ai[i]; 178115091d37SBarry Smith vi = aj + ai[i]; 178215091d37SBarry Smith nz = diag[i] - ai[i]; 178315091d37SBarry Smith idx = 6*(*r++); 1784f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1785f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 178615091d37SBarry Smith while (nz--) { 178715091d37SBarry Smith idx = 6*(*vi++); 1788f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1789f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1790f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1791f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1792f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1793f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1794f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1795f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 179615091d37SBarry Smith v += 36; 179715091d37SBarry Smith } 179815091d37SBarry Smith idx = 6*i; 1799f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1800f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1801f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 180215091d37SBarry Smith } 180315091d37SBarry Smith /* backward solve the upper triangular */ 180415091d37SBarry Smith for (i=n-1; i>=0; i--){ 180515091d37SBarry Smith v = aa + 36*diag[i] + 36; 180615091d37SBarry Smith vi = aj + diag[i] + 1; 180715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 180815091d37SBarry Smith idt = 6*i; 1809f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1810f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1811f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 181215091d37SBarry Smith while (nz--) { 181315091d37SBarry Smith idx = 6*(*vi++); 1814f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1815f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1816f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1817f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1818f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1819f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1820f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1821f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1822f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 182315091d37SBarry Smith v += 36; 182415091d37SBarry Smith } 182515091d37SBarry Smith idc = 6*(*c--); 182615091d37SBarry Smith v = aa + 36*diag[i]; 1827f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1828f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1829f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1830f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1831f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1832f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1833f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1834f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1835f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1836f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1837f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1838f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 183915091d37SBarry Smith } 184015091d37SBarry Smith 184115091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 184215091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1843d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18441ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1845dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 184615091d37SBarry Smith PetscFunctionReturn(0); 184715091d37SBarry Smith } 184815091d37SBarry Smith 18494a2ae208SSatish Balay #undef __FUNCT__ 18508f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 18518f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 18528f690400SShri Abhyankar { 18538f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18548f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 18558f690400SShri Abhyankar PetscErrorCode ierr; 18568f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 185729b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 18588f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 18598f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 18608f690400SShri Abhyankar const PetscScalar *b; 18618f690400SShri Abhyankar PetscFunctionBegin; 18628f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18638f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 18648f690400SShri Abhyankar t = a->solve_work; 18658f690400SShri Abhyankar 18668f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 186729b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 18688f690400SShri Abhyankar 18698f690400SShri Abhyankar /* forward solve the lower triangular */ 187029b92fc1SShri Abhyankar idx = 6*r[0]; 18718f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 18728f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 18738f690400SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 18748f690400SShri Abhyankar for (i=1; i<n; i++) { 18758f690400SShri Abhyankar v = aa + 36*ai[i]; 18768f690400SShri Abhyankar vi = aj + ai[i]; 18778f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 187829b92fc1SShri Abhyankar idx = 6*r[i]; 18798f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 18808f690400SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 188129b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 188229b92fc1SShri Abhyankar idx = 6*vi[m]; 18838f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 18848f690400SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 18858f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 18868f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 18878f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 18888f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 18898f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 18908f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 18918f690400SShri Abhyankar v += 36; 18928f690400SShri Abhyankar } 18938f690400SShri Abhyankar idx = 6*i; 18948f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 18958f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 18968f690400SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 18978f690400SShri Abhyankar } 18988f690400SShri Abhyankar /* backward solve the upper triangular */ 18998f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 19008f690400SShri Abhyankar k = 2*n-i; 19018f690400SShri Abhyankar v = aa + 36*ai[k]; 19028f690400SShri Abhyankar vi = aj + ai[k]; 19038f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 19048f690400SShri Abhyankar idt = 6*i; 19058f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 19068f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 19078f690400SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 190829b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 190929b92fc1SShri Abhyankar idx = 6*vi[m]; 19108f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 19118f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 19128f690400SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 19138f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 19148f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 19158f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 19168f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 19178f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 19188f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 19198f690400SShri Abhyankar v += 36; 19208f690400SShri Abhyankar } 192129b92fc1SShri Abhyankar idc = 6*c[i]; 19228f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 19238f690400SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 19248f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 19258f690400SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 19268f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 19278f690400SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 19288f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 19298f690400SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 19308f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 19318f690400SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 19328f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 19338f690400SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 19348f690400SShri Abhyankar } 19358f690400SShri Abhyankar 19368f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 19378f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 19388f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19398f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 19408f690400SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 19418f690400SShri Abhyankar PetscFunctionReturn(0); 19428f690400SShri Abhyankar } 19438f690400SShri Abhyankar 19446506fda5SShri Abhyankar #undef __FUNCT__ 19456506fda5SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct_v2" 19466506fda5SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct_v2(Mat A,Vec bb,Vec xx) 19476506fda5SShri Abhyankar { 19486506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 19496506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 19506506fda5SShri Abhyankar PetscErrorCode ierr; 19516506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 19526506fda5SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 19536506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 19546506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 19556506fda5SShri Abhyankar const PetscScalar *b; 19566506fda5SShri Abhyankar PetscFunctionBegin; 19576506fda5SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19586506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 19596506fda5SShri Abhyankar t = a->solve_work; 19606506fda5SShri Abhyankar 19616506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 19626506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 19636506fda5SShri Abhyankar 19646506fda5SShri Abhyankar /* forward solve the lower triangular */ 19656506fda5SShri Abhyankar idx = 6*r[0]; 19666506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 19676506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 19686506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 19696506fda5SShri Abhyankar for (i=1; i<n; i++) { 19706506fda5SShri Abhyankar v = aa + 36*ai[i]; 19716506fda5SShri Abhyankar vi = aj + ai[i]; 19726506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 19736506fda5SShri Abhyankar idx = 6*r[i]; 19746506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 19756506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 19766506fda5SShri Abhyankar for(m=0;m<nz;m++){ 19776506fda5SShri Abhyankar idx = 6*vi[m]; 19786506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 19796506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 19806506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 19816506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 19826506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 19836506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 19846506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 19856506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 19866506fda5SShri Abhyankar v += 36; 19876506fda5SShri Abhyankar } 19886506fda5SShri Abhyankar idx = 6*i; 19896506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 19906506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 19916506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 19926506fda5SShri Abhyankar } 19936506fda5SShri Abhyankar /* backward solve the upper triangular */ 19946506fda5SShri Abhyankar for (i=n-1; i>=0; i--){ 19956506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 19966506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 19976506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 19986506fda5SShri Abhyankar idt = 6*i; 19996506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 20006506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 20016506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 20026506fda5SShri Abhyankar for(m=0;m<nz;m++){ 20036506fda5SShri Abhyankar idx = 6*vi[m]; 20046506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 20056506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 20066506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 20076506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 20086506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 20096506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 20106506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 20116506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 20126506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 20136506fda5SShri Abhyankar v += 36; 20146506fda5SShri Abhyankar } 20156506fda5SShri Abhyankar idc = 6*c[i]; 20166506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 20176506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 20186506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 20196506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 20206506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 20216506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 20226506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 20236506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 20246506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 20256506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 20266506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 20276506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 20286506fda5SShri Abhyankar } 20296506fda5SShri Abhyankar 20306506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 20316506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 20326506fda5SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20336506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 20346506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 20356506fda5SShri Abhyankar PetscFunctionReturn(0); 20366506fda5SShri Abhyankar } 20378f690400SShri Abhyankar 20388f690400SShri Abhyankar #undef __FUNCT__ 20394a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 2040dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 204115091d37SBarry Smith { 204215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2043690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2044dfbe8321SBarry Smith PetscErrorCode ierr; 2045690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2046d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2047d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2048d9fead3dSBarry Smith const PetscScalar *b; 204915091d37SBarry Smith 205015091d37SBarry Smith PetscFunctionBegin; 2051d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20521ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 205315091d37SBarry Smith /* forward solve the lower triangular */ 205415091d37SBarry Smith idx = 0; 205515091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 205615091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 205715091d37SBarry Smith for (i=1; i<n; i++) { 205815091d37SBarry Smith v = aa + 36*ai[i]; 205915091d37SBarry Smith vi = aj + ai[i]; 206015091d37SBarry Smith nz = diag[i] - ai[i]; 206115091d37SBarry Smith idx = 6*i; 2062f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2063f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 206415091d37SBarry Smith while (nz--) { 206515091d37SBarry Smith jdx = 6*(*vi++); 206615091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 206715091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2068f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2069f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2070f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2071f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2072f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2073f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 207415091d37SBarry Smith v += 36; 207515091d37SBarry Smith } 2076f1af5d2fSBarry Smith x[idx] = s1; 2077f1af5d2fSBarry Smith x[1+idx] = s2; 2078f1af5d2fSBarry Smith x[2+idx] = s3; 2079f1af5d2fSBarry Smith x[3+idx] = s4; 2080f1af5d2fSBarry Smith x[4+idx] = s5; 2081f1af5d2fSBarry Smith x[5+idx] = s6; 208215091d37SBarry Smith } 208315091d37SBarry Smith /* backward solve the upper triangular */ 208415091d37SBarry Smith for (i=n-1; i>=0; i--){ 208515091d37SBarry Smith v = aa + 36*diag[i] + 36; 208615091d37SBarry Smith vi = aj + diag[i] + 1; 208715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 208815091d37SBarry Smith idt = 6*i; 2089f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2090f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2091f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 209215091d37SBarry Smith while (nz--) { 209315091d37SBarry Smith idx = 6*(*vi++); 209415091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 209515091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2096f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2097f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2098f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2099f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2100f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2101f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 210215091d37SBarry Smith v += 36; 210315091d37SBarry Smith } 210415091d37SBarry Smith v = aa + 36*diag[i]; 2105f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2106f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2107f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2108f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2109f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2110f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 211115091d37SBarry Smith } 211215091d37SBarry Smith 2113d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21141ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2115dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 211615091d37SBarry Smith PetscFunctionReturn(0); 211715091d37SBarry Smith } 211815091d37SBarry Smith 21194a2ae208SSatish Balay #undef __FUNCT__ 2120cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2121cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2122cee9d6f2SShri Abhyankar { 2123cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 21246464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2125cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2126cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 2127cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2128cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2129cee9d6f2SShri Abhyankar PetscScalar *x; 2130cee9d6f2SShri Abhyankar const PetscScalar *b; 2131cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2132cee9d6f2SShri Abhyankar 2133cee9d6f2SShri Abhyankar PetscFunctionBegin; 2134cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2135cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2136cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2137cee9d6f2SShri Abhyankar idx = 0; 2138cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2139cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 2140cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2141cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 2142cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2143cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2144cee9d6f2SShri Abhyankar idx = bs*i; 2145cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2146cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 21476464896eSShri Abhyankar for(k=0;k<nz;k++){ 21486464896eSShri Abhyankar jdx = bs*vi[k]; 2149cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2150cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 2151cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2152cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2153cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2154cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2155cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2156cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2157cee9d6f2SShri Abhyankar v += bs2; 2158cee9d6f2SShri Abhyankar } 2159cee9d6f2SShri Abhyankar 2160cee9d6f2SShri Abhyankar x[idx] = s1; 2161cee9d6f2SShri Abhyankar x[1+idx] = s2; 2162cee9d6f2SShri Abhyankar x[2+idx] = s3; 2163cee9d6f2SShri Abhyankar x[3+idx] = s4; 2164cee9d6f2SShri Abhyankar x[4+idx] = s5; 2165cee9d6f2SShri Abhyankar x[5+idx] = s6; 2166cee9d6f2SShri Abhyankar } 2167cee9d6f2SShri Abhyankar 2168cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2169cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2170cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 2171cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2172cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2173cee9d6f2SShri Abhyankar idt = bs*i; 2174cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2175cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 21766464896eSShri Abhyankar for(k=0;k<nz;k++){ 21776464896eSShri Abhyankar idx = bs*vi[k]; 2178cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2179cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 2180cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2181cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2182cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2183cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2184cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2185cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2186cee9d6f2SShri Abhyankar v += bs2; 2187cee9d6f2SShri Abhyankar } 2188cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2189cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2190cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2191cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2192cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2193cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2194cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2195cee9d6f2SShri Abhyankar } 2196cee9d6f2SShri Abhyankar 2197cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2198cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2199cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2200cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2201cee9d6f2SShri Abhyankar } 22028f690400SShri Abhyankar 2203cee9d6f2SShri Abhyankar #undef __FUNCT__ 220453cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2" 220553cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 220653cca76cSShri Abhyankar { 220753cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 220853cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 220953cca76cSShri Abhyankar PetscErrorCode ierr; 221053cca76cSShri Abhyankar PetscInt idx,jdx,idt; 221153cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 221253cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 221353cca76cSShri Abhyankar PetscScalar *x; 221453cca76cSShri Abhyankar const PetscScalar *b; 221553cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 221653cca76cSShri Abhyankar 221753cca76cSShri Abhyankar PetscFunctionBegin; 221853cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 221953cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 222053cca76cSShri Abhyankar /* forward solve the lower triangular */ 222153cca76cSShri Abhyankar idx = 0; 222253cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 222353cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 222453cca76cSShri Abhyankar for (i=1; i<n; i++) { 222553cca76cSShri Abhyankar v = aa + bs2*ai[i]; 222653cca76cSShri Abhyankar vi = aj + ai[i]; 222753cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 222853cca76cSShri Abhyankar idx = bs*i; 222953cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 223053cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 223153cca76cSShri Abhyankar for(k=0;k<nz;k++){ 223253cca76cSShri Abhyankar jdx = bs*vi[k]; 223353cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 223453cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 223553cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 223653cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 223753cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 223853cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 223953cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 224053cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 224153cca76cSShri Abhyankar v += bs2; 224253cca76cSShri Abhyankar } 224353cca76cSShri Abhyankar 224453cca76cSShri Abhyankar x[idx] = s1; 224553cca76cSShri Abhyankar x[1+idx] = s2; 224653cca76cSShri Abhyankar x[2+idx] = s3; 224753cca76cSShri Abhyankar x[3+idx] = s4; 224853cca76cSShri Abhyankar x[4+idx] = s5; 224953cca76cSShri Abhyankar x[5+idx] = s6; 225053cca76cSShri Abhyankar } 225153cca76cSShri Abhyankar 225253cca76cSShri Abhyankar /* backward solve the upper triangular */ 225353cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 225453cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 225553cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 225653cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 225753cca76cSShri Abhyankar idt = bs*i; 225853cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 225953cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 226053cca76cSShri Abhyankar for(k=0;k<nz;k++){ 226153cca76cSShri Abhyankar idx = bs*vi[k]; 226253cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 226353cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 226453cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 226553cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 226653cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 226753cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 226853cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 226953cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 227053cca76cSShri Abhyankar v += bs2; 227153cca76cSShri Abhyankar } 227253cca76cSShri Abhyankar /* x = inv_diagonal*x */ 227353cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 227453cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 227553cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 227653cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 227753cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 227853cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 227953cca76cSShri Abhyankar } 228053cca76cSShri Abhyankar 228153cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 228253cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 228353cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 228453cca76cSShri Abhyankar PetscFunctionReturn(0); 228553cca76cSShri Abhyankar } 228653cca76cSShri Abhyankar 228753cca76cSShri Abhyankar #undef __FUNCT__ 22884a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2289dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 22904e2b4712SSatish Balay { 22914e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 22924e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 22936849ba73SBarry Smith PetscErrorCode ierr; 22945d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 22955d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2296d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2297d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2298d9fead3dSBarry Smith const PetscScalar *b; 22994e2b4712SSatish Balay 23004e2b4712SSatish Balay PetscFunctionBegin; 2301d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23021ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2303f1af5d2fSBarry Smith t = a->solve_work; 23044e2b4712SSatish Balay 23054e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 23064e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 23074e2b4712SSatish Balay 23084e2b4712SSatish Balay /* forward solve the lower triangular */ 23094e2b4712SSatish Balay idx = 5*(*r++); 2310f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2311f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 23124e2b4712SSatish Balay for (i=1; i<n; i++) { 23134e2b4712SSatish Balay v = aa + 25*ai[i]; 23144e2b4712SSatish Balay vi = aj + ai[i]; 23154e2b4712SSatish Balay nz = diag[i] - ai[i]; 23164e2b4712SSatish Balay idx = 5*(*r++); 2317f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2318f1af5d2fSBarry Smith s5 = b[4+idx]; 23194e2b4712SSatish Balay while (nz--) { 23204e2b4712SSatish Balay idx = 5*(*vi++); 2321f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2322f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2323f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2324f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2325f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2326f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2327f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 23284e2b4712SSatish Balay v += 25; 23294e2b4712SSatish Balay } 23304e2b4712SSatish Balay idx = 5*i; 2331f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2332f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 23334e2b4712SSatish Balay } 23344e2b4712SSatish Balay /* backward solve the upper triangular */ 23354e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 23364e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 23374e2b4712SSatish Balay vi = aj + diag[i] + 1; 23384e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 23394e2b4712SSatish Balay idt = 5*i; 2340f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2341f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 23424e2b4712SSatish Balay while (nz--) { 23434e2b4712SSatish Balay idx = 5*(*vi++); 2344f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2345f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2346f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2347f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2348f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2349f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2350f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 23514e2b4712SSatish Balay v += 25; 23524e2b4712SSatish Balay } 23534e2b4712SSatish Balay idc = 5*(*c--); 23544e2b4712SSatish Balay v = aa + 25*diag[i]; 2355f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2356f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 2357f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2358f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 2359f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2360f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 2361f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2362f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 2363f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2364f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 23654e2b4712SSatish Balay } 23664e2b4712SSatish Balay 23674e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 23684e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2369d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23701ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2371dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 23724e2b4712SSatish Balay PetscFunctionReturn(0); 23734e2b4712SSatish Balay } 23744e2b4712SSatish Balay 23754a2ae208SSatish Balay #undef __FUNCT__ 23768f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 23778f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 23788f690400SShri Abhyankar { 23798f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 23808f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 23818f690400SShri Abhyankar PetscErrorCode ierr; 23828f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 238329b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 23848f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 23858f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 23868f690400SShri Abhyankar const PetscScalar *b; 23878f690400SShri Abhyankar 23888f690400SShri Abhyankar PetscFunctionBegin; 23898f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23908f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23918f690400SShri Abhyankar t = a->solve_work; 23928f690400SShri Abhyankar 23938f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 239429b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 23958f690400SShri Abhyankar 23968f690400SShri Abhyankar /* forward solve the lower triangular */ 239729b92fc1SShri Abhyankar idx = 5*r[0]; 23988f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 23998f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 24008f690400SShri Abhyankar for (i=1; i<n; i++) { 24018f690400SShri Abhyankar v = aa + 25*ai[i]; 24028f690400SShri Abhyankar vi = aj + ai[i]; 24038f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 240429b92fc1SShri Abhyankar idx = 5*r[i]; 24058f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 24068f690400SShri Abhyankar s5 = b[4+idx]; 240729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 240829b92fc1SShri Abhyankar idx = 5*vi[m]; 24098f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 24108f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 24118f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 24128f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 24138f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 24148f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 24158f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 24168f690400SShri Abhyankar v += 25; 24178f690400SShri Abhyankar } 24188f690400SShri Abhyankar idx = 5*i; 24198f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 24208f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 24218f690400SShri Abhyankar } 24228f690400SShri Abhyankar /* backward solve the upper triangular */ 24238f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 24248f690400SShri Abhyankar k = 2*n-i; 24258f690400SShri Abhyankar v = aa + 25*ai[k]; 24268f690400SShri Abhyankar vi = aj + ai[k]; 24278f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 24288f690400SShri Abhyankar idt = 5*i; 24298f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 24308f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 243129b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 243229b92fc1SShri Abhyankar idx = 5*vi[m]; 24338f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 24348f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 24358f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 24368f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 24378f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 24388f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 24398f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 24408f690400SShri Abhyankar v += 25; 24418f690400SShri Abhyankar } 244229b92fc1SShri Abhyankar idc = 5*c[i]; 24438f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 24448f690400SShri Abhyankar v[15]*s4+v[20]*s5; 24458f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 24468f690400SShri Abhyankar v[16]*s4+v[21]*s5; 24478f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 24488f690400SShri Abhyankar v[17]*s4+v[22]*s5; 24498f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 24508f690400SShri Abhyankar v[18]*s4+v[23]*s5; 24518f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 24528f690400SShri Abhyankar v[19]*s4+v[24]*s5; 24538f690400SShri Abhyankar } 24548f690400SShri Abhyankar 24558f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 24568f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 24578f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24588f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 24598f690400SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 24608f690400SShri Abhyankar PetscFunctionReturn(0); 24618f690400SShri Abhyankar } 246278bb4007SShri Abhyankar 246378bb4007SShri Abhyankar #undef __FUNCT__ 246478bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2" 246578bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx) 246678bb4007SShri Abhyankar { 246778bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 246878bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 246978bb4007SShri Abhyankar PetscErrorCode ierr; 247078bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 247178bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 247278bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 247378bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 247478bb4007SShri Abhyankar const PetscScalar *b; 247578bb4007SShri Abhyankar 247678bb4007SShri Abhyankar PetscFunctionBegin; 247778bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 247878bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 247978bb4007SShri Abhyankar t = a->solve_work; 248078bb4007SShri Abhyankar 248178bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 248278bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 248378bb4007SShri Abhyankar 248478bb4007SShri Abhyankar /* forward solve the lower triangular */ 248578bb4007SShri Abhyankar idx = 5*r[0]; 248678bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 248778bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 248878bb4007SShri Abhyankar for (i=1; i<n; i++) { 248978bb4007SShri Abhyankar v = aa + 25*ai[i]; 249078bb4007SShri Abhyankar vi = aj + ai[i]; 249178bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 249278bb4007SShri Abhyankar idx = 5*r[i]; 249378bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 249478bb4007SShri Abhyankar s5 = b[4+idx]; 249578bb4007SShri Abhyankar for(m=0;m<nz;m++){ 249678bb4007SShri Abhyankar idx = 5*vi[m]; 249778bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 249878bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 249978bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 250078bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 250178bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 250278bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 250378bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 250478bb4007SShri Abhyankar v += 25; 250578bb4007SShri Abhyankar } 250678bb4007SShri Abhyankar idx = 5*i; 250778bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 250878bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 250978bb4007SShri Abhyankar } 251078bb4007SShri Abhyankar /* backward solve the upper triangular */ 251178bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 251278bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 251378bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 251478bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 251578bb4007SShri Abhyankar idt = 5*i; 251678bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 251778bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 251878bb4007SShri Abhyankar for(m=0;m<nz;m++){ 251978bb4007SShri Abhyankar idx = 5*vi[m]; 252078bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 252178bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 252278bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 252378bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 252478bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 252578bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 252678bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 252778bb4007SShri Abhyankar v += 25; 252878bb4007SShri Abhyankar } 252978bb4007SShri Abhyankar idc = 5*c[i]; 253078bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 253178bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 253278bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 253378bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 253478bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 253578bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 253678bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 253778bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 253878bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 253978bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 254078bb4007SShri Abhyankar } 254178bb4007SShri Abhyankar 254278bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 254378bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 254478bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 254578bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 254678bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 254778bb4007SShri Abhyankar PetscFunctionReturn(0); 254878bb4007SShri Abhyankar } 254978bb4007SShri Abhyankar 25508f690400SShri Abhyankar #undef __FUNCT__ 25514a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2552dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 255315091d37SBarry Smith { 255415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2555690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2556dfbe8321SBarry Smith PetscErrorCode ierr; 2557690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2558d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2559d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2560d9fead3dSBarry Smith const PetscScalar *b; 256115091d37SBarry Smith 256215091d37SBarry Smith PetscFunctionBegin; 2563d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25641ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 256515091d37SBarry Smith /* forward solve the lower triangular */ 256615091d37SBarry Smith idx = 0; 256715091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 256815091d37SBarry Smith for (i=1; i<n; i++) { 256915091d37SBarry Smith v = aa + 25*ai[i]; 257015091d37SBarry Smith vi = aj + ai[i]; 257115091d37SBarry Smith nz = diag[i] - ai[i]; 257215091d37SBarry Smith idx = 5*i; 2573f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 257415091d37SBarry Smith while (nz--) { 257515091d37SBarry Smith jdx = 5*(*vi++); 257615091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2577f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2578f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2579f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2580f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2581f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 258215091d37SBarry Smith v += 25; 258315091d37SBarry Smith } 2584f1af5d2fSBarry Smith x[idx] = s1; 2585f1af5d2fSBarry Smith x[1+idx] = s2; 2586f1af5d2fSBarry Smith x[2+idx] = s3; 2587f1af5d2fSBarry Smith x[3+idx] = s4; 2588f1af5d2fSBarry Smith x[4+idx] = s5; 258915091d37SBarry Smith } 259015091d37SBarry Smith /* backward solve the upper triangular */ 259115091d37SBarry Smith for (i=n-1; i>=0; i--){ 259215091d37SBarry Smith v = aa + 25*diag[i] + 25; 259315091d37SBarry Smith vi = aj + diag[i] + 1; 259415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 259515091d37SBarry Smith idt = 5*i; 2596f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2597f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 259815091d37SBarry Smith while (nz--) { 259915091d37SBarry Smith idx = 5*(*vi++); 260015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2601f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2602f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2603f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2604f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2605f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 260615091d37SBarry Smith v += 25; 260715091d37SBarry Smith } 260815091d37SBarry Smith v = aa + 25*diag[i]; 2609f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2610f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2611f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2612f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2613f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 261415091d37SBarry Smith } 261515091d37SBarry Smith 2616d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26171ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2618dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 261915091d37SBarry Smith PetscFunctionReturn(0); 262015091d37SBarry Smith } 262115091d37SBarry Smith 26224a2ae208SSatish Balay #undef __FUNCT__ 2623cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2624cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2625cee9d6f2SShri Abhyankar { 2626cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 26276464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2628cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2629cee9d6f2SShri Abhyankar PetscInt jdx; 2630cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2631cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2632cee9d6f2SShri Abhyankar const PetscScalar *b; 2633cee9d6f2SShri Abhyankar 2634cee9d6f2SShri Abhyankar PetscFunctionBegin; 2635cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2636cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2637cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2638cee9d6f2SShri Abhyankar idx = 0; 2639cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2640cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2641cee9d6f2SShri Abhyankar v = aa + 25*ai[i]; 2642cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2643cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2644cee9d6f2SShri Abhyankar idx = 5*i; 2645cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 26466464896eSShri Abhyankar for(k=0;k<nz;k++) { 26476464896eSShri Abhyankar jdx = 5*vi[k]; 2648cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2649cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2650cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2651cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2652cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2653cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2654cee9d6f2SShri Abhyankar v += 25; 2655cee9d6f2SShri Abhyankar } 2656cee9d6f2SShri Abhyankar x[idx] = s1; 2657cee9d6f2SShri Abhyankar x[1+idx] = s2; 2658cee9d6f2SShri Abhyankar x[2+idx] = s3; 2659cee9d6f2SShri Abhyankar x[3+idx] = s4; 2660cee9d6f2SShri Abhyankar x[4+idx] = s5; 2661cee9d6f2SShri Abhyankar } 2662cee9d6f2SShri Abhyankar 2663cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2664cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2665cee9d6f2SShri Abhyankar v = aa + 25*ai[2*n-i]; 2666cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2667cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2668cee9d6f2SShri Abhyankar idt = 5*i; 2669cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 2670cee9d6f2SShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 26716464896eSShri Abhyankar for(k=0;k<nz;k++){ 26726464896eSShri Abhyankar idx = 5*vi[k]; 2673cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2674cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2675cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2676cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2677cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2678cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2679cee9d6f2SShri Abhyankar v += 25; 2680cee9d6f2SShri Abhyankar } 2681cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2682cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2683cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2684cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2685cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2686cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2687cee9d6f2SShri Abhyankar } 2688cee9d6f2SShri Abhyankar 2689cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2690cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2691cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2692cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2693cee9d6f2SShri Abhyankar } 2694cee9d6f2SShri Abhyankar 2695cee9d6f2SShri Abhyankar #undef __FUNCT__ 269653cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2" 269753cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 269853cca76cSShri Abhyankar { 269953cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 270053cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 270153cca76cSShri Abhyankar PetscErrorCode ierr; 270253cca76cSShri Abhyankar PetscInt jdx; 270353cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 270453cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 270553cca76cSShri Abhyankar const PetscScalar *b; 270653cca76cSShri Abhyankar 270753cca76cSShri Abhyankar PetscFunctionBegin; 270853cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 270953cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 271053cca76cSShri Abhyankar /* forward solve the lower triangular */ 271153cca76cSShri Abhyankar idx = 0; 271253cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 271353cca76cSShri Abhyankar for (i=1; i<n; i++) { 271453cca76cSShri Abhyankar v = aa + 25*ai[i]; 271553cca76cSShri Abhyankar vi = aj + ai[i]; 271653cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 271753cca76cSShri Abhyankar idx = 5*i; 271853cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 271953cca76cSShri Abhyankar for(k=0;k<nz;k++) { 272053cca76cSShri Abhyankar jdx = 5*vi[k]; 272153cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 272253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 272353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 272453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 272553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 272653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 272753cca76cSShri Abhyankar v += 25; 272853cca76cSShri Abhyankar } 272953cca76cSShri Abhyankar x[idx] = s1; 273053cca76cSShri Abhyankar x[1+idx] = s2; 273153cca76cSShri Abhyankar x[2+idx] = s3; 273253cca76cSShri Abhyankar x[3+idx] = s4; 273353cca76cSShri Abhyankar x[4+idx] = s5; 273453cca76cSShri Abhyankar } 273553cca76cSShri Abhyankar 273653cca76cSShri Abhyankar /* backward solve the upper triangular */ 273753cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 273853cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 273953cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 274053cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 274153cca76cSShri Abhyankar idt = 5*i; 274253cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 274353cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 274453cca76cSShri Abhyankar for(k=0;k<nz;k++){ 274553cca76cSShri Abhyankar idx = 5*vi[k]; 274653cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 274753cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 274853cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 274953cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 275053cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 275153cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 275253cca76cSShri Abhyankar v += 25; 275353cca76cSShri Abhyankar } 275453cca76cSShri Abhyankar /* x = inv_diagonal*x */ 275553cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 275653cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 275753cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 275853cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 275953cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 276053cca76cSShri Abhyankar } 276153cca76cSShri Abhyankar 276253cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 276353cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 276453cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 276553cca76cSShri Abhyankar PetscFunctionReturn(0); 276653cca76cSShri Abhyankar } 276753cca76cSShri Abhyankar 276853cca76cSShri Abhyankar #undef __FUNCT__ 27694a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2770dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 27714e2b4712SSatish Balay { 27724e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 27734e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 27746849ba73SBarry Smith PetscErrorCode ierr; 27755d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 27765d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2777d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2778d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2779d9fead3dSBarry Smith const PetscScalar *b; 27804e2b4712SSatish Balay 27814e2b4712SSatish Balay PetscFunctionBegin; 2782d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27831ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2784f1af5d2fSBarry Smith t = a->solve_work; 27854e2b4712SSatish Balay 27864e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 27874e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 27884e2b4712SSatish Balay 27894e2b4712SSatish Balay /* forward solve the lower triangular */ 27904e2b4712SSatish Balay idx = 4*(*r++); 2791f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2792f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 27934e2b4712SSatish Balay for (i=1; i<n; i++) { 27944e2b4712SSatish Balay v = aa + 16*ai[i]; 27954e2b4712SSatish Balay vi = aj + ai[i]; 27964e2b4712SSatish Balay nz = diag[i] - ai[i]; 27974e2b4712SSatish Balay idx = 4*(*r++); 2798f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 27994e2b4712SSatish Balay while (nz--) { 28004e2b4712SSatish Balay idx = 4*(*vi++); 2801f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2802f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2803f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2804f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2805f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 28064e2b4712SSatish Balay v += 16; 28074e2b4712SSatish Balay } 28084e2b4712SSatish Balay idx = 4*i; 2809f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2810f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 28114e2b4712SSatish Balay } 28124e2b4712SSatish Balay /* backward solve the upper triangular */ 28134e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 28144e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 28154e2b4712SSatish Balay vi = aj + diag[i] + 1; 28164e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 28174e2b4712SSatish Balay idt = 4*i; 2818f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2819f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 28204e2b4712SSatish Balay while (nz--) { 28214e2b4712SSatish Balay idx = 4*(*vi++); 2822f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2823f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2824f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2825f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2826f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2827f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 28284e2b4712SSatish Balay v += 16; 28294e2b4712SSatish Balay } 28304e2b4712SSatish Balay idc = 4*(*c--); 28314e2b4712SSatish Balay v = aa + 16*diag[i]; 2832f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2833f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2834f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2835f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 28364e2b4712SSatish Balay } 28374e2b4712SSatish Balay 28384e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 28394e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2840d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28411ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2842dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 28434e2b4712SSatish Balay PetscFunctionReturn(0); 28444e2b4712SSatish Balay } 2845f26ec98cSKris Buschelman 2846f26ec98cSKris Buschelman #undef __FUNCT__ 28478f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 28488f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 28498f690400SShri Abhyankar { 28508f690400SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 28518f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 28528f690400SShri Abhyankar PetscErrorCode ierr; 285329b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 28548f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 28558f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 28568f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 28578f690400SShri Abhyankar const PetscScalar *b; 28588f690400SShri Abhyankar 28598f690400SShri Abhyankar PetscFunctionBegin; 28608f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28618f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28628f690400SShri Abhyankar t = a->solve_work; 28638f690400SShri Abhyankar 28648f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 286529b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 28668f690400SShri Abhyankar 28678f690400SShri Abhyankar /* forward solve the lower triangular */ 286829b92fc1SShri Abhyankar idx = 4*r[0]; 28698f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 28708f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 28718f690400SShri Abhyankar for (i=1; i<n; i++) { 28728f690400SShri Abhyankar v = aa + 16*ai[i]; 28738f690400SShri Abhyankar vi = aj + ai[i]; 28748f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 287529b92fc1SShri Abhyankar idx = 4*r[i]; 28768f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 287729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 287829b92fc1SShri Abhyankar idx = 4*vi[m]; 28798f690400SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 28808f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 28818f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 28828f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 28838f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 28848f690400SShri Abhyankar v += 16; 28858f690400SShri Abhyankar } 28868f690400SShri Abhyankar idx = 4*i; 28878f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 28888f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 28898f690400SShri Abhyankar } 28908f690400SShri Abhyankar /* backward solve the upper triangular */ 28918f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 28928f690400SShri Abhyankar k = 2*n-i; 28938f690400SShri Abhyankar v = aa + 16*ai[k]; 28948f690400SShri Abhyankar vi = aj + ai[k]; 28958f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 28968f690400SShri Abhyankar idt = 4*i; 28978f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 28988f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 289929b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 290029b92fc1SShri Abhyankar idx = 4*vi[m]; 29018f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 29028f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 29038f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 29048f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 29058f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 29068f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 29078f690400SShri Abhyankar v += 16; 29088f690400SShri Abhyankar } 290929b92fc1SShri Abhyankar idc = 4*c[i]; 29108f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 29118f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 29128f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 29138f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 29148f690400SShri Abhyankar } 29158f690400SShri Abhyankar 29168f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 29178f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 29188f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29198f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 29208f690400SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 29218f690400SShri Abhyankar PetscFunctionReturn(0); 29228f690400SShri Abhyankar } 29238f690400SShri Abhyankar 29248f690400SShri Abhyankar #undef __FUNCT__ 292578bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2" 292678bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx) 292778bb4007SShri Abhyankar { 292878bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 292978bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 293078bb4007SShri Abhyankar PetscErrorCode ierr; 293178bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 293278bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 293378bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 293478bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 293578bb4007SShri Abhyankar const PetscScalar *b; 293678bb4007SShri Abhyankar 293778bb4007SShri Abhyankar PetscFunctionBegin; 293878bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 293978bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 294078bb4007SShri Abhyankar t = a->solve_work; 294178bb4007SShri Abhyankar 294278bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 294378bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 294478bb4007SShri Abhyankar 294578bb4007SShri Abhyankar /* forward solve the lower triangular */ 294678bb4007SShri Abhyankar idx = 4*r[0]; 294778bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 294878bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 294978bb4007SShri Abhyankar for (i=1; i<n; i++) { 295078bb4007SShri Abhyankar v = aa + 16*ai[i]; 295178bb4007SShri Abhyankar vi = aj + ai[i]; 295278bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 295378bb4007SShri Abhyankar idx = 4*r[i]; 295478bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 295578bb4007SShri Abhyankar for(m=0;m<nz;m++){ 295678bb4007SShri Abhyankar idx = 4*vi[m]; 295778bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 295878bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 295978bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 296078bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 296178bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 296278bb4007SShri Abhyankar v += 16; 296378bb4007SShri Abhyankar } 296478bb4007SShri Abhyankar idx = 4*i; 296578bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 296678bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 296778bb4007SShri Abhyankar } 296878bb4007SShri Abhyankar /* backward solve the upper triangular */ 296978bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 297078bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 297178bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 297278bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 297378bb4007SShri Abhyankar idt = 4*i; 297478bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 297578bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 297678bb4007SShri Abhyankar for(m=0;m<nz;m++){ 297778bb4007SShri Abhyankar idx = 4*vi[m]; 297878bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 297978bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 298078bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 298178bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 298278bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 298378bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 298478bb4007SShri Abhyankar v += 16; 298578bb4007SShri Abhyankar } 298678bb4007SShri Abhyankar idc = 4*c[i]; 298778bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 298878bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 298978bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 299078bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 299178bb4007SShri Abhyankar } 299278bb4007SShri Abhyankar 299378bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 299478bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 299578bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 299678bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 299778bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 299878bb4007SShri Abhyankar PetscFunctionReturn(0); 299978bb4007SShri Abhyankar } 300078bb4007SShri Abhyankar 300178bb4007SShri Abhyankar #undef __FUNCT__ 3002f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3003dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3004f26ec98cSKris Buschelman { 3005f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3006f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 30076849ba73SBarry Smith PetscErrorCode ierr; 30085d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 30095d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3010d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3011d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3012d9fead3dSBarry Smith PetscScalar *x; 3013d9fead3dSBarry Smith const PetscScalar *b; 3014f26ec98cSKris Buschelman 3015f26ec98cSKris Buschelman PetscFunctionBegin; 3016d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30171ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3018f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 3019f26ec98cSKris Buschelman 3020f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3021f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3022f26ec98cSKris Buschelman 3023f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3024f26ec98cSKris Buschelman idx = 4*(*r++); 3025f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 3026f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 3027f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 3028f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 3029f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3030f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3031f26ec98cSKris Buschelman vi = aj + ai[i]; 3032f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3033f26ec98cSKris Buschelman idx = 4*(*r++); 3034f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3035f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3036f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3037f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3038f26ec98cSKris Buschelman while (nz--) { 3039f26ec98cSKris Buschelman idx = 4*(*vi++); 3040f26ec98cSKris Buschelman x1 = t[idx]; 3041f26ec98cSKris Buschelman x2 = t[1+idx]; 3042f26ec98cSKris Buschelman x3 = t[2+idx]; 3043f26ec98cSKris Buschelman x4 = t[3+idx]; 3044f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3045f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3046f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3047f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3048f26ec98cSKris Buschelman v += 16; 3049f26ec98cSKris Buschelman } 3050f26ec98cSKris Buschelman idx = 4*i; 3051f26ec98cSKris Buschelman t[idx] = s1; 3052f26ec98cSKris Buschelman t[1+idx] = s2; 3053f26ec98cSKris Buschelman t[2+idx] = s3; 3054f26ec98cSKris Buschelman t[3+idx] = s4; 3055f26ec98cSKris Buschelman } 3056f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3057f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3058f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 3059f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3060f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3061f26ec98cSKris Buschelman idt = 4*i; 3062f26ec98cSKris Buschelman s1 = t[idt]; 3063f26ec98cSKris Buschelman s2 = t[1+idt]; 3064f26ec98cSKris Buschelman s3 = t[2+idt]; 3065f26ec98cSKris Buschelman s4 = t[3+idt]; 3066f26ec98cSKris Buschelman while (nz--) { 3067f26ec98cSKris Buschelman idx = 4*(*vi++); 3068f26ec98cSKris Buschelman x1 = t[idx]; 3069f26ec98cSKris Buschelman x2 = t[1+idx]; 3070f26ec98cSKris Buschelman x3 = t[2+idx]; 3071f26ec98cSKris Buschelman x4 = t[3+idx]; 3072f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3073f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3074f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3075f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3076f26ec98cSKris Buschelman v += 16; 3077f26ec98cSKris Buschelman } 3078f26ec98cSKris Buschelman idc = 4*(*c--); 3079f26ec98cSKris Buschelman v = aa + 16*diag[i]; 3080f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3081f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3082f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3083f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3084f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 3085f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 3086f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 3087f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 3088f26ec98cSKris Buschelman } 3089f26ec98cSKris Buschelman 3090f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3091f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3092d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3094dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3095f26ec98cSKris Buschelman PetscFunctionReturn(0); 3096f26ec98cSKris Buschelman } 3097f26ec98cSKris Buschelman 309824c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 309924c233c2SKris Buschelman 310024c233c2SKris Buschelman #include PETSC_HAVE_SSE 310124c233c2SKris Buschelman 310224c233c2SKris Buschelman #undef __FUNCT__ 310324c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3104dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 310524c233c2SKris Buschelman { 310624c233c2SKris Buschelman /* 310724c233c2SKris Buschelman Note: This code uses demotion of double 310824c233c2SKris Buschelman to float when performing the mixed-mode computation. 310924c233c2SKris Buschelman This may not be numerically reasonable for all applications. 311024c233c2SKris Buschelman */ 311124c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 311224c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 31136849ba73SBarry Smith PetscErrorCode ierr; 31145d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 31155d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 311624c233c2SKris Buschelman MatScalar *aa=a->a,*v; 311787828ca2SBarry Smith PetscScalar *x,*b,*t; 311824c233c2SKris Buschelman 311924c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 312024c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 312124c233c2SKris Buschelman unsigned long offset; 312224c233c2SKris Buschelman 312324c233c2SKris Buschelman PetscFunctionBegin; 312424c233c2SKris Buschelman SSE_SCOPE_BEGIN; 312524c233c2SKris Buschelman 312624c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 312724c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 312824c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 312924c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 313024c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 313124c233c2SKris Buschelman 31321ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 31331ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 313424c233c2SKris Buschelman t = a->solve_work; 313524c233c2SKris Buschelman 313624c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 313724c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 313824c233c2SKris Buschelman 313924c233c2SKris Buschelman /* forward solve the lower triangular */ 314024c233c2SKris Buschelman idx = 4*(*r++); 314124c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 314224c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 314324c233c2SKris Buschelman v = aa + 16*ai[1]; 314424c233c2SKris Buschelman 314524c233c2SKris Buschelman for (i=1; i<n;) { 314624c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 314724c233c2SKris Buschelman vi = aj + ai[i]; 314824c233c2SKris Buschelman nz = diag[i] - ai[i]; 314924c233c2SKris Buschelman idx = 4*(*r++); 315024c233c2SKris Buschelman 315124c233c2SKris Buschelman /* Demote sum from double to float */ 315224c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 315324c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 315424c233c2SKris Buschelman 315524c233c2SKris Buschelman while (nz--) { 315624c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 315724c233c2SKris Buschelman idx = 4*(*vi++); 315824c233c2SKris Buschelman 315924c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 316024c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 316124c233c2SKris Buschelman 316224c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 316324c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 316424c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 316524c233c2SKris Buschelman 316624c233c2SKris Buschelman /* First Column */ 316724c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 316824c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 316924c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 317024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 317124c233c2SKris Buschelman 317224c233c2SKris Buschelman /* Second Column */ 317324c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 317424c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 317524c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 317624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 317724c233c2SKris Buschelman 317824c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 317924c233c2SKris Buschelman 318024c233c2SKris Buschelman /* Third Column */ 318124c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 318224c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 318324c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 318424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 318524c233c2SKris Buschelman 318624c233c2SKris Buschelman /* Fourth Column */ 318724c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 318824c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 318924c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 319024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 319124c233c2SKris Buschelman SSE_INLINE_END_2 319224c233c2SKris Buschelman 319324c233c2SKris Buschelman v += 16; 319424c233c2SKris Buschelman } 319524c233c2SKris Buschelman idx = 4*i; 319624c233c2SKris Buschelman v = aa + 16*ai[++i]; 319724c233c2SKris Buschelman PREFETCH_NTA(v); 319824c233c2SKris Buschelman STORE_PS(tmps,XMM7); 319924c233c2SKris Buschelman 320024c233c2SKris Buschelman /* Promote result from float to double */ 320124c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 320224c233c2SKris Buschelman } 320324c233c2SKris Buschelman /* backward solve the upper triangular */ 320424c233c2SKris Buschelman idt = 4*(n-1); 320524c233c2SKris Buschelman ai16 = 16*diag[n-1]; 320624c233c2SKris Buschelman v = aa + ai16 + 16; 320724c233c2SKris Buschelman for (i=n-1; i>=0;){ 320824c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 320924c233c2SKris Buschelman vi = aj + diag[i] + 1; 321024c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 321124c233c2SKris Buschelman 321224c233c2SKris Buschelman /* Demote accumulator from double to float */ 321324c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 321424c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 321524c233c2SKris Buschelman 321624c233c2SKris Buschelman while (nz--) { 321724c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 321824c233c2SKris Buschelman idx = 4*(*vi++); 321924c233c2SKris Buschelman 322024c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 322124c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 322224c233c2SKris Buschelman 322324c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 322424c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 322524c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 322624c233c2SKris Buschelman 322724c233c2SKris Buschelman /* First Column */ 322824c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 322924c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 323024c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 323124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 323224c233c2SKris Buschelman 323324c233c2SKris Buschelman /* Second Column */ 323424c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 323524c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 323624c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 323724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 323824c233c2SKris Buschelman 323924c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 324024c233c2SKris Buschelman 324124c233c2SKris Buschelman /* Third Column */ 324224c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 324324c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 324424c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 324524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 324624c233c2SKris Buschelman 324724c233c2SKris Buschelman /* Fourth Column */ 324824c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 324924c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 325024c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 325124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 325224c233c2SKris Buschelman SSE_INLINE_END_2 325324c233c2SKris Buschelman v += 16; 325424c233c2SKris Buschelman } 325524c233c2SKris Buschelman v = aa + ai16; 325624c233c2SKris Buschelman ai16 = 16*diag[--i]; 325724c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 325824c233c2SKris Buschelman /* 325924c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 326024c233c2SKris Buschelman which was inverted as part of the factorization 326124c233c2SKris Buschelman */ 326224c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 326324c233c2SKris Buschelman /* First Column */ 326424c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 326524c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 326624c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 326724c233c2SKris Buschelman 326824c233c2SKris Buschelman /* Second Column */ 326924c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 327024c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 327124c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 327224c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 327324c233c2SKris Buschelman 327424c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 327524c233c2SKris Buschelman 327624c233c2SKris Buschelman /* Third Column */ 327724c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 327824c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 327924c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 328024c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 328124c233c2SKris Buschelman 328224c233c2SKris Buschelman /* Fourth Column */ 328324c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 328424c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 328524c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 328624c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 328724c233c2SKris Buschelman 328824c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 328924c233c2SKris Buschelman SSE_INLINE_END_3 329024c233c2SKris Buschelman 329124c233c2SKris Buschelman /* Promote solution from float to double */ 329224c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 329324c233c2SKris Buschelman 329424c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 329524c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 329624c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 329724c233c2SKris Buschelman idc = 4*(*c--); 329824c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 329924c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 330024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 330124c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 330224c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 330324c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 330424c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 330524c233c2SKris Buschelman SSE_INLINE_END_2 330624c233c2SKris Buschelman v = aa + ai16 + 16; 330724c233c2SKris Buschelman idt -= 4; 330824c233c2SKris Buschelman } 330924c233c2SKris Buschelman 331024c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 331124c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 33121ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 33131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3314dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 331524c233c2SKris Buschelman SSE_SCOPE_END; 331624c233c2SKris Buschelman PetscFunctionReturn(0); 331724c233c2SKris Buschelman } 331824c233c2SKris Buschelman 331924c233c2SKris Buschelman #endif 33200ef38995SBarry Smith 33210ef38995SBarry Smith 33224e2b4712SSatish Balay /* 33234e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 33244e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 33254e2b4712SSatish Balay */ 33264a2ae208SSatish Balay #undef __FUNCT__ 33274a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3328dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 33294e2b4712SSatish Balay { 33304e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3331356650c2SBarry Smith PetscInt n=a->mbs; 3332356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 3333dfbe8321SBarry Smith PetscErrorCode ierr; 3334356650c2SBarry Smith const PetscInt *diag = a->diag; 3335d9fead3dSBarry Smith const MatScalar *aa=a->a; 3336d9fead3dSBarry Smith PetscScalar *x; 3337d9fead3dSBarry Smith const PetscScalar *b; 33384e2b4712SSatish Balay 33394e2b4712SSatish Balay PetscFunctionBegin; 3340d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 33411ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 33424e2b4712SSatish Balay 3343aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 33442853dc0eSBarry Smith { 334587828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 33462853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 33472853dc0eSBarry Smith } 3348aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 33492853dc0eSBarry Smith { 335087828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 33512853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 33522853dc0eSBarry Smith } 3353aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 33542853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3355e1293385SBarry Smith #else 335630d4dcafSBarry Smith { 335787828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3358d9fead3dSBarry Smith const MatScalar *v; 3359356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 3360356650c2SBarry Smith const PetscInt *vi; 3361e1293385SBarry Smith 33624e2b4712SSatish Balay /* forward solve the lower triangular */ 33634e2b4712SSatish Balay idx = 0; 3364e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 33654e2b4712SSatish Balay for (i=1; i<n; i++) { 33664e2b4712SSatish Balay v = aa + 16*ai[i]; 33674e2b4712SSatish Balay vi = aj + ai[i]; 33684e2b4712SSatish Balay nz = diag[i] - ai[i]; 3369e1293385SBarry Smith idx += 4; 3370f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 33714e2b4712SSatish Balay while (nz--) { 33724e2b4712SSatish Balay jdx = 4*(*vi++); 33734e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3374f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3375f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3376f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3377f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 33784e2b4712SSatish Balay v += 16; 33794e2b4712SSatish Balay } 3380f1af5d2fSBarry Smith x[idx] = s1; 3381f1af5d2fSBarry Smith x[1+idx] = s2; 3382f1af5d2fSBarry Smith x[2+idx] = s3; 3383f1af5d2fSBarry Smith x[3+idx] = s4; 33844e2b4712SSatish Balay } 33854e2b4712SSatish Balay /* backward solve the upper triangular */ 33864e555682SBarry Smith idt = 4*(n-1); 33874e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 33884e555682SBarry Smith ai16 = 16*diag[i]; 33894e555682SBarry Smith v = aa + ai16 + 16; 33904e2b4712SSatish Balay vi = aj + diag[i] + 1; 33914e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3392f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3393f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 33944e2b4712SSatish Balay while (nz--) { 33954e2b4712SSatish Balay idx = 4*(*vi++); 33964e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3397f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3398f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3399f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3400f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 34014e2b4712SSatish Balay v += 16; 34024e2b4712SSatish Balay } 34034e555682SBarry Smith v = aa + ai16; 3404f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3405f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3406f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3407f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3408329f5518SBarry Smith idt -= 4; 34094e2b4712SSatish Balay } 341030d4dcafSBarry Smith } 3411e1293385SBarry Smith #endif 34124e2b4712SSatish Balay 3413d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 34141ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3415dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 34164e2b4712SSatish Balay PetscFunctionReturn(0); 34174e2b4712SSatish Balay } 34184e2b4712SSatish Balay 3419f26ec98cSKris Buschelman #undef __FUNCT__ 3420cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3421cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3422cee9d6f2SShri Abhyankar { 3423cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 34246464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3425cee9d6f2SShri Abhyankar PetscErrorCode ierr; 3426cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 3427cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3428cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 3429cee9d6f2SShri Abhyankar PetscScalar *x; 3430cee9d6f2SShri Abhyankar const PetscScalar *b; 3431cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3432cee9d6f2SShri Abhyankar 3433cee9d6f2SShri Abhyankar PetscFunctionBegin; 3434cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3435cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3436cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 3437cee9d6f2SShri Abhyankar idx = 0; 3438cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3439cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 3440cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 3441cee9d6f2SShri Abhyankar vi = aj + ai[i]; 3442cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 3443cee9d6f2SShri Abhyankar idx = bs*i; 3444cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 34456464896eSShri Abhyankar for(k=0;k<nz;k++) { 34466464896eSShri Abhyankar jdx = bs*vi[k]; 3447cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3448cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3449cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3450cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3451cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3452cee9d6f2SShri Abhyankar 3453cee9d6f2SShri Abhyankar v += bs2; 3454cee9d6f2SShri Abhyankar } 3455cee9d6f2SShri Abhyankar 3456cee9d6f2SShri Abhyankar x[idx] = s1; 3457cee9d6f2SShri Abhyankar x[1+idx] = s2; 3458cee9d6f2SShri Abhyankar x[2+idx] = s3; 3459cee9d6f2SShri Abhyankar x[3+idx] = s4; 3460cee9d6f2SShri Abhyankar } 3461cee9d6f2SShri Abhyankar 3462cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 3463cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 3464cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 3465cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 3466cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 3467cee9d6f2SShri Abhyankar idt = bs*i; 3468cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3469cee9d6f2SShri Abhyankar 34706464896eSShri Abhyankar for(k=0;k<nz;k++){ 34716464896eSShri Abhyankar idx = bs*vi[k]; 3472cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3473cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3474cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3475cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3476cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3477cee9d6f2SShri Abhyankar 3478cee9d6f2SShri Abhyankar v += bs2; 3479cee9d6f2SShri Abhyankar } 3480cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 3481cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3482cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3483cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3484cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3485cee9d6f2SShri Abhyankar 3486cee9d6f2SShri Abhyankar } 3487cee9d6f2SShri Abhyankar 3488cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3489cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3490cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3491cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 3492cee9d6f2SShri Abhyankar } 3493cee9d6f2SShri Abhyankar 3494b2b2dd24SShri Abhyankar #undef __FUNCT__ 3495b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 3496b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 3497b2b2dd24SShri Abhyankar { 3498b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3499b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3500b2b2dd24SShri Abhyankar PetscErrorCode ierr; 3501b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 3502b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3503b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 3504b2b2dd24SShri Abhyankar PetscScalar *x; 3505b2b2dd24SShri Abhyankar const PetscScalar *b; 3506b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3507cee9d6f2SShri Abhyankar 3508b2b2dd24SShri Abhyankar PetscFunctionBegin; 3509b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3510b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3511b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 3512b2b2dd24SShri Abhyankar idx = 0; 3513b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3514b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 3515b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 3516b2b2dd24SShri Abhyankar vi = aj + ai[i]; 3517b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 3518b2b2dd24SShri Abhyankar idx = bs*i; 3519b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3520b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 3521b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 3522b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3523b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3524b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3525b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3526b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3527b2b2dd24SShri Abhyankar 3528b2b2dd24SShri Abhyankar v += bs2; 3529b2b2dd24SShri Abhyankar } 3530b2b2dd24SShri Abhyankar 3531b2b2dd24SShri Abhyankar x[idx] = s1; 3532b2b2dd24SShri Abhyankar x[1+idx] = s2; 3533b2b2dd24SShri Abhyankar x[2+idx] = s3; 3534b2b2dd24SShri Abhyankar x[3+idx] = s4; 3535b2b2dd24SShri Abhyankar } 3536b2b2dd24SShri Abhyankar 3537b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 3538b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 3539b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 3540b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 3541b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 3542b2b2dd24SShri Abhyankar idt = bs*i; 3543b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3544b2b2dd24SShri Abhyankar 3545b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 3546b2b2dd24SShri Abhyankar idx = bs*vi[k]; 3547b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3548b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3549b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3550b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3551b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3552b2b2dd24SShri Abhyankar 3553b2b2dd24SShri Abhyankar v += bs2; 3554b2b2dd24SShri Abhyankar } 3555b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 3556b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3557b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3558b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3559b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3560b2b2dd24SShri Abhyankar 3561b2b2dd24SShri Abhyankar } 3562b2b2dd24SShri Abhyankar 3563b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3564b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3565b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3566b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 3567b2b2dd24SShri Abhyankar } 3568cee9d6f2SShri Abhyankar 3569cee9d6f2SShri Abhyankar #undef __FUNCT__ 3570f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3571dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3572f26ec98cSKris Buschelman { 3573f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3574690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3575dfbe8321SBarry Smith PetscErrorCode ierr; 3576690b6cddSBarry Smith PetscInt *diag = a->diag; 3577f26ec98cSKris Buschelman MatScalar *aa=a->a; 3578f26ec98cSKris Buschelman PetscScalar *x,*b; 3579f26ec98cSKris Buschelman 3580f26ec98cSKris Buschelman PetscFunctionBegin; 35811ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 35821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3583f26ec98cSKris Buschelman 3584f26ec98cSKris Buschelman { 3585f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3586f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 3587690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3588f26ec98cSKris Buschelman 3589f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3590f26ec98cSKris Buschelman idx = 0; 3591f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 3592f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 3593f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 3594f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 3595f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3596f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3597f26ec98cSKris Buschelman vi = aj + ai[i]; 3598f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3599f26ec98cSKris Buschelman idx += 4; 3600f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3601f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3602f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3603f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3604f26ec98cSKris Buschelman while (nz--) { 3605f26ec98cSKris Buschelman jdx = 4*(*vi++); 3606f26ec98cSKris Buschelman x1 = t[jdx]; 3607f26ec98cSKris Buschelman x2 = t[1+jdx]; 3608f26ec98cSKris Buschelman x3 = t[2+jdx]; 3609f26ec98cSKris Buschelman x4 = t[3+jdx]; 3610f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3611f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3612f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3613f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3614f26ec98cSKris Buschelman v += 16; 3615f26ec98cSKris Buschelman } 3616f26ec98cSKris Buschelman t[idx] = s1; 3617f26ec98cSKris Buschelman t[1+idx] = s2; 3618f26ec98cSKris Buschelman t[2+idx] = s3; 3619f26ec98cSKris Buschelman t[3+idx] = s4; 3620f26ec98cSKris Buschelman } 3621f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3622f26ec98cSKris Buschelman idt = 4*(n-1); 3623f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3624f26ec98cSKris Buschelman ai16 = 16*diag[i]; 3625f26ec98cSKris Buschelman v = aa + ai16 + 16; 3626f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3627f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3628f26ec98cSKris Buschelman s1 = t[idt]; 3629f26ec98cSKris Buschelman s2 = t[1+idt]; 3630f26ec98cSKris Buschelman s3 = t[2+idt]; 3631f26ec98cSKris Buschelman s4 = t[3+idt]; 3632f26ec98cSKris Buschelman while (nz--) { 3633f26ec98cSKris Buschelman idx = 4*(*vi++); 3634f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 3635f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 3636f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 3637f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 3638f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3639f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3640f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3641f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3642f26ec98cSKris Buschelman v += 16; 3643f26ec98cSKris Buschelman } 3644f26ec98cSKris Buschelman v = aa + ai16; 3645f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3646f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3647f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3648f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3649f26ec98cSKris Buschelman idt -= 4; 3650f26ec98cSKris Buschelman } 3651f26ec98cSKris Buschelman } 3652f26ec98cSKris Buschelman 36531ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 36541ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3655dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3656f26ec98cSKris Buschelman PetscFunctionReturn(0); 3657f26ec98cSKris Buschelman } 3658f26ec98cSKris Buschelman 36593660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 36603660e330SKris Buschelman 36613660e330SKris Buschelman #include PETSC_HAVE_SSE 36623660e330SKris Buschelman #undef __FUNCT__ 36637cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3664dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 36653660e330SKris Buschelman { 36663660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 36672aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 3668dfbe8321SBarry Smith PetscErrorCode ierr; 3669dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 36703660e330SKris Buschelman MatScalar *aa=a->a; 367187828ca2SBarry Smith PetscScalar *x,*b; 36723660e330SKris Buschelman 36733660e330SKris Buschelman PetscFunctionBegin; 36743660e330SKris Buschelman SSE_SCOPE_BEGIN; 36753660e330SKris Buschelman /* 36763660e330SKris Buschelman Note: This code currently uses demotion of double 36773660e330SKris Buschelman to float when performing the mixed-mode computation. 36783660e330SKris Buschelman This may not be numerically reasonable for all applications. 36793660e330SKris Buschelman */ 36803660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 36813660e330SKris Buschelman 36821ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 36831ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 36843660e330SKris Buschelman { 3685eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 3686eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 36872aa5897fSKris Buschelman int nz,i,idt,ai16; 36882aa5897fSKris Buschelman unsigned int jdx,idx; 36892aa5897fSKris Buschelman unsigned short *vi; 3690eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 36913660e330SKris Buschelman 3692eb05f457SKris Buschelman /* First block is the identity. */ 36933660e330SKris Buschelman idx = 0; 3694eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 36952aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 36963660e330SKris Buschelman 36973660e330SKris Buschelman for (i=1; i<n;) { 36983660e330SKris Buschelman PREFETCH_NTA(&v[8]); 36993660e330SKris Buschelman vi = aj + ai[i]; 37003660e330SKris Buschelman nz = diag[i] - ai[i]; 37013660e330SKris Buschelman idx += 4; 37023660e330SKris Buschelman 3703eb05f457SKris Buschelman /* Demote RHS from double to float. */ 3704eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3705eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 37063660e330SKris Buschelman 37073660e330SKris Buschelman while (nz--) { 37083660e330SKris Buschelman PREFETCH_NTA(&v[16]); 37092aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 37103660e330SKris Buschelman 37113660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 3712eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 37133660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 37143660e330SKris Buschelman 37153660e330SKris Buschelman /* First Column */ 37163660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 37173660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 37183660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 37193660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 37203660e330SKris Buschelman 37213660e330SKris Buschelman /* Second Column */ 37223660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 37233660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 37243660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 37253660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 37263660e330SKris Buschelman 37273660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 37283660e330SKris Buschelman 37293660e330SKris Buschelman /* Third Column */ 37303660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 37313660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 37323660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 37333660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 37343660e330SKris Buschelman 37353660e330SKris Buschelman /* Fourth Column */ 37363660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 37373660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 37383660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 37393660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 37403660e330SKris Buschelman SSE_INLINE_END_2 37413660e330SKris Buschelman 37423660e330SKris Buschelman v += 16; 37433660e330SKris Buschelman } 37443660e330SKris Buschelman v = aa + 16*ai[++i]; 37453660e330SKris Buschelman PREFETCH_NTA(v); 3746eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 37473660e330SKris Buschelman } 3748eb05f457SKris Buschelman 3749eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 3750eb05f457SKris Buschelman 37513660e330SKris Buschelman idt = 4*(n-1); 37523660e330SKris Buschelman ai16 = 16*diag[n-1]; 37533660e330SKris Buschelman v = aa + ai16 + 16; 37543660e330SKris Buschelman for (i=n-1; i>=0;){ 37553660e330SKris Buschelman PREFETCH_NTA(&v[8]); 37563660e330SKris Buschelman vi = aj + diag[i] + 1; 37573660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 37583660e330SKris Buschelman 3759eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 37603660e330SKris Buschelman 37613660e330SKris Buschelman while (nz--) { 37623660e330SKris Buschelman PREFETCH_NTA(&v[16]); 37632aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 37643660e330SKris Buschelman 37653660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 3766eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 37673660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 37683660e330SKris Buschelman 37693660e330SKris Buschelman /* First Column */ 37703660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 37713660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 37723660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 37733660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 37743660e330SKris Buschelman 37753660e330SKris Buschelman /* Second Column */ 37763660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 37773660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 37783660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 37793660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 37803660e330SKris Buschelman 37813660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 37823660e330SKris Buschelman 37833660e330SKris Buschelman /* Third Column */ 37843660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 37853660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 37863660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 37873660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 37883660e330SKris Buschelman 37893660e330SKris Buschelman /* Fourth Column */ 37903660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 37913660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 37923660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 37933660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 37943660e330SKris Buschelman SSE_INLINE_END_2 37953660e330SKris Buschelman v += 16; 37963660e330SKris Buschelman } 37973660e330SKris Buschelman v = aa + ai16; 37983660e330SKris Buschelman ai16 = 16*diag[--i]; 37993660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 38003660e330SKris Buschelman /* 38013660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 38023660e330SKris Buschelman which was inverted as part of the factorization 38033660e330SKris Buschelman */ 3804eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 38053660e330SKris Buschelman /* First Column */ 38063660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 38073660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 38083660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 38093660e330SKris Buschelman 38103660e330SKris Buschelman /* Second Column */ 38113660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 38123660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 38133660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 38143660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 38153660e330SKris Buschelman 38163660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 38173660e330SKris Buschelman 38183660e330SKris Buschelman /* Third Column */ 38193660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 38203660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 38213660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 38223660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 38233660e330SKris Buschelman 38243660e330SKris Buschelman /* Fourth Column */ 38253660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 38263660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 38273660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 38283660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 38293660e330SKris Buschelman 38303660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 38313660e330SKris Buschelman SSE_INLINE_END_3 38323660e330SKris Buschelman 38333660e330SKris Buschelman v = aa + ai16 + 16; 38343660e330SKris Buschelman idt -= 4; 38353660e330SKris Buschelman } 3836eb05f457SKris Buschelman 3837eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 3838eb05f457SKris Buschelman idt = 4*(n-1); 3839eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 3840eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3841eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3842eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 3843eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 3844eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 3845eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 3846eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 3847eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 384854693613SKris Buschelman idt -= 4; 38493660e330SKris Buschelman } 3850eb05f457SKris Buschelman 3851eb05f457SKris Buschelman } /* End of artificial scope. */ 38521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 38531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3854dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 38553660e330SKris Buschelman SSE_SCOPE_END; 38563660e330SKris Buschelman PetscFunctionReturn(0); 38573660e330SKris Buschelman } 38583660e330SKris Buschelman 38597cf1b8d3SKris Buschelman #undef __FUNCT__ 38607cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3861dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 38627cf1b8d3SKris Buschelman { 38637cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 38647cf1b8d3SKris Buschelman int *aj=a->j; 3865dfbe8321SBarry Smith PetscErrorCode ierr; 3866dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 38677cf1b8d3SKris Buschelman MatScalar *aa=a->a; 38687cf1b8d3SKris Buschelman PetscScalar *x,*b; 38697cf1b8d3SKris Buschelman 38707cf1b8d3SKris Buschelman PetscFunctionBegin; 38717cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 38727cf1b8d3SKris Buschelman /* 38737cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 38747cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 38757cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 38767cf1b8d3SKris Buschelman */ 38777cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 38787cf1b8d3SKris Buschelman 38791ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 38801ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 38817cf1b8d3SKris Buschelman { 38827cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 38837cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 38847cf1b8d3SKris Buschelman int nz,i,idt,ai16; 38857cf1b8d3SKris Buschelman int jdx,idx; 38867cf1b8d3SKris Buschelman int *vi; 38877cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 38887cf1b8d3SKris Buschelman 38897cf1b8d3SKris Buschelman /* First block is the identity. */ 38907cf1b8d3SKris Buschelman idx = 0; 38917cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 38927cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 38937cf1b8d3SKris Buschelman 38947cf1b8d3SKris Buschelman for (i=1; i<n;) { 38957cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 38967cf1b8d3SKris Buschelman vi = aj + ai[i]; 38977cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 38987cf1b8d3SKris Buschelman idx += 4; 38997cf1b8d3SKris Buschelman 39007cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 39017cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 39027cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 39037cf1b8d3SKris Buschelman 39047cf1b8d3SKris Buschelman while (nz--) { 39057cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 39067cf1b8d3SKris Buschelman jdx = 4*(*vi++); 39077cf1b8d3SKris Buschelman /* jdx = *vi++; */ 39087cf1b8d3SKris Buschelman 39097cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 39107cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 39117cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 39127cf1b8d3SKris Buschelman 39137cf1b8d3SKris Buschelman /* First Column */ 39147cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 39157cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 39167cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 39177cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 39187cf1b8d3SKris Buschelman 39197cf1b8d3SKris Buschelman /* Second Column */ 39207cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 39217cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 39227cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 39237cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 39247cf1b8d3SKris Buschelman 39257cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 39267cf1b8d3SKris Buschelman 39277cf1b8d3SKris Buschelman /* Third Column */ 39287cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 39297cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 39307cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 39317cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 39327cf1b8d3SKris Buschelman 39337cf1b8d3SKris Buschelman /* Fourth Column */ 39347cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 39357cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 39367cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 39377cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 39387cf1b8d3SKris Buschelman SSE_INLINE_END_2 39397cf1b8d3SKris Buschelman 39407cf1b8d3SKris Buschelman v += 16; 39417cf1b8d3SKris Buschelman } 39427cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 39437cf1b8d3SKris Buschelman PREFETCH_NTA(v); 39447cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 39457cf1b8d3SKris Buschelman } 39467cf1b8d3SKris Buschelman 39477cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 39487cf1b8d3SKris Buschelman 39497cf1b8d3SKris Buschelman idt = 4*(n-1); 39507cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 39517cf1b8d3SKris Buschelman v = aa + ai16 + 16; 39527cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 39537cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 39547cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 39557cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 39567cf1b8d3SKris Buschelman 39577cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 39587cf1b8d3SKris Buschelman 39597cf1b8d3SKris Buschelman while (nz--) { 39607cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 39617cf1b8d3SKris Buschelman idx = 4*(*vi++); 39627cf1b8d3SKris Buschelman /* idx = *vi++; */ 39637cf1b8d3SKris Buschelman 39647cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 39657cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 39667cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 39677cf1b8d3SKris Buschelman 39687cf1b8d3SKris Buschelman /* First Column */ 39697cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 39707cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 39717cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 39727cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 39737cf1b8d3SKris Buschelman 39747cf1b8d3SKris Buschelman /* Second Column */ 39757cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 39767cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 39777cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 39787cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 39797cf1b8d3SKris Buschelman 39807cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 39817cf1b8d3SKris Buschelman 39827cf1b8d3SKris Buschelman /* Third Column */ 39837cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 39847cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 39857cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 39867cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 39877cf1b8d3SKris Buschelman 39887cf1b8d3SKris Buschelman /* Fourth Column */ 39897cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 39907cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 39917cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 39927cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 39937cf1b8d3SKris Buschelman SSE_INLINE_END_2 39947cf1b8d3SKris Buschelman v += 16; 39957cf1b8d3SKris Buschelman } 39967cf1b8d3SKris Buschelman v = aa + ai16; 39977cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 39987cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 39997cf1b8d3SKris Buschelman /* 40007cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 40017cf1b8d3SKris Buschelman which was inverted as part of the factorization 40027cf1b8d3SKris Buschelman */ 40037cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 40047cf1b8d3SKris Buschelman /* First Column */ 40057cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 40067cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 40077cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 40087cf1b8d3SKris Buschelman 40097cf1b8d3SKris Buschelman /* Second Column */ 40107cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 40117cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 40127cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 40137cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 40147cf1b8d3SKris Buschelman 40157cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 40167cf1b8d3SKris Buschelman 40177cf1b8d3SKris Buschelman /* Third Column */ 40187cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 40197cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 40207cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 40217cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 40227cf1b8d3SKris Buschelman 40237cf1b8d3SKris Buschelman /* Fourth Column */ 40247cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 40257cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 40267cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 40277cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 40287cf1b8d3SKris Buschelman 40297cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 40307cf1b8d3SKris Buschelman SSE_INLINE_END_3 40317cf1b8d3SKris Buschelman 40327cf1b8d3SKris Buschelman v = aa + ai16 + 16; 40337cf1b8d3SKris Buschelman idt -= 4; 40347cf1b8d3SKris Buschelman } 40357cf1b8d3SKris Buschelman 40367cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 40377cf1b8d3SKris Buschelman idt = 4*(n-1); 40387cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 40397cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 40407cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 40417cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 40427cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 40437cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 40447cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 40457cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 40467cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 40477cf1b8d3SKris Buschelman idt -= 4; 40487cf1b8d3SKris Buschelman } 40497cf1b8d3SKris Buschelman 40507cf1b8d3SKris Buschelman } /* End of artificial scope. */ 40511ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 40521ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4053dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 40547cf1b8d3SKris Buschelman SSE_SCOPE_END; 40557cf1b8d3SKris Buschelman PetscFunctionReturn(0); 40567cf1b8d3SKris Buschelman } 40577cf1b8d3SKris Buschelman 40583660e330SKris Buschelman #endif 40598f690400SShri Abhyankar 40604a2ae208SSatish Balay #undef __FUNCT__ 40614a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4062dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 40634e2b4712SSatish Balay { 40644e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 40654e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 40666849ba73SBarry Smith PetscErrorCode ierr; 40675d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 40685d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4069d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4070d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4071d9fead3dSBarry Smith const PetscScalar *b; 40724e2b4712SSatish Balay 40734e2b4712SSatish Balay PetscFunctionBegin; 4074d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 40751ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4076f1af5d2fSBarry Smith t = a->solve_work; 40774e2b4712SSatish Balay 40784e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 40794e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 40804e2b4712SSatish Balay 40814e2b4712SSatish Balay /* forward solve the lower triangular */ 40824e2b4712SSatish Balay idx = 3*(*r++); 4083f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 40844e2b4712SSatish Balay for (i=1; i<n; i++) { 40854e2b4712SSatish Balay v = aa + 9*ai[i]; 40864e2b4712SSatish Balay vi = aj + ai[i]; 40874e2b4712SSatish Balay nz = diag[i] - ai[i]; 40884e2b4712SSatish Balay idx = 3*(*r++); 4089f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 40904e2b4712SSatish Balay while (nz--) { 40914e2b4712SSatish Balay idx = 3*(*vi++); 4092f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4093f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4094f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4095f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 40964e2b4712SSatish Balay v += 9; 40974e2b4712SSatish Balay } 40984e2b4712SSatish Balay idx = 3*i; 4099f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 41004e2b4712SSatish Balay } 41014e2b4712SSatish Balay /* backward solve the upper triangular */ 41024e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 41034e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 41044e2b4712SSatish Balay vi = aj + diag[i] + 1; 41054e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 41064e2b4712SSatish Balay idt = 3*i; 4107f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 41084e2b4712SSatish Balay while (nz--) { 41094e2b4712SSatish Balay idx = 3*(*vi++); 4110f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4111f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4112f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4113f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 41144e2b4712SSatish Balay v += 9; 41154e2b4712SSatish Balay } 41164e2b4712SSatish Balay idc = 3*(*c--); 41174e2b4712SSatish Balay v = aa + 9*diag[i]; 4118f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4119f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4120f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 41214e2b4712SSatish Balay } 41224e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 41234e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4124d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41251ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4126dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 41274e2b4712SSatish Balay PetscFunctionReturn(0); 41284e2b4712SSatish Balay } 41294e2b4712SSatish Balay 41308f690400SShri Abhyankar #undef __FUNCT__ 41318f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 41328f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 41338f690400SShri Abhyankar { 41348f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 41358f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 41368f690400SShri Abhyankar PetscErrorCode ierr; 413729b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 41388f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 41398f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 41408f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 41418f690400SShri Abhyankar const PetscScalar *b; 41428f690400SShri Abhyankar 41438f690400SShri Abhyankar PetscFunctionBegin; 41448f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41458f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 41468f690400SShri Abhyankar t = a->solve_work; 41478f690400SShri Abhyankar 41488f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 414929b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 41508f690400SShri Abhyankar 41518f690400SShri Abhyankar /* forward solve the lower triangular */ 415229b92fc1SShri Abhyankar idx = 3*r[0]; 41538f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 41548f690400SShri Abhyankar for (i=1; i<n; i++) { 41558f690400SShri Abhyankar v = aa + 9*ai[i]; 41568f690400SShri Abhyankar vi = aj + ai[i]; 41578f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 415829b92fc1SShri Abhyankar idx = 3*r[i]; 41598f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 416029b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 416129b92fc1SShri Abhyankar idx = 3*vi[m]; 41628f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 41638f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 41648f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 41658f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 41668f690400SShri Abhyankar v += 9; 41678f690400SShri Abhyankar } 41688f690400SShri Abhyankar idx = 3*i; 41698f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 41708f690400SShri Abhyankar } 41718f690400SShri Abhyankar /* backward solve the upper triangular */ 41728f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 41738f690400SShri Abhyankar k = 2*n-i; 41748f690400SShri Abhyankar v = aa + 9*ai[k]; 41758f690400SShri Abhyankar vi = aj + ai[k]; 41768f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 41778f690400SShri Abhyankar idt = 3*i; 41788f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 417929b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 418029b92fc1SShri Abhyankar idx = 3*vi[m]; 41818f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 41828f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 41838f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 41848f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 41858f690400SShri Abhyankar v += 9; 41868f690400SShri Abhyankar } 418729b92fc1SShri Abhyankar idc = 3*c[i]; 41888f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 41898f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 41908f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 41918f690400SShri Abhyankar } 41928f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 41938f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 41948f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41958f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 41968f690400SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 41978f690400SShri Abhyankar PetscFunctionReturn(0); 41988f690400SShri Abhyankar } 41998f690400SShri Abhyankar 42000c4413a7SShri Abhyankar #undef __FUNCT__ 42010c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2" 42020c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx) 42030c4413a7SShri Abhyankar { 42040c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 42050c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 42060c4413a7SShri Abhyankar PetscErrorCode ierr; 42070c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 42080c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 42090c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 42100c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 42110c4413a7SShri Abhyankar const PetscScalar *b; 42120c4413a7SShri Abhyankar 42130c4413a7SShri Abhyankar PetscFunctionBegin; 42140c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42150c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 42160c4413a7SShri Abhyankar t = a->solve_work; 42170c4413a7SShri Abhyankar 42180c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 42190c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 42200c4413a7SShri Abhyankar 42210c4413a7SShri Abhyankar /* forward solve the lower triangular */ 42220c4413a7SShri Abhyankar idx = 3*r[0]; 42230c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 42240c4413a7SShri Abhyankar for (i=1; i<n; i++) { 42250c4413a7SShri Abhyankar v = aa + 9*ai[i]; 42260c4413a7SShri Abhyankar vi = aj + ai[i]; 42270c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 42280c4413a7SShri Abhyankar idx = 3*r[i]; 42290c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 42300c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 42310c4413a7SShri Abhyankar idx = 3*vi[m]; 42320c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 42330c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 42340c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 42350c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 42360c4413a7SShri Abhyankar v += 9; 42370c4413a7SShri Abhyankar } 42380c4413a7SShri Abhyankar idx = 3*i; 42390c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 42400c4413a7SShri Abhyankar } 42410c4413a7SShri Abhyankar /* backward solve the upper triangular */ 42420c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 42430c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 42440c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 42450c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 42460c4413a7SShri Abhyankar idt = 3*i; 42470c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 42480c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 42490c4413a7SShri Abhyankar idx = 3*vi[m]; 42500c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 42510c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 42520c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 42530c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 42540c4413a7SShri Abhyankar v += 9; 42550c4413a7SShri Abhyankar } 42560c4413a7SShri Abhyankar idc = 3*c[i]; 42570c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 42580c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 42590c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 42600c4413a7SShri Abhyankar } 42610c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 42620c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 42630c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42640c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 42650c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 42660c4413a7SShri Abhyankar PetscFunctionReturn(0); 42670c4413a7SShri Abhyankar } 42680c4413a7SShri Abhyankar 426915091d37SBarry Smith /* 427015091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 427115091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 427215091d37SBarry Smith */ 42734a2ae208SSatish Balay #undef __FUNCT__ 42744a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4275dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 427615091d37SBarry Smith { 427715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4278690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4279dfbe8321SBarry Smith PetscErrorCode ierr; 4280690b6cddSBarry Smith PetscInt *diag = a->diag; 4281d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4282d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4283d9fead3dSBarry Smith const PetscScalar *b; 4284690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 428515091d37SBarry Smith 428615091d37SBarry Smith PetscFunctionBegin; 4287d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42881ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 428915091d37SBarry Smith 429015091d37SBarry Smith /* forward solve the lower triangular */ 429115091d37SBarry Smith idx = 0; 429215091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 429315091d37SBarry Smith for (i=1; i<n; i++) { 429415091d37SBarry Smith v = aa + 9*ai[i]; 429515091d37SBarry Smith vi = aj + ai[i]; 429615091d37SBarry Smith nz = diag[i] - ai[i]; 429715091d37SBarry Smith idx += 3; 4298f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 429915091d37SBarry Smith while (nz--) { 430015091d37SBarry Smith jdx = 3*(*vi++); 430115091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4302f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4303f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4304f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 430515091d37SBarry Smith v += 9; 430615091d37SBarry Smith } 4307f1af5d2fSBarry Smith x[idx] = s1; 4308f1af5d2fSBarry Smith x[1+idx] = s2; 4309f1af5d2fSBarry Smith x[2+idx] = s3; 431015091d37SBarry Smith } 431115091d37SBarry Smith /* backward solve the upper triangular */ 431215091d37SBarry Smith for (i=n-1; i>=0; i--){ 431315091d37SBarry Smith v = aa + 9*diag[i] + 9; 431415091d37SBarry Smith vi = aj + diag[i] + 1; 431515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 431615091d37SBarry Smith idt = 3*i; 4317f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4318f1af5d2fSBarry Smith s3 = x[2+idt]; 431915091d37SBarry Smith while (nz--) { 432015091d37SBarry Smith idx = 3*(*vi++); 432115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4322f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4323f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4324f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 432515091d37SBarry Smith v += 9; 432615091d37SBarry Smith } 432715091d37SBarry Smith v = aa + 9*diag[i]; 4328f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4329f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4330f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 433115091d37SBarry Smith } 433215091d37SBarry Smith 4333d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43341ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4335dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 433615091d37SBarry Smith PetscFunctionReturn(0); 433715091d37SBarry Smith } 433815091d37SBarry Smith 43394a2ae208SSatish Balay #undef __FUNCT__ 4340cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4341cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4342cee9d6f2SShri Abhyankar { 4343cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4344ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4345cee9d6f2SShri Abhyankar PetscErrorCode ierr; 4346cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 4347cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4348cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 4349cee9d6f2SShri Abhyankar PetscScalar *x; 4350cee9d6f2SShri Abhyankar const PetscScalar *b; 4351cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4352cee9d6f2SShri Abhyankar 4353cee9d6f2SShri Abhyankar PetscFunctionBegin; 4354cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4355cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4356cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 4357cee9d6f2SShri Abhyankar idx = 0; 4358cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4359cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 4360cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 4361cee9d6f2SShri Abhyankar vi = aj + ai[i]; 4362cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 4363cee9d6f2SShri Abhyankar idx = bs*i; 4364cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4365ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4366ce3d78c0SShri Abhyankar jdx = bs*vi[k]; 4367cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4368cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4369cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4370cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4371cee9d6f2SShri Abhyankar 4372cee9d6f2SShri Abhyankar v += bs2; 4373cee9d6f2SShri Abhyankar } 4374cee9d6f2SShri Abhyankar 4375cee9d6f2SShri Abhyankar x[idx] = s1; 4376cee9d6f2SShri Abhyankar x[1+idx] = s2; 4377cee9d6f2SShri Abhyankar x[2+idx] = s3; 4378cee9d6f2SShri Abhyankar } 4379cee9d6f2SShri Abhyankar 4380cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 4381cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 4382cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 4383cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 4384cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 4385cee9d6f2SShri Abhyankar idt = bs*i; 4386cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4387cee9d6f2SShri Abhyankar 4388ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4389ce3d78c0SShri Abhyankar idx = bs*vi[k]; 4390cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4391cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4392cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4393cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4394cee9d6f2SShri Abhyankar 4395cee9d6f2SShri Abhyankar v += bs2; 4396cee9d6f2SShri Abhyankar } 4397cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 4398cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4399cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4400cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4401cee9d6f2SShri Abhyankar 4402cee9d6f2SShri Abhyankar } 4403cee9d6f2SShri Abhyankar 4404cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4405cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4406cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4407cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 4408cee9d6f2SShri Abhyankar } 4409cee9d6f2SShri Abhyankar 4410cee9d6f2SShri Abhyankar #undef __FUNCT__ 4411b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2" 4412b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4413b2b2dd24SShri Abhyankar { 4414b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4415b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4416b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4417b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 4418b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4419b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4420b2b2dd24SShri Abhyankar PetscScalar *x; 4421b2b2dd24SShri Abhyankar const PetscScalar *b; 4422b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4423b2b2dd24SShri Abhyankar 4424b2b2dd24SShri Abhyankar PetscFunctionBegin; 4425b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4426b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4427b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4428b2b2dd24SShri Abhyankar idx = 0; 4429b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4430b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4431b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4432b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4433b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4434b2b2dd24SShri Abhyankar idx = bs*i; 4435b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4436b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4437b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4438b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4439b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4440b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4441b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4442b2b2dd24SShri Abhyankar 4443b2b2dd24SShri Abhyankar v += bs2; 4444b2b2dd24SShri Abhyankar } 4445b2b2dd24SShri Abhyankar 4446b2b2dd24SShri Abhyankar x[idx] = s1; 4447b2b2dd24SShri Abhyankar x[1+idx] = s2; 4448b2b2dd24SShri Abhyankar x[2+idx] = s3; 4449b2b2dd24SShri Abhyankar } 4450b2b2dd24SShri Abhyankar 4451b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4452b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4453b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4454b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4455b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4456b2b2dd24SShri Abhyankar idt = bs*i; 4457b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4458b2b2dd24SShri Abhyankar 4459b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4460b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4461b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4462b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4463b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4464b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4465b2b2dd24SShri Abhyankar 4466b2b2dd24SShri Abhyankar v += bs2; 4467b2b2dd24SShri Abhyankar } 4468b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4469b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4470b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4471b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4472b2b2dd24SShri Abhyankar 4473b2b2dd24SShri Abhyankar } 4474b2b2dd24SShri Abhyankar 4475b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4476b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4477b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4478b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4479b2b2dd24SShri Abhyankar } 4480b2b2dd24SShri Abhyankar 4481b2b2dd24SShri Abhyankar #undef __FUNCT__ 44824a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4483dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 44844e2b4712SSatish Balay { 44854e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 44864e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 44876849ba73SBarry Smith PetscErrorCode ierr; 44885d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 44895d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4490d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4491d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 4492d9fead3dSBarry Smith const PetscScalar *b; 44934e2b4712SSatish Balay 44944e2b4712SSatish Balay PetscFunctionBegin; 4495d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 44961ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4497f1af5d2fSBarry Smith t = a->solve_work; 44984e2b4712SSatish Balay 44994e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 45004e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 45014e2b4712SSatish Balay 45024e2b4712SSatish Balay /* forward solve the lower triangular */ 45034e2b4712SSatish Balay idx = 2*(*r++); 4504f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 45054e2b4712SSatish Balay for (i=1; i<n; i++) { 45064e2b4712SSatish Balay v = aa + 4*ai[i]; 45074e2b4712SSatish Balay vi = aj + ai[i]; 45084e2b4712SSatish Balay nz = diag[i] - ai[i]; 45094e2b4712SSatish Balay idx = 2*(*r++); 4510f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 45114e2b4712SSatish Balay while (nz--) { 45124e2b4712SSatish Balay idx = 2*(*vi++); 4513f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4514f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4515f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 45164e2b4712SSatish Balay v += 4; 45174e2b4712SSatish Balay } 45184e2b4712SSatish Balay idx = 2*i; 4519f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 45204e2b4712SSatish Balay } 45214e2b4712SSatish Balay /* backward solve the upper triangular */ 45224e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 45234e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 45244e2b4712SSatish Balay vi = aj + diag[i] + 1; 45254e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 45264e2b4712SSatish Balay idt = 2*i; 4527f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 45284e2b4712SSatish Balay while (nz--) { 45294e2b4712SSatish Balay idx = 2*(*vi++); 4530f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4531f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4532f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 45334e2b4712SSatish Balay v += 4; 45344e2b4712SSatish Balay } 45354e2b4712SSatish Balay idc = 2*(*c--); 45364e2b4712SSatish Balay v = aa + 4*diag[i]; 4537f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4538f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 45394e2b4712SSatish Balay } 45404e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 45414e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4542d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45431ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4544dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 45454e2b4712SSatish Balay PetscFunctionReturn(0); 45464e2b4712SSatish Balay } 45474e2b4712SSatish Balay 45488f690400SShri Abhyankar #undef __FUNCT__ 45498f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 45508f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 45518f690400SShri Abhyankar { 45528f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 45538f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 45548f690400SShri Abhyankar PetscErrorCode ierr; 455529b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 45568f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 45578f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 45588f690400SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 45598f690400SShri Abhyankar const PetscScalar *b; 45608f690400SShri Abhyankar 45618f690400SShri Abhyankar PetscFunctionBegin; 45628f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45638f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 45648f690400SShri Abhyankar t = a->solve_work; 45658f690400SShri Abhyankar 45668f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 456729b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 45688f690400SShri Abhyankar 45698f690400SShri Abhyankar /* forward solve the lower triangular */ 457029b92fc1SShri Abhyankar idx = 2*r[0]; 45718f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 45728f690400SShri Abhyankar for (i=1; i<n; i++) { 45738f690400SShri Abhyankar v = aa + 4*ai[i]; 45748f690400SShri Abhyankar vi = aj + ai[i]; 45758f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 457629b92fc1SShri Abhyankar idx = 2*r[i]; 45778f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 457829b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 457929b92fc1SShri Abhyankar jdx = 2*vi[m]; 45808f690400SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 45818f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 45828f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 45838f690400SShri Abhyankar v += 4; 45848f690400SShri Abhyankar } 45858f690400SShri Abhyankar idx = 2*i; 45868f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 45878f690400SShri Abhyankar } 45888f690400SShri Abhyankar /* backward solve the upper triangular */ 45898f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 45908f690400SShri Abhyankar k = 2*n-i; 45918f690400SShri Abhyankar v = aa + 4*ai[k]; 45928f690400SShri Abhyankar vi = aj + ai[k]; 45938f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 45948f690400SShri Abhyankar idt = 2*i; 45958f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 459629b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 459729b92fc1SShri Abhyankar idx = 2*vi[m]; 45988f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 45998f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 46008f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 46018f690400SShri Abhyankar v += 4; 46028f690400SShri Abhyankar } 460329b92fc1SShri Abhyankar idc = 2*c[i]; 46048f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 46058f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 46068f690400SShri Abhyankar } 46078f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 46088f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 46098f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46108f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 46118f690400SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 46128f690400SShri Abhyankar PetscFunctionReturn(0); 46138f690400SShri Abhyankar } 46148f690400SShri Abhyankar 46150c4413a7SShri Abhyankar #undef __FUNCT__ 46160c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2" 46170c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx) 46180c4413a7SShri Abhyankar { 46190c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 46200c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 46210c4413a7SShri Abhyankar PetscErrorCode ierr; 46220c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 46230c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 46240c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 46250c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 46260c4413a7SShri Abhyankar const PetscScalar *b; 46270c4413a7SShri Abhyankar 46280c4413a7SShri Abhyankar PetscFunctionBegin; 46290c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46300c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 46310c4413a7SShri Abhyankar t = a->solve_work; 46320c4413a7SShri Abhyankar 46330c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 46340c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 46350c4413a7SShri Abhyankar 46360c4413a7SShri Abhyankar /* forward solve the lower triangular */ 46370c4413a7SShri Abhyankar idx = 2*r[0]; 46380c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 46390c4413a7SShri Abhyankar for (i=1; i<n; i++) { 46400c4413a7SShri Abhyankar v = aa + 4*ai[i]; 46410c4413a7SShri Abhyankar vi = aj + ai[i]; 46420c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 46430c4413a7SShri Abhyankar idx = 2*r[i]; 46440c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 46450c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 46460c4413a7SShri Abhyankar jdx = 2*vi[m]; 46470c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 46480c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 46490c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 46500c4413a7SShri Abhyankar v += 4; 46510c4413a7SShri Abhyankar } 46520c4413a7SShri Abhyankar idx = 2*i; 46530c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 46540c4413a7SShri Abhyankar } 46550c4413a7SShri Abhyankar /* backward solve the upper triangular */ 46560c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 46570c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 46580c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 46590c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 46600c4413a7SShri Abhyankar idt = 2*i; 46610c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 46620c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 46630c4413a7SShri Abhyankar idx = 2*vi[m]; 46640c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 46650c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 46660c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 46670c4413a7SShri Abhyankar v += 4; 46680c4413a7SShri Abhyankar } 46690c4413a7SShri Abhyankar idc = 2*c[i]; 46700c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 46710c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 46720c4413a7SShri Abhyankar } 46730c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 46740c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 46750c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46760c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 46770c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 46780c4413a7SShri Abhyankar PetscFunctionReturn(0); 46790c4413a7SShri Abhyankar } 46808f690400SShri Abhyankar 468115091d37SBarry Smith /* 468215091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 468315091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 468415091d37SBarry Smith */ 46854a2ae208SSatish Balay #undef __FUNCT__ 46864a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4687dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 468815091d37SBarry Smith { 468915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4690690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4691dfbe8321SBarry Smith PetscErrorCode ierr; 4692690b6cddSBarry Smith PetscInt *diag = a->diag; 4693d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4694d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 4695d9fead3dSBarry Smith const PetscScalar *b; 4696690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 469715091d37SBarry Smith 469815091d37SBarry Smith PetscFunctionBegin; 4699d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47001ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 470115091d37SBarry Smith 470215091d37SBarry Smith /* forward solve the lower triangular */ 470315091d37SBarry Smith idx = 0; 470415091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 470515091d37SBarry Smith for (i=1; i<n; i++) { 470615091d37SBarry Smith v = aa + 4*ai[i]; 470715091d37SBarry Smith vi = aj + ai[i]; 470815091d37SBarry Smith nz = diag[i] - ai[i]; 470915091d37SBarry Smith idx += 2; 4710f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 471115091d37SBarry Smith while (nz--) { 471215091d37SBarry Smith jdx = 2*(*vi++); 471315091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 4714f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4715f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 471615091d37SBarry Smith v += 4; 471715091d37SBarry Smith } 4718f1af5d2fSBarry Smith x[idx] = s1; 4719f1af5d2fSBarry Smith x[1+idx] = s2; 472015091d37SBarry Smith } 472115091d37SBarry Smith /* backward solve the upper triangular */ 472215091d37SBarry Smith for (i=n-1; i>=0; i--){ 472315091d37SBarry Smith v = aa + 4*diag[i] + 4; 472415091d37SBarry Smith vi = aj + diag[i] + 1; 472515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 472615091d37SBarry Smith idt = 2*i; 4727f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 472815091d37SBarry Smith while (nz--) { 472915091d37SBarry Smith idx = 2*(*vi++); 473015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 4731f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4732f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 473315091d37SBarry Smith v += 4; 473415091d37SBarry Smith } 473515091d37SBarry Smith v = aa + 4*diag[i]; 4736f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 4737f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 473815091d37SBarry Smith } 473915091d37SBarry Smith 4740d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47411ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4742dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 474315091d37SBarry Smith PetscFunctionReturn(0); 474415091d37SBarry Smith } 474515091d37SBarry Smith 47464a2ae208SSatish Balay #undef __FUNCT__ 4747cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4748cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4749cee9d6f2SShri Abhyankar { 4750cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4751ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4752cee9d6f2SShri Abhyankar PetscErrorCode ierr; 4753cee9d6f2SShri Abhyankar PetscInt jdx; 4754cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 4755cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4756cee9d6f2SShri Abhyankar const PetscScalar *b; 4757cee9d6f2SShri Abhyankar 4758cee9d6f2SShri Abhyankar PetscFunctionBegin; 4759cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4760cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4761cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 4762cee9d6f2SShri Abhyankar idx = 0; 4763cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4764cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 4765cee9d6f2SShri Abhyankar v = aa + 4*ai[i]; 4766cee9d6f2SShri Abhyankar vi = aj + ai[i]; 4767cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 4768cee9d6f2SShri Abhyankar idx = 2*i; 4769cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4770ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4771ce3d78c0SShri Abhyankar jdx = 2*vi[k]; 4772cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4773cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4774cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4775cee9d6f2SShri Abhyankar v += 4; 4776cee9d6f2SShri Abhyankar } 4777cee9d6f2SShri Abhyankar x[idx] = s1; 4778cee9d6f2SShri Abhyankar x[1+idx] = s2; 4779cee9d6f2SShri Abhyankar } 4780cee9d6f2SShri Abhyankar 4781cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 4782cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 4783cee9d6f2SShri Abhyankar v = aa + 4*ai[2*n-i]; 4784cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 4785cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 4786cee9d6f2SShri Abhyankar idt = 2*i; 4787cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4788ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4789ce3d78c0SShri Abhyankar idx = 2*vi[k]; 4790cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4791cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4792cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4793cee9d6f2SShri Abhyankar v += 4; 4794cee9d6f2SShri Abhyankar } 4795cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 4796cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4797cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4798cee9d6f2SShri Abhyankar } 4799cee9d6f2SShri Abhyankar 4800cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4801cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4802cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4803cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 4804cee9d6f2SShri Abhyankar } 4805cee9d6f2SShri Abhyankar 4806cee9d6f2SShri Abhyankar #undef __FUNCT__ 4807b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2" 4808b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4809b2b2dd24SShri Abhyankar { 4810b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4811b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4812b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4813b2b2dd24SShri Abhyankar PetscInt jdx; 4814b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4815b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4816b2b2dd24SShri Abhyankar const PetscScalar *b; 4817b2b2dd24SShri Abhyankar 4818b2b2dd24SShri Abhyankar PetscFunctionBegin; 4819b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4820b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4821b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4822b2b2dd24SShri Abhyankar idx = 0; 4823b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4824b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4825b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 4826b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4827b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4828b2b2dd24SShri Abhyankar idx = 2*i; 4829b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4830b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4831b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 4832b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4833b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4834b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4835b2b2dd24SShri Abhyankar v += 4; 4836b2b2dd24SShri Abhyankar } 4837b2b2dd24SShri Abhyankar x[idx] = s1; 4838b2b2dd24SShri Abhyankar x[1+idx] = s2; 4839b2b2dd24SShri Abhyankar } 4840b2b2dd24SShri Abhyankar 4841b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4842b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4843b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 4844b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4845b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4846b2b2dd24SShri Abhyankar idt = 2*i; 4847b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4848b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4849b2b2dd24SShri Abhyankar idx = 2*vi[k]; 4850b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4851b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4852b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4853b2b2dd24SShri Abhyankar v += 4; 4854b2b2dd24SShri Abhyankar } 4855b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4856b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4857b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4858b2b2dd24SShri Abhyankar } 4859b2b2dd24SShri Abhyankar 4860b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4861b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4862b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4863b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4864b2b2dd24SShri Abhyankar } 4865b2b2dd24SShri Abhyankar 4866b2b2dd24SShri Abhyankar #undef __FUNCT__ 48674a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4868dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 48694e2b4712SSatish Balay { 48704e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 48714e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 48726849ba73SBarry Smith PetscErrorCode ierr; 48735d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 48745d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 48753f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 487687828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 48774e2b4712SSatish Balay 48784e2b4712SSatish Balay PetscFunctionBegin; 48794e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 48804e2b4712SSatish Balay 48811ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 48821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4883f1af5d2fSBarry Smith t = a->solve_work; 48844e2b4712SSatish Balay 48854e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 48864e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 48874e2b4712SSatish Balay 48884e2b4712SSatish Balay /* forward solve the lower triangular */ 4889f1af5d2fSBarry Smith t[0] = b[*r++]; 48904e2b4712SSatish Balay for (i=1; i<n; i++) { 48914e2b4712SSatish Balay v = aa + ai[i]; 48924e2b4712SSatish Balay vi = aj + ai[i]; 48934e2b4712SSatish Balay nz = diag[i] - ai[i]; 4894f1af5d2fSBarry Smith s1 = b[*r++]; 48954e2b4712SSatish Balay while (nz--) { 4896f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 48974e2b4712SSatish Balay } 4898f1af5d2fSBarry Smith t[i] = s1; 48994e2b4712SSatish Balay } 49004e2b4712SSatish Balay /* backward solve the upper triangular */ 49014e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 49024e2b4712SSatish Balay v = aa + diag[i] + 1; 49034e2b4712SSatish Balay vi = aj + diag[i] + 1; 49044e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4905f1af5d2fSBarry Smith s1 = t[i]; 49064e2b4712SSatish Balay while (nz--) { 4907f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 49084e2b4712SSatish Balay } 4909f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 49104e2b4712SSatish Balay } 49114e2b4712SSatish Balay 49124e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 49134e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 49141ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 49151ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4916dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 49174e2b4712SSatish Balay PetscFunctionReturn(0); 49184e2b4712SSatish Balay } 491915091d37SBarry Smith /* 492015091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 492115091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 492215091d37SBarry Smith */ 49234a2ae208SSatish Balay #undef __FUNCT__ 49244a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4925dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 492615091d37SBarry Smith { 492715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4928690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4929dfbe8321SBarry Smith PetscErrorCode ierr; 4930690b6cddSBarry Smith PetscInt *diag = a->diag; 493115091d37SBarry Smith MatScalar *aa=a->a; 493287828ca2SBarry Smith PetscScalar *x,*b; 493387828ca2SBarry Smith PetscScalar s1,x1; 493415091d37SBarry Smith MatScalar *v; 4935690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 493615091d37SBarry Smith 493715091d37SBarry Smith PetscFunctionBegin; 49381ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 49391ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 494015091d37SBarry Smith 494115091d37SBarry Smith /* forward solve the lower triangular */ 494215091d37SBarry Smith idx = 0; 494315091d37SBarry Smith x[0] = b[0]; 494415091d37SBarry Smith for (i=1; i<n; i++) { 494515091d37SBarry Smith v = aa + ai[i]; 494615091d37SBarry Smith vi = aj + ai[i]; 494715091d37SBarry Smith nz = diag[i] - ai[i]; 494815091d37SBarry Smith idx += 1; 4949f1af5d2fSBarry Smith s1 = b[idx]; 495015091d37SBarry Smith while (nz--) { 495115091d37SBarry Smith jdx = *vi++; 495215091d37SBarry Smith x1 = x[jdx]; 4953f1af5d2fSBarry Smith s1 -= v[0]*x1; 495415091d37SBarry Smith v += 1; 495515091d37SBarry Smith } 4956f1af5d2fSBarry Smith x[idx] = s1; 495715091d37SBarry Smith } 495815091d37SBarry Smith /* backward solve the upper triangular */ 495915091d37SBarry Smith for (i=n-1; i>=0; i--){ 496015091d37SBarry Smith v = aa + diag[i] + 1; 496115091d37SBarry Smith vi = aj + diag[i] + 1; 496215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 496315091d37SBarry Smith idt = i; 4964f1af5d2fSBarry Smith s1 = x[idt]; 496515091d37SBarry Smith while (nz--) { 496615091d37SBarry Smith idx = *vi++; 496715091d37SBarry Smith x1 = x[idx]; 4968f1af5d2fSBarry Smith s1 -= v[0]*x1; 496915091d37SBarry Smith v += 1; 497015091d37SBarry Smith } 497115091d37SBarry Smith v = aa + diag[i]; 4972f1af5d2fSBarry Smith x[idt] = v[0]*s1; 497315091d37SBarry Smith } 49741ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 49751ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4976dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 497715091d37SBarry Smith PetscFunctionReturn(0); 497815091d37SBarry Smith } 49794e2b4712SSatish Balay 49804e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 498116a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 49826bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 49836bce7ff8SHong Zhang 49846bce7ff8SHong Zhang #undef __FUNCT__ 49856bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 49866bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 49876bce7ff8SHong Zhang { 49886bce7ff8SHong Zhang Mat C=B; 49896bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 49906bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 49916bce7ff8SHong Zhang PetscErrorCode ierr; 49926bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 49936bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 49946bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4995b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4996914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4997914a18a2SHong Zhang MatScalar *v_work; 49986bce7ff8SHong Zhang 49996bce7ff8SHong Zhang PetscFunctionBegin; 50006bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 50016bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5002*fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5003*fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 50046bce7ff8SHong Zhang ics = ic; 50056bce7ff8SHong Zhang 5006914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 5007*fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5008914a18a2SHong Zhang 50096bce7ff8SHong Zhang for (i=0; i<n; i++){ 50106bce7ff8SHong Zhang /* zero rtmp */ 50116bce7ff8SHong Zhang /* L part */ 50126bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 50136bce7ff8SHong Zhang bjtmp = bj + bi[i]; 5014914a18a2SHong Zhang for (j=0; j<nz; j++){ 5015914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5016914a18a2SHong Zhang } 50176bce7ff8SHong Zhang 50186bce7ff8SHong Zhang /* U part */ 50191a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 50201a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 50211a83e813SShri Abhyankar for (j=0; j<nz; j++){ 50221a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 50231a83e813SShri Abhyankar } 50241a83e813SShri Abhyankar 50251a83e813SShri Abhyankar /* load in initial (unfactored row) */ 50261a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 50271a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 50281a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 50291a83e813SShri Abhyankar for (j=0; j<nz; j++) { 50301a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 50311a83e813SShri Abhyankar } 50321a83e813SShri Abhyankar 50331a83e813SShri Abhyankar /* elimination */ 50341a83e813SShri Abhyankar bjtmp = bj + bi[i]; 50351a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 50361a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 50371a83e813SShri Abhyankar row = bjtmp[k]; 50381a83e813SShri Abhyankar pc = rtmp + bs2*row; 50391a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 50401a83e813SShri Abhyankar if (flg) { 50411a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 50421a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 50431a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 50441a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 50451a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 50461a83e813SShri Abhyankar for (j=0; j<nz; j++) { 50471a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 50481a83e813SShri Abhyankar } 50491a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 50501a83e813SShri Abhyankar } 50511a83e813SShri Abhyankar } 50521a83e813SShri Abhyankar 50531a83e813SShri Abhyankar /* finished row so stick it into b->a */ 50541a83e813SShri Abhyankar /* L part */ 50551a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 50561a83e813SShri Abhyankar pj = b->j + bi[i] ; 50571a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 50581a83e813SShri Abhyankar for (j=0; j<nz; j++) { 50591a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 50601a83e813SShri Abhyankar } 50611a83e813SShri Abhyankar 50621a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 50631a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 50641a83e813SShri Abhyankar pj = b->j + bdiag[i]; 50651a83e813SShri Abhyankar /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 50661a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 50671a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 50681a83e813SShri Abhyankar 50691a83e813SShri Abhyankar /* U part */ 50701a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 50711a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 50721a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 50731a83e813SShri Abhyankar for (j=0; j<nz; j++){ 50741a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 50751a83e813SShri Abhyankar } 50761a83e813SShri Abhyankar } 50771a83e813SShri Abhyankar 50781a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 5079*fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 50801a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 50811a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 50821a83e813SShri Abhyankar 50831a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 50841a83e813SShri Abhyankar ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 50851a83e813SShri Abhyankar PetscFunctionReturn(0); 50861a83e813SShri Abhyankar } 50871a83e813SShri Abhyankar 50886bce7ff8SHong Zhang /* 50896bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 509016a2bf60SHong Zhang See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 509116a2bf60SHong Zhang because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 50926bce7ff8SHong Zhang */ 5093c0c7eb62SShri Abhyankar 50946bce7ff8SHong Zhang #undef __FUNCT__ 50956bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 50966bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 50976bce7ff8SHong Zhang { 50986bce7ff8SHong Zhang 50996bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 51006bce7ff8SHong Zhang PetscErrorCode ierr; 510116a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 510235aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 510335aa4fcfSShri Abhyankar 510435aa4fcfSShri Abhyankar PetscFunctionBegin; 510535aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 510635aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 510735aa4fcfSShri Abhyankar 510835aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 510935aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 511035aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 511135aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 511235aa4fcfSShri Abhyankar if (!b->diag){ 511335aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 511435aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 511535aa4fcfSShri Abhyankar } 511635aa4fcfSShri Abhyankar bdiag = b->diag; 511735aa4fcfSShri Abhyankar 511835aa4fcfSShri Abhyankar if (n > 0) { 511935aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 512035aa4fcfSShri Abhyankar } 512135aa4fcfSShri Abhyankar 512235aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 512335aa4fcfSShri Abhyankar bi = b->i; 512435aa4fcfSShri Abhyankar bj = b->j; 512535aa4fcfSShri Abhyankar 512635aa4fcfSShri Abhyankar /* L part */ 512735aa4fcfSShri Abhyankar bi[0] = 0; 512835aa4fcfSShri Abhyankar for (i=0; i<n; i++){ 512935aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 513035aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 513135aa4fcfSShri Abhyankar aj = a->j + ai[i]; 513235aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 513335aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 513435aa4fcfSShri Abhyankar } 513535aa4fcfSShri Abhyankar } 513635aa4fcfSShri Abhyankar 513735aa4fcfSShri Abhyankar /* U part */ 513835aa4fcfSShri Abhyankar bi_temp = bi[n]; 513935aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 514035aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 514135aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 514235aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 514335aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 514435aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 514535aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 514635aa4fcfSShri Abhyankar } 514735aa4fcfSShri Abhyankar /* diag[i] */ 514835aa4fcfSShri Abhyankar *bj = i; bj++; 514935aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 515035aa4fcfSShri Abhyankar } 515135aa4fcfSShri Abhyankar PetscFunctionReturn(0); 515235aa4fcfSShri Abhyankar } 515335aa4fcfSShri Abhyankar 515435aa4fcfSShri Abhyankar #undef __FUNCT__ 515516a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 515616a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 515716a2bf60SHong Zhang { 515816a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 515916a2bf60SHong Zhang IS isicol; 516016a2bf60SHong Zhang PetscErrorCode ierr; 516116a2bf60SHong Zhang const PetscInt *r,*ic; 51627fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 516316a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 516416a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 516516a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 51667fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 516716a2bf60SHong Zhang PetscReal f; 516816a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 516916a2bf60SHong Zhang PetscBT lnkbt; 517016a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 517116a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 517216a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 517316a2bf60SHong Zhang PetscTruth missing; 51747fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 517516a2bf60SHong Zhang 517616a2bf60SHong Zhang PetscFunctionBegin; 517716a2bf60SHong Zhang if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 517816a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 517916a2bf60SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 518016a2bf60SHong Zhang 518116a2bf60SHong Zhang f = info->fill; 518216a2bf60SHong Zhang levels = (PetscInt)info->levels; 518316a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 518416a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 518516a2bf60SHong Zhang 518616a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 518716a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 51887fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 518916a2bf60SHong Zhang 51907fa3a6a0SHong Zhang if (!levels && both_identity) { 519116a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 519216a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 519316a2bf60SHong Zhang (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 51947fa3a6a0SHong Zhang /* set MatSolve routines */ 51957fa3a6a0SHong Zhang switch (bs){ 51967fa3a6a0SHong Zhang case 2: 519735aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2; 519835aa4fcfSShri Abhyankar break; 519935aa4fcfSShri Abhyankar case 3: 520035aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2; 520135aa4fcfSShri Abhyankar break; 520235aa4fcfSShri Abhyankar case 4: 520335aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2; 520435aa4fcfSShri Abhyankar break; 520535aa4fcfSShri Abhyankar case 5: 520635aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2; 520735aa4fcfSShri Abhyankar break; 520835aa4fcfSShri Abhyankar case 6: 520935aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2; 521035aa4fcfSShri Abhyankar break; 521135aa4fcfSShri Abhyankar case 7: 521235aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2; 521335aa4fcfSShri Abhyankar break; 521435aa4fcfSShri Abhyankar default: 521535aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2; 521635aa4fcfSShri Abhyankar break; 521735aa4fcfSShri Abhyankar } 521835aa4fcfSShri Abhyankar 521935aa4fcfSShri Abhyankar fact->factor = MAT_FACTOR_ILU; 522035aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 522135aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 522235aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 522335aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 522435aa4fcfSShri Abhyankar b->row = isrow; 522535aa4fcfSShri Abhyankar b->col = iscol; 522635aa4fcfSShri Abhyankar b->icol = isicol; 522735aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 522835aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 522935aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 523035aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 523135aa4fcfSShri Abhyankar PetscFunctionReturn(0); 523235aa4fcfSShri Abhyankar } 523335aa4fcfSShri Abhyankar 523435aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 523535aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 523635aa4fcfSShri Abhyankar 523735aa4fcfSShri Abhyankar /* get new row pointers */ 523835aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 523935aa4fcfSShri Abhyankar bi[0] = 0; 524035aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 524135aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 524235aa4fcfSShri Abhyankar bdiag[0] = 0; 524335aa4fcfSShri Abhyankar 5244*fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 524535aa4fcfSShri Abhyankar 524635aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 524735aa4fcfSShri Abhyankar nlnk = n + 1; 524835aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 524935aa4fcfSShri Abhyankar 525035aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 525135aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 525235aa4fcfSShri Abhyankar current_space = free_space; 525335aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 525435aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 525535aa4fcfSShri Abhyankar 525635aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 525735aa4fcfSShri Abhyankar nzi = 0; 525835aa4fcfSShri Abhyankar /* copy current row into linked list */ 525935aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 526035aa4fcfSShri Abhyankar if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 526135aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 526235aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 526335aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 526435aa4fcfSShri Abhyankar nzi += nlnk; 526535aa4fcfSShri Abhyankar 526635aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 526735aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 526835aa4fcfSShri Abhyankar fm = n; 526935aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 527035aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 527135aa4fcfSShri Abhyankar lnk[fm] = i; 527235aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 527335aa4fcfSShri Abhyankar nzi++; dcount++; 527435aa4fcfSShri Abhyankar } 527535aa4fcfSShri Abhyankar 527635aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 527735aa4fcfSShri Abhyankar nzbd = 0; 527835aa4fcfSShri Abhyankar prow = lnk[n]; 527935aa4fcfSShri Abhyankar while (prow < i) { 528035aa4fcfSShri Abhyankar nnz = bdiag[prow]; 528135aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 528235aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 528335aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 528435aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 528535aa4fcfSShri Abhyankar nzi += nlnk; 528635aa4fcfSShri Abhyankar prow = lnk[prow]; 528735aa4fcfSShri Abhyankar nzbd++; 528835aa4fcfSShri Abhyankar } 528935aa4fcfSShri Abhyankar bdiag[i] = nzbd; 529035aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 529135aa4fcfSShri Abhyankar 529235aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 529335aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 529435aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 529535aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 529635aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 529735aa4fcfSShri Abhyankar reallocs++; 529835aa4fcfSShri Abhyankar } 529935aa4fcfSShri Abhyankar 530035aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 530135aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 530235aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 530335aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 530435aa4fcfSShri Abhyankar 530535aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 530635aa4fcfSShri Abhyankar if (*(bj_ptr[i]+bdiag[i]) != i) { 530735aa4fcfSShri Abhyankar SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 530835aa4fcfSShri Abhyankar try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 530935aa4fcfSShri Abhyankar } 531035aa4fcfSShri Abhyankar 531135aa4fcfSShri Abhyankar current_space->array += nzi; 531235aa4fcfSShri Abhyankar current_space->local_used += nzi; 531335aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 531435aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 531535aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 531635aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 531735aa4fcfSShri Abhyankar } 531835aa4fcfSShri Abhyankar 531935aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 532035aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 532135aa4fcfSShri Abhyankar 532235aa4fcfSShri Abhyankar /* destroy list of free space and other temporary arrays */ 532335aa4fcfSShri Abhyankar ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 532435aa4fcfSShri Abhyankar 532535aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 532635aa4fcfSShri Abhyankar ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 532735aa4fcfSShri Abhyankar 532835aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 532935aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5330*fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 533135aa4fcfSShri Abhyankar 533235aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 533335aa4fcfSShri Abhyankar { 533435aa4fcfSShri Abhyankar PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 533535aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 533635aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 533735aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 533835aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 533935aa4fcfSShri Abhyankar if (diagonal_fill) { 534035aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 534135aa4fcfSShri Abhyankar } 534235aa4fcfSShri Abhyankar } 534335aa4fcfSShri Abhyankar #endif 534435aa4fcfSShri Abhyankar 534535aa4fcfSShri Abhyankar /* put together the new matrix */ 534635aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 534735aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 534835aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 534935aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 535035aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 535135aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 535235aa4fcfSShri Abhyankar ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 535335aa4fcfSShri Abhyankar b->j = bj; 535435aa4fcfSShri Abhyankar b->i = bi; 535535aa4fcfSShri Abhyankar b->diag = bdiag; 535635aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 535735aa4fcfSShri Abhyankar b->ilen = 0; 535835aa4fcfSShri Abhyankar b->imax = 0; 535935aa4fcfSShri Abhyankar b->row = isrow; 536035aa4fcfSShri Abhyankar b->col = iscol; 536135aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 536235aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 536335aa4fcfSShri Abhyankar b->icol = isicol; 536435aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 536535aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 536635aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 536735aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 536835aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 536935aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = reallocs; 537035aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = f; 537135aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 537235aa4fcfSShri Abhyankar (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 537335aa4fcfSShri Abhyankar /* set MatSolve routines */ 537435aa4fcfSShri Abhyankar if (both_identity){ 537535aa4fcfSShri Abhyankar switch (bs){ 537635aa4fcfSShri Abhyankar case 2: 537735aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2; 537835aa4fcfSShri Abhyankar break; 537935aa4fcfSShri Abhyankar case 3: 538035aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2; 538135aa4fcfSShri Abhyankar break; 538235aa4fcfSShri Abhyankar case 4: 538335aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2; 538435aa4fcfSShri Abhyankar break; 538535aa4fcfSShri Abhyankar case 5: 538635aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2; 538735aa4fcfSShri Abhyankar break; 538835aa4fcfSShri Abhyankar case 6: 538935aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2; 539035aa4fcfSShri Abhyankar break; 539135aa4fcfSShri Abhyankar case 7: 539235aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2; 539335aa4fcfSShri Abhyankar break; 539435aa4fcfSShri Abhyankar default: 539535aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2; 539635aa4fcfSShri Abhyankar break; 539735aa4fcfSShri Abhyankar } 539835aa4fcfSShri Abhyankar } else { 539935aa4fcfSShri Abhyankar switch (bs){ 540035aa4fcfSShri Abhyankar case 2: 540135aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct_v2; 540235aa4fcfSShri Abhyankar break; 540335aa4fcfSShri Abhyankar case 3: 540435aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct_v2; 540535aa4fcfSShri Abhyankar break; 540635aa4fcfSShri Abhyankar case 4: 540735aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct_v2; 540835aa4fcfSShri Abhyankar break; 540935aa4fcfSShri Abhyankar case 5: 541035aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct_v2; 541135aa4fcfSShri Abhyankar break; 541235aa4fcfSShri Abhyankar case 6: 541335aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct_v2; 541435aa4fcfSShri Abhyankar break; 541535aa4fcfSShri Abhyankar case 7: 541635aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct_v2; 541735aa4fcfSShri Abhyankar break; 541835aa4fcfSShri Abhyankar default: 541935aa4fcfSShri Abhyankar fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct_v2; 542035aa4fcfSShri Abhyankar break; 542135aa4fcfSShri Abhyankar } 542235aa4fcfSShri Abhyankar } 542335aa4fcfSShri Abhyankar PetscFunctionReturn(0); 542435aa4fcfSShri Abhyankar } 542535aa4fcfSShri Abhyankar 542635aa4fcfSShri Abhyankar 54274e2b4712SSatish Balay /* 54284e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 54294e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 54304e2b4712SSatish Balay Not a good example of code reuse. 54314e2b4712SSatish Balay */ 54324a2ae208SSatish Balay #undef __FUNCT__ 54334a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 54340481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 54354e2b4712SSatish Balay { 54364e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 54374e2b4712SSatish Balay IS isicol; 54386849ba73SBarry Smith PetscErrorCode ierr; 54395d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 54405d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5441a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5442d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 544341df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 5444329f5518SBarry Smith PetscReal f; 5445c0c7eb62SShri Abhyankar PetscTruth newdatastruct = PETSC_FALSE; 54464e2b4712SSatish Balay 54474e2b4712SSatish Balay PetscFunctionBegin; 544816a2bf60SHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 544916a2bf60SHong Zhang if (newdatastruct){ 545016a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 545116a2bf60SHong Zhang PetscFunctionReturn(0); 545216a2bf60SHong Zhang } 545316a2bf60SHong Zhang 54546bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 54556bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 54566bce7ff8SHong Zhang 5457435faa5fSBarry Smith f = info->fill; 5458690b6cddSBarry Smith levels = (PetscInt)info->levels; 5459690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 54604c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 546116a2bf60SHong Zhang 5462667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5463667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 54647d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 5465309c388cSBarry Smith 546641df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 546716a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 54686bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 54696bce7ff8SHong Zhang 5470719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 5471719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 5472bb3d539aSBarry Smith b->row = isrow; 5473bb3d539aSBarry Smith b->col = iscol; 5474bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5475bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5476bb3d539aSBarry Smith b->icol = isicol; 5477bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5478b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 54796bce7ff8SHong Zhang PetscFunctionReturn(0); 54806bce7ff8SHong Zhang } 54816bce7ff8SHong Zhang 54826bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 54834e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 54844e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 54854e2b4712SSatish Balay 54864e2b4712SSatish Balay /* get new row pointers */ 5487690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 54884e2b4712SSatish Balay ainew[0] = 0; 54894e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 5490690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 5491690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 54924e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 5493690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 54944e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 5495690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 54964e2b4712SSatish Balay /* im is level for each filled value */ 5497690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 54984e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 5499690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 55004e2b4712SSatish Balay dloc[0] = 0; 55014e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 5502435faa5fSBarry Smith 5503435faa5fSBarry Smith /* copy prow into linked list */ 55044e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 55053b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 55064e2b4712SSatish Balay xi = aj + ai[r[prow]]; 55074e2b4712SSatish Balay fill[n] = n; 5508435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 55094e2b4712SSatish Balay while (nz--) { 55104e2b4712SSatish Balay fm = n; 55114e2b4712SSatish Balay idx = ic[*xi++]; 55124e2b4712SSatish Balay do { 55134e2b4712SSatish Balay m = fm; 55144e2b4712SSatish Balay fm = fill[m]; 55154e2b4712SSatish Balay } while (fm < idx); 55164e2b4712SSatish Balay fill[m] = idx; 55174e2b4712SSatish Balay fill[idx] = fm; 55184e2b4712SSatish Balay im[idx] = 0; 55194e2b4712SSatish Balay } 5520435faa5fSBarry Smith 5521435faa5fSBarry Smith /* make sure diagonal entry is included */ 5522435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 5523435faa5fSBarry Smith fm = n; 5524435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 5525435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5526435faa5fSBarry Smith fill[fm] = prow; 5527435faa5fSBarry Smith im[prow] = 0; 5528435faa5fSBarry Smith nzf++; 5529335d9088SBarry Smith dcount++; 5530435faa5fSBarry Smith } 5531435faa5fSBarry Smith 55324e2b4712SSatish Balay nzi = 0; 55334e2b4712SSatish Balay row = fill[n]; 55344e2b4712SSatish Balay while (row < prow) { 55354e2b4712SSatish Balay incrlev = im[row] + 1; 55364e2b4712SSatish Balay nz = dloc[row]; 5537435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 55384e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 55394e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 55404e2b4712SSatish Balay fm = row; 55414e2b4712SSatish Balay while (nnz-- > 0) { 55424e2b4712SSatish Balay idx = *xi++; 55434e2b4712SSatish Balay if (*flev + incrlev > levels) { 55444e2b4712SSatish Balay flev++; 55454e2b4712SSatish Balay continue; 55464e2b4712SSatish Balay } 55474e2b4712SSatish Balay do { 55484e2b4712SSatish Balay m = fm; 55494e2b4712SSatish Balay fm = fill[m]; 55504e2b4712SSatish Balay } while (fm < idx); 55514e2b4712SSatish Balay if (fm != idx) { 55524e2b4712SSatish Balay im[idx] = *flev + incrlev; 55534e2b4712SSatish Balay fill[m] = idx; 55544e2b4712SSatish Balay fill[idx] = fm; 55554e2b4712SSatish Balay fm = idx; 55564e2b4712SSatish Balay nzf++; 5557ecf371e4SBarry Smith } else { 55584e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 55594e2b4712SSatish Balay } 55604e2b4712SSatish Balay flev++; 55614e2b4712SSatish Balay } 55624e2b4712SSatish Balay row = fill[row]; 55634e2b4712SSatish Balay nzi++; 55644e2b4712SSatish Balay } 55654e2b4712SSatish Balay /* copy new filled row into permanent storage */ 55664e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 55674e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 5568ecf371e4SBarry Smith 5569ecf371e4SBarry Smith /* estimate how much additional space we will need */ 5570ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5571ecf371e4SBarry Smith /* just double the memory each time */ 5572690b6cddSBarry Smith PetscInt maxadd = jmax; 5573ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 55744e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 55754e2b4712SSatish Balay jmax += maxadd; 5576ecf371e4SBarry Smith 5577ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 55785d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 55795d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5580606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 55815d0c19d7SBarry Smith ajnew = xitmp; 55825d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 55835d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5584606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 55855d0c19d7SBarry Smith ajfill = xitmp; 5586eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 55874e2b4712SSatish Balay } 55885d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 55894e2b4712SSatish Balay flev = ajfill + ainew[prow]; 55904e2b4712SSatish Balay dloc[prow] = nzi; 55914e2b4712SSatish Balay fm = fill[n]; 55924e2b4712SSatish Balay while (nzf--) { 55935d0c19d7SBarry Smith *xitmp++ = fm; 55944e2b4712SSatish Balay *flev++ = im[fm]; 55954e2b4712SSatish Balay fm = fill[fm]; 55964e2b4712SSatish Balay } 5597435faa5fSBarry Smith /* make sure row has diagonal entry */ 5598435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 559977431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 56002401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5601435faa5fSBarry Smith } 56024e2b4712SSatish Balay } 5603606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 56044e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 56054e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5606606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 5607606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 56084e2b4712SSatish Balay 56096cf91177SBarry Smith #if defined(PETSC_USE_INFO) 56104e2b4712SSatish Balay { 5611329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5612ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5613ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5614ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5615ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5616335d9088SBarry Smith if (diagonal_fill) { 5617ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5618335d9088SBarry Smith } 56194e2b4712SSatish Balay } 562063ba0a88SBarry Smith #endif 56214e2b4712SSatish Balay 56224e2b4712SSatish Balay /* put together the new matrix */ 5623719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5624719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5625719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 5626e6b907acSBarry Smith b->free_a = PETSC_TRUE; 5627e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 56287c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 5629a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 56304e2b4712SSatish Balay b->j = ajnew; 56314e2b4712SSatish Balay b->i = ainew; 56324e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 56334e2b4712SSatish Balay b->diag = dloc; 56347f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 56354e2b4712SSatish Balay b->ilen = 0; 56364e2b4712SSatish Balay b->imax = 0; 56374e2b4712SSatish Balay b->row = isrow; 56384e2b4712SSatish Balay b->col = iscol; 5639bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5640c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5641c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5642e51c0b9cSSatish Balay b->icol = isicol; 564387828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 56444e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 56454e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 5646719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 56474e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 56484e2b4712SSatish Balay 5649719d5645SBarry Smith (fact)->info.factor_mallocs = reallocate; 5650719d5645SBarry Smith (fact)->info.fill_ratio_given = f; 5651719d5645SBarry Smith (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 56526bce7ff8SHong Zhang 565341df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 56548661488fSKris Buschelman PetscFunctionReturn(0); 56558661488fSKris Buschelman } 56568661488fSKris Buschelman 5657732ee342SKris Buschelman #undef __FUNCT__ 56587e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5659dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 56607e7071cdSKris Buschelman { 566112272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 566212272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 56635a9542e3SKris Buschelman PetscFunctionBegin; 56647cf1b8d3SKris Buschelman /* Undo Column scaling */ 56657cf1b8d3SKris Buschelman /* while (nz--) { */ 56667cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 56677cf1b8d3SKris Buschelman /* } */ 5668c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 5669c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 56707cf1b8d3SKris Buschelman PetscFunctionReturn(0); 56717cf1b8d3SKris Buschelman } 56727cf1b8d3SKris Buschelman 56737cf1b8d3SKris Buschelman #undef __FUNCT__ 56747cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5675dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 56767cf1b8d3SKris Buschelman { 56777cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5678b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 56792aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 56805a9542e3SKris Buschelman PetscFunctionBegin; 56810b9da03eSKris Buschelman /* Is this really necessary? */ 568220235379SKris Buschelman while (nz--) { 56830b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 56847e7071cdSKris Buschelman } 5685c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 56867e7071cdSKris Buschelman PetscFunctionReturn(0); 56877e7071cdSKris Buschelman } 56887e7071cdSKris Buschelman 5689732ee342SKris Buschelman 5690