1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 1016a2bf60SHong Zhang #include "petscbt.h" 1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 124e2b4712SSatish Balay 134a2ae208SSatish Balay #undef __FUNCT__ 144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16f1af5d2fSBarry Smith { 17f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18dfbe8321SBarry Smith PetscErrorCode ierr; 19690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20690b6cddSBarry Smith PetscInt *diag = a->diag; 21f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2287828ca2SBarry Smith PetscScalar s1,*x,*b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 66f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6887828ca2SBarry Smith PetscScalar *x,*b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 1111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120f1af5d2fSBarry Smith { 121f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122dfbe8321SBarry Smith PetscErrorCode ierr; 123690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 125f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12687828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12787828ca2SBarry Smith PetscScalar *x,*b; 128f1af5d2fSBarry Smith 129f1af5d2fSBarry Smith PetscFunctionBegin; 130ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1311ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133f1af5d2fSBarry Smith 134f1af5d2fSBarry Smith /* forward solve the U^T */ 135f1af5d2fSBarry Smith idx = 0; 136f1af5d2fSBarry Smith for (i=0; i<n; i++) { 137f1af5d2fSBarry Smith 138f1af5d2fSBarry Smith v = aa + 9*diag[i]; 139f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 140ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144f1af5d2fSBarry Smith v += 9; 145f1af5d2fSBarry Smith 146f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 147f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 148f1af5d2fSBarry Smith while (nz--) { 149f1af5d2fSBarry Smith oidx = 3*(*vi++); 150f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153f1af5d2fSBarry Smith v += 9; 154f1af5d2fSBarry Smith } 155f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156f1af5d2fSBarry Smith idx += 3; 157f1af5d2fSBarry Smith } 158f1af5d2fSBarry Smith /* backward solve the L^T */ 159f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 160f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 161f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 162f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 163f1af5d2fSBarry Smith idt = 3*i; 164f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165f1af5d2fSBarry Smith while (nz--) { 166f1af5d2fSBarry Smith idx = 3*(*vi--); 167f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170f1af5d2fSBarry Smith v -= 9; 171f1af5d2fSBarry Smith } 172f1af5d2fSBarry Smith } 1731ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176f1af5d2fSBarry Smith PetscFunctionReturn(0); 177f1af5d2fSBarry Smith } 178f1af5d2fSBarry Smith 1794a2ae208SSatish Balay #undef __FUNCT__ 1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182f1af5d2fSBarry Smith { 183f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184dfbe8321SBarry Smith PetscErrorCode ierr; 185690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 187f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18987828ca2SBarry Smith PetscScalar *x,*b; 190f1af5d2fSBarry Smith 191f1af5d2fSBarry Smith PetscFunctionBegin; 192ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1931ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195f1af5d2fSBarry Smith 196f1af5d2fSBarry Smith /* forward solve the U^T */ 197f1af5d2fSBarry Smith idx = 0; 198f1af5d2fSBarry Smith for (i=0; i<n; i++) { 199f1af5d2fSBarry Smith 200f1af5d2fSBarry Smith v = aa + 16*diag[i]; 201f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 202ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207f1af5d2fSBarry Smith v += 16; 208f1af5d2fSBarry Smith 209f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 210f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 211f1af5d2fSBarry Smith while (nz--) { 212f1af5d2fSBarry Smith oidx = 4*(*vi++); 213f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217f1af5d2fSBarry Smith v += 16; 218f1af5d2fSBarry Smith } 219f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220f1af5d2fSBarry Smith idx += 4; 221f1af5d2fSBarry Smith } 222f1af5d2fSBarry Smith /* backward solve the L^T */ 223f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 224f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 225f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 226f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 227f1af5d2fSBarry Smith idt = 4*i; 228f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229f1af5d2fSBarry Smith while (nz--) { 230f1af5d2fSBarry Smith idx = 4*(*vi--); 231f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235f1af5d2fSBarry Smith v -= 16; 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith } 2381ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2391ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241f1af5d2fSBarry Smith PetscFunctionReturn(0); 242f1af5d2fSBarry Smith } 243f1af5d2fSBarry Smith 2444a2ae208SSatish Balay #undef __FUNCT__ 2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247f1af5d2fSBarry Smith { 248f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249dfbe8321SBarry Smith PetscErrorCode ierr; 250690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 252f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 25387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 25487828ca2SBarry Smith PetscScalar *x,*b; 255f1af5d2fSBarry Smith 256f1af5d2fSBarry Smith PetscFunctionBegin; 257ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2581ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2591ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260f1af5d2fSBarry Smith 261f1af5d2fSBarry Smith /* forward solve the U^T */ 262f1af5d2fSBarry Smith idx = 0; 263f1af5d2fSBarry Smith for (i=0; i<n; i++) { 264f1af5d2fSBarry Smith 265f1af5d2fSBarry Smith v = aa + 25*diag[i]; 266f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 267ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273f1af5d2fSBarry Smith v += 25; 274f1af5d2fSBarry Smith 275f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 276f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 277f1af5d2fSBarry Smith while (nz--) { 278f1af5d2fSBarry Smith oidx = 5*(*vi++); 279f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284f1af5d2fSBarry Smith v += 25; 285f1af5d2fSBarry Smith } 286f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287f1af5d2fSBarry Smith idx += 5; 288f1af5d2fSBarry Smith } 289f1af5d2fSBarry Smith /* backward solve the L^T */ 290f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 291f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 292f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 293f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 294f1af5d2fSBarry Smith idt = 5*i; 295f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296f1af5d2fSBarry Smith while (nz--) { 297f1af5d2fSBarry Smith idx = 5*(*vi--); 298f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303f1af5d2fSBarry Smith v -= 25; 304f1af5d2fSBarry Smith } 305f1af5d2fSBarry Smith } 3061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309f1af5d2fSBarry Smith PetscFunctionReturn(0); 310f1af5d2fSBarry Smith } 311f1af5d2fSBarry Smith 3124a2ae208SSatish Balay #undef __FUNCT__ 3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315f1af5d2fSBarry Smith { 316f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317dfbe8321SBarry Smith PetscErrorCode ierr; 318690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 320f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 32187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 32287828ca2SBarry Smith PetscScalar *x,*b; 323f1af5d2fSBarry Smith 324f1af5d2fSBarry Smith PetscFunctionBegin; 325ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328f1af5d2fSBarry Smith 329f1af5d2fSBarry Smith /* forward solve the U^T */ 330f1af5d2fSBarry Smith idx = 0; 331f1af5d2fSBarry Smith for (i=0; i<n; i++) { 332f1af5d2fSBarry Smith 333f1af5d2fSBarry Smith v = aa + 36*diag[i]; 334f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 335ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336ef66eb69SBarry Smith x6 = x[5+idx]; 337f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343f1af5d2fSBarry Smith v += 36; 344f1af5d2fSBarry Smith 345f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 346f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 347f1af5d2fSBarry Smith while (nz--) { 348f1af5d2fSBarry Smith oidx = 6*(*vi++); 349f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355f1af5d2fSBarry Smith v += 36; 356f1af5d2fSBarry Smith } 357f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358f1af5d2fSBarry Smith x[5+idx] = s6; 359f1af5d2fSBarry Smith idx += 6; 360f1af5d2fSBarry Smith } 361f1af5d2fSBarry Smith /* backward solve the L^T */ 362f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 363f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 364f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 365f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 366f1af5d2fSBarry Smith idt = 6*i; 367f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368f1af5d2fSBarry Smith s6 = x[5+idt]; 369f1af5d2fSBarry Smith while (nz--) { 370f1af5d2fSBarry Smith idx = 6*(*vi--); 371f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377f1af5d2fSBarry Smith v -= 36; 378f1af5d2fSBarry Smith } 379f1af5d2fSBarry Smith } 3801ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383f1af5d2fSBarry Smith PetscFunctionReturn(0); 384f1af5d2fSBarry Smith } 385f1af5d2fSBarry Smith 3864a2ae208SSatish Balay #undef __FUNCT__ 3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389f1af5d2fSBarry Smith { 390f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391dfbe8321SBarry Smith PetscErrorCode ierr; 392690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 394f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 39587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 39687828ca2SBarry Smith PetscScalar *x,*b; 397f1af5d2fSBarry Smith 398f1af5d2fSBarry Smith PetscFunctionBegin; 399ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4001ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402f1af5d2fSBarry Smith 403f1af5d2fSBarry Smith /* forward solve the U^T */ 404f1af5d2fSBarry Smith idx = 0; 405f1af5d2fSBarry Smith for (i=0; i<n; i++) { 406f1af5d2fSBarry Smith 407f1af5d2fSBarry Smith v = aa + 49*diag[i]; 408f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 409ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 411f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418f1af5d2fSBarry Smith v += 49; 419f1af5d2fSBarry Smith 420f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 421f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 422f1af5d2fSBarry Smith while (nz--) { 423f1af5d2fSBarry Smith oidx = 7*(*vi++); 424f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431f1af5d2fSBarry Smith v += 49; 432f1af5d2fSBarry Smith } 433f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 435f1af5d2fSBarry Smith idx += 7; 436f1af5d2fSBarry Smith } 437f1af5d2fSBarry Smith /* backward solve the L^T */ 438f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 439f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 440f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 441f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 442f1af5d2fSBarry Smith idt = 7*i; 443f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 445f1af5d2fSBarry Smith while (nz--) { 446f1af5d2fSBarry Smith idx = 7*(*vi--); 447f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454f1af5d2fSBarry Smith v -= 49; 455f1af5d2fSBarry Smith } 456f1af5d2fSBarry Smith } 4571ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460f1af5d2fSBarry Smith PetscFunctionReturn(0); 461f1af5d2fSBarry Smith } 462f1af5d2fSBarry Smith 463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4644a2ae208SSatish Balay #undef __FUNCT__ 4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467f1af5d2fSBarry Smith { 468f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 4706849ba73SBarry Smith PetscErrorCode ierr; 4715d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 4725d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473690b6cddSBarry Smith PetscInt *diag = a->diag; 474f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 47587828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 476f1af5d2fSBarry Smith 477f1af5d2fSBarry Smith PetscFunctionBegin; 4781ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480f1af5d2fSBarry Smith t = a->solve_work; 481f1af5d2fSBarry Smith 482f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484f1af5d2fSBarry Smith 485f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 486f1af5d2fSBarry Smith for (i=0; i<n; i++) { 487f1af5d2fSBarry Smith t[i] = b[c[i]]; 488f1af5d2fSBarry Smith } 489f1af5d2fSBarry Smith 490f1af5d2fSBarry Smith /* forward solve the U^T */ 491f1af5d2fSBarry Smith for (i=0; i<n; i++) { 492f1af5d2fSBarry Smith 493f1af5d2fSBarry Smith v = aa + diag[i]; 494f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 495f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 496f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 497f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 498f1af5d2fSBarry Smith while (nz--) { 499f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 500f1af5d2fSBarry Smith } 501f1af5d2fSBarry Smith t[i] = s1; 502f1af5d2fSBarry Smith } 503f1af5d2fSBarry Smith /* backward solve the L^T */ 504f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 505f1af5d2fSBarry Smith v = aa + diag[i] - 1; 506f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 507f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 508f1af5d2fSBarry Smith s1 = t[i]; 509f1af5d2fSBarry Smith while (nz--) { 510f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 511f1af5d2fSBarry Smith } 512f1af5d2fSBarry Smith } 513f1af5d2fSBarry Smith 514f1af5d2fSBarry Smith /* copy t into x according to permutation */ 515f1af5d2fSBarry Smith for (i=0; i<n; i++) { 516f1af5d2fSBarry Smith x[r[i]] = t[i]; 517f1af5d2fSBarry Smith } 518f1af5d2fSBarry Smith 519f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5211ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5221ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524f1af5d2fSBarry Smith PetscFunctionReturn(0); 525f1af5d2fSBarry Smith } 526f1af5d2fSBarry Smith 5274a2ae208SSatish Balay #undef __FUNCT__ 5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530f1af5d2fSBarry Smith { 531f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5336849ba73SBarry Smith PetscErrorCode ierr; 5345d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5355d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 537f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53887828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 53987828ca2SBarry Smith PetscScalar *x,*b,*t; 540f1af5d2fSBarry Smith 541f1af5d2fSBarry Smith PetscFunctionBegin; 5421ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5431ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544f1af5d2fSBarry Smith t = a->solve_work; 545f1af5d2fSBarry Smith 546f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548f1af5d2fSBarry Smith 549f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 550f1af5d2fSBarry Smith ii = 0; 551f1af5d2fSBarry Smith for (i=0; i<n; i++) { 552f1af5d2fSBarry Smith ic = 2*c[i]; 553f1af5d2fSBarry Smith t[ii] = b[ic]; 554f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 555f1af5d2fSBarry Smith ii += 2; 556f1af5d2fSBarry Smith } 557f1af5d2fSBarry Smith 558f1af5d2fSBarry Smith /* forward solve the U^T */ 559f1af5d2fSBarry Smith idx = 0; 560f1af5d2fSBarry Smith for (i=0; i<n; i++) { 561f1af5d2fSBarry Smith 562f1af5d2fSBarry Smith v = aa + 4*diag[i]; 563f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 564f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 565f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 566f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 567f1af5d2fSBarry Smith v += 4; 568f1af5d2fSBarry Smith 569f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 570f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 571f1af5d2fSBarry Smith while (nz--) { 572f1af5d2fSBarry Smith oidx = 2*(*vi++); 573f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 574f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 575f1af5d2fSBarry Smith v += 4; 576f1af5d2fSBarry Smith } 577f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 578f1af5d2fSBarry Smith idx += 2; 579f1af5d2fSBarry Smith } 580f1af5d2fSBarry Smith /* backward solve the L^T */ 581f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 582f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 583f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 584f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 585f1af5d2fSBarry Smith idt = 2*i; 586f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 587f1af5d2fSBarry Smith while (nz--) { 588f1af5d2fSBarry Smith idx = 2*(*vi--); 589f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 590f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 591f1af5d2fSBarry Smith v -= 4; 592f1af5d2fSBarry Smith } 593f1af5d2fSBarry Smith } 594f1af5d2fSBarry Smith 595f1af5d2fSBarry Smith /* copy t into x according to permutation */ 596f1af5d2fSBarry Smith ii = 0; 597f1af5d2fSBarry Smith for (i=0; i<n; i++) { 598f1af5d2fSBarry Smith ir = 2*r[i]; 599f1af5d2fSBarry Smith x[ir] = t[ii]; 600f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 601f1af5d2fSBarry Smith ii += 2; 602f1af5d2fSBarry Smith } 603f1af5d2fSBarry Smith 604f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609f1af5d2fSBarry Smith PetscFunctionReturn(0); 610f1af5d2fSBarry Smith } 611f1af5d2fSBarry Smith 6124a2ae208SSatish Balay #undef __FUNCT__ 6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615f1af5d2fSBarry Smith { 616f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6186849ba73SBarry Smith PetscErrorCode ierr; 6195d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 6205d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 622f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 62387828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 62487828ca2SBarry Smith PetscScalar *x,*b,*t; 625f1af5d2fSBarry Smith 626f1af5d2fSBarry Smith PetscFunctionBegin; 6271ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629f1af5d2fSBarry Smith t = a->solve_work; 630f1af5d2fSBarry Smith 631f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633f1af5d2fSBarry Smith 634f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 635f1af5d2fSBarry Smith ii = 0; 636f1af5d2fSBarry Smith for (i=0; i<n; i++) { 637f1af5d2fSBarry Smith ic = 3*c[i]; 638f1af5d2fSBarry Smith t[ii] = b[ic]; 639f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 640f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 641f1af5d2fSBarry Smith ii += 3; 642f1af5d2fSBarry Smith } 643f1af5d2fSBarry Smith 644f1af5d2fSBarry Smith /* forward solve the U^T */ 645f1af5d2fSBarry Smith idx = 0; 646f1af5d2fSBarry Smith for (i=0; i<n; i++) { 647f1af5d2fSBarry Smith 648f1af5d2fSBarry Smith v = aa + 9*diag[i]; 649f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 650f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654f1af5d2fSBarry Smith v += 9; 655f1af5d2fSBarry Smith 656f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 657f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 658f1af5d2fSBarry Smith while (nz--) { 659f1af5d2fSBarry Smith oidx = 3*(*vi++); 660f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663f1af5d2fSBarry Smith v += 9; 664f1af5d2fSBarry Smith } 665f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666f1af5d2fSBarry Smith idx += 3; 667f1af5d2fSBarry Smith } 668f1af5d2fSBarry Smith /* backward solve the L^T */ 669f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 670f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 671f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 672f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 673f1af5d2fSBarry Smith idt = 3*i; 674f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675f1af5d2fSBarry Smith while (nz--) { 676f1af5d2fSBarry Smith idx = 3*(*vi--); 677f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680f1af5d2fSBarry Smith v -= 9; 681f1af5d2fSBarry Smith } 682f1af5d2fSBarry Smith } 683f1af5d2fSBarry Smith 684f1af5d2fSBarry Smith /* copy t into x according to permutation */ 685f1af5d2fSBarry Smith ii = 0; 686f1af5d2fSBarry Smith for (i=0; i<n; i++) { 687f1af5d2fSBarry Smith ir = 3*r[i]; 688f1af5d2fSBarry Smith x[ir] = t[ii]; 689f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 690f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 691f1af5d2fSBarry Smith ii += 3; 692f1af5d2fSBarry Smith } 693f1af5d2fSBarry Smith 694f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6961ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699f1af5d2fSBarry Smith PetscFunctionReturn(0); 700f1af5d2fSBarry Smith } 701f1af5d2fSBarry Smith 7024a2ae208SSatish Balay #undef __FUNCT__ 7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705f1af5d2fSBarry Smith { 706f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7086849ba73SBarry Smith PetscErrorCode ierr; 7095d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 7105d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 712f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 71387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 71487828ca2SBarry Smith PetscScalar *x,*b,*t; 715f1af5d2fSBarry Smith 716f1af5d2fSBarry Smith PetscFunctionBegin; 7171ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719f1af5d2fSBarry Smith t = a->solve_work; 720f1af5d2fSBarry Smith 721f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723f1af5d2fSBarry Smith 724f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 725f1af5d2fSBarry Smith ii = 0; 726f1af5d2fSBarry Smith for (i=0; i<n; i++) { 727f1af5d2fSBarry Smith ic = 4*c[i]; 728f1af5d2fSBarry Smith t[ii] = b[ic]; 729f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 730f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 731f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 732f1af5d2fSBarry Smith ii += 4; 733f1af5d2fSBarry Smith } 734f1af5d2fSBarry Smith 735f1af5d2fSBarry Smith /* forward solve the U^T */ 736f1af5d2fSBarry Smith idx = 0; 737f1af5d2fSBarry Smith for (i=0; i<n; i++) { 738f1af5d2fSBarry Smith 739f1af5d2fSBarry Smith v = aa + 16*diag[i]; 740f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 741f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746f1af5d2fSBarry Smith v += 16; 747f1af5d2fSBarry Smith 748f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 749f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 750f1af5d2fSBarry Smith while (nz--) { 751f1af5d2fSBarry Smith oidx = 4*(*vi++); 752f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756f1af5d2fSBarry Smith v += 16; 757f1af5d2fSBarry Smith } 758f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759f1af5d2fSBarry Smith idx += 4; 760f1af5d2fSBarry Smith } 761f1af5d2fSBarry Smith /* backward solve the L^T */ 762f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 763f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 764f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 765f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 766f1af5d2fSBarry Smith idt = 4*i; 767f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768f1af5d2fSBarry Smith while (nz--) { 769f1af5d2fSBarry Smith idx = 4*(*vi--); 770f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774f1af5d2fSBarry Smith v -= 16; 775f1af5d2fSBarry Smith } 776f1af5d2fSBarry Smith } 777f1af5d2fSBarry Smith 778f1af5d2fSBarry Smith /* copy t into x according to permutation */ 779f1af5d2fSBarry Smith ii = 0; 780f1af5d2fSBarry Smith for (i=0; i<n; i++) { 781f1af5d2fSBarry Smith ir = 4*r[i]; 782f1af5d2fSBarry Smith x[ir] = t[ii]; 783f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 784f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 785f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 786f1af5d2fSBarry Smith ii += 4; 787f1af5d2fSBarry Smith } 788f1af5d2fSBarry Smith 789f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794f1af5d2fSBarry Smith PetscFunctionReturn(0); 795f1af5d2fSBarry Smith } 796f1af5d2fSBarry Smith 7974a2ae208SSatish Balay #undef __FUNCT__ 7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800f1af5d2fSBarry Smith { 801f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8036849ba73SBarry Smith PetscErrorCode ierr; 8045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 807f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 80887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 80987828ca2SBarry Smith PetscScalar *x,*b,*t; 810f1af5d2fSBarry Smith 811f1af5d2fSBarry Smith PetscFunctionBegin; 8121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814f1af5d2fSBarry Smith t = a->solve_work; 815f1af5d2fSBarry Smith 816f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818f1af5d2fSBarry Smith 819f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 820f1af5d2fSBarry Smith ii = 0; 821f1af5d2fSBarry Smith for (i=0; i<n; i++) { 822f1af5d2fSBarry Smith ic = 5*c[i]; 823f1af5d2fSBarry Smith t[ii] = b[ic]; 824f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 825f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 826f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 827f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 828f1af5d2fSBarry Smith ii += 5; 829f1af5d2fSBarry Smith } 830f1af5d2fSBarry Smith 831f1af5d2fSBarry Smith /* forward solve the U^T */ 832f1af5d2fSBarry Smith idx = 0; 833f1af5d2fSBarry Smith for (i=0; i<n; i++) { 834f1af5d2fSBarry Smith 835f1af5d2fSBarry Smith v = aa + 25*diag[i]; 836f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 837f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843f1af5d2fSBarry Smith v += 25; 844f1af5d2fSBarry Smith 845f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 846f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 847f1af5d2fSBarry Smith while (nz--) { 848f1af5d2fSBarry Smith oidx = 5*(*vi++); 849f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854f1af5d2fSBarry Smith v += 25; 855f1af5d2fSBarry Smith } 856f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857f1af5d2fSBarry Smith idx += 5; 858f1af5d2fSBarry Smith } 859f1af5d2fSBarry Smith /* backward solve the L^T */ 860f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 861f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 862f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 863f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 864f1af5d2fSBarry Smith idt = 5*i; 865f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866f1af5d2fSBarry Smith while (nz--) { 867f1af5d2fSBarry Smith idx = 5*(*vi--); 868f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873f1af5d2fSBarry Smith v -= 25; 874f1af5d2fSBarry Smith } 875f1af5d2fSBarry Smith } 876f1af5d2fSBarry Smith 877f1af5d2fSBarry Smith /* copy t into x according to permutation */ 878f1af5d2fSBarry Smith ii = 0; 879f1af5d2fSBarry Smith for (i=0; i<n; i++) { 880f1af5d2fSBarry Smith ir = 5*r[i]; 881f1af5d2fSBarry Smith x[ir] = t[ii]; 882f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 883f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 884f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 885f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 886f1af5d2fSBarry Smith ii += 5; 887f1af5d2fSBarry Smith } 888f1af5d2fSBarry Smith 889f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894f1af5d2fSBarry Smith PetscFunctionReturn(0); 895f1af5d2fSBarry Smith } 896f1af5d2fSBarry Smith 8974a2ae208SSatish Balay #undef __FUNCT__ 8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900f1af5d2fSBarry Smith { 901f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9036849ba73SBarry Smith PetscErrorCode ierr; 9045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 907f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 90887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 90987828ca2SBarry Smith PetscScalar *x,*b,*t; 910f1af5d2fSBarry Smith 911f1af5d2fSBarry Smith PetscFunctionBegin; 9121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914f1af5d2fSBarry Smith t = a->solve_work; 915f1af5d2fSBarry Smith 916f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918f1af5d2fSBarry Smith 919f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 920f1af5d2fSBarry Smith ii = 0; 921f1af5d2fSBarry Smith for (i=0; i<n; i++) { 922f1af5d2fSBarry Smith ic = 6*c[i]; 923f1af5d2fSBarry Smith t[ii] = b[ic]; 924f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 925f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 926f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 927f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 928f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 929f1af5d2fSBarry Smith ii += 6; 930f1af5d2fSBarry Smith } 931f1af5d2fSBarry Smith 932f1af5d2fSBarry Smith /* forward solve the U^T */ 933f1af5d2fSBarry Smith idx = 0; 934f1af5d2fSBarry Smith for (i=0; i<n; i++) { 935f1af5d2fSBarry Smith 936f1af5d2fSBarry Smith v = aa + 36*diag[i]; 937f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 938f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939f1af5d2fSBarry Smith x6 = t[5+idx]; 940f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946f1af5d2fSBarry Smith v += 36; 947f1af5d2fSBarry Smith 948f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 949f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 950f1af5d2fSBarry Smith while (nz--) { 951f1af5d2fSBarry Smith oidx = 6*(*vi++); 952f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958f1af5d2fSBarry Smith v += 36; 959f1af5d2fSBarry Smith } 960f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961f1af5d2fSBarry Smith t[5+idx] = s6; 962f1af5d2fSBarry Smith idx += 6; 963f1af5d2fSBarry Smith } 964f1af5d2fSBarry Smith /* backward solve the L^T */ 965f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 966f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 967f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 968f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 969f1af5d2fSBarry Smith idt = 6*i; 970f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971f1af5d2fSBarry Smith s6 = t[5+idt]; 972f1af5d2fSBarry Smith while (nz--) { 973f1af5d2fSBarry Smith idx = 6*(*vi--); 974f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980f1af5d2fSBarry Smith v -= 36; 981f1af5d2fSBarry Smith } 982f1af5d2fSBarry Smith } 983f1af5d2fSBarry Smith 984f1af5d2fSBarry Smith /* copy t into x according to permutation */ 985f1af5d2fSBarry Smith ii = 0; 986f1af5d2fSBarry Smith for (i=0; i<n; i++) { 987f1af5d2fSBarry Smith ir = 6*r[i]; 988f1af5d2fSBarry Smith x[ir] = t[ii]; 989f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 990f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 991f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 992f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 993f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 994f1af5d2fSBarry Smith ii += 6; 995f1af5d2fSBarry Smith } 996f1af5d2fSBarry Smith 997f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9991ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002f1af5d2fSBarry Smith PetscFunctionReturn(0); 1003f1af5d2fSBarry Smith } 1004f1af5d2fSBarry Smith 10054a2ae208SSatish Balay #undef __FUNCT__ 10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008f1af5d2fSBarry Smith { 1009f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10116849ba73SBarry Smith PetscErrorCode ierr; 10125d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 10135d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 101687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 101787828ca2SBarry Smith PetscScalar *x,*b,*t; 1018f1af5d2fSBarry Smith 1019f1af5d2fSBarry Smith PetscFunctionBegin; 10201ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022f1af5d2fSBarry Smith t = a->solve_work; 1023f1af5d2fSBarry Smith 1024f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026f1af5d2fSBarry Smith 1027f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1028f1af5d2fSBarry Smith ii = 0; 1029f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1030f1af5d2fSBarry Smith ic = 7*c[i]; 1031f1af5d2fSBarry Smith t[ii] = b[ic]; 1032f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1033f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1034f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1035f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1036f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1037f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1038f1af5d2fSBarry Smith ii += 7; 1039f1af5d2fSBarry Smith } 1040f1af5d2fSBarry Smith 1041f1af5d2fSBarry Smith /* forward solve the U^T */ 1042f1af5d2fSBarry Smith idx = 0; 1043f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1044f1af5d2fSBarry Smith 1045f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1046f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1047f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1049f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056f1af5d2fSBarry Smith v += 49; 1057f1af5d2fSBarry Smith 1058f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1059f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1060f1af5d2fSBarry Smith while (nz--) { 1061f1af5d2fSBarry Smith oidx = 7*(*vi++); 1062f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069f1af5d2fSBarry Smith v += 49; 1070f1af5d2fSBarry Smith } 1071f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1073f1af5d2fSBarry Smith idx += 7; 1074f1af5d2fSBarry Smith } 1075f1af5d2fSBarry Smith /* backward solve the L^T */ 1076f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1077f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1078f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1079f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1080f1af5d2fSBarry Smith idt = 7*i; 1081f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1083f1af5d2fSBarry Smith while (nz--) { 1084f1af5d2fSBarry Smith idx = 7*(*vi--); 1085f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092f1af5d2fSBarry Smith v -= 49; 1093f1af5d2fSBarry Smith } 1094f1af5d2fSBarry Smith } 1095f1af5d2fSBarry Smith 1096f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1097f1af5d2fSBarry Smith ii = 0; 1098f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1099f1af5d2fSBarry Smith ir = 7*r[i]; 1100f1af5d2fSBarry Smith x[ir] = t[ii]; 1101f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1102f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1103f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1104f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1105f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1106f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1107f1af5d2fSBarry Smith ii += 7; 1108f1af5d2fSBarry Smith } 1109f1af5d2fSBarry Smith 1110f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11121ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115f1af5d2fSBarry Smith PetscFunctionReturn(0); 1116f1af5d2fSBarry Smith } 1117f1af5d2fSBarry Smith 11184e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11194a2ae208SSatish Balay #undef __FUNCT__ 11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11224e2b4712SSatish Balay { 11234e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11244e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11256849ba73SBarry Smith PetscErrorCode ierr; 11265d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 11275d0c19d7SBarry Smith PetscInt i,n=a->mbs; 11285d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11293f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 113087828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11314e2b4712SSatish Balay 11324e2b4712SSatish Balay PetscFunctionBegin; 11331ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11341ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135f1af5d2fSBarry Smith t = a->solve_work; 11364e2b4712SSatish Balay 11374e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11384e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11394e2b4712SSatish Balay 11404e2b4712SSatish Balay /* forward solve the lower triangular */ 114187828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11424e2b4712SSatish Balay for (i=1; i<n; i++) { 11434e2b4712SSatish Balay v = aa + bs2*ai[i]; 11444e2b4712SSatish Balay vi = aj + ai[i]; 11454e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1146f1af5d2fSBarry Smith s = t + bs*i; 114787828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11484e2b4712SSatish Balay while (nz--) { 1149f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11504e2b4712SSatish Balay v += bs2; 11514e2b4712SSatish Balay } 11524e2b4712SSatish Balay } 11534e2b4712SSatish Balay /* backward solve the upper triangular */ 1154d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 11554e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11564e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11574e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11584e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 115987828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11604e2b4712SSatish Balay while (nz--) { 1161f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11624e2b4712SSatish Balay v += bs2; 11634e2b4712SSatish Balay } 1164f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 116587828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11664e2b4712SSatish Balay } 11674e2b4712SSatish Balay 11684e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11694e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11701ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 11734e2b4712SSatish Balay PetscFunctionReturn(0); 11744e2b4712SSatish Balay } 11754e2b4712SSatish Balay 11764a2ae208SSatish Balay #undef __FUNCT__ 11774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1178dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11794e2b4712SSatish Balay { 11804e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11814e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11826849ba73SBarry Smith PetscErrorCode ierr; 11835d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 11845d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 11853f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 118687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 118787828ca2SBarry Smith PetscScalar *x,*b,*t; 11884e2b4712SSatish Balay 11894e2b4712SSatish Balay PetscFunctionBegin; 11901ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1192f1af5d2fSBarry Smith t = a->solve_work; 11934e2b4712SSatish Balay 11944e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11954e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11964e2b4712SSatish Balay 11974e2b4712SSatish Balay /* forward solve the lower triangular */ 11984e2b4712SSatish Balay idx = 7*(*r++); 1199f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1200f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1201f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 12024e2b4712SSatish Balay 12034e2b4712SSatish Balay for (i=1; i<n; i++) { 12044e2b4712SSatish Balay v = aa + 49*ai[i]; 12054e2b4712SSatish Balay vi = aj + ai[i]; 12064e2b4712SSatish Balay nz = diag[i] - ai[i]; 12074e2b4712SSatish Balay idx = 7*(*r++); 1208f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1209f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 12104e2b4712SSatish Balay while (nz--) { 12114e2b4712SSatish Balay idx = 7*(*vi++); 1212f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1213f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1214f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1215f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1216f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1217f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1218f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1219f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1220f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1221f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12224e2b4712SSatish Balay v += 49; 12234e2b4712SSatish Balay } 12244e2b4712SSatish Balay idx = 7*i; 1225f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1226f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1227f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12284e2b4712SSatish Balay } 12294e2b4712SSatish Balay /* backward solve the upper triangular */ 12304e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12314e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12324e2b4712SSatish Balay vi = aj + diag[i] + 1; 12334e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12344e2b4712SSatish Balay idt = 7*i; 1235f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1236f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1237f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12384e2b4712SSatish Balay while (nz--) { 12394e2b4712SSatish Balay idx = 7*(*vi++); 1240f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1241f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1243f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1244f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1245f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1246f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1247f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1248f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1249f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12504e2b4712SSatish Balay v += 49; 12514e2b4712SSatish Balay } 12524e2b4712SSatish Balay idc = 7*(*c--); 12534e2b4712SSatish Balay v = aa + 49*diag[i]; 1254f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1255f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1256f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1257f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1258f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1259f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1260f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1261f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1262f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1263f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1264f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1265f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1266f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1267f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12684e2b4712SSatish Balay } 12694e2b4712SSatish Balay 12704e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12714e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12721ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12731ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1274dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 12754e2b4712SSatish Balay PetscFunctionReturn(0); 12764e2b4712SSatish Balay } 12774e2b4712SSatish Balay 12784a2ae208SSatish Balay #undef __FUNCT__ 12798f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 12808f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 12818f690400SShri Abhyankar { 12828f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 12838f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 12848f690400SShri Abhyankar PetscErrorCode ierr; 12858f690400SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 128629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 12878f690400SShri Abhyankar MatScalar *aa=a->a,*v; 12888f690400SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 12898f690400SShri Abhyankar PetscScalar *x,*b,*t; 12908f690400SShri Abhyankar 12918f690400SShri Abhyankar PetscFunctionBegin; 12928f690400SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 12938f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 12948f690400SShri Abhyankar t = a->solve_work; 12958f690400SShri Abhyankar 12968f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 129729b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 12988f690400SShri Abhyankar 12998f690400SShri Abhyankar /* forward solve the lower triangular */ 130029b92fc1SShri Abhyankar idx = 7*r[0]; 13018f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 13028f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 13038f690400SShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 13048f690400SShri Abhyankar 13058f690400SShri Abhyankar for (i=1; i<n; i++) { 13068f690400SShri Abhyankar v = aa + 49*ai[i]; 13078f690400SShri Abhyankar vi = aj + ai[i]; 13088f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 130929b92fc1SShri Abhyankar idx = 7*r[i]; 13108f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 13118f690400SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 131229b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 131329b92fc1SShri Abhyankar idx = 7*vi[m]; 13148f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 13158f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 13168f690400SShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 13178f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 13188f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 13198f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 13208f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 13218f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 13228f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 13238f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13248f690400SShri Abhyankar v += 49; 13258f690400SShri Abhyankar } 13268f690400SShri Abhyankar idx = 7*i; 13278f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 13288f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 13298f690400SShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 13308f690400SShri Abhyankar } 13318f690400SShri Abhyankar /* backward solve the upper triangular */ 13328f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 13338f690400SShri Abhyankar k = 2*n-i; 13348f690400SShri Abhyankar v = aa + 49*ai[k]; 13358f690400SShri Abhyankar vi = aj + ai[k]; 13368f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 13378f690400SShri Abhyankar idt = 7*i; 13388f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 13398f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 13408f690400SShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 134129b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 134229b92fc1SShri Abhyankar idx = 7*vi[m]; 13438f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 13448f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 13458f690400SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 13468f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 13478f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 13488f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 13498f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 13508f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 13518f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 13528f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13538f690400SShri Abhyankar v += 49; 13548f690400SShri Abhyankar } 135529b92fc1SShri Abhyankar idc = 7*c[i]; 13568f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 13578f690400SShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 13588f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 13598f690400SShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 13608f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 13618f690400SShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 13628f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 13638f690400SShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 13648f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 13658f690400SShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 13668f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 13678f690400SShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 13688f690400SShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 13698f690400SShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 13708f690400SShri Abhyankar } 13718f690400SShri Abhyankar 13728f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 13738f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13748f690400SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 13758f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 13768f690400SShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 13778f690400SShri Abhyankar PetscFunctionReturn(0); 13788f690400SShri Abhyankar } 13798f690400SShri Abhyankar 13808f690400SShri Abhyankar #undef __FUNCT__ 13814a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1382dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 138315091d37SBarry Smith { 138415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1385690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1386dfbe8321SBarry Smith PetscErrorCode ierr; 1387690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1388d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1389d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1390d9fead3dSBarry Smith const PetscScalar *b; 139115091d37SBarry Smith 139215091d37SBarry Smith PetscFunctionBegin; 1393d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 139515091d37SBarry Smith /* forward solve the lower triangular */ 139615091d37SBarry Smith idx = 0; 139715091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 139815091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 139915091d37SBarry Smith x[6] = b[6+idx]; 140015091d37SBarry Smith for (i=1; i<n; i++) { 140115091d37SBarry Smith v = aa + 49*ai[i]; 140215091d37SBarry Smith vi = aj + ai[i]; 140315091d37SBarry Smith nz = diag[i] - ai[i]; 140415091d37SBarry Smith idx = 7*i; 1405f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1406f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1407f1af5d2fSBarry Smith s7 = b[6+idx]; 140815091d37SBarry Smith while (nz--) { 140915091d37SBarry Smith jdx = 7*(*vi++); 141015091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 141115091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 141215091d37SBarry Smith x7 = x[6+jdx]; 1413f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1414f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1415f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1416f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1417f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1418f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1419f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 142015091d37SBarry Smith v += 49; 142115091d37SBarry Smith } 1422f1af5d2fSBarry Smith x[idx] = s1; 1423f1af5d2fSBarry Smith x[1+idx] = s2; 1424f1af5d2fSBarry Smith x[2+idx] = s3; 1425f1af5d2fSBarry Smith x[3+idx] = s4; 1426f1af5d2fSBarry Smith x[4+idx] = s5; 1427f1af5d2fSBarry Smith x[5+idx] = s6; 1428f1af5d2fSBarry Smith x[6+idx] = s7; 142915091d37SBarry Smith } 143015091d37SBarry Smith /* backward solve the upper triangular */ 143115091d37SBarry Smith for (i=n-1; i>=0; i--){ 143215091d37SBarry Smith v = aa + 49*diag[i] + 49; 143315091d37SBarry Smith vi = aj + diag[i] + 1; 143415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 143515091d37SBarry Smith idt = 7*i; 1436f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1437f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1438f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1439f1af5d2fSBarry Smith s7 = x[6+idt]; 144015091d37SBarry Smith while (nz--) { 144115091d37SBarry Smith idx = 7*(*vi++); 144215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 144315091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 144415091d37SBarry Smith x7 = x[6+idx]; 1445f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1446f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1447f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1448f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1449f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1450f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1451f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 145215091d37SBarry Smith v += 49; 145315091d37SBarry Smith } 145415091d37SBarry Smith v = aa + 49*diag[i]; 1455f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1456f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1457f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1458f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1459f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1460f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1461f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1462f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1463f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1464f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1465f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1466f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1467f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1468f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 146915091d37SBarry Smith } 147015091d37SBarry Smith 1471d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14721ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1473dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 147415091d37SBarry Smith PetscFunctionReturn(0); 147515091d37SBarry Smith } 147615091d37SBarry Smith 14774a2ae208SSatish Balay #undef __FUNCT__ 1478cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1479cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1480cee9d6f2SShri Abhyankar { 1481cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 14826464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1483cee9d6f2SShri Abhyankar PetscErrorCode ierr; 1484cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 1485cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1486cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 1487cee9d6f2SShri Abhyankar PetscScalar *x; 1488cee9d6f2SShri Abhyankar const PetscScalar *b; 1489cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1490cee9d6f2SShri Abhyankar 1491cee9d6f2SShri Abhyankar PetscFunctionBegin; 1492cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1493cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1494cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 1495cee9d6f2SShri Abhyankar idx = 0; 1496cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1497cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1498cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 1499cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 1500cee9d6f2SShri Abhyankar vi = aj + ai[i]; 1501cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 1502cee9d6f2SShri Abhyankar idx = bs*i; 1503cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1504cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 15056464896eSShri Abhyankar for(k=0;k<nz;k++) { 15066464896eSShri Abhyankar jdx = bs*vi[k]; 1507cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1508cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1509cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1510cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1511cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1512cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1513cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1514cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1515cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1516cee9d6f2SShri Abhyankar v += bs2; 1517cee9d6f2SShri Abhyankar } 1518cee9d6f2SShri Abhyankar 1519cee9d6f2SShri Abhyankar x[idx] = s1; 1520cee9d6f2SShri Abhyankar x[1+idx] = s2; 1521cee9d6f2SShri Abhyankar x[2+idx] = s3; 1522cee9d6f2SShri Abhyankar x[3+idx] = s4; 1523cee9d6f2SShri Abhyankar x[4+idx] = s5; 1524cee9d6f2SShri Abhyankar x[5+idx] = s6; 1525cee9d6f2SShri Abhyankar x[6+idx] = s7; 1526cee9d6f2SShri Abhyankar } 1527cee9d6f2SShri Abhyankar 1528cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 1529cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 1530cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 1531cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 1532cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 1533cee9d6f2SShri Abhyankar idt = bs*i; 1534cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1535cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 15366464896eSShri Abhyankar for(k=0;k<nz;k++) { 15376464896eSShri Abhyankar idx = bs*vi[k]; 1538cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1539cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1540cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1541cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1542cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1543cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1544cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1545cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1546cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1547cee9d6f2SShri Abhyankar v += bs2; 1548cee9d6f2SShri Abhyankar } 1549cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 1550cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1551cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1552cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1553cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1554cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1555cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1556cee9d6f2SShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1557cee9d6f2SShri Abhyankar } 1558cee9d6f2SShri Abhyankar 1559cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1560cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1561cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1562cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 1563cee9d6f2SShri Abhyankar } 1564cee9d6f2SShri Abhyankar 1565cee9d6f2SShri Abhyankar #undef __FUNCT__ 156653cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2" 156753cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 156853cca76cSShri Abhyankar { 156953cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 157053cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 157153cca76cSShri Abhyankar PetscErrorCode ierr; 157253cca76cSShri Abhyankar PetscInt idx,jdx,idt; 157353cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 157453cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 157553cca76cSShri Abhyankar PetscScalar *x; 157653cca76cSShri Abhyankar const PetscScalar *b; 157753cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 157853cca76cSShri Abhyankar 157953cca76cSShri Abhyankar PetscFunctionBegin; 158053cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 158153cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 158253cca76cSShri Abhyankar /* forward solve the lower triangular */ 158353cca76cSShri Abhyankar idx = 0; 158453cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 158553cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 158653cca76cSShri Abhyankar for (i=1; i<n; i++) { 158753cca76cSShri Abhyankar v = aa + bs2*ai[i]; 158853cca76cSShri Abhyankar vi = aj + ai[i]; 158953cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 159053cca76cSShri Abhyankar idx = bs*i; 159153cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 159253cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 159353cca76cSShri Abhyankar for(k=0;k<nz;k++) { 159453cca76cSShri Abhyankar jdx = bs*vi[k]; 159553cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 159653cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 159753cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 159853cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 159953cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 160053cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 160153cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 160253cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 160353cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 160453cca76cSShri Abhyankar v += bs2; 160553cca76cSShri Abhyankar } 160653cca76cSShri Abhyankar 160753cca76cSShri Abhyankar x[idx] = s1; 160853cca76cSShri Abhyankar x[1+idx] = s2; 160953cca76cSShri Abhyankar x[2+idx] = s3; 161053cca76cSShri Abhyankar x[3+idx] = s4; 161153cca76cSShri Abhyankar x[4+idx] = s5; 161253cca76cSShri Abhyankar x[5+idx] = s6; 161353cca76cSShri Abhyankar x[6+idx] = s7; 161453cca76cSShri Abhyankar } 161553cca76cSShri Abhyankar 161653cca76cSShri Abhyankar /* backward solve the upper triangular */ 161753cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 161853cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 161953cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 162053cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 162153cca76cSShri Abhyankar idt = bs*i; 162253cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 162353cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 162453cca76cSShri Abhyankar for(k=0;k<nz;k++) { 162553cca76cSShri Abhyankar idx = bs*vi[k]; 162653cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 162753cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 162853cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 162953cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 163053cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 163153cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 163253cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 163353cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 163453cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 163553cca76cSShri Abhyankar v += bs2; 163653cca76cSShri Abhyankar } 163753cca76cSShri Abhyankar /* x = inv_diagonal*x */ 163853cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 163953cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 164053cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 164153cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 164253cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 164353cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 164453cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 164553cca76cSShri Abhyankar } 164653cca76cSShri Abhyankar 164753cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 164853cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 164953cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 165053cca76cSShri Abhyankar PetscFunctionReturn(0); 165153cca76cSShri Abhyankar } 165253cca76cSShri Abhyankar 165353cca76cSShri Abhyankar #undef __FUNCT__ 16544a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1655dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 165615091d37SBarry Smith { 165715091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 165815091d37SBarry Smith IS iscol=a->col,isrow=a->row; 16596849ba73SBarry Smith PetscErrorCode ierr; 16605d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 16615d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1662d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1663d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1664d9fead3dSBarry Smith const PetscScalar *b; 166515091d37SBarry Smith PetscFunctionBegin; 1666d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16671ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1668f1af5d2fSBarry Smith t = a->solve_work; 166915091d37SBarry Smith 167015091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 167115091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 167215091d37SBarry Smith 167315091d37SBarry Smith /* forward solve the lower triangular */ 167415091d37SBarry Smith idx = 6*(*r++); 1675f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1676f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1677f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 167815091d37SBarry Smith for (i=1; i<n; i++) { 167915091d37SBarry Smith v = aa + 36*ai[i]; 168015091d37SBarry Smith vi = aj + ai[i]; 168115091d37SBarry Smith nz = diag[i] - ai[i]; 168215091d37SBarry Smith idx = 6*(*r++); 1683f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1684f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 168515091d37SBarry Smith while (nz--) { 168615091d37SBarry Smith idx = 6*(*vi++); 1687f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1688f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1689f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1690f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1691f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1692f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1693f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1694f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 169515091d37SBarry Smith v += 36; 169615091d37SBarry Smith } 169715091d37SBarry Smith idx = 6*i; 1698f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1699f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1700f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 170115091d37SBarry Smith } 170215091d37SBarry Smith /* backward solve the upper triangular */ 170315091d37SBarry Smith for (i=n-1; i>=0; i--){ 170415091d37SBarry Smith v = aa + 36*diag[i] + 36; 170515091d37SBarry Smith vi = aj + diag[i] + 1; 170615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 170715091d37SBarry Smith idt = 6*i; 1708f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1709f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1710f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 171115091d37SBarry Smith while (nz--) { 171215091d37SBarry Smith idx = 6*(*vi++); 1713f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1714f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1715f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1716f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1717f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1718f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1719f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1720f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1721f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 172215091d37SBarry Smith v += 36; 172315091d37SBarry Smith } 172415091d37SBarry Smith idc = 6*(*c--); 172515091d37SBarry Smith v = aa + 36*diag[i]; 1726f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1727f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1728f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1729f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1730f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1731f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1732f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1733f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1734f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1735f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1736f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1737f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 173815091d37SBarry Smith } 173915091d37SBarry Smith 174015091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 174115091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1742d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17431ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1744dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 174515091d37SBarry Smith PetscFunctionReturn(0); 174615091d37SBarry Smith } 174715091d37SBarry Smith 17484a2ae208SSatish Balay #undef __FUNCT__ 17498f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 17508f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 17518f690400SShri Abhyankar { 17528f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 17538f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 17548f690400SShri Abhyankar PetscErrorCode ierr; 17558f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 175629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 17578f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 17588f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 17598f690400SShri Abhyankar const PetscScalar *b; 17608f690400SShri Abhyankar PetscFunctionBegin; 17618f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17628f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 17638f690400SShri Abhyankar t = a->solve_work; 17648f690400SShri Abhyankar 17658f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 176629b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 17678f690400SShri Abhyankar 17688f690400SShri Abhyankar /* forward solve the lower triangular */ 176929b92fc1SShri Abhyankar idx = 6*r[0]; 17708f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 17718f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 17728f690400SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 17738f690400SShri Abhyankar for (i=1; i<n; i++) { 17748f690400SShri Abhyankar v = aa + 36*ai[i]; 17758f690400SShri Abhyankar vi = aj + ai[i]; 17768f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 177729b92fc1SShri Abhyankar idx = 6*r[i]; 17788f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 17798f690400SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 178029b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 178129b92fc1SShri Abhyankar idx = 6*vi[m]; 17828f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 17838f690400SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 17848f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 17858f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 17868f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 17878f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 17888f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 17898f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 17908f690400SShri Abhyankar v += 36; 17918f690400SShri Abhyankar } 17928f690400SShri Abhyankar idx = 6*i; 17938f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 17948f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 17958f690400SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 17968f690400SShri Abhyankar } 17978f690400SShri Abhyankar /* backward solve the upper triangular */ 17988f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 17998f690400SShri Abhyankar k = 2*n-i; 18008f690400SShri Abhyankar v = aa + 36*ai[k]; 18018f690400SShri Abhyankar vi = aj + ai[k]; 18028f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 18038f690400SShri Abhyankar idt = 6*i; 18048f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 18058f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 18068f690400SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 180729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 180829b92fc1SShri Abhyankar idx = 6*vi[m]; 18098f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 18108f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 18118f690400SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 18128f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 18138f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 18148f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 18158f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 18168f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 18178f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 18188f690400SShri Abhyankar v += 36; 18198f690400SShri Abhyankar } 182029b92fc1SShri Abhyankar idc = 6*c[i]; 18218f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 18228f690400SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 18238f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 18248f690400SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 18258f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 18268f690400SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 18278f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 18288f690400SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 18298f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 18308f690400SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 18318f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 18328f690400SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 18338f690400SShri Abhyankar } 18348f690400SShri Abhyankar 18358f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 18368f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 18378f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18388f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 18398f690400SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 18408f690400SShri Abhyankar PetscFunctionReturn(0); 18418f690400SShri Abhyankar } 18428f690400SShri Abhyankar 1843*6506fda5SShri Abhyankar #undef __FUNCT__ 1844*6506fda5SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct_v2" 1845*6506fda5SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct_v2(Mat A,Vec bb,Vec xx) 1846*6506fda5SShri Abhyankar { 1847*6506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1848*6506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 1849*6506fda5SShri Abhyankar PetscErrorCode ierr; 1850*6506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 1851*6506fda5SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 1852*6506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 1853*6506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1854*6506fda5SShri Abhyankar const PetscScalar *b; 1855*6506fda5SShri Abhyankar PetscFunctionBegin; 1856*6506fda5SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1857*6506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1858*6506fda5SShri Abhyankar t = a->solve_work; 1859*6506fda5SShri Abhyankar 1860*6506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1861*6506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1862*6506fda5SShri Abhyankar 1863*6506fda5SShri Abhyankar /* forward solve the lower triangular */ 1864*6506fda5SShri Abhyankar idx = 6*r[0]; 1865*6506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 1866*6506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 1867*6506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 1868*6506fda5SShri Abhyankar for (i=1; i<n; i++) { 1869*6506fda5SShri Abhyankar v = aa + 36*ai[i]; 1870*6506fda5SShri Abhyankar vi = aj + ai[i]; 1871*6506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 1872*6506fda5SShri Abhyankar idx = 6*r[i]; 1873*6506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1874*6506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 1875*6506fda5SShri Abhyankar for(m=0;m<nz;m++){ 1876*6506fda5SShri Abhyankar idx = 6*vi[m]; 1877*6506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1878*6506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1879*6506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1880*6506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1881*6506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1882*6506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1883*6506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1884*6506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1885*6506fda5SShri Abhyankar v += 36; 1886*6506fda5SShri Abhyankar } 1887*6506fda5SShri Abhyankar idx = 6*i; 1888*6506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 1889*6506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 1890*6506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 1891*6506fda5SShri Abhyankar } 1892*6506fda5SShri Abhyankar /* backward solve the upper triangular */ 1893*6506fda5SShri Abhyankar for (i=n-1; i>=0; i--){ 1894*6506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 1895*6506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 1896*6506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 1897*6506fda5SShri Abhyankar idt = 6*i; 1898*6506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 1899*6506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 1900*6506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 1901*6506fda5SShri Abhyankar for(m=0;m<nz;m++){ 1902*6506fda5SShri Abhyankar idx = 6*vi[m]; 1903*6506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 1904*6506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 1905*6506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 1906*6506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1907*6506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1908*6506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1909*6506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1910*6506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1911*6506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1912*6506fda5SShri Abhyankar v += 36; 1913*6506fda5SShri Abhyankar } 1914*6506fda5SShri Abhyankar idc = 6*c[i]; 1915*6506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1916*6506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 1917*6506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1918*6506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 1919*6506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1920*6506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 1921*6506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1922*6506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 1923*6506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1924*6506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 1925*6506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1926*6506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 1927*6506fda5SShri Abhyankar } 1928*6506fda5SShri Abhyankar 1929*6506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1930*6506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1931*6506fda5SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1932*6506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1933*6506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1934*6506fda5SShri Abhyankar PetscFunctionReturn(0); 1935*6506fda5SShri Abhyankar } 19368f690400SShri Abhyankar 19378f690400SShri Abhyankar #undef __FUNCT__ 19384a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1939dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 194015091d37SBarry Smith { 194115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1942690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1943dfbe8321SBarry Smith PetscErrorCode ierr; 1944690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1945d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1946d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1947d9fead3dSBarry Smith const PetscScalar *b; 194815091d37SBarry Smith 194915091d37SBarry Smith PetscFunctionBegin; 1950d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19511ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195215091d37SBarry Smith /* forward solve the lower triangular */ 195315091d37SBarry Smith idx = 0; 195415091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 195515091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 195615091d37SBarry Smith for (i=1; i<n; i++) { 195715091d37SBarry Smith v = aa + 36*ai[i]; 195815091d37SBarry Smith vi = aj + ai[i]; 195915091d37SBarry Smith nz = diag[i] - ai[i]; 196015091d37SBarry Smith idx = 6*i; 1961f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1962f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 196315091d37SBarry Smith while (nz--) { 196415091d37SBarry Smith jdx = 6*(*vi++); 196515091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 196615091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1967f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1968f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1969f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1970f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1971f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1972f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 197315091d37SBarry Smith v += 36; 197415091d37SBarry Smith } 1975f1af5d2fSBarry Smith x[idx] = s1; 1976f1af5d2fSBarry Smith x[1+idx] = s2; 1977f1af5d2fSBarry Smith x[2+idx] = s3; 1978f1af5d2fSBarry Smith x[3+idx] = s4; 1979f1af5d2fSBarry Smith x[4+idx] = s5; 1980f1af5d2fSBarry Smith x[5+idx] = s6; 198115091d37SBarry Smith } 198215091d37SBarry Smith /* backward solve the upper triangular */ 198315091d37SBarry Smith for (i=n-1; i>=0; i--){ 198415091d37SBarry Smith v = aa + 36*diag[i] + 36; 198515091d37SBarry Smith vi = aj + diag[i] + 1; 198615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 198715091d37SBarry Smith idt = 6*i; 1988f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1989f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1990f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 199115091d37SBarry Smith while (nz--) { 199215091d37SBarry Smith idx = 6*(*vi++); 199315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 199415091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1995f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1996f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1997f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1998f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1999f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2000f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 200115091d37SBarry Smith v += 36; 200215091d37SBarry Smith } 200315091d37SBarry Smith v = aa + 36*diag[i]; 2004f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2005f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2006f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2007f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2008f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2009f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 201015091d37SBarry Smith } 201115091d37SBarry Smith 2012d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2014dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 201515091d37SBarry Smith PetscFunctionReturn(0); 201615091d37SBarry Smith } 201715091d37SBarry Smith 20184a2ae208SSatish Balay #undef __FUNCT__ 2019cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2020cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2021cee9d6f2SShri Abhyankar { 2022cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 20236464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2024cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2025cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 2026cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2027cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2028cee9d6f2SShri Abhyankar PetscScalar *x; 2029cee9d6f2SShri Abhyankar const PetscScalar *b; 2030cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2031cee9d6f2SShri Abhyankar 2032cee9d6f2SShri Abhyankar PetscFunctionBegin; 2033cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2034cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2035cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2036cee9d6f2SShri Abhyankar idx = 0; 2037cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2038cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 2039cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2040cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 2041cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2042cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2043cee9d6f2SShri Abhyankar idx = bs*i; 2044cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2045cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 20466464896eSShri Abhyankar for(k=0;k<nz;k++){ 20476464896eSShri Abhyankar jdx = bs*vi[k]; 2048cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2049cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 2050cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2051cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2052cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2053cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2054cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2055cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2056cee9d6f2SShri Abhyankar v += bs2; 2057cee9d6f2SShri Abhyankar } 2058cee9d6f2SShri Abhyankar 2059cee9d6f2SShri Abhyankar x[idx] = s1; 2060cee9d6f2SShri Abhyankar x[1+idx] = s2; 2061cee9d6f2SShri Abhyankar x[2+idx] = s3; 2062cee9d6f2SShri Abhyankar x[3+idx] = s4; 2063cee9d6f2SShri Abhyankar x[4+idx] = s5; 2064cee9d6f2SShri Abhyankar x[5+idx] = s6; 2065cee9d6f2SShri Abhyankar } 2066cee9d6f2SShri Abhyankar 2067cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2068cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2069cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 2070cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2071cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2072cee9d6f2SShri Abhyankar idt = bs*i; 2073cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2074cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 20756464896eSShri Abhyankar for(k=0;k<nz;k++){ 20766464896eSShri Abhyankar idx = bs*vi[k]; 2077cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2078cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 2079cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2080cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2081cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2082cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2083cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2084cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2085cee9d6f2SShri Abhyankar v += bs2; 2086cee9d6f2SShri Abhyankar } 2087cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2088cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2089cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2090cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2091cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2092cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2093cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2094cee9d6f2SShri Abhyankar } 2095cee9d6f2SShri Abhyankar 2096cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2097cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2098cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2099cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2100cee9d6f2SShri Abhyankar } 21018f690400SShri Abhyankar 2102cee9d6f2SShri Abhyankar #undef __FUNCT__ 210353cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2" 210453cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 210553cca76cSShri Abhyankar { 210653cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 210753cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 210853cca76cSShri Abhyankar PetscErrorCode ierr; 210953cca76cSShri Abhyankar PetscInt idx,jdx,idt; 211053cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 211153cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 211253cca76cSShri Abhyankar PetscScalar *x; 211353cca76cSShri Abhyankar const PetscScalar *b; 211453cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 211553cca76cSShri Abhyankar 211653cca76cSShri Abhyankar PetscFunctionBegin; 211753cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 211853cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 211953cca76cSShri Abhyankar /* forward solve the lower triangular */ 212053cca76cSShri Abhyankar idx = 0; 212153cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 212253cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 212353cca76cSShri Abhyankar for (i=1; i<n; i++) { 212453cca76cSShri Abhyankar v = aa + bs2*ai[i]; 212553cca76cSShri Abhyankar vi = aj + ai[i]; 212653cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 212753cca76cSShri Abhyankar idx = bs*i; 212853cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 212953cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 213053cca76cSShri Abhyankar for(k=0;k<nz;k++){ 213153cca76cSShri Abhyankar jdx = bs*vi[k]; 213253cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 213353cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 213453cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 213553cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 213653cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 213753cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 213853cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 213953cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 214053cca76cSShri Abhyankar v += bs2; 214153cca76cSShri Abhyankar } 214253cca76cSShri Abhyankar 214353cca76cSShri Abhyankar x[idx] = s1; 214453cca76cSShri Abhyankar x[1+idx] = s2; 214553cca76cSShri Abhyankar x[2+idx] = s3; 214653cca76cSShri Abhyankar x[3+idx] = s4; 214753cca76cSShri Abhyankar x[4+idx] = s5; 214853cca76cSShri Abhyankar x[5+idx] = s6; 214953cca76cSShri Abhyankar } 215053cca76cSShri Abhyankar 215153cca76cSShri Abhyankar /* backward solve the upper triangular */ 215253cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 215353cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 215453cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 215553cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 215653cca76cSShri Abhyankar idt = bs*i; 215753cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 215853cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 215953cca76cSShri Abhyankar for(k=0;k<nz;k++){ 216053cca76cSShri Abhyankar idx = bs*vi[k]; 216153cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 216253cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 216353cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 216453cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 216553cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 216653cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 216753cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 216853cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 216953cca76cSShri Abhyankar v += bs2; 217053cca76cSShri Abhyankar } 217153cca76cSShri Abhyankar /* x = inv_diagonal*x */ 217253cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 217353cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 217453cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 217553cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 217653cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 217753cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 217853cca76cSShri Abhyankar } 217953cca76cSShri Abhyankar 218053cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 218153cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 218253cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 218353cca76cSShri Abhyankar PetscFunctionReturn(0); 218453cca76cSShri Abhyankar } 218553cca76cSShri Abhyankar 218653cca76cSShri Abhyankar #undef __FUNCT__ 21874a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2188dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 21894e2b4712SSatish Balay { 21904e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21914e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 21926849ba73SBarry Smith PetscErrorCode ierr; 21935d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 21945d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2195d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2196d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2197d9fead3dSBarry Smith const PetscScalar *b; 21984e2b4712SSatish Balay 21994e2b4712SSatish Balay PetscFunctionBegin; 2200d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2202f1af5d2fSBarry Smith t = a->solve_work; 22034e2b4712SSatish Balay 22044e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 22054e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 22064e2b4712SSatish Balay 22074e2b4712SSatish Balay /* forward solve the lower triangular */ 22084e2b4712SSatish Balay idx = 5*(*r++); 2209f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2210f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 22114e2b4712SSatish Balay for (i=1; i<n; i++) { 22124e2b4712SSatish Balay v = aa + 25*ai[i]; 22134e2b4712SSatish Balay vi = aj + ai[i]; 22144e2b4712SSatish Balay nz = diag[i] - ai[i]; 22154e2b4712SSatish Balay idx = 5*(*r++); 2216f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2217f1af5d2fSBarry Smith s5 = b[4+idx]; 22184e2b4712SSatish Balay while (nz--) { 22194e2b4712SSatish Balay idx = 5*(*vi++); 2220f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2221f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2222f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2223f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2224f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2225f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2226f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 22274e2b4712SSatish Balay v += 25; 22284e2b4712SSatish Balay } 22294e2b4712SSatish Balay idx = 5*i; 2230f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2231f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 22324e2b4712SSatish Balay } 22334e2b4712SSatish Balay /* backward solve the upper triangular */ 22344e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 22354e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 22364e2b4712SSatish Balay vi = aj + diag[i] + 1; 22374e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 22384e2b4712SSatish Balay idt = 5*i; 2239f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2240f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 22414e2b4712SSatish Balay while (nz--) { 22424e2b4712SSatish Balay idx = 5*(*vi++); 2243f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2244f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2245f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2246f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2247f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2248f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2249f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 22504e2b4712SSatish Balay v += 25; 22514e2b4712SSatish Balay } 22524e2b4712SSatish Balay idc = 5*(*c--); 22534e2b4712SSatish Balay v = aa + 25*diag[i]; 2254f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2255f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 2256f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2257f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 2258f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2259f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 2260f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2261f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 2262f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2263f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 22644e2b4712SSatish Balay } 22654e2b4712SSatish Balay 22664e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22674e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2268d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22691ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2270dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 22714e2b4712SSatish Balay PetscFunctionReturn(0); 22724e2b4712SSatish Balay } 22734e2b4712SSatish Balay 22744a2ae208SSatish Balay #undef __FUNCT__ 22758f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 22768f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 22778f690400SShri Abhyankar { 22788f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 22798f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 22808f690400SShri Abhyankar PetscErrorCode ierr; 22818f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 228229b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 22838f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 22848f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 22858f690400SShri Abhyankar const PetscScalar *b; 22868f690400SShri Abhyankar 22878f690400SShri Abhyankar PetscFunctionBegin; 22888f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22898f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22908f690400SShri Abhyankar t = a->solve_work; 22918f690400SShri Abhyankar 22928f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 229329b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22948f690400SShri Abhyankar 22958f690400SShri Abhyankar /* forward solve the lower triangular */ 229629b92fc1SShri Abhyankar idx = 5*r[0]; 22978f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 22988f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 22998f690400SShri Abhyankar for (i=1; i<n; i++) { 23008f690400SShri Abhyankar v = aa + 25*ai[i]; 23018f690400SShri Abhyankar vi = aj + ai[i]; 23028f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 230329b92fc1SShri Abhyankar idx = 5*r[i]; 23048f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 23058f690400SShri Abhyankar s5 = b[4+idx]; 230629b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 230729b92fc1SShri Abhyankar idx = 5*vi[m]; 23088f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 23098f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 23108f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 23118f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 23128f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 23138f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 23148f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 23158f690400SShri Abhyankar v += 25; 23168f690400SShri Abhyankar } 23178f690400SShri Abhyankar idx = 5*i; 23188f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 23198f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 23208f690400SShri Abhyankar } 23218f690400SShri Abhyankar /* backward solve the upper triangular */ 23228f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 23238f690400SShri Abhyankar k = 2*n-i; 23248f690400SShri Abhyankar v = aa + 25*ai[k]; 23258f690400SShri Abhyankar vi = aj + ai[k]; 23268f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 23278f690400SShri Abhyankar idt = 5*i; 23288f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 23298f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 233029b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 233129b92fc1SShri Abhyankar idx = 5*vi[m]; 23328f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 23338f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 23348f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 23358f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 23368f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 23378f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 23388f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 23398f690400SShri Abhyankar v += 25; 23408f690400SShri Abhyankar } 234129b92fc1SShri Abhyankar idc = 5*c[i]; 23428f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 23438f690400SShri Abhyankar v[15]*s4+v[20]*s5; 23448f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 23458f690400SShri Abhyankar v[16]*s4+v[21]*s5; 23468f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 23478f690400SShri Abhyankar v[17]*s4+v[22]*s5; 23488f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 23498f690400SShri Abhyankar v[18]*s4+v[23]*s5; 23508f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 23518f690400SShri Abhyankar v[19]*s4+v[24]*s5; 23528f690400SShri Abhyankar } 23538f690400SShri Abhyankar 23548f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 23558f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 23568f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23578f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 23588f690400SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 23598f690400SShri Abhyankar PetscFunctionReturn(0); 23608f690400SShri Abhyankar } 236178bb4007SShri Abhyankar 236278bb4007SShri Abhyankar #undef __FUNCT__ 236378bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2" 236478bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx) 236578bb4007SShri Abhyankar { 236678bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 236778bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 236878bb4007SShri Abhyankar PetscErrorCode ierr; 236978bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 237078bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 237178bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 237278bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 237378bb4007SShri Abhyankar const PetscScalar *b; 237478bb4007SShri Abhyankar 237578bb4007SShri Abhyankar PetscFunctionBegin; 237678bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 237778bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 237878bb4007SShri Abhyankar t = a->solve_work; 237978bb4007SShri Abhyankar 238078bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 238178bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 238278bb4007SShri Abhyankar 238378bb4007SShri Abhyankar /* forward solve the lower triangular */ 238478bb4007SShri Abhyankar idx = 5*r[0]; 238578bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 238678bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 238778bb4007SShri Abhyankar for (i=1; i<n; i++) { 238878bb4007SShri Abhyankar v = aa + 25*ai[i]; 238978bb4007SShri Abhyankar vi = aj + ai[i]; 239078bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 239178bb4007SShri Abhyankar idx = 5*r[i]; 239278bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 239378bb4007SShri Abhyankar s5 = b[4+idx]; 239478bb4007SShri Abhyankar for(m=0;m<nz;m++){ 239578bb4007SShri Abhyankar idx = 5*vi[m]; 239678bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 239778bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 239878bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 239978bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 240078bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 240178bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 240278bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 240378bb4007SShri Abhyankar v += 25; 240478bb4007SShri Abhyankar } 240578bb4007SShri Abhyankar idx = 5*i; 240678bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 240778bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 240878bb4007SShri Abhyankar } 240978bb4007SShri Abhyankar /* backward solve the upper triangular */ 241078bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 241178bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 241278bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 241378bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 241478bb4007SShri Abhyankar idt = 5*i; 241578bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 241678bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 241778bb4007SShri Abhyankar for(m=0;m<nz;m++){ 241878bb4007SShri Abhyankar idx = 5*vi[m]; 241978bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 242078bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 242178bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 242278bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 242378bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 242478bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 242578bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 242678bb4007SShri Abhyankar v += 25; 242778bb4007SShri Abhyankar } 242878bb4007SShri Abhyankar idc = 5*c[i]; 242978bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 243078bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 243178bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 243278bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 243378bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 243478bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 243578bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 243678bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 243778bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 243878bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 243978bb4007SShri Abhyankar } 244078bb4007SShri Abhyankar 244178bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 244278bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 244378bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 244478bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 244578bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 244678bb4007SShri Abhyankar PetscFunctionReturn(0); 244778bb4007SShri Abhyankar } 244878bb4007SShri Abhyankar 24498f690400SShri Abhyankar #undef __FUNCT__ 24504a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2451dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 245215091d37SBarry Smith { 245315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2454690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2455dfbe8321SBarry Smith PetscErrorCode ierr; 2456690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2457d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2458d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2459d9fead3dSBarry Smith const PetscScalar *b; 246015091d37SBarry Smith 246115091d37SBarry Smith PetscFunctionBegin; 2462d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24631ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 246415091d37SBarry Smith /* forward solve the lower triangular */ 246515091d37SBarry Smith idx = 0; 246615091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 246715091d37SBarry Smith for (i=1; i<n; i++) { 246815091d37SBarry Smith v = aa + 25*ai[i]; 246915091d37SBarry Smith vi = aj + ai[i]; 247015091d37SBarry Smith nz = diag[i] - ai[i]; 247115091d37SBarry Smith idx = 5*i; 2472f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 247315091d37SBarry Smith while (nz--) { 247415091d37SBarry Smith jdx = 5*(*vi++); 247515091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2476f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2477f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2478f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2479f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2480f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 248115091d37SBarry Smith v += 25; 248215091d37SBarry Smith } 2483f1af5d2fSBarry Smith x[idx] = s1; 2484f1af5d2fSBarry Smith x[1+idx] = s2; 2485f1af5d2fSBarry Smith x[2+idx] = s3; 2486f1af5d2fSBarry Smith x[3+idx] = s4; 2487f1af5d2fSBarry Smith x[4+idx] = s5; 248815091d37SBarry Smith } 248915091d37SBarry Smith /* backward solve the upper triangular */ 249015091d37SBarry Smith for (i=n-1; i>=0; i--){ 249115091d37SBarry Smith v = aa + 25*diag[i] + 25; 249215091d37SBarry Smith vi = aj + diag[i] + 1; 249315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 249415091d37SBarry Smith idt = 5*i; 2495f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2496f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 249715091d37SBarry Smith while (nz--) { 249815091d37SBarry Smith idx = 5*(*vi++); 249915091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2500f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2501f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2502f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2503f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2504f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 250515091d37SBarry Smith v += 25; 250615091d37SBarry Smith } 250715091d37SBarry Smith v = aa + 25*diag[i]; 2508f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2509f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2510f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2511f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2512f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 251315091d37SBarry Smith } 251415091d37SBarry Smith 2515d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25161ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2517dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 251815091d37SBarry Smith PetscFunctionReturn(0); 251915091d37SBarry Smith } 252015091d37SBarry Smith 25214a2ae208SSatish Balay #undef __FUNCT__ 2522cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2523cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2524cee9d6f2SShri Abhyankar { 2525cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 25266464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2527cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2528cee9d6f2SShri Abhyankar PetscInt jdx; 2529cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2530cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2531cee9d6f2SShri Abhyankar const PetscScalar *b; 2532cee9d6f2SShri Abhyankar 2533cee9d6f2SShri Abhyankar PetscFunctionBegin; 2534cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2535cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2536cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2537cee9d6f2SShri Abhyankar idx = 0; 2538cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2539cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2540cee9d6f2SShri Abhyankar v = aa + 25*ai[i]; 2541cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2542cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2543cee9d6f2SShri Abhyankar idx = 5*i; 2544cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 25456464896eSShri Abhyankar for(k=0;k<nz;k++) { 25466464896eSShri Abhyankar jdx = 5*vi[k]; 2547cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2548cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2549cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2550cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2551cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2552cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2553cee9d6f2SShri Abhyankar v += 25; 2554cee9d6f2SShri Abhyankar } 2555cee9d6f2SShri Abhyankar x[idx] = s1; 2556cee9d6f2SShri Abhyankar x[1+idx] = s2; 2557cee9d6f2SShri Abhyankar x[2+idx] = s3; 2558cee9d6f2SShri Abhyankar x[3+idx] = s4; 2559cee9d6f2SShri Abhyankar x[4+idx] = s5; 2560cee9d6f2SShri Abhyankar } 2561cee9d6f2SShri Abhyankar 2562cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2563cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2564cee9d6f2SShri Abhyankar v = aa + 25*ai[2*n-i]; 2565cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2566cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2567cee9d6f2SShri Abhyankar idt = 5*i; 2568cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 2569cee9d6f2SShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 25706464896eSShri Abhyankar for(k=0;k<nz;k++){ 25716464896eSShri Abhyankar idx = 5*vi[k]; 2572cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2573cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2574cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2575cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2576cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2577cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2578cee9d6f2SShri Abhyankar v += 25; 2579cee9d6f2SShri Abhyankar } 2580cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2581cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2582cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2583cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2584cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2585cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2586cee9d6f2SShri Abhyankar } 2587cee9d6f2SShri Abhyankar 2588cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2589cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2590cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2591cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2592cee9d6f2SShri Abhyankar } 2593cee9d6f2SShri Abhyankar 2594cee9d6f2SShri Abhyankar #undef __FUNCT__ 259553cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2" 259653cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 259753cca76cSShri Abhyankar { 259853cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 259953cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 260053cca76cSShri Abhyankar PetscErrorCode ierr; 260153cca76cSShri Abhyankar PetscInt jdx; 260253cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 260353cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 260453cca76cSShri Abhyankar const PetscScalar *b; 260553cca76cSShri Abhyankar 260653cca76cSShri Abhyankar PetscFunctionBegin; 260753cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 260853cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260953cca76cSShri Abhyankar /* forward solve the lower triangular */ 261053cca76cSShri Abhyankar idx = 0; 261153cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 261253cca76cSShri Abhyankar for (i=1; i<n; i++) { 261353cca76cSShri Abhyankar v = aa + 25*ai[i]; 261453cca76cSShri Abhyankar vi = aj + ai[i]; 261553cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 261653cca76cSShri Abhyankar idx = 5*i; 261753cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 261853cca76cSShri Abhyankar for(k=0;k<nz;k++) { 261953cca76cSShri Abhyankar jdx = 5*vi[k]; 262053cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 262153cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 262253cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 262353cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 262453cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 262553cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 262653cca76cSShri Abhyankar v += 25; 262753cca76cSShri Abhyankar } 262853cca76cSShri Abhyankar x[idx] = s1; 262953cca76cSShri Abhyankar x[1+idx] = s2; 263053cca76cSShri Abhyankar x[2+idx] = s3; 263153cca76cSShri Abhyankar x[3+idx] = s4; 263253cca76cSShri Abhyankar x[4+idx] = s5; 263353cca76cSShri Abhyankar } 263453cca76cSShri Abhyankar 263553cca76cSShri Abhyankar /* backward solve the upper triangular */ 263653cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 263753cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 263853cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 263953cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 264053cca76cSShri Abhyankar idt = 5*i; 264153cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 264253cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 264353cca76cSShri Abhyankar for(k=0;k<nz;k++){ 264453cca76cSShri Abhyankar idx = 5*vi[k]; 264553cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 264653cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 264753cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 264853cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 264953cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 265053cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 265153cca76cSShri Abhyankar v += 25; 265253cca76cSShri Abhyankar } 265353cca76cSShri Abhyankar /* x = inv_diagonal*x */ 265453cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 265553cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 265653cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 265753cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 265853cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 265953cca76cSShri Abhyankar } 266053cca76cSShri Abhyankar 266153cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 266253cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 266353cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 266453cca76cSShri Abhyankar PetscFunctionReturn(0); 266553cca76cSShri Abhyankar } 266653cca76cSShri Abhyankar 266753cca76cSShri Abhyankar #undef __FUNCT__ 26684a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2669dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 26704e2b4712SSatish Balay { 26714e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 26724e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 26736849ba73SBarry Smith PetscErrorCode ierr; 26745d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 26755d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2676d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2677d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2678d9fead3dSBarry Smith const PetscScalar *b; 26794e2b4712SSatish Balay 26804e2b4712SSatish Balay PetscFunctionBegin; 2681d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2683f1af5d2fSBarry Smith t = a->solve_work; 26844e2b4712SSatish Balay 26854e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 26864e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 26874e2b4712SSatish Balay 26884e2b4712SSatish Balay /* forward solve the lower triangular */ 26894e2b4712SSatish Balay idx = 4*(*r++); 2690f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2691f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 26924e2b4712SSatish Balay for (i=1; i<n; i++) { 26934e2b4712SSatish Balay v = aa + 16*ai[i]; 26944e2b4712SSatish Balay vi = aj + ai[i]; 26954e2b4712SSatish Balay nz = diag[i] - ai[i]; 26964e2b4712SSatish Balay idx = 4*(*r++); 2697f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 26984e2b4712SSatish Balay while (nz--) { 26994e2b4712SSatish Balay idx = 4*(*vi++); 2700f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2701f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2702f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2703f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2704f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 27054e2b4712SSatish Balay v += 16; 27064e2b4712SSatish Balay } 27074e2b4712SSatish Balay idx = 4*i; 2708f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2709f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 27104e2b4712SSatish Balay } 27114e2b4712SSatish Balay /* backward solve the upper triangular */ 27124e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 27134e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 27144e2b4712SSatish Balay vi = aj + diag[i] + 1; 27154e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 27164e2b4712SSatish Balay idt = 4*i; 2717f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2718f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 27194e2b4712SSatish Balay while (nz--) { 27204e2b4712SSatish Balay idx = 4*(*vi++); 2721f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2722f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2723f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2724f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2725f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2726f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 27274e2b4712SSatish Balay v += 16; 27284e2b4712SSatish Balay } 27294e2b4712SSatish Balay idc = 4*(*c--); 27304e2b4712SSatish Balay v = aa + 16*diag[i]; 2731f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2732f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2733f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2734f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 27354e2b4712SSatish Balay } 27364e2b4712SSatish Balay 27374e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 27384e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2739d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27401ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2741dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 27424e2b4712SSatish Balay PetscFunctionReturn(0); 27434e2b4712SSatish Balay } 2744f26ec98cSKris Buschelman 2745f26ec98cSKris Buschelman #undef __FUNCT__ 27468f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 27478f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 27488f690400SShri Abhyankar { 27498f690400SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 27508f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 27518f690400SShri Abhyankar PetscErrorCode ierr; 275229b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 27538f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 27548f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 27558f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 27568f690400SShri Abhyankar const PetscScalar *b; 27578f690400SShri Abhyankar 27588f690400SShri Abhyankar PetscFunctionBegin; 27598f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27608f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 27618f690400SShri Abhyankar t = a->solve_work; 27628f690400SShri Abhyankar 27638f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 276429b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 27658f690400SShri Abhyankar 27668f690400SShri Abhyankar /* forward solve the lower triangular */ 276729b92fc1SShri Abhyankar idx = 4*r[0]; 27688f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 27698f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 27708f690400SShri Abhyankar for (i=1; i<n; i++) { 27718f690400SShri Abhyankar v = aa + 16*ai[i]; 27728f690400SShri Abhyankar vi = aj + ai[i]; 27738f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 277429b92fc1SShri Abhyankar idx = 4*r[i]; 27758f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 277629b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 277729b92fc1SShri Abhyankar idx = 4*vi[m]; 27788f690400SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 27798f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 27808f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 27818f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 27828f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 27838f690400SShri Abhyankar v += 16; 27848f690400SShri Abhyankar } 27858f690400SShri Abhyankar idx = 4*i; 27868f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 27878f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 27888f690400SShri Abhyankar } 27898f690400SShri Abhyankar /* backward solve the upper triangular */ 27908f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 27918f690400SShri Abhyankar k = 2*n-i; 27928f690400SShri Abhyankar v = aa + 16*ai[k]; 27938f690400SShri Abhyankar vi = aj + ai[k]; 27948f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 27958f690400SShri Abhyankar idt = 4*i; 27968f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 27978f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 279829b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 279929b92fc1SShri Abhyankar idx = 4*vi[m]; 28008f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 28018f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 28028f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 28038f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 28048f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 28058f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 28068f690400SShri Abhyankar v += 16; 28078f690400SShri Abhyankar } 280829b92fc1SShri Abhyankar idc = 4*c[i]; 28098f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 28108f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 28118f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 28128f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 28138f690400SShri Abhyankar } 28148f690400SShri Abhyankar 28158f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 28168f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 28178f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28188f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 28198f690400SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 28208f690400SShri Abhyankar PetscFunctionReturn(0); 28218f690400SShri Abhyankar } 28228f690400SShri Abhyankar 28238f690400SShri Abhyankar #undef __FUNCT__ 282478bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2" 282578bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx) 282678bb4007SShri Abhyankar { 282778bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 282878bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 282978bb4007SShri Abhyankar PetscErrorCode ierr; 283078bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 283178bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 283278bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 283378bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 283478bb4007SShri Abhyankar const PetscScalar *b; 283578bb4007SShri Abhyankar 283678bb4007SShri Abhyankar PetscFunctionBegin; 283778bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 283878bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 283978bb4007SShri Abhyankar t = a->solve_work; 284078bb4007SShri Abhyankar 284178bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 284278bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 284378bb4007SShri Abhyankar 284478bb4007SShri Abhyankar /* forward solve the lower triangular */ 284578bb4007SShri Abhyankar idx = 4*r[0]; 284678bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 284778bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 284878bb4007SShri Abhyankar for (i=1; i<n; i++) { 284978bb4007SShri Abhyankar v = aa + 16*ai[i]; 285078bb4007SShri Abhyankar vi = aj + ai[i]; 285178bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 285278bb4007SShri Abhyankar idx = 4*r[i]; 285378bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 285478bb4007SShri Abhyankar for(m=0;m<nz;m++){ 285578bb4007SShri Abhyankar idx = 4*vi[m]; 285678bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 285778bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 285878bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 285978bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 286078bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 286178bb4007SShri Abhyankar v += 16; 286278bb4007SShri Abhyankar } 286378bb4007SShri Abhyankar idx = 4*i; 286478bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 286578bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 286678bb4007SShri Abhyankar } 286778bb4007SShri Abhyankar /* backward solve the upper triangular */ 286878bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 286978bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 287078bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 287178bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 287278bb4007SShri Abhyankar idt = 4*i; 287378bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 287478bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 287578bb4007SShri Abhyankar for(m=0;m<nz;m++){ 287678bb4007SShri Abhyankar idx = 4*vi[m]; 287778bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 287878bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 287978bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 288078bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 288178bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 288278bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 288378bb4007SShri Abhyankar v += 16; 288478bb4007SShri Abhyankar } 288578bb4007SShri Abhyankar idc = 4*c[i]; 288678bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 288778bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 288878bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 288978bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 289078bb4007SShri Abhyankar } 289178bb4007SShri Abhyankar 289278bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 289378bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 289478bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 289578bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 289678bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 289778bb4007SShri Abhyankar PetscFunctionReturn(0); 289878bb4007SShri Abhyankar } 289978bb4007SShri Abhyankar 290078bb4007SShri Abhyankar #undef __FUNCT__ 2901f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2902dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2903f26ec98cSKris Buschelman { 2904f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2905f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 29066849ba73SBarry Smith PetscErrorCode ierr; 29075d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 29085d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2909d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2910d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2911d9fead3dSBarry Smith PetscScalar *x; 2912d9fead3dSBarry Smith const PetscScalar *b; 2913f26ec98cSKris Buschelman 2914f26ec98cSKris Buschelman PetscFunctionBegin; 2915d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29161ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2917f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 2918f26ec98cSKris Buschelman 2919f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2920f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2921f26ec98cSKris Buschelman 2922f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2923f26ec98cSKris Buschelman idx = 4*(*r++); 2924f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 2925f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 2926f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 2927f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 2928f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2929f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2930f26ec98cSKris Buschelman vi = aj + ai[i]; 2931f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2932f26ec98cSKris Buschelman idx = 4*(*r++); 2933f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2934f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2935f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2936f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2937f26ec98cSKris Buschelman while (nz--) { 2938f26ec98cSKris Buschelman idx = 4*(*vi++); 2939f26ec98cSKris Buschelman x1 = t[idx]; 2940f26ec98cSKris Buschelman x2 = t[1+idx]; 2941f26ec98cSKris Buschelman x3 = t[2+idx]; 2942f26ec98cSKris Buschelman x4 = t[3+idx]; 2943f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2944f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2945f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2946f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2947f26ec98cSKris Buschelman v += 16; 2948f26ec98cSKris Buschelman } 2949f26ec98cSKris Buschelman idx = 4*i; 2950f26ec98cSKris Buschelman t[idx] = s1; 2951f26ec98cSKris Buschelman t[1+idx] = s2; 2952f26ec98cSKris Buschelman t[2+idx] = s3; 2953f26ec98cSKris Buschelman t[3+idx] = s4; 2954f26ec98cSKris Buschelman } 2955f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2956f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2957f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 2958f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2959f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2960f26ec98cSKris Buschelman idt = 4*i; 2961f26ec98cSKris Buschelman s1 = t[idt]; 2962f26ec98cSKris Buschelman s2 = t[1+idt]; 2963f26ec98cSKris Buschelman s3 = t[2+idt]; 2964f26ec98cSKris Buschelman s4 = t[3+idt]; 2965f26ec98cSKris Buschelman while (nz--) { 2966f26ec98cSKris Buschelman idx = 4*(*vi++); 2967f26ec98cSKris Buschelman x1 = t[idx]; 2968f26ec98cSKris Buschelman x2 = t[1+idx]; 2969f26ec98cSKris Buschelman x3 = t[2+idx]; 2970f26ec98cSKris Buschelman x4 = t[3+idx]; 2971f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2972f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2973f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2974f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2975f26ec98cSKris Buschelman v += 16; 2976f26ec98cSKris Buschelman } 2977f26ec98cSKris Buschelman idc = 4*(*c--); 2978f26ec98cSKris Buschelman v = aa + 16*diag[i]; 2979f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2980f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2981f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2982f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2983f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 2984f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 2985f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 2986f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 2987f26ec98cSKris Buschelman } 2988f26ec98cSKris Buschelman 2989f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2990f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2991d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2993dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2994f26ec98cSKris Buschelman PetscFunctionReturn(0); 2995f26ec98cSKris Buschelman } 2996f26ec98cSKris Buschelman 299724c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 299824c233c2SKris Buschelman 299924c233c2SKris Buschelman #include PETSC_HAVE_SSE 300024c233c2SKris Buschelman 300124c233c2SKris Buschelman #undef __FUNCT__ 300224c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3003dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 300424c233c2SKris Buschelman { 300524c233c2SKris Buschelman /* 300624c233c2SKris Buschelman Note: This code uses demotion of double 300724c233c2SKris Buschelman to float when performing the mixed-mode computation. 300824c233c2SKris Buschelman This may not be numerically reasonable for all applications. 300924c233c2SKris Buschelman */ 301024c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 301124c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 30126849ba73SBarry Smith PetscErrorCode ierr; 30135d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 30145d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 301524c233c2SKris Buschelman MatScalar *aa=a->a,*v; 301687828ca2SBarry Smith PetscScalar *x,*b,*t; 301724c233c2SKris Buschelman 301824c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 301924c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 302024c233c2SKris Buschelman unsigned long offset; 302124c233c2SKris Buschelman 302224c233c2SKris Buschelman PetscFunctionBegin; 302324c233c2SKris Buschelman SSE_SCOPE_BEGIN; 302424c233c2SKris Buschelman 302524c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 302624c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 302724c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 302824c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 302924c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 303024c233c2SKris Buschelman 30311ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 30321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 303324c233c2SKris Buschelman t = a->solve_work; 303424c233c2SKris Buschelman 303524c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 303624c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 303724c233c2SKris Buschelman 303824c233c2SKris Buschelman /* forward solve the lower triangular */ 303924c233c2SKris Buschelman idx = 4*(*r++); 304024c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 304124c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 304224c233c2SKris Buschelman v = aa + 16*ai[1]; 304324c233c2SKris Buschelman 304424c233c2SKris Buschelman for (i=1; i<n;) { 304524c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 304624c233c2SKris Buschelman vi = aj + ai[i]; 304724c233c2SKris Buschelman nz = diag[i] - ai[i]; 304824c233c2SKris Buschelman idx = 4*(*r++); 304924c233c2SKris Buschelman 305024c233c2SKris Buschelman /* Demote sum from double to float */ 305124c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 305224c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 305324c233c2SKris Buschelman 305424c233c2SKris Buschelman while (nz--) { 305524c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 305624c233c2SKris Buschelman idx = 4*(*vi++); 305724c233c2SKris Buschelman 305824c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 305924c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 306024c233c2SKris Buschelman 306124c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 306224c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 306324c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 306424c233c2SKris Buschelman 306524c233c2SKris Buschelman /* First Column */ 306624c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 306724c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 306824c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 306924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 307024c233c2SKris Buschelman 307124c233c2SKris Buschelman /* Second Column */ 307224c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 307324c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 307424c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 307524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 307624c233c2SKris Buschelman 307724c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 307824c233c2SKris Buschelman 307924c233c2SKris Buschelman /* Third Column */ 308024c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 308124c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 308224c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 308324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 308424c233c2SKris Buschelman 308524c233c2SKris Buschelman /* Fourth Column */ 308624c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 308724c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 308824c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 308924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 309024c233c2SKris Buschelman SSE_INLINE_END_2 309124c233c2SKris Buschelman 309224c233c2SKris Buschelman v += 16; 309324c233c2SKris Buschelman } 309424c233c2SKris Buschelman idx = 4*i; 309524c233c2SKris Buschelman v = aa + 16*ai[++i]; 309624c233c2SKris Buschelman PREFETCH_NTA(v); 309724c233c2SKris Buschelman STORE_PS(tmps,XMM7); 309824c233c2SKris Buschelman 309924c233c2SKris Buschelman /* Promote result from float to double */ 310024c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 310124c233c2SKris Buschelman } 310224c233c2SKris Buschelman /* backward solve the upper triangular */ 310324c233c2SKris Buschelman idt = 4*(n-1); 310424c233c2SKris Buschelman ai16 = 16*diag[n-1]; 310524c233c2SKris Buschelman v = aa + ai16 + 16; 310624c233c2SKris Buschelman for (i=n-1; i>=0;){ 310724c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 310824c233c2SKris Buschelman vi = aj + diag[i] + 1; 310924c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 311024c233c2SKris Buschelman 311124c233c2SKris Buschelman /* Demote accumulator from double to float */ 311224c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 311324c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 311424c233c2SKris Buschelman 311524c233c2SKris Buschelman while (nz--) { 311624c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 311724c233c2SKris Buschelman idx = 4*(*vi++); 311824c233c2SKris Buschelman 311924c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 312024c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 312124c233c2SKris Buschelman 312224c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 312324c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 312424c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 312524c233c2SKris Buschelman 312624c233c2SKris Buschelman /* First Column */ 312724c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 312824c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 312924c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 313024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 313124c233c2SKris Buschelman 313224c233c2SKris Buschelman /* Second Column */ 313324c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 313424c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 313524c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 313624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 313724c233c2SKris Buschelman 313824c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 313924c233c2SKris Buschelman 314024c233c2SKris Buschelman /* Third Column */ 314124c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 314224c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 314324c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 314424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 314524c233c2SKris Buschelman 314624c233c2SKris Buschelman /* Fourth Column */ 314724c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 314824c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 314924c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 315024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 315124c233c2SKris Buschelman SSE_INLINE_END_2 315224c233c2SKris Buschelman v += 16; 315324c233c2SKris Buschelman } 315424c233c2SKris Buschelman v = aa + ai16; 315524c233c2SKris Buschelman ai16 = 16*diag[--i]; 315624c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 315724c233c2SKris Buschelman /* 315824c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 315924c233c2SKris Buschelman which was inverted as part of the factorization 316024c233c2SKris Buschelman */ 316124c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 316224c233c2SKris Buschelman /* First Column */ 316324c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 316424c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 316524c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 316624c233c2SKris Buschelman 316724c233c2SKris Buschelman /* Second Column */ 316824c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 316924c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 317024c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 317124c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 317224c233c2SKris Buschelman 317324c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 317424c233c2SKris Buschelman 317524c233c2SKris Buschelman /* Third Column */ 317624c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 317724c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 317824c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 317924c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 318024c233c2SKris Buschelman 318124c233c2SKris Buschelman /* Fourth Column */ 318224c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 318324c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 318424c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 318524c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 318624c233c2SKris Buschelman 318724c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 318824c233c2SKris Buschelman SSE_INLINE_END_3 318924c233c2SKris Buschelman 319024c233c2SKris Buschelman /* Promote solution from float to double */ 319124c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 319224c233c2SKris Buschelman 319324c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 319424c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 319524c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 319624c233c2SKris Buschelman idc = 4*(*c--); 319724c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 319824c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 319924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 320024c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 320124c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 320224c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 320324c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 320424c233c2SKris Buschelman SSE_INLINE_END_2 320524c233c2SKris Buschelman v = aa + ai16 + 16; 320624c233c2SKris Buschelman idt -= 4; 320724c233c2SKris Buschelman } 320824c233c2SKris Buschelman 320924c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 321024c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 32111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 32121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3213dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 321424c233c2SKris Buschelman SSE_SCOPE_END; 321524c233c2SKris Buschelman PetscFunctionReturn(0); 321624c233c2SKris Buschelman } 321724c233c2SKris Buschelman 321824c233c2SKris Buschelman #endif 32190ef38995SBarry Smith 32200ef38995SBarry Smith 32214e2b4712SSatish Balay /* 32224e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 32234e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 32244e2b4712SSatish Balay */ 32254a2ae208SSatish Balay #undef __FUNCT__ 32264a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3227dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 32284e2b4712SSatish Balay { 32294e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3230356650c2SBarry Smith PetscInt n=a->mbs; 3231356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 3232dfbe8321SBarry Smith PetscErrorCode ierr; 3233356650c2SBarry Smith const PetscInt *diag = a->diag; 3234d9fead3dSBarry Smith const MatScalar *aa=a->a; 3235d9fead3dSBarry Smith PetscScalar *x; 3236d9fead3dSBarry Smith const PetscScalar *b; 32374e2b4712SSatish Balay 32384e2b4712SSatish Balay PetscFunctionBegin; 3239d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 32401ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 32414e2b4712SSatish Balay 3242aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 32432853dc0eSBarry Smith { 324487828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 32452853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 32462853dc0eSBarry Smith } 3247aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 32482853dc0eSBarry Smith { 324987828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 32502853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 32512853dc0eSBarry Smith } 3252aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 32532853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3254e1293385SBarry Smith #else 325530d4dcafSBarry Smith { 325687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3257d9fead3dSBarry Smith const MatScalar *v; 3258356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 3259356650c2SBarry Smith const PetscInt *vi; 3260e1293385SBarry Smith 32614e2b4712SSatish Balay /* forward solve the lower triangular */ 32624e2b4712SSatish Balay idx = 0; 3263e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 32644e2b4712SSatish Balay for (i=1; i<n; i++) { 32654e2b4712SSatish Balay v = aa + 16*ai[i]; 32664e2b4712SSatish Balay vi = aj + ai[i]; 32674e2b4712SSatish Balay nz = diag[i] - ai[i]; 3268e1293385SBarry Smith idx += 4; 3269f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 32704e2b4712SSatish Balay while (nz--) { 32714e2b4712SSatish Balay jdx = 4*(*vi++); 32724e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3273f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3274f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3275f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3276f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 32774e2b4712SSatish Balay v += 16; 32784e2b4712SSatish Balay } 3279f1af5d2fSBarry Smith x[idx] = s1; 3280f1af5d2fSBarry Smith x[1+idx] = s2; 3281f1af5d2fSBarry Smith x[2+idx] = s3; 3282f1af5d2fSBarry Smith x[3+idx] = s4; 32834e2b4712SSatish Balay } 32844e2b4712SSatish Balay /* backward solve the upper triangular */ 32854e555682SBarry Smith idt = 4*(n-1); 32864e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 32874e555682SBarry Smith ai16 = 16*diag[i]; 32884e555682SBarry Smith v = aa + ai16 + 16; 32894e2b4712SSatish Balay vi = aj + diag[i] + 1; 32904e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3291f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3292f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 32934e2b4712SSatish Balay while (nz--) { 32944e2b4712SSatish Balay idx = 4*(*vi++); 32954e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3296f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3297f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3298f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3299f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 33004e2b4712SSatish Balay v += 16; 33014e2b4712SSatish Balay } 33024e555682SBarry Smith v = aa + ai16; 3303f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3304f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3305f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3306f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3307329f5518SBarry Smith idt -= 4; 33084e2b4712SSatish Balay } 330930d4dcafSBarry Smith } 3310e1293385SBarry Smith #endif 33114e2b4712SSatish Balay 3312d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 33131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3314dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 33154e2b4712SSatish Balay PetscFunctionReturn(0); 33164e2b4712SSatish Balay } 33174e2b4712SSatish Balay 3318f26ec98cSKris Buschelman #undef __FUNCT__ 3319cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3320cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3321cee9d6f2SShri Abhyankar { 3322cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33236464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3324cee9d6f2SShri Abhyankar PetscErrorCode ierr; 3325cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 3326cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3327cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 3328cee9d6f2SShri Abhyankar PetscScalar *x; 3329cee9d6f2SShri Abhyankar const PetscScalar *b; 3330cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3331cee9d6f2SShri Abhyankar 3332cee9d6f2SShri Abhyankar PetscFunctionBegin; 3333cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3334cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3335cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 3336cee9d6f2SShri Abhyankar idx = 0; 3337cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3338cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 3339cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 3340cee9d6f2SShri Abhyankar vi = aj + ai[i]; 3341cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 3342cee9d6f2SShri Abhyankar idx = bs*i; 3343cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 33446464896eSShri Abhyankar for(k=0;k<nz;k++) { 33456464896eSShri Abhyankar jdx = bs*vi[k]; 3346cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3347cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3348cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3349cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3350cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3351cee9d6f2SShri Abhyankar 3352cee9d6f2SShri Abhyankar v += bs2; 3353cee9d6f2SShri Abhyankar } 3354cee9d6f2SShri Abhyankar 3355cee9d6f2SShri Abhyankar x[idx] = s1; 3356cee9d6f2SShri Abhyankar x[1+idx] = s2; 3357cee9d6f2SShri Abhyankar x[2+idx] = s3; 3358cee9d6f2SShri Abhyankar x[3+idx] = s4; 3359cee9d6f2SShri Abhyankar } 3360cee9d6f2SShri Abhyankar 3361cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 3362cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 3363cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 3364cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 3365cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 3366cee9d6f2SShri Abhyankar idt = bs*i; 3367cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3368cee9d6f2SShri Abhyankar 33696464896eSShri Abhyankar for(k=0;k<nz;k++){ 33706464896eSShri Abhyankar idx = bs*vi[k]; 3371cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3372cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3373cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3374cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3375cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3376cee9d6f2SShri Abhyankar 3377cee9d6f2SShri Abhyankar v += bs2; 3378cee9d6f2SShri Abhyankar } 3379cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 3380cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3381cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3382cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3383cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3384cee9d6f2SShri Abhyankar 3385cee9d6f2SShri Abhyankar } 3386cee9d6f2SShri Abhyankar 3387cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3388cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3389cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3390cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 3391cee9d6f2SShri Abhyankar } 3392cee9d6f2SShri Abhyankar 3393b2b2dd24SShri Abhyankar #undef __FUNCT__ 3394b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 3395b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 3396b2b2dd24SShri Abhyankar { 3397b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3398b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3399b2b2dd24SShri Abhyankar PetscErrorCode ierr; 3400b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 3401b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3402b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 3403b2b2dd24SShri Abhyankar PetscScalar *x; 3404b2b2dd24SShri Abhyankar const PetscScalar *b; 3405b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3406cee9d6f2SShri Abhyankar 3407b2b2dd24SShri Abhyankar PetscFunctionBegin; 3408b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3409b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3410b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 3411b2b2dd24SShri Abhyankar idx = 0; 3412b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3413b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 3414b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 3415b2b2dd24SShri Abhyankar vi = aj + ai[i]; 3416b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 3417b2b2dd24SShri Abhyankar idx = bs*i; 3418b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3419b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 3420b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 3421b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3422b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3423b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3424b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3425b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3426b2b2dd24SShri Abhyankar 3427b2b2dd24SShri Abhyankar v += bs2; 3428b2b2dd24SShri Abhyankar } 3429b2b2dd24SShri Abhyankar 3430b2b2dd24SShri Abhyankar x[idx] = s1; 3431b2b2dd24SShri Abhyankar x[1+idx] = s2; 3432b2b2dd24SShri Abhyankar x[2+idx] = s3; 3433b2b2dd24SShri Abhyankar x[3+idx] = s4; 3434b2b2dd24SShri Abhyankar } 3435b2b2dd24SShri Abhyankar 3436b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 3437b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 3438b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 3439b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 3440b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 3441b2b2dd24SShri Abhyankar idt = bs*i; 3442b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3443b2b2dd24SShri Abhyankar 3444b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 3445b2b2dd24SShri Abhyankar idx = bs*vi[k]; 3446b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3447b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3448b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3449b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3450b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3451b2b2dd24SShri Abhyankar 3452b2b2dd24SShri Abhyankar v += bs2; 3453b2b2dd24SShri Abhyankar } 3454b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 3455b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3456b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3457b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3458b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3459b2b2dd24SShri Abhyankar 3460b2b2dd24SShri Abhyankar } 3461b2b2dd24SShri Abhyankar 3462b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3463b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3464b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3465b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 3466b2b2dd24SShri Abhyankar } 3467cee9d6f2SShri Abhyankar 3468cee9d6f2SShri Abhyankar #undef __FUNCT__ 3469f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3470dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3471f26ec98cSKris Buschelman { 3472f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3473690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3474dfbe8321SBarry Smith PetscErrorCode ierr; 3475690b6cddSBarry Smith PetscInt *diag = a->diag; 3476f26ec98cSKris Buschelman MatScalar *aa=a->a; 3477f26ec98cSKris Buschelman PetscScalar *x,*b; 3478f26ec98cSKris Buschelman 3479f26ec98cSKris Buschelman PetscFunctionBegin; 34801ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 34811ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3482f26ec98cSKris Buschelman 3483f26ec98cSKris Buschelman { 3484f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3485f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 3486690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3487f26ec98cSKris Buschelman 3488f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3489f26ec98cSKris Buschelman idx = 0; 3490f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 3491f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 3492f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 3493f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 3494f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3495f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3496f26ec98cSKris Buschelman vi = aj + ai[i]; 3497f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3498f26ec98cSKris Buschelman idx += 4; 3499f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3500f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3501f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3502f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3503f26ec98cSKris Buschelman while (nz--) { 3504f26ec98cSKris Buschelman jdx = 4*(*vi++); 3505f26ec98cSKris Buschelman x1 = t[jdx]; 3506f26ec98cSKris Buschelman x2 = t[1+jdx]; 3507f26ec98cSKris Buschelman x3 = t[2+jdx]; 3508f26ec98cSKris Buschelman x4 = t[3+jdx]; 3509f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3510f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3511f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3512f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3513f26ec98cSKris Buschelman v += 16; 3514f26ec98cSKris Buschelman } 3515f26ec98cSKris Buschelman t[idx] = s1; 3516f26ec98cSKris Buschelman t[1+idx] = s2; 3517f26ec98cSKris Buschelman t[2+idx] = s3; 3518f26ec98cSKris Buschelman t[3+idx] = s4; 3519f26ec98cSKris Buschelman } 3520f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3521f26ec98cSKris Buschelman idt = 4*(n-1); 3522f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3523f26ec98cSKris Buschelman ai16 = 16*diag[i]; 3524f26ec98cSKris Buschelman v = aa + ai16 + 16; 3525f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3526f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3527f26ec98cSKris Buschelman s1 = t[idt]; 3528f26ec98cSKris Buschelman s2 = t[1+idt]; 3529f26ec98cSKris Buschelman s3 = t[2+idt]; 3530f26ec98cSKris Buschelman s4 = t[3+idt]; 3531f26ec98cSKris Buschelman while (nz--) { 3532f26ec98cSKris Buschelman idx = 4*(*vi++); 3533f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 3534f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 3535f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 3536f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 3537f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3538f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3539f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3540f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3541f26ec98cSKris Buschelman v += 16; 3542f26ec98cSKris Buschelman } 3543f26ec98cSKris Buschelman v = aa + ai16; 3544f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3545f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3546f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3547f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3548f26ec98cSKris Buschelman idt -= 4; 3549f26ec98cSKris Buschelman } 3550f26ec98cSKris Buschelman } 3551f26ec98cSKris Buschelman 35521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 35531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3554dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3555f26ec98cSKris Buschelman PetscFunctionReturn(0); 3556f26ec98cSKris Buschelman } 3557f26ec98cSKris Buschelman 35583660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 35593660e330SKris Buschelman 35603660e330SKris Buschelman #include PETSC_HAVE_SSE 35613660e330SKris Buschelman #undef __FUNCT__ 35627cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3563dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 35643660e330SKris Buschelman { 35653660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 35662aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 3567dfbe8321SBarry Smith PetscErrorCode ierr; 3568dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 35693660e330SKris Buschelman MatScalar *aa=a->a; 357087828ca2SBarry Smith PetscScalar *x,*b; 35713660e330SKris Buschelman 35723660e330SKris Buschelman PetscFunctionBegin; 35733660e330SKris Buschelman SSE_SCOPE_BEGIN; 35743660e330SKris Buschelman /* 35753660e330SKris Buschelman Note: This code currently uses demotion of double 35763660e330SKris Buschelman to float when performing the mixed-mode computation. 35773660e330SKris Buschelman This may not be numerically reasonable for all applications. 35783660e330SKris Buschelman */ 35793660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 35803660e330SKris Buschelman 35811ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 35821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 35833660e330SKris Buschelman { 3584eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 3585eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 35862aa5897fSKris Buschelman int nz,i,idt,ai16; 35872aa5897fSKris Buschelman unsigned int jdx,idx; 35882aa5897fSKris Buschelman unsigned short *vi; 3589eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 35903660e330SKris Buschelman 3591eb05f457SKris Buschelman /* First block is the identity. */ 35923660e330SKris Buschelman idx = 0; 3593eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 35942aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 35953660e330SKris Buschelman 35963660e330SKris Buschelman for (i=1; i<n;) { 35973660e330SKris Buschelman PREFETCH_NTA(&v[8]); 35983660e330SKris Buschelman vi = aj + ai[i]; 35993660e330SKris Buschelman nz = diag[i] - ai[i]; 36003660e330SKris Buschelman idx += 4; 36013660e330SKris Buschelman 3602eb05f457SKris Buschelman /* Demote RHS from double to float. */ 3603eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3604eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 36053660e330SKris Buschelman 36063660e330SKris Buschelman while (nz--) { 36073660e330SKris Buschelman PREFETCH_NTA(&v[16]); 36082aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 36093660e330SKris Buschelman 36103660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 3611eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 36123660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 36133660e330SKris Buschelman 36143660e330SKris Buschelman /* First Column */ 36153660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 36163660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 36173660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 36183660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 36193660e330SKris Buschelman 36203660e330SKris Buschelman /* Second Column */ 36213660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 36223660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 36233660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 36243660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 36253660e330SKris Buschelman 36263660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 36273660e330SKris Buschelman 36283660e330SKris Buschelman /* Third Column */ 36293660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 36303660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 36313660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 36323660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 36333660e330SKris Buschelman 36343660e330SKris Buschelman /* Fourth Column */ 36353660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 36363660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 36373660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 36383660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 36393660e330SKris Buschelman SSE_INLINE_END_2 36403660e330SKris Buschelman 36413660e330SKris Buschelman v += 16; 36423660e330SKris Buschelman } 36433660e330SKris Buschelman v = aa + 16*ai[++i]; 36443660e330SKris Buschelman PREFETCH_NTA(v); 3645eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 36463660e330SKris Buschelman } 3647eb05f457SKris Buschelman 3648eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 3649eb05f457SKris Buschelman 36503660e330SKris Buschelman idt = 4*(n-1); 36513660e330SKris Buschelman ai16 = 16*diag[n-1]; 36523660e330SKris Buschelman v = aa + ai16 + 16; 36533660e330SKris Buschelman for (i=n-1; i>=0;){ 36543660e330SKris Buschelman PREFETCH_NTA(&v[8]); 36553660e330SKris Buschelman vi = aj + diag[i] + 1; 36563660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 36573660e330SKris Buschelman 3658eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 36593660e330SKris Buschelman 36603660e330SKris Buschelman while (nz--) { 36613660e330SKris Buschelman PREFETCH_NTA(&v[16]); 36622aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 36633660e330SKris Buschelman 36643660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 3665eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 36663660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 36673660e330SKris Buschelman 36683660e330SKris Buschelman /* First Column */ 36693660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 36703660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 36713660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 36723660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 36733660e330SKris Buschelman 36743660e330SKris Buschelman /* Second Column */ 36753660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 36763660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 36773660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 36783660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 36793660e330SKris Buschelman 36803660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 36813660e330SKris Buschelman 36823660e330SKris Buschelman /* Third Column */ 36833660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 36843660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 36853660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 36863660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 36873660e330SKris Buschelman 36883660e330SKris Buschelman /* Fourth Column */ 36893660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 36903660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 36913660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 36923660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 36933660e330SKris Buschelman SSE_INLINE_END_2 36943660e330SKris Buschelman v += 16; 36953660e330SKris Buschelman } 36963660e330SKris Buschelman v = aa + ai16; 36973660e330SKris Buschelman ai16 = 16*diag[--i]; 36983660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 36993660e330SKris Buschelman /* 37003660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 37013660e330SKris Buschelman which was inverted as part of the factorization 37023660e330SKris Buschelman */ 3703eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 37043660e330SKris Buschelman /* First Column */ 37053660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 37063660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 37073660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 37083660e330SKris Buschelman 37093660e330SKris Buschelman /* Second Column */ 37103660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 37113660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 37123660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 37133660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 37143660e330SKris Buschelman 37153660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 37163660e330SKris Buschelman 37173660e330SKris Buschelman /* Third Column */ 37183660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 37193660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 37203660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 37213660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 37223660e330SKris Buschelman 37233660e330SKris Buschelman /* Fourth Column */ 37243660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 37253660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 37263660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 37273660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 37283660e330SKris Buschelman 37293660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 37303660e330SKris Buschelman SSE_INLINE_END_3 37313660e330SKris Buschelman 37323660e330SKris Buschelman v = aa + ai16 + 16; 37333660e330SKris Buschelman idt -= 4; 37343660e330SKris Buschelman } 3735eb05f457SKris Buschelman 3736eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 3737eb05f457SKris Buschelman idt = 4*(n-1); 3738eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 3739eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3740eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3741eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 3742eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 3743eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 3744eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 3745eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 3746eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 374754693613SKris Buschelman idt -= 4; 37483660e330SKris Buschelman } 3749eb05f457SKris Buschelman 3750eb05f457SKris Buschelman } /* End of artificial scope. */ 37511ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 37521ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3753dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 37543660e330SKris Buschelman SSE_SCOPE_END; 37553660e330SKris Buschelman PetscFunctionReturn(0); 37563660e330SKris Buschelman } 37573660e330SKris Buschelman 37587cf1b8d3SKris Buschelman #undef __FUNCT__ 37597cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3760dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 37617cf1b8d3SKris Buschelman { 37627cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 37637cf1b8d3SKris Buschelman int *aj=a->j; 3764dfbe8321SBarry Smith PetscErrorCode ierr; 3765dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 37667cf1b8d3SKris Buschelman MatScalar *aa=a->a; 37677cf1b8d3SKris Buschelman PetscScalar *x,*b; 37687cf1b8d3SKris Buschelman 37697cf1b8d3SKris Buschelman PetscFunctionBegin; 37707cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 37717cf1b8d3SKris Buschelman /* 37727cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 37737cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 37747cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 37757cf1b8d3SKris Buschelman */ 37767cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 37777cf1b8d3SKris Buschelman 37781ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 37791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 37807cf1b8d3SKris Buschelman { 37817cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 37827cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 37837cf1b8d3SKris Buschelman int nz,i,idt,ai16; 37847cf1b8d3SKris Buschelman int jdx,idx; 37857cf1b8d3SKris Buschelman int *vi; 37867cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 37877cf1b8d3SKris Buschelman 37887cf1b8d3SKris Buschelman /* First block is the identity. */ 37897cf1b8d3SKris Buschelman idx = 0; 37907cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 37917cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 37927cf1b8d3SKris Buschelman 37937cf1b8d3SKris Buschelman for (i=1; i<n;) { 37947cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 37957cf1b8d3SKris Buschelman vi = aj + ai[i]; 37967cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 37977cf1b8d3SKris Buschelman idx += 4; 37987cf1b8d3SKris Buschelman 37997cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 38007cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 38017cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 38027cf1b8d3SKris Buschelman 38037cf1b8d3SKris Buschelman while (nz--) { 38047cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 38057cf1b8d3SKris Buschelman jdx = 4*(*vi++); 38067cf1b8d3SKris Buschelman /* jdx = *vi++; */ 38077cf1b8d3SKris Buschelman 38087cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 38097cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 38107cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 38117cf1b8d3SKris Buschelman 38127cf1b8d3SKris Buschelman /* First Column */ 38137cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 38147cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 38157cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 38167cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 38177cf1b8d3SKris Buschelman 38187cf1b8d3SKris Buschelman /* Second Column */ 38197cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 38207cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 38217cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 38227cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 38237cf1b8d3SKris Buschelman 38247cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 38257cf1b8d3SKris Buschelman 38267cf1b8d3SKris Buschelman /* Third Column */ 38277cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 38287cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 38297cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 38307cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 38317cf1b8d3SKris Buschelman 38327cf1b8d3SKris Buschelman /* Fourth Column */ 38337cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 38347cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 38357cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 38367cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 38377cf1b8d3SKris Buschelman SSE_INLINE_END_2 38387cf1b8d3SKris Buschelman 38397cf1b8d3SKris Buschelman v += 16; 38407cf1b8d3SKris Buschelman } 38417cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 38427cf1b8d3SKris Buschelman PREFETCH_NTA(v); 38437cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 38447cf1b8d3SKris Buschelman } 38457cf1b8d3SKris Buschelman 38467cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 38477cf1b8d3SKris Buschelman 38487cf1b8d3SKris Buschelman idt = 4*(n-1); 38497cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 38507cf1b8d3SKris Buschelman v = aa + ai16 + 16; 38517cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 38527cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 38537cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 38547cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 38557cf1b8d3SKris Buschelman 38567cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 38577cf1b8d3SKris Buschelman 38587cf1b8d3SKris Buschelman while (nz--) { 38597cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 38607cf1b8d3SKris Buschelman idx = 4*(*vi++); 38617cf1b8d3SKris Buschelman /* idx = *vi++; */ 38627cf1b8d3SKris Buschelman 38637cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 38647cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 38657cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 38667cf1b8d3SKris Buschelman 38677cf1b8d3SKris Buschelman /* First Column */ 38687cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 38697cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 38707cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 38717cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 38727cf1b8d3SKris Buschelman 38737cf1b8d3SKris Buschelman /* Second Column */ 38747cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 38757cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 38767cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 38777cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 38787cf1b8d3SKris Buschelman 38797cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 38807cf1b8d3SKris Buschelman 38817cf1b8d3SKris Buschelman /* Third Column */ 38827cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 38837cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 38847cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 38857cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 38867cf1b8d3SKris Buschelman 38877cf1b8d3SKris Buschelman /* Fourth Column */ 38887cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 38897cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 38907cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 38917cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 38927cf1b8d3SKris Buschelman SSE_INLINE_END_2 38937cf1b8d3SKris Buschelman v += 16; 38947cf1b8d3SKris Buschelman } 38957cf1b8d3SKris Buschelman v = aa + ai16; 38967cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 38977cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 38987cf1b8d3SKris Buschelman /* 38997cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 39007cf1b8d3SKris Buschelman which was inverted as part of the factorization 39017cf1b8d3SKris Buschelman */ 39027cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 39037cf1b8d3SKris Buschelman /* First Column */ 39047cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 39057cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 39067cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 39077cf1b8d3SKris Buschelman 39087cf1b8d3SKris Buschelman /* Second Column */ 39097cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 39107cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 39117cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 39127cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 39137cf1b8d3SKris Buschelman 39147cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 39157cf1b8d3SKris Buschelman 39167cf1b8d3SKris Buschelman /* Third Column */ 39177cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 39187cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 39197cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 39207cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 39217cf1b8d3SKris Buschelman 39227cf1b8d3SKris Buschelman /* Fourth Column */ 39237cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 39247cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 39257cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 39267cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 39277cf1b8d3SKris Buschelman 39287cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 39297cf1b8d3SKris Buschelman SSE_INLINE_END_3 39307cf1b8d3SKris Buschelman 39317cf1b8d3SKris Buschelman v = aa + ai16 + 16; 39327cf1b8d3SKris Buschelman idt -= 4; 39337cf1b8d3SKris Buschelman } 39347cf1b8d3SKris Buschelman 39357cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 39367cf1b8d3SKris Buschelman idt = 4*(n-1); 39377cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 39387cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 39397cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 39407cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 39417cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 39427cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 39437cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 39447cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 39457cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 39467cf1b8d3SKris Buschelman idt -= 4; 39477cf1b8d3SKris Buschelman } 39487cf1b8d3SKris Buschelman 39497cf1b8d3SKris Buschelman } /* End of artificial scope. */ 39501ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 39511ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3952dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 39537cf1b8d3SKris Buschelman SSE_SCOPE_END; 39547cf1b8d3SKris Buschelman PetscFunctionReturn(0); 39557cf1b8d3SKris Buschelman } 39567cf1b8d3SKris Buschelman 39573660e330SKris Buschelman #endif 39588f690400SShri Abhyankar 39594a2ae208SSatish Balay #undef __FUNCT__ 39604a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3961dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 39624e2b4712SSatish Balay { 39634e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 39644e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 39656849ba73SBarry Smith PetscErrorCode ierr; 39665d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 39675d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3968d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3969d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3970d9fead3dSBarry Smith const PetscScalar *b; 39714e2b4712SSatish Balay 39724e2b4712SSatish Balay PetscFunctionBegin; 3973d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 39741ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3975f1af5d2fSBarry Smith t = a->solve_work; 39764e2b4712SSatish Balay 39774e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 39784e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 39794e2b4712SSatish Balay 39804e2b4712SSatish Balay /* forward solve the lower triangular */ 39814e2b4712SSatish Balay idx = 3*(*r++); 3982f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 39834e2b4712SSatish Balay for (i=1; i<n; i++) { 39844e2b4712SSatish Balay v = aa + 9*ai[i]; 39854e2b4712SSatish Balay vi = aj + ai[i]; 39864e2b4712SSatish Balay nz = diag[i] - ai[i]; 39874e2b4712SSatish Balay idx = 3*(*r++); 3988f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 39894e2b4712SSatish Balay while (nz--) { 39904e2b4712SSatish Balay idx = 3*(*vi++); 3991f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3992f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3993f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3994f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 39954e2b4712SSatish Balay v += 9; 39964e2b4712SSatish Balay } 39974e2b4712SSatish Balay idx = 3*i; 3998f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 39994e2b4712SSatish Balay } 40004e2b4712SSatish Balay /* backward solve the upper triangular */ 40014e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 40024e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 40034e2b4712SSatish Balay vi = aj + diag[i] + 1; 40044e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 40054e2b4712SSatish Balay idt = 3*i; 4006f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 40074e2b4712SSatish Balay while (nz--) { 40084e2b4712SSatish Balay idx = 3*(*vi++); 4009f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4010f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4011f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4012f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 40134e2b4712SSatish Balay v += 9; 40144e2b4712SSatish Balay } 40154e2b4712SSatish Balay idc = 3*(*c--); 40164e2b4712SSatish Balay v = aa + 9*diag[i]; 4017f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4018f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4019f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 40204e2b4712SSatish Balay } 40214e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 40224e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4023d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 40241ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4025dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 40264e2b4712SSatish Balay PetscFunctionReturn(0); 40274e2b4712SSatish Balay } 40284e2b4712SSatish Balay 40298f690400SShri Abhyankar #undef __FUNCT__ 40308f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 40318f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 40328f690400SShri Abhyankar { 40338f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 40348f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 40358f690400SShri Abhyankar PetscErrorCode ierr; 403629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 40378f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 40388f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 40398f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 40408f690400SShri Abhyankar const PetscScalar *b; 40418f690400SShri Abhyankar 40428f690400SShri Abhyankar PetscFunctionBegin; 40438f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 40448f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 40458f690400SShri Abhyankar t = a->solve_work; 40468f690400SShri Abhyankar 40478f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 404829b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 40498f690400SShri Abhyankar 40508f690400SShri Abhyankar /* forward solve the lower triangular */ 405129b92fc1SShri Abhyankar idx = 3*r[0]; 40528f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 40538f690400SShri Abhyankar for (i=1; i<n; i++) { 40548f690400SShri Abhyankar v = aa + 9*ai[i]; 40558f690400SShri Abhyankar vi = aj + ai[i]; 40568f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 405729b92fc1SShri Abhyankar idx = 3*r[i]; 40588f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 405929b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 406029b92fc1SShri Abhyankar idx = 3*vi[m]; 40618f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 40628f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 40638f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 40648f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 40658f690400SShri Abhyankar v += 9; 40668f690400SShri Abhyankar } 40678f690400SShri Abhyankar idx = 3*i; 40688f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 40698f690400SShri Abhyankar } 40708f690400SShri Abhyankar /* backward solve the upper triangular */ 40718f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 40728f690400SShri Abhyankar k = 2*n-i; 40738f690400SShri Abhyankar v = aa + 9*ai[k]; 40748f690400SShri Abhyankar vi = aj + ai[k]; 40758f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 40768f690400SShri Abhyankar idt = 3*i; 40778f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 407829b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 407929b92fc1SShri Abhyankar idx = 3*vi[m]; 40808f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 40818f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 40828f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 40838f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 40848f690400SShri Abhyankar v += 9; 40858f690400SShri Abhyankar } 408629b92fc1SShri Abhyankar idc = 3*c[i]; 40878f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 40888f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 40898f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 40908f690400SShri Abhyankar } 40918f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 40928f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 40938f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 40948f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 40958f690400SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 40968f690400SShri Abhyankar PetscFunctionReturn(0); 40978f690400SShri Abhyankar } 40988f690400SShri Abhyankar 40990c4413a7SShri Abhyankar #undef __FUNCT__ 41000c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2" 41010c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx) 41020c4413a7SShri Abhyankar { 41030c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 41040c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 41050c4413a7SShri Abhyankar PetscErrorCode ierr; 41060c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 41070c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 41080c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 41090c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 41100c4413a7SShri Abhyankar const PetscScalar *b; 41110c4413a7SShri Abhyankar 41120c4413a7SShri Abhyankar PetscFunctionBegin; 41130c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41140c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 41150c4413a7SShri Abhyankar t = a->solve_work; 41160c4413a7SShri Abhyankar 41170c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 41180c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 41190c4413a7SShri Abhyankar 41200c4413a7SShri Abhyankar /* forward solve the lower triangular */ 41210c4413a7SShri Abhyankar idx = 3*r[0]; 41220c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 41230c4413a7SShri Abhyankar for (i=1; i<n; i++) { 41240c4413a7SShri Abhyankar v = aa + 9*ai[i]; 41250c4413a7SShri Abhyankar vi = aj + ai[i]; 41260c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 41270c4413a7SShri Abhyankar idx = 3*r[i]; 41280c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 41290c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 41300c4413a7SShri Abhyankar idx = 3*vi[m]; 41310c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 41320c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 41330c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 41340c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 41350c4413a7SShri Abhyankar v += 9; 41360c4413a7SShri Abhyankar } 41370c4413a7SShri Abhyankar idx = 3*i; 41380c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 41390c4413a7SShri Abhyankar } 41400c4413a7SShri Abhyankar /* backward solve the upper triangular */ 41410c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 41420c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 41430c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 41440c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 41450c4413a7SShri Abhyankar idt = 3*i; 41460c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 41470c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 41480c4413a7SShri Abhyankar idx = 3*vi[m]; 41490c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 41500c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 41510c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 41520c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 41530c4413a7SShri Abhyankar v += 9; 41540c4413a7SShri Abhyankar } 41550c4413a7SShri Abhyankar idc = 3*c[i]; 41560c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 41570c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 41580c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 41590c4413a7SShri Abhyankar } 41600c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 41610c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 41620c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41630c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 41640c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 41650c4413a7SShri Abhyankar PetscFunctionReturn(0); 41660c4413a7SShri Abhyankar } 41670c4413a7SShri Abhyankar 416815091d37SBarry Smith /* 416915091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 417015091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 417115091d37SBarry Smith */ 41724a2ae208SSatish Balay #undef __FUNCT__ 41734a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4174dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 417515091d37SBarry Smith { 417615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4177690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4178dfbe8321SBarry Smith PetscErrorCode ierr; 4179690b6cddSBarry Smith PetscInt *diag = a->diag; 4180d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4181d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4182d9fead3dSBarry Smith const PetscScalar *b; 4183690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 418415091d37SBarry Smith 418515091d37SBarry Smith PetscFunctionBegin; 4186d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41871ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 418815091d37SBarry Smith 418915091d37SBarry Smith /* forward solve the lower triangular */ 419015091d37SBarry Smith idx = 0; 419115091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 419215091d37SBarry Smith for (i=1; i<n; i++) { 419315091d37SBarry Smith v = aa + 9*ai[i]; 419415091d37SBarry Smith vi = aj + ai[i]; 419515091d37SBarry Smith nz = diag[i] - ai[i]; 419615091d37SBarry Smith idx += 3; 4197f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 419815091d37SBarry Smith while (nz--) { 419915091d37SBarry Smith jdx = 3*(*vi++); 420015091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4201f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4202f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4203f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 420415091d37SBarry Smith v += 9; 420515091d37SBarry Smith } 4206f1af5d2fSBarry Smith x[idx] = s1; 4207f1af5d2fSBarry Smith x[1+idx] = s2; 4208f1af5d2fSBarry Smith x[2+idx] = s3; 420915091d37SBarry Smith } 421015091d37SBarry Smith /* backward solve the upper triangular */ 421115091d37SBarry Smith for (i=n-1; i>=0; i--){ 421215091d37SBarry Smith v = aa + 9*diag[i] + 9; 421315091d37SBarry Smith vi = aj + diag[i] + 1; 421415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 421515091d37SBarry Smith idt = 3*i; 4216f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4217f1af5d2fSBarry Smith s3 = x[2+idt]; 421815091d37SBarry Smith while (nz--) { 421915091d37SBarry Smith idx = 3*(*vi++); 422015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4221f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4222f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4223f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 422415091d37SBarry Smith v += 9; 422515091d37SBarry Smith } 422615091d37SBarry Smith v = aa + 9*diag[i]; 4227f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4228f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4229f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 423015091d37SBarry Smith } 423115091d37SBarry Smith 4232d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4234dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 423515091d37SBarry Smith PetscFunctionReturn(0); 423615091d37SBarry Smith } 423715091d37SBarry Smith 42384a2ae208SSatish Balay #undef __FUNCT__ 4239cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4240cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4241cee9d6f2SShri Abhyankar { 4242cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4243ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4244cee9d6f2SShri Abhyankar PetscErrorCode ierr; 4245cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 4246cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4247cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 4248cee9d6f2SShri Abhyankar PetscScalar *x; 4249cee9d6f2SShri Abhyankar const PetscScalar *b; 4250cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4251cee9d6f2SShri Abhyankar 4252cee9d6f2SShri Abhyankar PetscFunctionBegin; 4253cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4254cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4255cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 4256cee9d6f2SShri Abhyankar idx = 0; 4257cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4258cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 4259cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 4260cee9d6f2SShri Abhyankar vi = aj + ai[i]; 4261cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 4262cee9d6f2SShri Abhyankar idx = bs*i; 4263cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4264ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4265ce3d78c0SShri Abhyankar jdx = bs*vi[k]; 4266cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4267cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4268cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4269cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4270cee9d6f2SShri Abhyankar 4271cee9d6f2SShri Abhyankar v += bs2; 4272cee9d6f2SShri Abhyankar } 4273cee9d6f2SShri Abhyankar 4274cee9d6f2SShri Abhyankar x[idx] = s1; 4275cee9d6f2SShri Abhyankar x[1+idx] = s2; 4276cee9d6f2SShri Abhyankar x[2+idx] = s3; 4277cee9d6f2SShri Abhyankar } 4278cee9d6f2SShri Abhyankar 4279cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 4280cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 4281cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 4282cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 4283cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 4284cee9d6f2SShri Abhyankar idt = bs*i; 4285cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4286cee9d6f2SShri Abhyankar 4287ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4288ce3d78c0SShri Abhyankar idx = bs*vi[k]; 4289cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4290cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4291cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4292cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4293cee9d6f2SShri Abhyankar 4294cee9d6f2SShri Abhyankar v += bs2; 4295cee9d6f2SShri Abhyankar } 4296cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 4297cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4298cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4299cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4300cee9d6f2SShri Abhyankar 4301cee9d6f2SShri Abhyankar } 4302cee9d6f2SShri Abhyankar 4303cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4304cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4305cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4306cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 4307cee9d6f2SShri Abhyankar } 4308cee9d6f2SShri Abhyankar 4309cee9d6f2SShri Abhyankar #undef __FUNCT__ 4310b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2" 4311b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4312b2b2dd24SShri Abhyankar { 4313b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4314b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4315b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4316b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 4317b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4318b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4319b2b2dd24SShri Abhyankar PetscScalar *x; 4320b2b2dd24SShri Abhyankar const PetscScalar *b; 4321b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4322b2b2dd24SShri Abhyankar 4323b2b2dd24SShri Abhyankar PetscFunctionBegin; 4324b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4325b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4326b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4327b2b2dd24SShri Abhyankar idx = 0; 4328b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4329b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4330b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4331b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4332b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4333b2b2dd24SShri Abhyankar idx = bs*i; 4334b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4335b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4336b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4337b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4338b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4339b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4340b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4341b2b2dd24SShri Abhyankar 4342b2b2dd24SShri Abhyankar v += bs2; 4343b2b2dd24SShri Abhyankar } 4344b2b2dd24SShri Abhyankar 4345b2b2dd24SShri Abhyankar x[idx] = s1; 4346b2b2dd24SShri Abhyankar x[1+idx] = s2; 4347b2b2dd24SShri Abhyankar x[2+idx] = s3; 4348b2b2dd24SShri Abhyankar } 4349b2b2dd24SShri Abhyankar 4350b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4351b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4352b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4353b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4354b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4355b2b2dd24SShri Abhyankar idt = bs*i; 4356b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4357b2b2dd24SShri Abhyankar 4358b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4359b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4360b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4361b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4362b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4363b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4364b2b2dd24SShri Abhyankar 4365b2b2dd24SShri Abhyankar v += bs2; 4366b2b2dd24SShri Abhyankar } 4367b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4368b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4369b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4370b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4371b2b2dd24SShri Abhyankar 4372b2b2dd24SShri Abhyankar } 4373b2b2dd24SShri Abhyankar 4374b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4375b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4376b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4377b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4378b2b2dd24SShri Abhyankar } 4379b2b2dd24SShri Abhyankar 4380b2b2dd24SShri Abhyankar #undef __FUNCT__ 43814a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4382dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 43834e2b4712SSatish Balay { 43844e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 43854e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 43866849ba73SBarry Smith PetscErrorCode ierr; 43875d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 43885d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4389d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4390d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 4391d9fead3dSBarry Smith const PetscScalar *b; 43924e2b4712SSatish Balay 43934e2b4712SSatish Balay PetscFunctionBegin; 4394d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43951ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4396f1af5d2fSBarry Smith t = a->solve_work; 43974e2b4712SSatish Balay 43984e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 43994e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 44004e2b4712SSatish Balay 44014e2b4712SSatish Balay /* forward solve the lower triangular */ 44024e2b4712SSatish Balay idx = 2*(*r++); 4403f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 44044e2b4712SSatish Balay for (i=1; i<n; i++) { 44054e2b4712SSatish Balay v = aa + 4*ai[i]; 44064e2b4712SSatish Balay vi = aj + ai[i]; 44074e2b4712SSatish Balay nz = diag[i] - ai[i]; 44084e2b4712SSatish Balay idx = 2*(*r++); 4409f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 44104e2b4712SSatish Balay while (nz--) { 44114e2b4712SSatish Balay idx = 2*(*vi++); 4412f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4413f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4414f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 44154e2b4712SSatish Balay v += 4; 44164e2b4712SSatish Balay } 44174e2b4712SSatish Balay idx = 2*i; 4418f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 44194e2b4712SSatish Balay } 44204e2b4712SSatish Balay /* backward solve the upper triangular */ 44214e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 44224e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 44234e2b4712SSatish Balay vi = aj + diag[i] + 1; 44244e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 44254e2b4712SSatish Balay idt = 2*i; 4426f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 44274e2b4712SSatish Balay while (nz--) { 44284e2b4712SSatish Balay idx = 2*(*vi++); 4429f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4430f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4431f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 44324e2b4712SSatish Balay v += 4; 44334e2b4712SSatish Balay } 44344e2b4712SSatish Balay idc = 2*(*c--); 44354e2b4712SSatish Balay v = aa + 4*diag[i]; 4436f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4437f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 44384e2b4712SSatish Balay } 44394e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 44404e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4441d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 44421ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4443dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 44444e2b4712SSatish Balay PetscFunctionReturn(0); 44454e2b4712SSatish Balay } 44464e2b4712SSatish Balay 44478f690400SShri Abhyankar #undef __FUNCT__ 44488f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 44498f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 44508f690400SShri Abhyankar { 44518f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 44528f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 44538f690400SShri Abhyankar PetscErrorCode ierr; 445429b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 44558f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 44568f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 44578f690400SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 44588f690400SShri Abhyankar const PetscScalar *b; 44598f690400SShri Abhyankar 44608f690400SShri Abhyankar PetscFunctionBegin; 44618f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 44628f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 44638f690400SShri Abhyankar t = a->solve_work; 44648f690400SShri Abhyankar 44658f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 446629b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 44678f690400SShri Abhyankar 44688f690400SShri Abhyankar /* forward solve the lower triangular */ 446929b92fc1SShri Abhyankar idx = 2*r[0]; 44708f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 44718f690400SShri Abhyankar for (i=1; i<n; i++) { 44728f690400SShri Abhyankar v = aa + 4*ai[i]; 44738f690400SShri Abhyankar vi = aj + ai[i]; 44748f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 447529b92fc1SShri Abhyankar idx = 2*r[i]; 44768f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 447729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 447829b92fc1SShri Abhyankar jdx = 2*vi[m]; 44798f690400SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 44808f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 44818f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 44828f690400SShri Abhyankar v += 4; 44838f690400SShri Abhyankar } 44848f690400SShri Abhyankar idx = 2*i; 44858f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 44868f690400SShri Abhyankar } 44878f690400SShri Abhyankar /* backward solve the upper triangular */ 44888f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 44898f690400SShri Abhyankar k = 2*n-i; 44908f690400SShri Abhyankar v = aa + 4*ai[k]; 44918f690400SShri Abhyankar vi = aj + ai[k]; 44928f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 44938f690400SShri Abhyankar idt = 2*i; 44948f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 449529b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 449629b92fc1SShri Abhyankar idx = 2*vi[m]; 44978f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 44988f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 44998f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 45008f690400SShri Abhyankar v += 4; 45018f690400SShri Abhyankar } 450229b92fc1SShri Abhyankar idc = 2*c[i]; 45038f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 45048f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 45058f690400SShri Abhyankar } 45068f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 45078f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 45088f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45098f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 45108f690400SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 45118f690400SShri Abhyankar PetscFunctionReturn(0); 45128f690400SShri Abhyankar } 45138f690400SShri Abhyankar 45140c4413a7SShri Abhyankar #undef __FUNCT__ 45150c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2" 45160c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx) 45170c4413a7SShri Abhyankar { 45180c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 45190c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 45200c4413a7SShri Abhyankar PetscErrorCode ierr; 45210c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 45220c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 45230c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 45240c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 45250c4413a7SShri Abhyankar const PetscScalar *b; 45260c4413a7SShri Abhyankar 45270c4413a7SShri Abhyankar PetscFunctionBegin; 45280c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45290c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 45300c4413a7SShri Abhyankar t = a->solve_work; 45310c4413a7SShri Abhyankar 45320c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 45330c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 45340c4413a7SShri Abhyankar 45350c4413a7SShri Abhyankar /* forward solve the lower triangular */ 45360c4413a7SShri Abhyankar idx = 2*r[0]; 45370c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 45380c4413a7SShri Abhyankar for (i=1; i<n; i++) { 45390c4413a7SShri Abhyankar v = aa + 4*ai[i]; 45400c4413a7SShri Abhyankar vi = aj + ai[i]; 45410c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 45420c4413a7SShri Abhyankar idx = 2*r[i]; 45430c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 45440c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 45450c4413a7SShri Abhyankar jdx = 2*vi[m]; 45460c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 45470c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 45480c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 45490c4413a7SShri Abhyankar v += 4; 45500c4413a7SShri Abhyankar } 45510c4413a7SShri Abhyankar idx = 2*i; 45520c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 45530c4413a7SShri Abhyankar } 45540c4413a7SShri Abhyankar /* backward solve the upper triangular */ 45550c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 45560c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 45570c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 45580c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 45590c4413a7SShri Abhyankar idt = 2*i; 45600c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 45610c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 45620c4413a7SShri Abhyankar idx = 2*vi[m]; 45630c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 45640c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 45650c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 45660c4413a7SShri Abhyankar v += 4; 45670c4413a7SShri Abhyankar } 45680c4413a7SShri Abhyankar idc = 2*c[i]; 45690c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 45700c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 45710c4413a7SShri Abhyankar } 45720c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 45730c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 45740c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45750c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 45760c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 45770c4413a7SShri Abhyankar PetscFunctionReturn(0); 45780c4413a7SShri Abhyankar } 45798f690400SShri Abhyankar 458015091d37SBarry Smith /* 458115091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 458215091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 458315091d37SBarry Smith */ 45844a2ae208SSatish Balay #undef __FUNCT__ 45854a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4586dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 458715091d37SBarry Smith { 458815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4589690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4590dfbe8321SBarry Smith PetscErrorCode ierr; 4591690b6cddSBarry Smith PetscInt *diag = a->diag; 4592d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4593d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 4594d9fead3dSBarry Smith const PetscScalar *b; 4595690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 459615091d37SBarry Smith 459715091d37SBarry Smith PetscFunctionBegin; 4598d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45991ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 460015091d37SBarry Smith 460115091d37SBarry Smith /* forward solve the lower triangular */ 460215091d37SBarry Smith idx = 0; 460315091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 460415091d37SBarry Smith for (i=1; i<n; i++) { 460515091d37SBarry Smith v = aa + 4*ai[i]; 460615091d37SBarry Smith vi = aj + ai[i]; 460715091d37SBarry Smith nz = diag[i] - ai[i]; 460815091d37SBarry Smith idx += 2; 4609f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 461015091d37SBarry Smith while (nz--) { 461115091d37SBarry Smith jdx = 2*(*vi++); 461215091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 4613f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4614f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 461515091d37SBarry Smith v += 4; 461615091d37SBarry Smith } 4617f1af5d2fSBarry Smith x[idx] = s1; 4618f1af5d2fSBarry Smith x[1+idx] = s2; 461915091d37SBarry Smith } 462015091d37SBarry Smith /* backward solve the upper triangular */ 462115091d37SBarry Smith for (i=n-1; i>=0; i--){ 462215091d37SBarry Smith v = aa + 4*diag[i] + 4; 462315091d37SBarry Smith vi = aj + diag[i] + 1; 462415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 462515091d37SBarry Smith idt = 2*i; 4626f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 462715091d37SBarry Smith while (nz--) { 462815091d37SBarry Smith idx = 2*(*vi++); 462915091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 4630f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4631f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 463215091d37SBarry Smith v += 4; 463315091d37SBarry Smith } 463415091d37SBarry Smith v = aa + 4*diag[i]; 4635f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 4636f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 463715091d37SBarry Smith } 463815091d37SBarry Smith 4639d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46401ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4641dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 464215091d37SBarry Smith PetscFunctionReturn(0); 464315091d37SBarry Smith } 464415091d37SBarry Smith 46454a2ae208SSatish Balay #undef __FUNCT__ 4646cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4647cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4648cee9d6f2SShri Abhyankar { 4649cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4650ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4651cee9d6f2SShri Abhyankar PetscErrorCode ierr; 4652cee9d6f2SShri Abhyankar PetscInt jdx; 4653cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 4654cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4655cee9d6f2SShri Abhyankar const PetscScalar *b; 4656cee9d6f2SShri Abhyankar 4657cee9d6f2SShri Abhyankar PetscFunctionBegin; 4658cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4659cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4660cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 4661cee9d6f2SShri Abhyankar idx = 0; 4662cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4663cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 4664cee9d6f2SShri Abhyankar v = aa + 4*ai[i]; 4665cee9d6f2SShri Abhyankar vi = aj + ai[i]; 4666cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 4667cee9d6f2SShri Abhyankar idx = 2*i; 4668cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4669ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4670ce3d78c0SShri Abhyankar jdx = 2*vi[k]; 4671cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4672cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4673cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4674cee9d6f2SShri Abhyankar v += 4; 4675cee9d6f2SShri Abhyankar } 4676cee9d6f2SShri Abhyankar x[idx] = s1; 4677cee9d6f2SShri Abhyankar x[1+idx] = s2; 4678cee9d6f2SShri Abhyankar } 4679cee9d6f2SShri Abhyankar 4680cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 4681cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 4682cee9d6f2SShri Abhyankar v = aa + 4*ai[2*n-i]; 4683cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 4684cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 4685cee9d6f2SShri Abhyankar idt = 2*i; 4686cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4687ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4688ce3d78c0SShri Abhyankar idx = 2*vi[k]; 4689cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4690cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4691cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4692cee9d6f2SShri Abhyankar v += 4; 4693cee9d6f2SShri Abhyankar } 4694cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 4695cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4696cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4697cee9d6f2SShri Abhyankar } 4698cee9d6f2SShri Abhyankar 4699cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4700cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4701cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4702cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 4703cee9d6f2SShri Abhyankar } 4704cee9d6f2SShri Abhyankar 4705cee9d6f2SShri Abhyankar #undef __FUNCT__ 4706b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2" 4707b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4708b2b2dd24SShri Abhyankar { 4709b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4710b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4711b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4712b2b2dd24SShri Abhyankar PetscInt jdx; 4713b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4714b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4715b2b2dd24SShri Abhyankar const PetscScalar *b; 4716b2b2dd24SShri Abhyankar 4717b2b2dd24SShri Abhyankar PetscFunctionBegin; 4718b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4719b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4720b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4721b2b2dd24SShri Abhyankar idx = 0; 4722b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4723b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4724b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 4725b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4726b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4727b2b2dd24SShri Abhyankar idx = 2*i; 4728b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4729b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4730b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 4731b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4732b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4733b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4734b2b2dd24SShri Abhyankar v += 4; 4735b2b2dd24SShri Abhyankar } 4736b2b2dd24SShri Abhyankar x[idx] = s1; 4737b2b2dd24SShri Abhyankar x[1+idx] = s2; 4738b2b2dd24SShri Abhyankar } 4739b2b2dd24SShri Abhyankar 4740b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4741b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4742b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 4743b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4744b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4745b2b2dd24SShri Abhyankar idt = 2*i; 4746b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4747b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4748b2b2dd24SShri Abhyankar idx = 2*vi[k]; 4749b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4750b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4751b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4752b2b2dd24SShri Abhyankar v += 4; 4753b2b2dd24SShri Abhyankar } 4754b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4755b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4756b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4757b2b2dd24SShri Abhyankar } 4758b2b2dd24SShri Abhyankar 4759b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4760b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4761b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4762b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4763b2b2dd24SShri Abhyankar } 4764b2b2dd24SShri Abhyankar 4765b2b2dd24SShri Abhyankar #undef __FUNCT__ 47664a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4767dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 47684e2b4712SSatish Balay { 47694e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 47704e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 47716849ba73SBarry Smith PetscErrorCode ierr; 47725d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 47735d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 47743f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 477587828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 47764e2b4712SSatish Balay 47774e2b4712SSatish Balay PetscFunctionBegin; 47784e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 47794e2b4712SSatish Balay 47801ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 47811ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4782f1af5d2fSBarry Smith t = a->solve_work; 47834e2b4712SSatish Balay 47844e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 47854e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 47864e2b4712SSatish Balay 47874e2b4712SSatish Balay /* forward solve the lower triangular */ 4788f1af5d2fSBarry Smith t[0] = b[*r++]; 47894e2b4712SSatish Balay for (i=1; i<n; i++) { 47904e2b4712SSatish Balay v = aa + ai[i]; 47914e2b4712SSatish Balay vi = aj + ai[i]; 47924e2b4712SSatish Balay nz = diag[i] - ai[i]; 4793f1af5d2fSBarry Smith s1 = b[*r++]; 47944e2b4712SSatish Balay while (nz--) { 4795f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 47964e2b4712SSatish Balay } 4797f1af5d2fSBarry Smith t[i] = s1; 47984e2b4712SSatish Balay } 47994e2b4712SSatish Balay /* backward solve the upper triangular */ 48004e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 48014e2b4712SSatish Balay v = aa + diag[i] + 1; 48024e2b4712SSatish Balay vi = aj + diag[i] + 1; 48034e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4804f1af5d2fSBarry Smith s1 = t[i]; 48054e2b4712SSatish Balay while (nz--) { 4806f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 48074e2b4712SSatish Balay } 4808f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 48094e2b4712SSatish Balay } 48104e2b4712SSatish Balay 48114e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 48124e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 48131ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 48141ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4815dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 48164e2b4712SSatish Balay PetscFunctionReturn(0); 48174e2b4712SSatish Balay } 481815091d37SBarry Smith /* 481915091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 482015091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 482115091d37SBarry Smith */ 48224a2ae208SSatish Balay #undef __FUNCT__ 48234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4824dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 482515091d37SBarry Smith { 482615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4827690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4828dfbe8321SBarry Smith PetscErrorCode ierr; 4829690b6cddSBarry Smith PetscInt *diag = a->diag; 483015091d37SBarry Smith MatScalar *aa=a->a; 483187828ca2SBarry Smith PetscScalar *x,*b; 483287828ca2SBarry Smith PetscScalar s1,x1; 483315091d37SBarry Smith MatScalar *v; 4834690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 483515091d37SBarry Smith 483615091d37SBarry Smith PetscFunctionBegin; 48371ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 48381ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 483915091d37SBarry Smith 484015091d37SBarry Smith /* forward solve the lower triangular */ 484115091d37SBarry Smith idx = 0; 484215091d37SBarry Smith x[0] = b[0]; 484315091d37SBarry Smith for (i=1; i<n; i++) { 484415091d37SBarry Smith v = aa + ai[i]; 484515091d37SBarry Smith vi = aj + ai[i]; 484615091d37SBarry Smith nz = diag[i] - ai[i]; 484715091d37SBarry Smith idx += 1; 4848f1af5d2fSBarry Smith s1 = b[idx]; 484915091d37SBarry Smith while (nz--) { 485015091d37SBarry Smith jdx = *vi++; 485115091d37SBarry Smith x1 = x[jdx]; 4852f1af5d2fSBarry Smith s1 -= v[0]*x1; 485315091d37SBarry Smith v += 1; 485415091d37SBarry Smith } 4855f1af5d2fSBarry Smith x[idx] = s1; 485615091d37SBarry Smith } 485715091d37SBarry Smith /* backward solve the upper triangular */ 485815091d37SBarry Smith for (i=n-1; i>=0; i--){ 485915091d37SBarry Smith v = aa + diag[i] + 1; 486015091d37SBarry Smith vi = aj + diag[i] + 1; 486115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 486215091d37SBarry Smith idt = i; 4863f1af5d2fSBarry Smith s1 = x[idt]; 486415091d37SBarry Smith while (nz--) { 486515091d37SBarry Smith idx = *vi++; 486615091d37SBarry Smith x1 = x[idx]; 4867f1af5d2fSBarry Smith s1 -= v[0]*x1; 486815091d37SBarry Smith v += 1; 486915091d37SBarry Smith } 487015091d37SBarry Smith v = aa + diag[i]; 4871f1af5d2fSBarry Smith x[idt] = v[0]*s1; 487215091d37SBarry Smith } 48731ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 48741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4875dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 487615091d37SBarry Smith PetscFunctionReturn(0); 487715091d37SBarry Smith } 48784e2b4712SSatish Balay 48794e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 488016a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 48816bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 48826bce7ff8SHong Zhang 48836bce7ff8SHong Zhang #undef __FUNCT__ 48846bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 48856bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 48866bce7ff8SHong Zhang { 48876bce7ff8SHong Zhang Mat C=B; 48886bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 48896bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 48906bce7ff8SHong Zhang PetscErrorCode ierr; 48916bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 48926bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 48936bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4894b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4895914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4896914a18a2SHong Zhang MatScalar *v_work; 48976bce7ff8SHong Zhang 48986bce7ff8SHong Zhang PetscFunctionBegin; 48996bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 49006bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4901914a18a2SHong Zhang ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4902914a18a2SHong Zhang ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 49036bce7ff8SHong Zhang ics = ic; 49046bce7ff8SHong Zhang 4905914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 4906914a18a2SHong Zhang ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 4907b588c5a2SHong Zhang mwork = v_work + bs; 4908b588c5a2SHong Zhang v_pivots = (PetscInt*)(mwork + bs2); 4909914a18a2SHong Zhang 49106bce7ff8SHong Zhang for (i=0; i<n; i++){ 49116bce7ff8SHong Zhang /* zero rtmp */ 49126bce7ff8SHong Zhang /* L part */ 49136bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 49146bce7ff8SHong Zhang bjtmp = bj + bi[i]; 4915914a18a2SHong Zhang for (j=0; j<nz; j++){ 4916914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4917914a18a2SHong Zhang } 49186bce7ff8SHong Zhang 49196bce7ff8SHong Zhang /* U part */ 49206bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i]; 49216bce7ff8SHong Zhang bjtmp = bj + bi[2*n-i]; 4922914a18a2SHong Zhang for (j=0; j<nz; j++){ 4923914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4924914a18a2SHong Zhang } 49256bce7ff8SHong Zhang 49266bce7ff8SHong Zhang /* load in initial (unfactored row) */ 49276bce7ff8SHong Zhang nz = ai[r[i]+1] - ai[r[i]]; 49286bce7ff8SHong Zhang ajtmp = aj + ai[r[i]]; 4929914a18a2SHong Zhang v = aa + bs2*ai[r[i]]; 49306bce7ff8SHong Zhang for (j=0; j<nz; j++) { 4931914a18a2SHong Zhang ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 49326bce7ff8SHong Zhang } 49336bce7ff8SHong Zhang 49346bce7ff8SHong Zhang /* elimination */ 49356bce7ff8SHong Zhang bjtmp = bj + bi[i]; 49366bce7ff8SHong Zhang nzL = bi[i+1] - bi[i]; 4937b1646270SShri Abhyankar for(k=0;k < nzL;k++) { 4938b1646270SShri Abhyankar row = bjtmp[k]; 4939914a18a2SHong Zhang pc = rtmp + bs2*row; 4940914a18a2SHong Zhang for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 4941914a18a2SHong Zhang if (flg) { 4942914a18a2SHong Zhang pv = b->a + bs2*bdiag[row]; 4943b588c5a2SHong Zhang Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 49446bce7ff8SHong Zhang pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 4945914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-row]; 49466bce7ff8SHong Zhang nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 4947914a18a2SHong Zhang for (j=0; j<nz; j++) { 4948914a18a2SHong Zhang Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 4949914a18a2SHong Zhang } 4950b588c5a2SHong Zhang ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 49516bce7ff8SHong Zhang } 49526bce7ff8SHong Zhang } 49536bce7ff8SHong Zhang 49546bce7ff8SHong Zhang /* finished row so stick it into b->a */ 49556bce7ff8SHong Zhang /* L part */ 4956914a18a2SHong Zhang pv = b->a + bs2*bi[i] ; 49576bce7ff8SHong Zhang pj = b->j + bi[i] ; 49586bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 49596bce7ff8SHong Zhang for (j=0; j<nz; j++) { 4960914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 49616bce7ff8SHong Zhang } 49626bce7ff8SHong Zhang 49636bce7ff8SHong Zhang /* Mark diagonal and invert diagonal for simplier triangular solves */ 4964914a18a2SHong Zhang pv = b->a + bs2*bdiag[i]; 49656bce7ff8SHong Zhang pj = b->j + bdiag[i]; 4966914a18a2SHong Zhang /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 4967914a18a2SHong Zhang ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4968914a18a2SHong Zhang ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 49696bce7ff8SHong Zhang 49706bce7ff8SHong Zhang /* U part */ 4971914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-i]; 49726bce7ff8SHong Zhang pj = b->j + bi[2*n-i]; 49736bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i] - 1; 4974914a18a2SHong Zhang for (j=0; j<nz; j++){ 4975914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4976914a18a2SHong Zhang } 49776bce7ff8SHong Zhang } 49786bce7ff8SHong Zhang 49796bce7ff8SHong Zhang ierr = PetscFree(rtmp);CHKERRQ(ierr); 49806bce7ff8SHong Zhang ierr = PetscFree(v_work);CHKERRQ(ierr); 49816bce7ff8SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 49826bce7ff8SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 498327019359SHong Zhang 49846bce7ff8SHong Zhang C->assembled = PETSC_TRUE; 4985914a18a2SHong Zhang ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 49866bce7ff8SHong Zhang PetscFunctionReturn(0); 49876bce7ff8SHong Zhang } 49886bce7ff8SHong Zhang 49891a83e813SShri Abhyankar #undef __FUNCT__ 49901a83e813SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2" 49911a83e813SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info) 49921a83e813SShri Abhyankar { 49931a83e813SShri Abhyankar Mat C=B; 49941a83e813SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 49951a83e813SShri Abhyankar IS isrow = b->row,isicol = b->icol; 49961a83e813SShri Abhyankar PetscErrorCode ierr; 49971a83e813SShri Abhyankar const PetscInt *r,*ic,*ics; 49981a83e813SShri Abhyankar PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 49991a83e813SShri Abhyankar PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 50001a83e813SShri Abhyankar MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 50011a83e813SShri Abhyankar PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 50021a83e813SShri Abhyankar MatScalar *v_work; 50031a83e813SShri Abhyankar 50041a83e813SShri Abhyankar PetscFunctionBegin; 50051a83e813SShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 50061a83e813SShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 50071a83e813SShri Abhyankar ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 50081a83e813SShri Abhyankar ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 50091a83e813SShri Abhyankar ics = ic; 50101a83e813SShri Abhyankar 50111a83e813SShri Abhyankar /* generate work space needed by dense LU factorization */ 50121a83e813SShri Abhyankar ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 50131a83e813SShri Abhyankar mwork = v_work + bs; 50141a83e813SShri Abhyankar v_pivots = (PetscInt*)(mwork + bs2); 50151a83e813SShri Abhyankar 50161a83e813SShri Abhyankar for (i=0; i<n; i++){ 50171a83e813SShri Abhyankar /* zero rtmp */ 50181a83e813SShri Abhyankar /* L part */ 50191a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 50201a83e813SShri Abhyankar bjtmp = bj + bi[i]; 50211a83e813SShri Abhyankar for (j=0; j<nz; j++){ 50221a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 50231a83e813SShri Abhyankar } 50241a83e813SShri Abhyankar 50251a83e813SShri Abhyankar /* U part */ 50261a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 50271a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 50281a83e813SShri Abhyankar for (j=0; j<nz; j++){ 50291a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 50301a83e813SShri Abhyankar } 50311a83e813SShri Abhyankar 50321a83e813SShri Abhyankar /* load in initial (unfactored row) */ 50331a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 50341a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 50351a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 50361a83e813SShri Abhyankar for (j=0; j<nz; j++) { 50371a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 50381a83e813SShri Abhyankar } 50391a83e813SShri Abhyankar 50401a83e813SShri Abhyankar /* elimination */ 50411a83e813SShri Abhyankar bjtmp = bj + bi[i]; 50421a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 50431a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 50441a83e813SShri Abhyankar row = bjtmp[k]; 50451a83e813SShri Abhyankar pc = rtmp + bs2*row; 50461a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 50471a83e813SShri Abhyankar if (flg) { 50481a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 50491a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 50501a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 50511a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 50521a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 50531a83e813SShri Abhyankar for (j=0; j<nz; j++) { 50541a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 50551a83e813SShri Abhyankar } 50561a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 50571a83e813SShri Abhyankar } 50581a83e813SShri Abhyankar } 50591a83e813SShri Abhyankar 50601a83e813SShri Abhyankar /* finished row so stick it into b->a */ 50611a83e813SShri Abhyankar /* L part */ 50621a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 50631a83e813SShri Abhyankar pj = b->j + bi[i] ; 50641a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 50651a83e813SShri Abhyankar for (j=0; j<nz; j++) { 50661a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 50671a83e813SShri Abhyankar } 50681a83e813SShri Abhyankar 50691a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 50701a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 50711a83e813SShri Abhyankar pj = b->j + bdiag[i]; 50721a83e813SShri Abhyankar /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 50731a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 50741a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 50751a83e813SShri Abhyankar 50761a83e813SShri Abhyankar /* U part */ 50771a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 50781a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 50791a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 50801a83e813SShri Abhyankar for (j=0; j<nz; j++){ 50811a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 50821a83e813SShri Abhyankar } 50831a83e813SShri Abhyankar } 50841a83e813SShri Abhyankar 50851a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 50861a83e813SShri Abhyankar ierr = PetscFree(v_work);CHKERRQ(ierr); 50871a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 50881a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 50891a83e813SShri Abhyankar 50901a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 50911a83e813SShri Abhyankar ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 50921a83e813SShri Abhyankar PetscFunctionReturn(0); 50931a83e813SShri Abhyankar } 50941a83e813SShri Abhyankar 50956bce7ff8SHong Zhang /* 50966bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 509716a2bf60SHong Zhang See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 509816a2bf60SHong Zhang because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 50996bce7ff8SHong Zhang */ 51006bce7ff8SHong Zhang #undef __FUNCT__ 51016bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 51026bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 51036bce7ff8SHong Zhang { 51046bce7ff8SHong Zhang 51056bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 51066bce7ff8SHong Zhang PetscErrorCode ierr; 510716a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 510816a2bf60SHong Zhang PetscInt i,j,nz,*bi,*bj,*bdiag; 51096bce7ff8SHong Zhang 51106bce7ff8SHong Zhang PetscFunctionBegin; 511116a2bf60SHong Zhang /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */ 511216a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 51136bce7ff8SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 511416a2bf60SHong Zhang 511516a2bf60SHong Zhang /* allocate matrix arrays for new data structure */ 511616a2bf60SHong Zhang ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr); 511716a2bf60SHong Zhang ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr); 511816a2bf60SHong Zhang b->singlemalloc = PETSC_TRUE; 511916a2bf60SHong Zhang if (!b->diag){ 512016a2bf60SHong Zhang ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 512116a2bf60SHong Zhang } 5122914a18a2SHong Zhang bdiag = b->diag; 51236bce7ff8SHong Zhang 512416a2bf60SHong Zhang if (n > 0) { 512516a2bf60SHong Zhang ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 51266bce7ff8SHong Zhang } 51276bce7ff8SHong Zhang 51286bce7ff8SHong Zhang /* set bi and bj with new data structure */ 51296bce7ff8SHong Zhang bi = b->i; 51306bce7ff8SHong Zhang bj = b->j; 51316bce7ff8SHong Zhang 51326bce7ff8SHong Zhang /* L part */ 51336bce7ff8SHong Zhang bi[0] = 0; 513416a2bf60SHong Zhang for (i=0; i<n; i++){ 51356bce7ff8SHong Zhang nz = adiag[i] - ai[i]; 5136914a18a2SHong Zhang bi[i+1] = bi[i] + nz; 51376bce7ff8SHong Zhang aj = a->j + ai[i]; 51386bce7ff8SHong Zhang for (j=0; j<nz; j++){ 51396bce7ff8SHong Zhang *bj = aj[j]; bj++; 51406bce7ff8SHong Zhang } 51416bce7ff8SHong Zhang } 51426bce7ff8SHong Zhang 51436bce7ff8SHong Zhang /* U part */ 514416a2bf60SHong Zhang bi[n+1] = bi[n]; 514516a2bf60SHong Zhang for (i=n-1; i>=0; i--){ 51466bce7ff8SHong Zhang nz = ai[i+1] - adiag[i] - 1; 514716a2bf60SHong Zhang bi[2*n-i+1] = bi[2*n-i] + nz + 1; 51486bce7ff8SHong Zhang aj = a->j + adiag[i] + 1; 51496bce7ff8SHong Zhang for (j=0; j<nz; j++){ 51506bce7ff8SHong Zhang *bj = aj[j]; bj++; 51516bce7ff8SHong Zhang } 51526bce7ff8SHong Zhang /* diag[i] */ 51536bce7ff8SHong Zhang *bj = i; bj++; 515416a2bf60SHong Zhang bdiag[i] = bi[2*n-i+1]-1; 51556bce7ff8SHong Zhang } 51566bce7ff8SHong Zhang PetscFunctionReturn(0); 51576bce7ff8SHong Zhang } 51586bce7ff8SHong Zhang 515916a2bf60SHong Zhang #undef __FUNCT__ 516016a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 516116a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 516216a2bf60SHong Zhang { 516316a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 516416a2bf60SHong Zhang IS isicol; 516516a2bf60SHong Zhang PetscErrorCode ierr; 516616a2bf60SHong Zhang const PetscInt *r,*ic; 51677fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 516816a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 516916a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 517016a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 51717fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 517216a2bf60SHong Zhang PetscReal f; 517316a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 517416a2bf60SHong Zhang PetscBT lnkbt; 517516a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 517616a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 517716a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 517816a2bf60SHong Zhang PetscTruth missing; 51797fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 518016a2bf60SHong Zhang 518116a2bf60SHong Zhang PetscFunctionBegin; 518216a2bf60SHong Zhang if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 518316a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 518416a2bf60SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 518516a2bf60SHong Zhang 518616a2bf60SHong Zhang f = info->fill; 518716a2bf60SHong Zhang levels = (PetscInt)info->levels; 518816a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 518916a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 519016a2bf60SHong Zhang 519116a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 519216a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 51937fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 519416a2bf60SHong Zhang 51957fa3a6a0SHong Zhang if (!levels && both_identity) { 519616a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 519716a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 519816a2bf60SHong Zhang (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 51997fa3a6a0SHong Zhang /* set MatSolve routines */ 52007fa3a6a0SHong Zhang switch (bs){ 52017fa3a6a0SHong Zhang case 2: 52027fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 52037fa3a6a0SHong Zhang break; 52047fa3a6a0SHong Zhang case 3: 52057fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 52067fa3a6a0SHong Zhang break; 52077fa3a6a0SHong Zhang case 4: 52087fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 52097fa3a6a0SHong Zhang break; 52107fa3a6a0SHong Zhang case 5: 52117fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 52127fa3a6a0SHong Zhang break; 52137fa3a6a0SHong Zhang case 6: 52147fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 52157fa3a6a0SHong Zhang break; 52167fa3a6a0SHong Zhang case 7: 52177fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 52187fa3a6a0SHong Zhang break; 52197fa3a6a0SHong Zhang default: 52207fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 52217fa3a6a0SHong Zhang break; 52227fa3a6a0SHong Zhang } 522316a2bf60SHong Zhang 522416a2bf60SHong Zhang fact->factor = MAT_FACTOR_ILU; 522516a2bf60SHong Zhang (fact)->info.factor_mallocs = 0; 522616a2bf60SHong Zhang (fact)->info.fill_ratio_given = info->fill; 522716a2bf60SHong Zhang (fact)->info.fill_ratio_needed = 1.0; 522816a2bf60SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 522916a2bf60SHong Zhang b->row = isrow; 523016a2bf60SHong Zhang b->col = iscol; 523116a2bf60SHong Zhang b->icol = isicol; 523216a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 523316a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 523416a2bf60SHong Zhang b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5235b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 523616a2bf60SHong Zhang PetscFunctionReturn(0); 523716a2bf60SHong Zhang } 523816a2bf60SHong Zhang 523916a2bf60SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 524016a2bf60SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 524116a2bf60SHong Zhang 524216a2bf60SHong Zhang /* get new row pointers */ 524316a2bf60SHong Zhang ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 524416a2bf60SHong Zhang bi[0] = 0; 524516a2bf60SHong Zhang /* bdiag is location of diagonal in factor */ 524616a2bf60SHong Zhang ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 524716a2bf60SHong Zhang bdiag[0] = 0; 524816a2bf60SHong Zhang 524916a2bf60SHong Zhang ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr); 525016a2bf60SHong Zhang bjlvl_ptr = (PetscInt**)(bj_ptr + n); 525116a2bf60SHong Zhang 525216a2bf60SHong Zhang /* create a linked list for storing column indices of the active row */ 525316a2bf60SHong Zhang nlnk = n + 1; 525416a2bf60SHong Zhang ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 525516a2bf60SHong Zhang 525616a2bf60SHong Zhang /* initial FreeSpace size is f*(ai[n]+1) */ 525716a2bf60SHong Zhang ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 525816a2bf60SHong Zhang current_space = free_space; 525916a2bf60SHong Zhang ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 526016a2bf60SHong Zhang current_space_lvl = free_space_lvl; 526116a2bf60SHong Zhang 526216a2bf60SHong Zhang for (i=0; i<n; i++) { 526316a2bf60SHong Zhang nzi = 0; 526416a2bf60SHong Zhang /* copy current row into linked list */ 526516a2bf60SHong Zhang nnz = ai[r[i]+1] - ai[r[i]]; 526616a2bf60SHong Zhang if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 526716a2bf60SHong Zhang cols = aj + ai[r[i]]; 526816a2bf60SHong Zhang lnk[i] = -1; /* marker to indicate if diagonal exists */ 526916a2bf60SHong Zhang ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 527016a2bf60SHong Zhang nzi += nlnk; 527116a2bf60SHong Zhang 527216a2bf60SHong Zhang /* make sure diagonal entry is included */ 527316a2bf60SHong Zhang if (diagonal_fill && lnk[i] == -1) { 527416a2bf60SHong Zhang fm = n; 527516a2bf60SHong Zhang while (lnk[fm] < i) fm = lnk[fm]; 527616a2bf60SHong Zhang lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 527716a2bf60SHong Zhang lnk[fm] = i; 527816a2bf60SHong Zhang lnk_lvl[i] = 0; 527916a2bf60SHong Zhang nzi++; dcount++; 528016a2bf60SHong Zhang } 528116a2bf60SHong Zhang 528216a2bf60SHong Zhang /* add pivot rows into the active row */ 528316a2bf60SHong Zhang nzbd = 0; 528416a2bf60SHong Zhang prow = lnk[n]; 528516a2bf60SHong Zhang while (prow < i) { 528616a2bf60SHong Zhang nnz = bdiag[prow]; 528716a2bf60SHong Zhang cols = bj_ptr[prow] + nnz + 1; 528816a2bf60SHong Zhang cols_lvl = bjlvl_ptr[prow] + nnz + 1; 528916a2bf60SHong Zhang nnz = bi[prow+1] - bi[prow] - nnz - 1; 529016a2bf60SHong Zhang ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 529116a2bf60SHong Zhang nzi += nlnk; 529216a2bf60SHong Zhang prow = lnk[prow]; 529316a2bf60SHong Zhang nzbd++; 529416a2bf60SHong Zhang } 529516a2bf60SHong Zhang bdiag[i] = nzbd; 529616a2bf60SHong Zhang bi[i+1] = bi[i] + nzi; 529716a2bf60SHong Zhang 529816a2bf60SHong Zhang /* if free space is not available, make more free space */ 529916a2bf60SHong Zhang if (current_space->local_remaining<nzi) { 530016a2bf60SHong Zhang nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 530116a2bf60SHong Zhang ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 530216a2bf60SHong Zhang ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 530316a2bf60SHong Zhang reallocs++; 530416a2bf60SHong Zhang } 530516a2bf60SHong Zhang 530616a2bf60SHong Zhang /* copy data into free_space and free_space_lvl, then initialize lnk */ 530716a2bf60SHong Zhang ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 530816a2bf60SHong Zhang bj_ptr[i] = current_space->array; 530916a2bf60SHong Zhang bjlvl_ptr[i] = current_space_lvl->array; 531016a2bf60SHong Zhang 531116a2bf60SHong Zhang /* make sure the active row i has diagonal entry */ 531216a2bf60SHong Zhang if (*(bj_ptr[i]+bdiag[i]) != i) { 531316a2bf60SHong Zhang SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 531416a2bf60SHong Zhang try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 531516a2bf60SHong Zhang } 531616a2bf60SHong Zhang 531716a2bf60SHong Zhang current_space->array += nzi; 531816a2bf60SHong Zhang current_space->local_used += nzi; 531916a2bf60SHong Zhang current_space->local_remaining -= nzi; 532016a2bf60SHong Zhang current_space_lvl->array += nzi; 532116a2bf60SHong Zhang current_space_lvl->local_used += nzi; 532216a2bf60SHong Zhang current_space_lvl->local_remaining -= nzi; 532316a2bf60SHong Zhang } 532416a2bf60SHong Zhang 532516a2bf60SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 532616a2bf60SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 532716a2bf60SHong Zhang 532816a2bf60SHong Zhang /* destroy list of free space and other temporary arrays */ 532916a2bf60SHong Zhang ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 533016a2bf60SHong Zhang 533116a2bf60SHong Zhang /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5332783ef271SHong Zhang ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 533316a2bf60SHong Zhang 533416a2bf60SHong Zhang ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 533516a2bf60SHong Zhang ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 533616a2bf60SHong Zhang ierr = PetscFree(bj_ptr);CHKERRQ(ierr); 533716a2bf60SHong Zhang 533816a2bf60SHong Zhang #if defined(PETSC_USE_INFO) 533916a2bf60SHong Zhang { 534016a2bf60SHong Zhang PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 534116a2bf60SHong Zhang ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 534216a2bf60SHong Zhang ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 534316a2bf60SHong Zhang ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 534416a2bf60SHong Zhang ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 534516a2bf60SHong Zhang if (diagonal_fill) { 534616a2bf60SHong Zhang ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 534716a2bf60SHong Zhang } 534816a2bf60SHong Zhang } 534916a2bf60SHong Zhang #endif 535016a2bf60SHong Zhang 535116a2bf60SHong Zhang /* put together the new matrix */ 535216a2bf60SHong Zhang ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 535316a2bf60SHong Zhang ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 535416a2bf60SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 535516a2bf60SHong Zhang b->free_a = PETSC_TRUE; 535616a2bf60SHong Zhang b->free_ij = PETSC_TRUE; 535716a2bf60SHong Zhang b->singlemalloc = PETSC_FALSE; 53587fa3a6a0SHong Zhang ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 535916a2bf60SHong Zhang b->j = bj; 536016a2bf60SHong Zhang b->i = bi; 536116a2bf60SHong Zhang b->diag = bdiag; 53627f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 536316a2bf60SHong Zhang b->ilen = 0; 536416a2bf60SHong Zhang b->imax = 0; 536516a2bf60SHong Zhang b->row = isrow; 536616a2bf60SHong Zhang b->col = iscol; 536716a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 536816a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 536916a2bf60SHong Zhang b->icol = isicol; 53707fa3a6a0SHong Zhang ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 537116a2bf60SHong Zhang /* In b structure: Free imax, ilen, old a, old j. 537216a2bf60SHong Zhang Allocate bdiag, solve_work, new a, new j */ 53737fa3a6a0SHong Zhang ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 537416a2bf60SHong Zhang b->maxnz = b->nz = bi[2*n+1] ; 537516a2bf60SHong Zhang (fact)->info.factor_mallocs = reallocs; 537616a2bf60SHong Zhang (fact)->info.fill_ratio_given = f; 537716a2bf60SHong Zhang (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]); 537816a2bf60SHong Zhang (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 53797fa3a6a0SHong Zhang /* set MatSolve routines */ 53807fa3a6a0SHong Zhang if (both_identity){ 53817fa3a6a0SHong Zhang switch (bs){ 53827fa3a6a0SHong Zhang case 2: 53837fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 53847fa3a6a0SHong Zhang break; 53857fa3a6a0SHong Zhang case 3: 53867fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 53877fa3a6a0SHong Zhang break; 53887fa3a6a0SHong Zhang case 4: 53897fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 53907fa3a6a0SHong Zhang break; 53917fa3a6a0SHong Zhang case 5: 53927fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 53937fa3a6a0SHong Zhang break; 53947fa3a6a0SHong Zhang case 6: 53957fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 53967fa3a6a0SHong Zhang break; 53977fa3a6a0SHong Zhang case 7: 53987fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 53997fa3a6a0SHong Zhang break; 54007fa3a6a0SHong Zhang default: 54017fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 54027fa3a6a0SHong Zhang break; 54037fa3a6a0SHong Zhang } 54047fa3a6a0SHong Zhang } else { 54057fa3a6a0SHong Zhang switch (bs){ 54067fa3a6a0SHong Zhang case 2: 54077fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct; 54087fa3a6a0SHong Zhang break; 54097fa3a6a0SHong Zhang case 3: 54107fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct; 54117fa3a6a0SHong Zhang break; 54127fa3a6a0SHong Zhang case 4: 54137fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct; 54147fa3a6a0SHong Zhang break; 54157fa3a6a0SHong Zhang case 5: 54167fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct; 54177fa3a6a0SHong Zhang break; 54187fa3a6a0SHong Zhang case 6: 54197fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct; 54207fa3a6a0SHong Zhang break; 54217fa3a6a0SHong Zhang case 7: 54227fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct; 54237fa3a6a0SHong Zhang break; 54247fa3a6a0SHong Zhang default: 54257fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 54267fa3a6a0SHong Zhang break; 54277fa3a6a0SHong Zhang } 54287fa3a6a0SHong Zhang } 542916a2bf60SHong Zhang PetscFunctionReturn(0); 543016a2bf60SHong Zhang } 543116a2bf60SHong Zhang 54324e2b4712SSatish Balay /* 54334e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 54344e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 54354e2b4712SSatish Balay Not a good example of code reuse. 54364e2b4712SSatish Balay */ 54374a2ae208SSatish Balay #undef __FUNCT__ 54384a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 54390481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 54404e2b4712SSatish Balay { 54414e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 54424e2b4712SSatish Balay IS isicol; 54436849ba73SBarry Smith PetscErrorCode ierr; 54445d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 54455d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5446a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5447d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 544841df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 5449329f5518SBarry Smith PetscReal f; 545016a2bf60SHong Zhang PetscTruth newdatastruct=PETSC_FALSE; 54514e2b4712SSatish Balay 54524e2b4712SSatish Balay PetscFunctionBegin; 545316a2bf60SHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 545416a2bf60SHong Zhang if (newdatastruct){ 545516a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 545616a2bf60SHong Zhang PetscFunctionReturn(0); 545716a2bf60SHong Zhang } 545816a2bf60SHong Zhang 54596bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 54606bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 54616bce7ff8SHong Zhang 5462435faa5fSBarry Smith f = info->fill; 5463690b6cddSBarry Smith levels = (PetscInt)info->levels; 5464690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 54654c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 546616a2bf60SHong Zhang 5467667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5468667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 54697d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 5470309c388cSBarry Smith 547141df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 547216a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 54736bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 54746bce7ff8SHong Zhang 5475719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 5476719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 5477bb3d539aSBarry Smith b->row = isrow; 5478bb3d539aSBarry Smith b->col = iscol; 5479bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5480bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5481bb3d539aSBarry Smith b->icol = isicol; 5482bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5483b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 54846bce7ff8SHong Zhang PetscFunctionReturn(0); 54856bce7ff8SHong Zhang } 54866bce7ff8SHong Zhang 54876bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 54884e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 54894e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 54904e2b4712SSatish Balay 54914e2b4712SSatish Balay /* get new row pointers */ 5492690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 54934e2b4712SSatish Balay ainew[0] = 0; 54944e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 5495690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 5496690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 54974e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 5498690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 54994e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 5500690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 55014e2b4712SSatish Balay /* im is level for each filled value */ 5502690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 55034e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 5504690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 55054e2b4712SSatish Balay dloc[0] = 0; 55064e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 5507435faa5fSBarry Smith 5508435faa5fSBarry Smith /* copy prow into linked list */ 55094e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 55103b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 55114e2b4712SSatish Balay xi = aj + ai[r[prow]]; 55124e2b4712SSatish Balay fill[n] = n; 5513435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 55144e2b4712SSatish Balay while (nz--) { 55154e2b4712SSatish Balay fm = n; 55164e2b4712SSatish Balay idx = ic[*xi++]; 55174e2b4712SSatish Balay do { 55184e2b4712SSatish Balay m = fm; 55194e2b4712SSatish Balay fm = fill[m]; 55204e2b4712SSatish Balay } while (fm < idx); 55214e2b4712SSatish Balay fill[m] = idx; 55224e2b4712SSatish Balay fill[idx] = fm; 55234e2b4712SSatish Balay im[idx] = 0; 55244e2b4712SSatish Balay } 5525435faa5fSBarry Smith 5526435faa5fSBarry Smith /* make sure diagonal entry is included */ 5527435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 5528435faa5fSBarry Smith fm = n; 5529435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 5530435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5531435faa5fSBarry Smith fill[fm] = prow; 5532435faa5fSBarry Smith im[prow] = 0; 5533435faa5fSBarry Smith nzf++; 5534335d9088SBarry Smith dcount++; 5535435faa5fSBarry Smith } 5536435faa5fSBarry Smith 55374e2b4712SSatish Balay nzi = 0; 55384e2b4712SSatish Balay row = fill[n]; 55394e2b4712SSatish Balay while (row < prow) { 55404e2b4712SSatish Balay incrlev = im[row] + 1; 55414e2b4712SSatish Balay nz = dloc[row]; 5542435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 55434e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 55444e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 55454e2b4712SSatish Balay fm = row; 55464e2b4712SSatish Balay while (nnz-- > 0) { 55474e2b4712SSatish Balay idx = *xi++; 55484e2b4712SSatish Balay if (*flev + incrlev > levels) { 55494e2b4712SSatish Balay flev++; 55504e2b4712SSatish Balay continue; 55514e2b4712SSatish Balay } 55524e2b4712SSatish Balay do { 55534e2b4712SSatish Balay m = fm; 55544e2b4712SSatish Balay fm = fill[m]; 55554e2b4712SSatish Balay } while (fm < idx); 55564e2b4712SSatish Balay if (fm != idx) { 55574e2b4712SSatish Balay im[idx] = *flev + incrlev; 55584e2b4712SSatish Balay fill[m] = idx; 55594e2b4712SSatish Balay fill[idx] = fm; 55604e2b4712SSatish Balay fm = idx; 55614e2b4712SSatish Balay nzf++; 5562ecf371e4SBarry Smith } else { 55634e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 55644e2b4712SSatish Balay } 55654e2b4712SSatish Balay flev++; 55664e2b4712SSatish Balay } 55674e2b4712SSatish Balay row = fill[row]; 55684e2b4712SSatish Balay nzi++; 55694e2b4712SSatish Balay } 55704e2b4712SSatish Balay /* copy new filled row into permanent storage */ 55714e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 55724e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 5573ecf371e4SBarry Smith 5574ecf371e4SBarry Smith /* estimate how much additional space we will need */ 5575ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5576ecf371e4SBarry Smith /* just double the memory each time */ 5577690b6cddSBarry Smith PetscInt maxadd = jmax; 5578ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 55794e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 55804e2b4712SSatish Balay jmax += maxadd; 5581ecf371e4SBarry Smith 5582ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 55835d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 55845d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5585606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 55865d0c19d7SBarry Smith ajnew = xitmp; 55875d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 55885d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5589606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 55905d0c19d7SBarry Smith ajfill = xitmp; 5591eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 55924e2b4712SSatish Balay } 55935d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 55944e2b4712SSatish Balay flev = ajfill + ainew[prow]; 55954e2b4712SSatish Balay dloc[prow] = nzi; 55964e2b4712SSatish Balay fm = fill[n]; 55974e2b4712SSatish Balay while (nzf--) { 55985d0c19d7SBarry Smith *xitmp++ = fm; 55994e2b4712SSatish Balay *flev++ = im[fm]; 56004e2b4712SSatish Balay fm = fill[fm]; 56014e2b4712SSatish Balay } 5602435faa5fSBarry Smith /* make sure row has diagonal entry */ 5603435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 560477431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 56052401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5606435faa5fSBarry Smith } 56074e2b4712SSatish Balay } 5608606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 56094e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 56104e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5611606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 5612606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 56134e2b4712SSatish Balay 56146cf91177SBarry Smith #if defined(PETSC_USE_INFO) 56154e2b4712SSatish Balay { 5616329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5617ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5618ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5619ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5620ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5621335d9088SBarry Smith if (diagonal_fill) { 5622ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5623335d9088SBarry Smith } 56244e2b4712SSatish Balay } 562563ba0a88SBarry Smith #endif 56264e2b4712SSatish Balay 56274e2b4712SSatish Balay /* put together the new matrix */ 5628719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5629719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5630719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 5631e6b907acSBarry Smith b->free_a = PETSC_TRUE; 5632e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 56337c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 5634a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 56354e2b4712SSatish Balay b->j = ajnew; 56364e2b4712SSatish Balay b->i = ainew; 56374e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 56384e2b4712SSatish Balay b->diag = dloc; 56397f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 56404e2b4712SSatish Balay b->ilen = 0; 56414e2b4712SSatish Balay b->imax = 0; 56424e2b4712SSatish Balay b->row = isrow; 56434e2b4712SSatish Balay b->col = iscol; 5644bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5645c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5646c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5647e51c0b9cSSatish Balay b->icol = isicol; 564887828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 56494e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 56504e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 5651719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 56524e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 56534e2b4712SSatish Balay 5654719d5645SBarry Smith (fact)->info.factor_mallocs = reallocate; 5655719d5645SBarry Smith (fact)->info.fill_ratio_given = f; 5656719d5645SBarry Smith (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 56576bce7ff8SHong Zhang 565841df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 56598661488fSKris Buschelman PetscFunctionReturn(0); 56608661488fSKris Buschelman } 56618661488fSKris Buschelman 5662732ee342SKris Buschelman #undef __FUNCT__ 56637e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5664dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 56657e7071cdSKris Buschelman { 566612272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 566712272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 56685a9542e3SKris Buschelman PetscFunctionBegin; 56697cf1b8d3SKris Buschelman /* Undo Column scaling */ 56707cf1b8d3SKris Buschelman /* while (nz--) { */ 56717cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 56727cf1b8d3SKris Buschelman /* } */ 5673c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 5674c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 56757cf1b8d3SKris Buschelman PetscFunctionReturn(0); 56767cf1b8d3SKris Buschelman } 56777cf1b8d3SKris Buschelman 56787cf1b8d3SKris Buschelman #undef __FUNCT__ 56797cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5680dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 56817cf1b8d3SKris Buschelman { 56827cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5683b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 56842aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 56855a9542e3SKris Buschelman PetscFunctionBegin; 56860b9da03eSKris Buschelman /* Is this really necessary? */ 568720235379SKris Buschelman while (nz--) { 56880b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 56897e7071cdSKris Buschelman } 5690c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 56917e7071cdSKris Buschelman PetscFunctionReturn(0); 56927e7071cdSKris Buschelman } 56937e7071cdSKris Buschelman 5694732ee342SKris Buschelman 5695