1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 1016a2bf60SHong Zhang #include "petscbt.h" 1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 124e2b4712SSatish Balay 134a2ae208SSatish Balay #undef __FUNCT__ 144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16f1af5d2fSBarry Smith { 17f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18dfbe8321SBarry Smith PetscErrorCode ierr; 19690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20690b6cddSBarry Smith PetscInt *diag = a->diag; 21f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2287828ca2SBarry Smith PetscScalar s1,*x,*b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 66f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6887828ca2SBarry Smith PetscScalar *x,*b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 1111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120f1af5d2fSBarry Smith { 121f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122dfbe8321SBarry Smith PetscErrorCode ierr; 123690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 125f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12687828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12787828ca2SBarry Smith PetscScalar *x,*b; 128f1af5d2fSBarry Smith 129f1af5d2fSBarry Smith PetscFunctionBegin; 130ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1311ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133f1af5d2fSBarry Smith 134f1af5d2fSBarry Smith /* forward solve the U^T */ 135f1af5d2fSBarry Smith idx = 0; 136f1af5d2fSBarry Smith for (i=0; i<n; i++) { 137f1af5d2fSBarry Smith 138f1af5d2fSBarry Smith v = aa + 9*diag[i]; 139f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 140ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144f1af5d2fSBarry Smith v += 9; 145f1af5d2fSBarry Smith 146f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 147f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 148f1af5d2fSBarry Smith while (nz--) { 149f1af5d2fSBarry Smith oidx = 3*(*vi++); 150f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153f1af5d2fSBarry Smith v += 9; 154f1af5d2fSBarry Smith } 155f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156f1af5d2fSBarry Smith idx += 3; 157f1af5d2fSBarry Smith } 158f1af5d2fSBarry Smith /* backward solve the L^T */ 159f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 160f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 161f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 162f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 163f1af5d2fSBarry Smith idt = 3*i; 164f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165f1af5d2fSBarry Smith while (nz--) { 166f1af5d2fSBarry Smith idx = 3*(*vi--); 167f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170f1af5d2fSBarry Smith v -= 9; 171f1af5d2fSBarry Smith } 172f1af5d2fSBarry Smith } 1731ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176f1af5d2fSBarry Smith PetscFunctionReturn(0); 177f1af5d2fSBarry Smith } 178f1af5d2fSBarry Smith 1794a2ae208SSatish Balay #undef __FUNCT__ 1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182f1af5d2fSBarry Smith { 183f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184dfbe8321SBarry Smith PetscErrorCode ierr; 185690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 187f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18987828ca2SBarry Smith PetscScalar *x,*b; 190f1af5d2fSBarry Smith 191f1af5d2fSBarry Smith PetscFunctionBegin; 192ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1931ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195f1af5d2fSBarry Smith 196f1af5d2fSBarry Smith /* forward solve the U^T */ 197f1af5d2fSBarry Smith idx = 0; 198f1af5d2fSBarry Smith for (i=0; i<n; i++) { 199f1af5d2fSBarry Smith 200f1af5d2fSBarry Smith v = aa + 16*diag[i]; 201f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 202ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207f1af5d2fSBarry Smith v += 16; 208f1af5d2fSBarry Smith 209f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 210f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 211f1af5d2fSBarry Smith while (nz--) { 212f1af5d2fSBarry Smith oidx = 4*(*vi++); 213f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217f1af5d2fSBarry Smith v += 16; 218f1af5d2fSBarry Smith } 219f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220f1af5d2fSBarry Smith idx += 4; 221f1af5d2fSBarry Smith } 222f1af5d2fSBarry Smith /* backward solve the L^T */ 223f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 224f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 225f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 226f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 227f1af5d2fSBarry Smith idt = 4*i; 228f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229f1af5d2fSBarry Smith while (nz--) { 230f1af5d2fSBarry Smith idx = 4*(*vi--); 231f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235f1af5d2fSBarry Smith v -= 16; 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith } 2381ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2391ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241f1af5d2fSBarry Smith PetscFunctionReturn(0); 242f1af5d2fSBarry Smith } 243f1af5d2fSBarry Smith 2444a2ae208SSatish Balay #undef __FUNCT__ 2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247f1af5d2fSBarry Smith { 248f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249dfbe8321SBarry Smith PetscErrorCode ierr; 250690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 252f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 25387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 25487828ca2SBarry Smith PetscScalar *x,*b; 255f1af5d2fSBarry Smith 256f1af5d2fSBarry Smith PetscFunctionBegin; 257ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2581ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2591ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260f1af5d2fSBarry Smith 261f1af5d2fSBarry Smith /* forward solve the U^T */ 262f1af5d2fSBarry Smith idx = 0; 263f1af5d2fSBarry Smith for (i=0; i<n; i++) { 264f1af5d2fSBarry Smith 265f1af5d2fSBarry Smith v = aa + 25*diag[i]; 266f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 267ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273f1af5d2fSBarry Smith v += 25; 274f1af5d2fSBarry Smith 275f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 276f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 277f1af5d2fSBarry Smith while (nz--) { 278f1af5d2fSBarry Smith oidx = 5*(*vi++); 279f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284f1af5d2fSBarry Smith v += 25; 285f1af5d2fSBarry Smith } 286f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287f1af5d2fSBarry Smith idx += 5; 288f1af5d2fSBarry Smith } 289f1af5d2fSBarry Smith /* backward solve the L^T */ 290f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 291f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 292f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 293f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 294f1af5d2fSBarry Smith idt = 5*i; 295f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296f1af5d2fSBarry Smith while (nz--) { 297f1af5d2fSBarry Smith idx = 5*(*vi--); 298f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303f1af5d2fSBarry Smith v -= 25; 304f1af5d2fSBarry Smith } 305f1af5d2fSBarry Smith } 3061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309f1af5d2fSBarry Smith PetscFunctionReturn(0); 310f1af5d2fSBarry Smith } 311f1af5d2fSBarry Smith 3124a2ae208SSatish Balay #undef __FUNCT__ 3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315f1af5d2fSBarry Smith { 316f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317dfbe8321SBarry Smith PetscErrorCode ierr; 318690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 320f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 32187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 32287828ca2SBarry Smith PetscScalar *x,*b; 323f1af5d2fSBarry Smith 324f1af5d2fSBarry Smith PetscFunctionBegin; 325ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328f1af5d2fSBarry Smith 329f1af5d2fSBarry Smith /* forward solve the U^T */ 330f1af5d2fSBarry Smith idx = 0; 331f1af5d2fSBarry Smith for (i=0; i<n; i++) { 332f1af5d2fSBarry Smith 333f1af5d2fSBarry Smith v = aa + 36*diag[i]; 334f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 335ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336ef66eb69SBarry Smith x6 = x[5+idx]; 337f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343f1af5d2fSBarry Smith v += 36; 344f1af5d2fSBarry Smith 345f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 346f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 347f1af5d2fSBarry Smith while (nz--) { 348f1af5d2fSBarry Smith oidx = 6*(*vi++); 349f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355f1af5d2fSBarry Smith v += 36; 356f1af5d2fSBarry Smith } 357f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358f1af5d2fSBarry Smith x[5+idx] = s6; 359f1af5d2fSBarry Smith idx += 6; 360f1af5d2fSBarry Smith } 361f1af5d2fSBarry Smith /* backward solve the L^T */ 362f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 363f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 364f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 365f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 366f1af5d2fSBarry Smith idt = 6*i; 367f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368f1af5d2fSBarry Smith s6 = x[5+idt]; 369f1af5d2fSBarry Smith while (nz--) { 370f1af5d2fSBarry Smith idx = 6*(*vi--); 371f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377f1af5d2fSBarry Smith v -= 36; 378f1af5d2fSBarry Smith } 379f1af5d2fSBarry Smith } 3801ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383f1af5d2fSBarry Smith PetscFunctionReturn(0); 384f1af5d2fSBarry Smith } 385f1af5d2fSBarry Smith 3864a2ae208SSatish Balay #undef __FUNCT__ 3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389f1af5d2fSBarry Smith { 390f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391dfbe8321SBarry Smith PetscErrorCode ierr; 392690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 394f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 39587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 39687828ca2SBarry Smith PetscScalar *x,*b; 397f1af5d2fSBarry Smith 398f1af5d2fSBarry Smith PetscFunctionBegin; 399ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4001ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402f1af5d2fSBarry Smith 403f1af5d2fSBarry Smith /* forward solve the U^T */ 404f1af5d2fSBarry Smith idx = 0; 405f1af5d2fSBarry Smith for (i=0; i<n; i++) { 406f1af5d2fSBarry Smith 407f1af5d2fSBarry Smith v = aa + 49*diag[i]; 408f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 409ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 411f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418f1af5d2fSBarry Smith v += 49; 419f1af5d2fSBarry Smith 420f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 421f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 422f1af5d2fSBarry Smith while (nz--) { 423f1af5d2fSBarry Smith oidx = 7*(*vi++); 424f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431f1af5d2fSBarry Smith v += 49; 432f1af5d2fSBarry Smith } 433f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 435f1af5d2fSBarry Smith idx += 7; 436f1af5d2fSBarry Smith } 437f1af5d2fSBarry Smith /* backward solve the L^T */ 438f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 439f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 440f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 441f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 442f1af5d2fSBarry Smith idt = 7*i; 443f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 445f1af5d2fSBarry Smith while (nz--) { 446f1af5d2fSBarry Smith idx = 7*(*vi--); 447f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454f1af5d2fSBarry Smith v -= 49; 455f1af5d2fSBarry Smith } 456f1af5d2fSBarry Smith } 4571ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460f1af5d2fSBarry Smith PetscFunctionReturn(0); 461f1af5d2fSBarry Smith } 462f1af5d2fSBarry Smith 463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4644a2ae208SSatish Balay #undef __FUNCT__ 4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467f1af5d2fSBarry Smith { 468f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 4706849ba73SBarry Smith PetscErrorCode ierr; 4715d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 4725d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473690b6cddSBarry Smith PetscInt *diag = a->diag; 474f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 47587828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 476f1af5d2fSBarry Smith 477f1af5d2fSBarry Smith PetscFunctionBegin; 4781ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480f1af5d2fSBarry Smith t = a->solve_work; 481f1af5d2fSBarry Smith 482f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484f1af5d2fSBarry Smith 485f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 486f1af5d2fSBarry Smith for (i=0; i<n; i++) { 487f1af5d2fSBarry Smith t[i] = b[c[i]]; 488f1af5d2fSBarry Smith } 489f1af5d2fSBarry Smith 490f1af5d2fSBarry Smith /* forward solve the U^T */ 491f1af5d2fSBarry Smith for (i=0; i<n; i++) { 492f1af5d2fSBarry Smith 493f1af5d2fSBarry Smith v = aa + diag[i]; 494f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 495f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 496f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 497f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 498f1af5d2fSBarry Smith while (nz--) { 499f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 500f1af5d2fSBarry Smith } 501f1af5d2fSBarry Smith t[i] = s1; 502f1af5d2fSBarry Smith } 503f1af5d2fSBarry Smith /* backward solve the L^T */ 504f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 505f1af5d2fSBarry Smith v = aa + diag[i] - 1; 506f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 507f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 508f1af5d2fSBarry Smith s1 = t[i]; 509f1af5d2fSBarry Smith while (nz--) { 510f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 511f1af5d2fSBarry Smith } 512f1af5d2fSBarry Smith } 513f1af5d2fSBarry Smith 514f1af5d2fSBarry Smith /* copy t into x according to permutation */ 515f1af5d2fSBarry Smith for (i=0; i<n; i++) { 516f1af5d2fSBarry Smith x[r[i]] = t[i]; 517f1af5d2fSBarry Smith } 518f1af5d2fSBarry Smith 519f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5211ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5221ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524f1af5d2fSBarry Smith PetscFunctionReturn(0); 525f1af5d2fSBarry Smith } 526f1af5d2fSBarry Smith 5274a2ae208SSatish Balay #undef __FUNCT__ 5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530f1af5d2fSBarry Smith { 531f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5336849ba73SBarry Smith PetscErrorCode ierr; 5345d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5355d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 537f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53887828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 53987828ca2SBarry Smith PetscScalar *x,*b,*t; 540f1af5d2fSBarry Smith 541f1af5d2fSBarry Smith PetscFunctionBegin; 5421ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5431ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544f1af5d2fSBarry Smith t = a->solve_work; 545f1af5d2fSBarry Smith 546f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548f1af5d2fSBarry Smith 549f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 550f1af5d2fSBarry Smith ii = 0; 551f1af5d2fSBarry Smith for (i=0; i<n; i++) { 552f1af5d2fSBarry Smith ic = 2*c[i]; 553f1af5d2fSBarry Smith t[ii] = b[ic]; 554f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 555f1af5d2fSBarry Smith ii += 2; 556f1af5d2fSBarry Smith } 557f1af5d2fSBarry Smith 558f1af5d2fSBarry Smith /* forward solve the U^T */ 559f1af5d2fSBarry Smith idx = 0; 560f1af5d2fSBarry Smith for (i=0; i<n; i++) { 561f1af5d2fSBarry Smith 562f1af5d2fSBarry Smith v = aa + 4*diag[i]; 563f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 564f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 565f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 566f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 567f1af5d2fSBarry Smith v += 4; 568f1af5d2fSBarry Smith 569f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 570f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 571f1af5d2fSBarry Smith while (nz--) { 572f1af5d2fSBarry Smith oidx = 2*(*vi++); 573f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 574f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 575f1af5d2fSBarry Smith v += 4; 576f1af5d2fSBarry Smith } 577f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 578f1af5d2fSBarry Smith idx += 2; 579f1af5d2fSBarry Smith } 580f1af5d2fSBarry Smith /* backward solve the L^T */ 581f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 582f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 583f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 584f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 585f1af5d2fSBarry Smith idt = 2*i; 586f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 587f1af5d2fSBarry Smith while (nz--) { 588f1af5d2fSBarry Smith idx = 2*(*vi--); 589f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 590f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 591f1af5d2fSBarry Smith v -= 4; 592f1af5d2fSBarry Smith } 593f1af5d2fSBarry Smith } 594f1af5d2fSBarry Smith 595f1af5d2fSBarry Smith /* copy t into x according to permutation */ 596f1af5d2fSBarry Smith ii = 0; 597f1af5d2fSBarry Smith for (i=0; i<n; i++) { 598f1af5d2fSBarry Smith ir = 2*r[i]; 599f1af5d2fSBarry Smith x[ir] = t[ii]; 600f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 601f1af5d2fSBarry Smith ii += 2; 602f1af5d2fSBarry Smith } 603f1af5d2fSBarry Smith 604f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609f1af5d2fSBarry Smith PetscFunctionReturn(0); 610f1af5d2fSBarry Smith } 611f1af5d2fSBarry Smith 6124a2ae208SSatish Balay #undef __FUNCT__ 6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615f1af5d2fSBarry Smith { 616f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6186849ba73SBarry Smith PetscErrorCode ierr; 6195d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 6205d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 622f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 62387828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 62487828ca2SBarry Smith PetscScalar *x,*b,*t; 625f1af5d2fSBarry Smith 626f1af5d2fSBarry Smith PetscFunctionBegin; 6271ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629f1af5d2fSBarry Smith t = a->solve_work; 630f1af5d2fSBarry Smith 631f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633f1af5d2fSBarry Smith 634f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 635f1af5d2fSBarry Smith ii = 0; 636f1af5d2fSBarry Smith for (i=0; i<n; i++) { 637f1af5d2fSBarry Smith ic = 3*c[i]; 638f1af5d2fSBarry Smith t[ii] = b[ic]; 639f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 640f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 641f1af5d2fSBarry Smith ii += 3; 642f1af5d2fSBarry Smith } 643f1af5d2fSBarry Smith 644f1af5d2fSBarry Smith /* forward solve the U^T */ 645f1af5d2fSBarry Smith idx = 0; 646f1af5d2fSBarry Smith for (i=0; i<n; i++) { 647f1af5d2fSBarry Smith 648f1af5d2fSBarry Smith v = aa + 9*diag[i]; 649f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 650f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654f1af5d2fSBarry Smith v += 9; 655f1af5d2fSBarry Smith 656f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 657f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 658f1af5d2fSBarry Smith while (nz--) { 659f1af5d2fSBarry Smith oidx = 3*(*vi++); 660f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663f1af5d2fSBarry Smith v += 9; 664f1af5d2fSBarry Smith } 665f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666f1af5d2fSBarry Smith idx += 3; 667f1af5d2fSBarry Smith } 668f1af5d2fSBarry Smith /* backward solve the L^T */ 669f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 670f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 671f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 672f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 673f1af5d2fSBarry Smith idt = 3*i; 674f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675f1af5d2fSBarry Smith while (nz--) { 676f1af5d2fSBarry Smith idx = 3*(*vi--); 677f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680f1af5d2fSBarry Smith v -= 9; 681f1af5d2fSBarry Smith } 682f1af5d2fSBarry Smith } 683f1af5d2fSBarry Smith 684f1af5d2fSBarry Smith /* copy t into x according to permutation */ 685f1af5d2fSBarry Smith ii = 0; 686f1af5d2fSBarry Smith for (i=0; i<n; i++) { 687f1af5d2fSBarry Smith ir = 3*r[i]; 688f1af5d2fSBarry Smith x[ir] = t[ii]; 689f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 690f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 691f1af5d2fSBarry Smith ii += 3; 692f1af5d2fSBarry Smith } 693f1af5d2fSBarry Smith 694f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6961ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699f1af5d2fSBarry Smith PetscFunctionReturn(0); 700f1af5d2fSBarry Smith } 701f1af5d2fSBarry Smith 7024a2ae208SSatish Balay #undef __FUNCT__ 7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705f1af5d2fSBarry Smith { 706f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7086849ba73SBarry Smith PetscErrorCode ierr; 7095d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 7105d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 712f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 71387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 71487828ca2SBarry Smith PetscScalar *x,*b,*t; 715f1af5d2fSBarry Smith 716f1af5d2fSBarry Smith PetscFunctionBegin; 7171ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719f1af5d2fSBarry Smith t = a->solve_work; 720f1af5d2fSBarry Smith 721f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723f1af5d2fSBarry Smith 724f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 725f1af5d2fSBarry Smith ii = 0; 726f1af5d2fSBarry Smith for (i=0; i<n; i++) { 727f1af5d2fSBarry Smith ic = 4*c[i]; 728f1af5d2fSBarry Smith t[ii] = b[ic]; 729f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 730f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 731f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 732f1af5d2fSBarry Smith ii += 4; 733f1af5d2fSBarry Smith } 734f1af5d2fSBarry Smith 735f1af5d2fSBarry Smith /* forward solve the U^T */ 736f1af5d2fSBarry Smith idx = 0; 737f1af5d2fSBarry Smith for (i=0; i<n; i++) { 738f1af5d2fSBarry Smith 739f1af5d2fSBarry Smith v = aa + 16*diag[i]; 740f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 741f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746f1af5d2fSBarry Smith v += 16; 747f1af5d2fSBarry Smith 748f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 749f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 750f1af5d2fSBarry Smith while (nz--) { 751f1af5d2fSBarry Smith oidx = 4*(*vi++); 752f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756f1af5d2fSBarry Smith v += 16; 757f1af5d2fSBarry Smith } 758f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759f1af5d2fSBarry Smith idx += 4; 760f1af5d2fSBarry Smith } 761f1af5d2fSBarry Smith /* backward solve the L^T */ 762f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 763f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 764f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 765f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 766f1af5d2fSBarry Smith idt = 4*i; 767f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768f1af5d2fSBarry Smith while (nz--) { 769f1af5d2fSBarry Smith idx = 4*(*vi--); 770f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774f1af5d2fSBarry Smith v -= 16; 775f1af5d2fSBarry Smith } 776f1af5d2fSBarry Smith } 777f1af5d2fSBarry Smith 778f1af5d2fSBarry Smith /* copy t into x according to permutation */ 779f1af5d2fSBarry Smith ii = 0; 780f1af5d2fSBarry Smith for (i=0; i<n; i++) { 781f1af5d2fSBarry Smith ir = 4*r[i]; 782f1af5d2fSBarry Smith x[ir] = t[ii]; 783f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 784f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 785f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 786f1af5d2fSBarry Smith ii += 4; 787f1af5d2fSBarry Smith } 788f1af5d2fSBarry Smith 789f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794f1af5d2fSBarry Smith PetscFunctionReturn(0); 795f1af5d2fSBarry Smith } 796f1af5d2fSBarry Smith 7974a2ae208SSatish Balay #undef __FUNCT__ 7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800f1af5d2fSBarry Smith { 801f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8036849ba73SBarry Smith PetscErrorCode ierr; 8045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 807f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 80887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 80987828ca2SBarry Smith PetscScalar *x,*b,*t; 810f1af5d2fSBarry Smith 811f1af5d2fSBarry Smith PetscFunctionBegin; 8121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814f1af5d2fSBarry Smith t = a->solve_work; 815f1af5d2fSBarry Smith 816f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818f1af5d2fSBarry Smith 819f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 820f1af5d2fSBarry Smith ii = 0; 821f1af5d2fSBarry Smith for (i=0; i<n; i++) { 822f1af5d2fSBarry Smith ic = 5*c[i]; 823f1af5d2fSBarry Smith t[ii] = b[ic]; 824f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 825f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 826f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 827f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 828f1af5d2fSBarry Smith ii += 5; 829f1af5d2fSBarry Smith } 830f1af5d2fSBarry Smith 831f1af5d2fSBarry Smith /* forward solve the U^T */ 832f1af5d2fSBarry Smith idx = 0; 833f1af5d2fSBarry Smith for (i=0; i<n; i++) { 834f1af5d2fSBarry Smith 835f1af5d2fSBarry Smith v = aa + 25*diag[i]; 836f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 837f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843f1af5d2fSBarry Smith v += 25; 844f1af5d2fSBarry Smith 845f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 846f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 847f1af5d2fSBarry Smith while (nz--) { 848f1af5d2fSBarry Smith oidx = 5*(*vi++); 849f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854f1af5d2fSBarry Smith v += 25; 855f1af5d2fSBarry Smith } 856f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857f1af5d2fSBarry Smith idx += 5; 858f1af5d2fSBarry Smith } 859f1af5d2fSBarry Smith /* backward solve the L^T */ 860f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 861f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 862f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 863f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 864f1af5d2fSBarry Smith idt = 5*i; 865f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866f1af5d2fSBarry Smith while (nz--) { 867f1af5d2fSBarry Smith idx = 5*(*vi--); 868f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873f1af5d2fSBarry Smith v -= 25; 874f1af5d2fSBarry Smith } 875f1af5d2fSBarry Smith } 876f1af5d2fSBarry Smith 877f1af5d2fSBarry Smith /* copy t into x according to permutation */ 878f1af5d2fSBarry Smith ii = 0; 879f1af5d2fSBarry Smith for (i=0; i<n; i++) { 880f1af5d2fSBarry Smith ir = 5*r[i]; 881f1af5d2fSBarry Smith x[ir] = t[ii]; 882f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 883f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 884f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 885f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 886f1af5d2fSBarry Smith ii += 5; 887f1af5d2fSBarry Smith } 888f1af5d2fSBarry Smith 889f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894f1af5d2fSBarry Smith PetscFunctionReturn(0); 895f1af5d2fSBarry Smith } 896f1af5d2fSBarry Smith 8974a2ae208SSatish Balay #undef __FUNCT__ 8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900f1af5d2fSBarry Smith { 901f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9036849ba73SBarry Smith PetscErrorCode ierr; 9045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 907f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 90887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 90987828ca2SBarry Smith PetscScalar *x,*b,*t; 910f1af5d2fSBarry Smith 911f1af5d2fSBarry Smith PetscFunctionBegin; 9121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914f1af5d2fSBarry Smith t = a->solve_work; 915f1af5d2fSBarry Smith 916f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918f1af5d2fSBarry Smith 919f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 920f1af5d2fSBarry Smith ii = 0; 921f1af5d2fSBarry Smith for (i=0; i<n; i++) { 922f1af5d2fSBarry Smith ic = 6*c[i]; 923f1af5d2fSBarry Smith t[ii] = b[ic]; 924f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 925f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 926f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 927f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 928f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 929f1af5d2fSBarry Smith ii += 6; 930f1af5d2fSBarry Smith } 931f1af5d2fSBarry Smith 932f1af5d2fSBarry Smith /* forward solve the U^T */ 933f1af5d2fSBarry Smith idx = 0; 934f1af5d2fSBarry Smith for (i=0; i<n; i++) { 935f1af5d2fSBarry Smith 936f1af5d2fSBarry Smith v = aa + 36*diag[i]; 937f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 938f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939f1af5d2fSBarry Smith x6 = t[5+idx]; 940f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946f1af5d2fSBarry Smith v += 36; 947f1af5d2fSBarry Smith 948f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 949f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 950f1af5d2fSBarry Smith while (nz--) { 951f1af5d2fSBarry Smith oidx = 6*(*vi++); 952f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958f1af5d2fSBarry Smith v += 36; 959f1af5d2fSBarry Smith } 960f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961f1af5d2fSBarry Smith t[5+idx] = s6; 962f1af5d2fSBarry Smith idx += 6; 963f1af5d2fSBarry Smith } 964f1af5d2fSBarry Smith /* backward solve the L^T */ 965f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 966f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 967f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 968f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 969f1af5d2fSBarry Smith idt = 6*i; 970f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971f1af5d2fSBarry Smith s6 = t[5+idt]; 972f1af5d2fSBarry Smith while (nz--) { 973f1af5d2fSBarry Smith idx = 6*(*vi--); 974f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980f1af5d2fSBarry Smith v -= 36; 981f1af5d2fSBarry Smith } 982f1af5d2fSBarry Smith } 983f1af5d2fSBarry Smith 984f1af5d2fSBarry Smith /* copy t into x according to permutation */ 985f1af5d2fSBarry Smith ii = 0; 986f1af5d2fSBarry Smith for (i=0; i<n; i++) { 987f1af5d2fSBarry Smith ir = 6*r[i]; 988f1af5d2fSBarry Smith x[ir] = t[ii]; 989f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 990f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 991f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 992f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 993f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 994f1af5d2fSBarry Smith ii += 6; 995f1af5d2fSBarry Smith } 996f1af5d2fSBarry Smith 997f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9991ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002f1af5d2fSBarry Smith PetscFunctionReturn(0); 1003f1af5d2fSBarry Smith } 1004f1af5d2fSBarry Smith 10054a2ae208SSatish Balay #undef __FUNCT__ 10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008f1af5d2fSBarry Smith { 1009f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10116849ba73SBarry Smith PetscErrorCode ierr; 10125d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 10135d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 101687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 101787828ca2SBarry Smith PetscScalar *x,*b,*t; 1018f1af5d2fSBarry Smith 1019f1af5d2fSBarry Smith PetscFunctionBegin; 10201ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022f1af5d2fSBarry Smith t = a->solve_work; 1023f1af5d2fSBarry Smith 1024f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026f1af5d2fSBarry Smith 1027f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1028f1af5d2fSBarry Smith ii = 0; 1029f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1030f1af5d2fSBarry Smith ic = 7*c[i]; 1031f1af5d2fSBarry Smith t[ii] = b[ic]; 1032f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1033f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1034f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1035f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1036f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1037f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1038f1af5d2fSBarry Smith ii += 7; 1039f1af5d2fSBarry Smith } 1040f1af5d2fSBarry Smith 1041f1af5d2fSBarry Smith /* forward solve the U^T */ 1042f1af5d2fSBarry Smith idx = 0; 1043f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1044f1af5d2fSBarry Smith 1045f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1046f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1047f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1049f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056f1af5d2fSBarry Smith v += 49; 1057f1af5d2fSBarry Smith 1058f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1059f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1060f1af5d2fSBarry Smith while (nz--) { 1061f1af5d2fSBarry Smith oidx = 7*(*vi++); 1062f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069f1af5d2fSBarry Smith v += 49; 1070f1af5d2fSBarry Smith } 1071f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1073f1af5d2fSBarry Smith idx += 7; 1074f1af5d2fSBarry Smith } 1075f1af5d2fSBarry Smith /* backward solve the L^T */ 1076f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1077f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1078f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1079f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1080f1af5d2fSBarry Smith idt = 7*i; 1081f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1083f1af5d2fSBarry Smith while (nz--) { 1084f1af5d2fSBarry Smith idx = 7*(*vi--); 1085f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092f1af5d2fSBarry Smith v -= 49; 1093f1af5d2fSBarry Smith } 1094f1af5d2fSBarry Smith } 1095f1af5d2fSBarry Smith 1096f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1097f1af5d2fSBarry Smith ii = 0; 1098f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1099f1af5d2fSBarry Smith ir = 7*r[i]; 1100f1af5d2fSBarry Smith x[ir] = t[ii]; 1101f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1102f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1103f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1104f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1105f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1106f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1107f1af5d2fSBarry Smith ii += 7; 1108f1af5d2fSBarry Smith } 1109f1af5d2fSBarry Smith 1110f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11121ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115f1af5d2fSBarry Smith PetscFunctionReturn(0); 1116f1af5d2fSBarry Smith } 1117f1af5d2fSBarry Smith 11184e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11194a2ae208SSatish Balay #undef __FUNCT__ 11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11224e2b4712SSatish Balay { 11234e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11244e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11256849ba73SBarry Smith PetscErrorCode ierr; 11265d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 11275d0c19d7SBarry Smith PetscInt i,n=a->mbs; 11285d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11293f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 113087828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11314e2b4712SSatish Balay 11324e2b4712SSatish Balay PetscFunctionBegin; 11331ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11341ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135f1af5d2fSBarry Smith t = a->solve_work; 11364e2b4712SSatish Balay 11374e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11384e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11394e2b4712SSatish Balay 11404e2b4712SSatish Balay /* forward solve the lower triangular */ 114187828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11424e2b4712SSatish Balay for (i=1; i<n; i++) { 11434e2b4712SSatish Balay v = aa + bs2*ai[i]; 11444e2b4712SSatish Balay vi = aj + ai[i]; 11454e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1146f1af5d2fSBarry Smith s = t + bs*i; 114787828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11484e2b4712SSatish Balay while (nz--) { 1149f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11504e2b4712SSatish Balay v += bs2; 11514e2b4712SSatish Balay } 11524e2b4712SSatish Balay } 11534e2b4712SSatish Balay /* backward solve the upper triangular */ 1154d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 11554e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11564e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11574e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11584e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 115987828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11604e2b4712SSatish Balay while (nz--) { 1161f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11624e2b4712SSatish Balay v += bs2; 11634e2b4712SSatish Balay } 1164f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 116587828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11664e2b4712SSatish Balay } 11674e2b4712SSatish Balay 11684e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11694e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11701ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 11734e2b4712SSatish Balay PetscFunctionReturn(0); 11744e2b4712SSatish Balay } 11754e2b4712SSatish Balay 1176*5c42ef9dSBarry Smith /* ----------------------------------------------------------- */ 1177*5c42ef9dSBarry Smith #undef __FUNCT__ 1178*5c42ef9dSBarry Smith #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 1179*5c42ef9dSBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1180*5c42ef9dSBarry Smith { 1181*5c42ef9dSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1182*5c42ef9dSBarry Smith IS iscol=a->col,isrow=a->row; 1183*5c42ef9dSBarry Smith PetscErrorCode ierr; 1184*5c42ef9dSBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1185*5c42ef9dSBarry Smith PetscInt i,n=a->mbs,j; 1186*5c42ef9dSBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1187*5c42ef9dSBarry Smith const MatScalar *aa=a->a,*v; 1188*5c42ef9dSBarry Smith PetscScalar *x,*t,*ls; 1189*5c42ef9dSBarry Smith const PetscScalar *b; 1190*5c42ef9dSBarry Smith PetscFunctionBegin; 1191*5c42ef9dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1192*5c42ef9dSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1193*5c42ef9dSBarry Smith t = a->solve_work; 1194*5c42ef9dSBarry Smith 1195*5c42ef9dSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1196*5c42ef9dSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1197*5c42ef9dSBarry Smith 1198*5c42ef9dSBarry Smith /* copy the b into temp work space according to permutation */ 1199*5c42ef9dSBarry Smith for (i=0; i<n; i++) { 1200*5c42ef9dSBarry Smith for (j=0; j<bs; j++) { 1201*5c42ef9dSBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 1202*5c42ef9dSBarry Smith } 1203*5c42ef9dSBarry Smith } 1204*5c42ef9dSBarry Smith 1205*5c42ef9dSBarry Smith 1206*5c42ef9dSBarry Smith /* forward solve the upper triangular transpose */ 1207*5c42ef9dSBarry Smith ls = a->solve_work + A->cmap->n; 1208*5c42ef9dSBarry Smith for (i=0; i<n; i++){ 1209*5c42ef9dSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1210*5c42ef9dSBarry Smith Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1211*5c42ef9dSBarry Smith v = aa + bs2*(a->diag[i] + 1); 1212*5c42ef9dSBarry Smith vi = aj + a->diag[i] + 1; 1213*5c42ef9dSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 1214*5c42ef9dSBarry Smith while (nz--) { 1215*5c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 1216*5c42ef9dSBarry Smith v += bs2; 1217*5c42ef9dSBarry Smith } 1218*5c42ef9dSBarry Smith } 1219*5c42ef9dSBarry Smith 1220*5c42ef9dSBarry Smith /* backward solve the lower triangular transpose */ 1221*5c42ef9dSBarry Smith for (i=n-1; i>=0; i--) { 1222*5c42ef9dSBarry Smith v = aa + bs2*ai[i]; 1223*5c42ef9dSBarry Smith vi = aj + ai[i]; 1224*5c42ef9dSBarry Smith nz = a->diag[i] - ai[i]; 1225*5c42ef9dSBarry Smith while (nz--) { 1226*5c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 1227*5c42ef9dSBarry Smith v += bs2; 1228*5c42ef9dSBarry Smith } 1229*5c42ef9dSBarry Smith } 1230*5c42ef9dSBarry Smith 1231*5c42ef9dSBarry Smith /* copy t into x according to permutation */ 1232*5c42ef9dSBarry Smith for (i=0; i<n; i++) { 1233*5c42ef9dSBarry Smith for (j=0; j<bs; j++) { 1234*5c42ef9dSBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 1235*5c42ef9dSBarry Smith } 1236*5c42ef9dSBarry Smith } 1237*5c42ef9dSBarry Smith 1238*5c42ef9dSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1239*5c42ef9dSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1240*5c42ef9dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1241*5c42ef9dSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1242*5c42ef9dSBarry Smith ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1243*5c42ef9dSBarry Smith PetscFunctionReturn(0); 1244*5c42ef9dSBarry Smith } 1245*5c42ef9dSBarry Smith 12464a2ae208SSatish Balay #undef __FUNCT__ 12474a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1248dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 12494e2b4712SSatish Balay { 12504e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 12514e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 12526849ba73SBarry Smith PetscErrorCode ierr; 12535d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 12545d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 12553f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 125687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 125787828ca2SBarry Smith PetscScalar *x,*b,*t; 12584e2b4712SSatish Balay 12594e2b4712SSatish Balay PetscFunctionBegin; 12601ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 12611ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1262f1af5d2fSBarry Smith t = a->solve_work; 12634e2b4712SSatish Balay 12644e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 12654e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 12664e2b4712SSatish Balay 12674e2b4712SSatish Balay /* forward solve the lower triangular */ 12684e2b4712SSatish Balay idx = 7*(*r++); 1269f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1270f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1271f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 12724e2b4712SSatish Balay 12734e2b4712SSatish Balay for (i=1; i<n; i++) { 12744e2b4712SSatish Balay v = aa + 49*ai[i]; 12754e2b4712SSatish Balay vi = aj + ai[i]; 12764e2b4712SSatish Balay nz = diag[i] - ai[i]; 12774e2b4712SSatish Balay idx = 7*(*r++); 1278f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1279f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 12804e2b4712SSatish Balay while (nz--) { 12814e2b4712SSatish Balay idx = 7*(*vi++); 1282f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1283f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1284f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1285f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1286f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1287f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1288f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1289f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1290f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1291f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12924e2b4712SSatish Balay v += 49; 12934e2b4712SSatish Balay } 12944e2b4712SSatish Balay idx = 7*i; 1295f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1296f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1297f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12984e2b4712SSatish Balay } 12994e2b4712SSatish Balay /* backward solve the upper triangular */ 13004e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 13014e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 13024e2b4712SSatish Balay vi = aj + diag[i] + 1; 13034e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 13044e2b4712SSatish Balay idt = 7*i; 1305f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1306f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1307f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 13084e2b4712SSatish Balay while (nz--) { 13094e2b4712SSatish Balay idx = 7*(*vi++); 1310f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1311f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1312f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1313f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1314f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1315f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1316f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1317f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1318f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1319f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13204e2b4712SSatish Balay v += 49; 13214e2b4712SSatish Balay } 13224e2b4712SSatish Balay idc = 7*(*c--); 13234e2b4712SSatish Balay v = aa + 49*diag[i]; 1324f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1325f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1326f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1327f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1328f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1329f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1330f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1331f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1332f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1333f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1334f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1335f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1336f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1337f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 13384e2b4712SSatish Balay } 13394e2b4712SSatish Balay 13404e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 13414e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13421ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 13431ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1344dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 13454e2b4712SSatish Balay PetscFunctionReturn(0); 13464e2b4712SSatish Balay } 13474e2b4712SSatish Balay 13484a2ae208SSatish Balay #undef __FUNCT__ 13498f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 13508f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 13518f690400SShri Abhyankar { 13528f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 13538f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 13548f690400SShri Abhyankar PetscErrorCode ierr; 13558f690400SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 135629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 13578f690400SShri Abhyankar MatScalar *aa=a->a,*v; 13588f690400SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 13598f690400SShri Abhyankar PetscScalar *x,*b,*t; 13608f690400SShri Abhyankar 13618f690400SShri Abhyankar PetscFunctionBegin; 13628f690400SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 13638f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 13648f690400SShri Abhyankar t = a->solve_work; 13658f690400SShri Abhyankar 13668f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 136729b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 13688f690400SShri Abhyankar 13698f690400SShri Abhyankar /* forward solve the lower triangular */ 137029b92fc1SShri Abhyankar idx = 7*r[0]; 13718f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 13728f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 13738f690400SShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 13748f690400SShri Abhyankar 13758f690400SShri Abhyankar for (i=1; i<n; i++) { 13768f690400SShri Abhyankar v = aa + 49*ai[i]; 13778f690400SShri Abhyankar vi = aj + ai[i]; 13788f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 137929b92fc1SShri Abhyankar idx = 7*r[i]; 13808f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 13818f690400SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 138229b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 138329b92fc1SShri Abhyankar idx = 7*vi[m]; 13848f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 13858f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 13868f690400SShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 13878f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 13888f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 13898f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 13908f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 13918f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 13928f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 13938f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13948f690400SShri Abhyankar v += 49; 13958f690400SShri Abhyankar } 13968f690400SShri Abhyankar idx = 7*i; 13978f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 13988f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 13998f690400SShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 14008f690400SShri Abhyankar } 14018f690400SShri Abhyankar /* backward solve the upper triangular */ 14028f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 14038f690400SShri Abhyankar k = 2*n-i; 14048f690400SShri Abhyankar v = aa + 49*ai[k]; 14058f690400SShri Abhyankar vi = aj + ai[k]; 14068f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 14078f690400SShri Abhyankar idt = 7*i; 14088f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 14098f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 14108f690400SShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 141129b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 141229b92fc1SShri Abhyankar idx = 7*vi[m]; 14138f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 14148f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 14158f690400SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 14168f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 14178f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 14188f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 14198f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 14208f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 14218f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 14228f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 14238f690400SShri Abhyankar v += 49; 14248f690400SShri Abhyankar } 142529b92fc1SShri Abhyankar idc = 7*c[i]; 14268f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 14278f690400SShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 14288f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 14298f690400SShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 14308f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 14318f690400SShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 14328f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 14338f690400SShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 14348f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 14358f690400SShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 14368f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 14378f690400SShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 14388f690400SShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 14398f690400SShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 14408f690400SShri Abhyankar } 14418f690400SShri Abhyankar 14428f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 14438f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 14448f690400SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 14458f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 14468f690400SShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 14478f690400SShri Abhyankar PetscFunctionReturn(0); 14488f690400SShri Abhyankar } 14498f690400SShri Abhyankar 14508f690400SShri Abhyankar #undef __FUNCT__ 145135aa4fcfSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct_v2" 145235aa4fcfSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct_v2(Mat A,Vec bb,Vec xx) 145335aa4fcfSShri Abhyankar { 145435aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 145535aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 145635aa4fcfSShri Abhyankar PetscErrorCode ierr; 145735aa4fcfSShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 145835aa4fcfSShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 145935aa4fcfSShri Abhyankar MatScalar *aa=a->a,*v; 146035aa4fcfSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 146135aa4fcfSShri Abhyankar PetscScalar *x,*b,*t; 146235aa4fcfSShri Abhyankar 146335aa4fcfSShri Abhyankar PetscFunctionBegin; 146435aa4fcfSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 146535aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 146635aa4fcfSShri Abhyankar t = a->solve_work; 146735aa4fcfSShri Abhyankar 146835aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 146935aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 147035aa4fcfSShri Abhyankar 147135aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 147235aa4fcfSShri Abhyankar idx = 7*r[0]; 147335aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 147435aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 147535aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 147635aa4fcfSShri Abhyankar 147735aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 147835aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 147935aa4fcfSShri Abhyankar vi = aj + ai[i]; 148035aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 148135aa4fcfSShri Abhyankar idx = 7*r[i]; 148235aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 148335aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 148435aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 148535aa4fcfSShri Abhyankar idx = 7*vi[m]; 148635aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 148735aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 148835aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 148935aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 149035aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 149135aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 149235aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 149335aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 149435aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 149535aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 149635aa4fcfSShri Abhyankar v += 49; 149735aa4fcfSShri Abhyankar } 149835aa4fcfSShri Abhyankar idx = 7*i; 149935aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 150035aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 150135aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 150235aa4fcfSShri Abhyankar } 150335aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 150435aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 150535aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 150635aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 150735aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 150835aa4fcfSShri Abhyankar idt = 7*i; 150935aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 151035aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 151135aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 151235aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 151335aa4fcfSShri Abhyankar idx = 7*vi[m]; 151435aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 151535aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 151635aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 151735aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 151835aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 151935aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 152035aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 152135aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 152235aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 152335aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 152435aa4fcfSShri Abhyankar v += 49; 152535aa4fcfSShri Abhyankar } 152635aa4fcfSShri Abhyankar idc = 7*c[i]; 152735aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 152835aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 152935aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 153035aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 153135aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 153235aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 153335aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 153435aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 153535aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 153635aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 153735aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 153835aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 153935aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 154035aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 154135aa4fcfSShri Abhyankar } 154235aa4fcfSShri Abhyankar 154335aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 154435aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 154535aa4fcfSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 154635aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 154735aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 154835aa4fcfSShri Abhyankar PetscFunctionReturn(0); 154935aa4fcfSShri Abhyankar } 155035aa4fcfSShri Abhyankar 155135aa4fcfSShri Abhyankar #undef __FUNCT__ 15524a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1553dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 155415091d37SBarry Smith { 155515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1556690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1557dfbe8321SBarry Smith PetscErrorCode ierr; 1558690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1559d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1560d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1561d9fead3dSBarry Smith const PetscScalar *b; 156215091d37SBarry Smith 156315091d37SBarry Smith PetscFunctionBegin; 1564d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15651ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 156615091d37SBarry Smith /* forward solve the lower triangular */ 156715091d37SBarry Smith idx = 0; 156815091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 156915091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 157015091d37SBarry Smith x[6] = b[6+idx]; 157115091d37SBarry Smith for (i=1; i<n; i++) { 157215091d37SBarry Smith v = aa + 49*ai[i]; 157315091d37SBarry Smith vi = aj + ai[i]; 157415091d37SBarry Smith nz = diag[i] - ai[i]; 157515091d37SBarry Smith idx = 7*i; 1576f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1577f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1578f1af5d2fSBarry Smith s7 = b[6+idx]; 157915091d37SBarry Smith while (nz--) { 158015091d37SBarry Smith jdx = 7*(*vi++); 158115091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 158215091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 158315091d37SBarry Smith x7 = x[6+jdx]; 1584f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1585f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1586f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1587f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1588f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1589f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1590f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 159115091d37SBarry Smith v += 49; 159215091d37SBarry Smith } 1593f1af5d2fSBarry Smith x[idx] = s1; 1594f1af5d2fSBarry Smith x[1+idx] = s2; 1595f1af5d2fSBarry Smith x[2+idx] = s3; 1596f1af5d2fSBarry Smith x[3+idx] = s4; 1597f1af5d2fSBarry Smith x[4+idx] = s5; 1598f1af5d2fSBarry Smith x[5+idx] = s6; 1599f1af5d2fSBarry Smith x[6+idx] = s7; 160015091d37SBarry Smith } 160115091d37SBarry Smith /* backward solve the upper triangular */ 160215091d37SBarry Smith for (i=n-1; i>=0; i--){ 160315091d37SBarry Smith v = aa + 49*diag[i] + 49; 160415091d37SBarry Smith vi = aj + diag[i] + 1; 160515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 160615091d37SBarry Smith idt = 7*i; 1607f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1608f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1609f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1610f1af5d2fSBarry Smith s7 = x[6+idt]; 161115091d37SBarry Smith while (nz--) { 161215091d37SBarry Smith idx = 7*(*vi++); 161315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 161415091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 161515091d37SBarry Smith x7 = x[6+idx]; 1616f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1617f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1618f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1619f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1620f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1621f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1622f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 162315091d37SBarry Smith v += 49; 162415091d37SBarry Smith } 162515091d37SBarry Smith v = aa + 49*diag[i]; 1626f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1627f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1628f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1629f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1630f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1631f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1632f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1633f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1634f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1635f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1636f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1637f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1638f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1639f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 164015091d37SBarry Smith } 164115091d37SBarry Smith 1642d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16431ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1644dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 164515091d37SBarry Smith PetscFunctionReturn(0); 164615091d37SBarry Smith } 164715091d37SBarry Smith 16484a2ae208SSatish Balay #undef __FUNCT__ 1649cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1650cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1651cee9d6f2SShri Abhyankar { 1652cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 16536464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1654cee9d6f2SShri Abhyankar PetscErrorCode ierr; 1655cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 1656cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1657cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 1658cee9d6f2SShri Abhyankar PetscScalar *x; 1659cee9d6f2SShri Abhyankar const PetscScalar *b; 1660cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1661cee9d6f2SShri Abhyankar 1662cee9d6f2SShri Abhyankar PetscFunctionBegin; 1663cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1664cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1665cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 1666cee9d6f2SShri Abhyankar idx = 0; 1667cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1668cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1669cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 1670cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 1671cee9d6f2SShri Abhyankar vi = aj + ai[i]; 1672cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 1673cee9d6f2SShri Abhyankar idx = bs*i; 1674cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1675cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 16766464896eSShri Abhyankar for(k=0;k<nz;k++) { 16776464896eSShri Abhyankar jdx = bs*vi[k]; 1678cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1679cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1680cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1681cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1682cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1683cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1684cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1685cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1686cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1687cee9d6f2SShri Abhyankar v += bs2; 1688cee9d6f2SShri Abhyankar } 1689cee9d6f2SShri Abhyankar 1690cee9d6f2SShri Abhyankar x[idx] = s1; 1691cee9d6f2SShri Abhyankar x[1+idx] = s2; 1692cee9d6f2SShri Abhyankar x[2+idx] = s3; 1693cee9d6f2SShri Abhyankar x[3+idx] = s4; 1694cee9d6f2SShri Abhyankar x[4+idx] = s5; 1695cee9d6f2SShri Abhyankar x[5+idx] = s6; 1696cee9d6f2SShri Abhyankar x[6+idx] = s7; 1697cee9d6f2SShri Abhyankar } 1698cee9d6f2SShri Abhyankar 1699cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 1700cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 1701cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 1702cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 1703cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 1704cee9d6f2SShri Abhyankar idt = bs*i; 1705cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1706cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 17076464896eSShri Abhyankar for(k=0;k<nz;k++) { 17086464896eSShri Abhyankar idx = bs*vi[k]; 1709cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1710cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1711cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1712cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1713cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1714cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1715cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1716cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1717cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1718cee9d6f2SShri Abhyankar v += bs2; 1719cee9d6f2SShri Abhyankar } 1720cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 1721cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1722cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1723cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1724cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1725cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1726cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1727cee9d6f2SShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1728cee9d6f2SShri Abhyankar } 1729cee9d6f2SShri Abhyankar 1730cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1731cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1732cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1733cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 1734cee9d6f2SShri Abhyankar } 1735cee9d6f2SShri Abhyankar 1736cee9d6f2SShri Abhyankar #undef __FUNCT__ 173753cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2" 173853cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 173953cca76cSShri Abhyankar { 174053cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 174153cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 174253cca76cSShri Abhyankar PetscErrorCode ierr; 174353cca76cSShri Abhyankar PetscInt idx,jdx,idt; 174453cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 174553cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 174653cca76cSShri Abhyankar PetscScalar *x; 174753cca76cSShri Abhyankar const PetscScalar *b; 174853cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 174953cca76cSShri Abhyankar 175053cca76cSShri Abhyankar PetscFunctionBegin; 175153cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 175253cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 175353cca76cSShri Abhyankar /* forward solve the lower triangular */ 175453cca76cSShri Abhyankar idx = 0; 175553cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 175653cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 175753cca76cSShri Abhyankar for (i=1; i<n; i++) { 175853cca76cSShri Abhyankar v = aa + bs2*ai[i]; 175953cca76cSShri Abhyankar vi = aj + ai[i]; 176053cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 176153cca76cSShri Abhyankar idx = bs*i; 176253cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 176353cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 176453cca76cSShri Abhyankar for(k=0;k<nz;k++) { 176553cca76cSShri Abhyankar jdx = bs*vi[k]; 176653cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 176753cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 176853cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 176953cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 177053cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 177153cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 177253cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 177353cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 177453cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 177553cca76cSShri Abhyankar v += bs2; 177653cca76cSShri Abhyankar } 177753cca76cSShri Abhyankar 177853cca76cSShri Abhyankar x[idx] = s1; 177953cca76cSShri Abhyankar x[1+idx] = s2; 178053cca76cSShri Abhyankar x[2+idx] = s3; 178153cca76cSShri Abhyankar x[3+idx] = s4; 178253cca76cSShri Abhyankar x[4+idx] = s5; 178353cca76cSShri Abhyankar x[5+idx] = s6; 178453cca76cSShri Abhyankar x[6+idx] = s7; 178553cca76cSShri Abhyankar } 178653cca76cSShri Abhyankar 178753cca76cSShri Abhyankar /* backward solve the upper triangular */ 178853cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 178953cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 179053cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 179153cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 179253cca76cSShri Abhyankar idt = bs*i; 179353cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 179453cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 179553cca76cSShri Abhyankar for(k=0;k<nz;k++) { 179653cca76cSShri Abhyankar idx = bs*vi[k]; 179753cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 179853cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 179953cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 180053cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 180153cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 180253cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 180353cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 180453cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 180553cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 180653cca76cSShri Abhyankar v += bs2; 180753cca76cSShri Abhyankar } 180853cca76cSShri Abhyankar /* x = inv_diagonal*x */ 180953cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 181053cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 181153cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 181253cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 181353cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 181453cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 181553cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 181653cca76cSShri Abhyankar } 181753cca76cSShri Abhyankar 181853cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 181953cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 182053cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 182153cca76cSShri Abhyankar PetscFunctionReturn(0); 182253cca76cSShri Abhyankar } 182353cca76cSShri Abhyankar 182453cca76cSShri Abhyankar #undef __FUNCT__ 18254a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1826dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 182715091d37SBarry Smith { 182815091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182915091d37SBarry Smith IS iscol=a->col,isrow=a->row; 18306849ba73SBarry Smith PetscErrorCode ierr; 18315d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 18325d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1833d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1834d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1835d9fead3dSBarry Smith const PetscScalar *b; 183615091d37SBarry Smith PetscFunctionBegin; 1837d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18381ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1839f1af5d2fSBarry Smith t = a->solve_work; 184015091d37SBarry Smith 184115091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 184215091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 184315091d37SBarry Smith 184415091d37SBarry Smith /* forward solve the lower triangular */ 184515091d37SBarry Smith idx = 6*(*r++); 1846f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1847f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1848f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 184915091d37SBarry Smith for (i=1; i<n; i++) { 185015091d37SBarry Smith v = aa + 36*ai[i]; 185115091d37SBarry Smith vi = aj + ai[i]; 185215091d37SBarry Smith nz = diag[i] - ai[i]; 185315091d37SBarry Smith idx = 6*(*r++); 1854f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1855f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 185615091d37SBarry Smith while (nz--) { 185715091d37SBarry Smith idx = 6*(*vi++); 1858f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1859f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1860f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1861f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1862f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1863f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1864f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1865f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 186615091d37SBarry Smith v += 36; 186715091d37SBarry Smith } 186815091d37SBarry Smith idx = 6*i; 1869f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1870f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1871f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 187215091d37SBarry Smith } 187315091d37SBarry Smith /* backward solve the upper triangular */ 187415091d37SBarry Smith for (i=n-1; i>=0; i--){ 187515091d37SBarry Smith v = aa + 36*diag[i] + 36; 187615091d37SBarry Smith vi = aj + diag[i] + 1; 187715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 187815091d37SBarry Smith idt = 6*i; 1879f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1880f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1881f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 188215091d37SBarry Smith while (nz--) { 188315091d37SBarry Smith idx = 6*(*vi++); 1884f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1885f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1886f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1887f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1888f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1889f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1890f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1891f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1892f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 189315091d37SBarry Smith v += 36; 189415091d37SBarry Smith } 189515091d37SBarry Smith idc = 6*(*c--); 189615091d37SBarry Smith v = aa + 36*diag[i]; 1897f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1898f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1899f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1900f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1901f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1902f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1903f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1904f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1905f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1906f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1907f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1908f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 190915091d37SBarry Smith } 191015091d37SBarry Smith 191115091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 191215091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1913d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19141ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1915dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 191615091d37SBarry Smith PetscFunctionReturn(0); 191715091d37SBarry Smith } 191815091d37SBarry Smith 19194a2ae208SSatish Balay #undef __FUNCT__ 19208f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 19218f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 19228f690400SShri Abhyankar { 19238f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 19248f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 19258f690400SShri Abhyankar PetscErrorCode ierr; 19268f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 192729b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 19288f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 19298f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 19308f690400SShri Abhyankar const PetscScalar *b; 19318f690400SShri Abhyankar PetscFunctionBegin; 19328f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19338f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 19348f690400SShri Abhyankar t = a->solve_work; 19358f690400SShri Abhyankar 19368f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 193729b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 19388f690400SShri Abhyankar 19398f690400SShri Abhyankar /* forward solve the lower triangular */ 194029b92fc1SShri Abhyankar idx = 6*r[0]; 19418f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 19428f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 19438f690400SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 19448f690400SShri Abhyankar for (i=1; i<n; i++) { 19458f690400SShri Abhyankar v = aa + 36*ai[i]; 19468f690400SShri Abhyankar vi = aj + ai[i]; 19478f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 194829b92fc1SShri Abhyankar idx = 6*r[i]; 19498f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 19508f690400SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 195129b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 195229b92fc1SShri Abhyankar idx = 6*vi[m]; 19538f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 19548f690400SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 19558f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 19568f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 19578f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 19588f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 19598f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 19608f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 19618f690400SShri Abhyankar v += 36; 19628f690400SShri Abhyankar } 19638f690400SShri Abhyankar idx = 6*i; 19648f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 19658f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 19668f690400SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 19678f690400SShri Abhyankar } 19688f690400SShri Abhyankar /* backward solve the upper triangular */ 19698f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 19708f690400SShri Abhyankar k = 2*n-i; 19718f690400SShri Abhyankar v = aa + 36*ai[k]; 19728f690400SShri Abhyankar vi = aj + ai[k]; 19738f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 19748f690400SShri Abhyankar idt = 6*i; 19758f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 19768f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 19778f690400SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 197829b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 197929b92fc1SShri Abhyankar idx = 6*vi[m]; 19808f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 19818f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 19828f690400SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 19838f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 19848f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 19858f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 19868f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 19878f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 19888f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 19898f690400SShri Abhyankar v += 36; 19908f690400SShri Abhyankar } 199129b92fc1SShri Abhyankar idc = 6*c[i]; 19928f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 19938f690400SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 19948f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 19958f690400SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 19968f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 19978f690400SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 19988f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 19998f690400SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 20008f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 20018f690400SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 20028f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 20038f690400SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 20048f690400SShri Abhyankar } 20058f690400SShri Abhyankar 20068f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 20078f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 20088f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20098f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 20108f690400SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 20118f690400SShri Abhyankar PetscFunctionReturn(0); 20128f690400SShri Abhyankar } 20138f690400SShri Abhyankar 20146506fda5SShri Abhyankar #undef __FUNCT__ 20156506fda5SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct_v2" 20166506fda5SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct_v2(Mat A,Vec bb,Vec xx) 20176506fda5SShri Abhyankar { 20186506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 20196506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 20206506fda5SShri Abhyankar PetscErrorCode ierr; 20216506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 20226506fda5SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 20236506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 20246506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 20256506fda5SShri Abhyankar const PetscScalar *b; 20266506fda5SShri Abhyankar PetscFunctionBegin; 20276506fda5SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20286506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 20296506fda5SShri Abhyankar t = a->solve_work; 20306506fda5SShri Abhyankar 20316506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 20326506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 20336506fda5SShri Abhyankar 20346506fda5SShri Abhyankar /* forward solve the lower triangular */ 20356506fda5SShri Abhyankar idx = 6*r[0]; 20366506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 20376506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 20386506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 20396506fda5SShri Abhyankar for (i=1; i<n; i++) { 20406506fda5SShri Abhyankar v = aa + 36*ai[i]; 20416506fda5SShri Abhyankar vi = aj + ai[i]; 20426506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 20436506fda5SShri Abhyankar idx = 6*r[i]; 20446506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 20456506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 20466506fda5SShri Abhyankar for(m=0;m<nz;m++){ 20476506fda5SShri Abhyankar idx = 6*vi[m]; 20486506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 20496506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 20506506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 20516506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 20526506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 20536506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 20546506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 20556506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 20566506fda5SShri Abhyankar v += 36; 20576506fda5SShri Abhyankar } 20586506fda5SShri Abhyankar idx = 6*i; 20596506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 20606506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 20616506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 20626506fda5SShri Abhyankar } 20636506fda5SShri Abhyankar /* backward solve the upper triangular */ 20646506fda5SShri Abhyankar for (i=n-1; i>=0; i--){ 20656506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 20666506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 20676506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 20686506fda5SShri Abhyankar idt = 6*i; 20696506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 20706506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 20716506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 20726506fda5SShri Abhyankar for(m=0;m<nz;m++){ 20736506fda5SShri Abhyankar idx = 6*vi[m]; 20746506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 20756506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 20766506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 20776506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 20786506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 20796506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 20806506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 20816506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 20826506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 20836506fda5SShri Abhyankar v += 36; 20846506fda5SShri Abhyankar } 20856506fda5SShri Abhyankar idc = 6*c[i]; 20866506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 20876506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 20886506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 20896506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 20906506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 20916506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 20926506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 20936506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 20946506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 20956506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 20966506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 20976506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 20986506fda5SShri Abhyankar } 20996506fda5SShri Abhyankar 21006506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21016506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21026506fda5SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21036506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 21046506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 21056506fda5SShri Abhyankar PetscFunctionReturn(0); 21066506fda5SShri Abhyankar } 21078f690400SShri Abhyankar 21088f690400SShri Abhyankar #undef __FUNCT__ 21094a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 2110dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 211115091d37SBarry Smith { 211215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2113690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2114dfbe8321SBarry Smith PetscErrorCode ierr; 2115690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2116d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2117d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2118d9fead3dSBarry Smith const PetscScalar *b; 211915091d37SBarry Smith 212015091d37SBarry Smith PetscFunctionBegin; 2121d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21221ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 212315091d37SBarry Smith /* forward solve the lower triangular */ 212415091d37SBarry Smith idx = 0; 212515091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 212615091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 212715091d37SBarry Smith for (i=1; i<n; i++) { 212815091d37SBarry Smith v = aa + 36*ai[i]; 212915091d37SBarry Smith vi = aj + ai[i]; 213015091d37SBarry Smith nz = diag[i] - ai[i]; 213115091d37SBarry Smith idx = 6*i; 2132f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2133f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 213415091d37SBarry Smith while (nz--) { 213515091d37SBarry Smith jdx = 6*(*vi++); 213615091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 213715091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2138f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2139f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2140f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2141f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2142f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2143f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 214415091d37SBarry Smith v += 36; 214515091d37SBarry Smith } 2146f1af5d2fSBarry Smith x[idx] = s1; 2147f1af5d2fSBarry Smith x[1+idx] = s2; 2148f1af5d2fSBarry Smith x[2+idx] = s3; 2149f1af5d2fSBarry Smith x[3+idx] = s4; 2150f1af5d2fSBarry Smith x[4+idx] = s5; 2151f1af5d2fSBarry Smith x[5+idx] = s6; 215215091d37SBarry Smith } 215315091d37SBarry Smith /* backward solve the upper triangular */ 215415091d37SBarry Smith for (i=n-1; i>=0; i--){ 215515091d37SBarry Smith v = aa + 36*diag[i] + 36; 215615091d37SBarry Smith vi = aj + diag[i] + 1; 215715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 215815091d37SBarry Smith idt = 6*i; 2159f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2160f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2161f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 216215091d37SBarry Smith while (nz--) { 216315091d37SBarry Smith idx = 6*(*vi++); 216415091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 216515091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2166f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2167f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2168f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2169f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2170f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2171f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 217215091d37SBarry Smith v += 36; 217315091d37SBarry Smith } 217415091d37SBarry Smith v = aa + 36*diag[i]; 2175f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2176f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2177f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2178f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2179f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2180f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 218115091d37SBarry Smith } 218215091d37SBarry Smith 2183d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21841ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2185dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 218615091d37SBarry Smith PetscFunctionReturn(0); 218715091d37SBarry Smith } 218815091d37SBarry Smith 21894a2ae208SSatish Balay #undef __FUNCT__ 2190cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2191cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2192cee9d6f2SShri Abhyankar { 2193cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 21946464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2195cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2196cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 2197cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2198cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2199cee9d6f2SShri Abhyankar PetscScalar *x; 2200cee9d6f2SShri Abhyankar const PetscScalar *b; 2201cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2202cee9d6f2SShri Abhyankar 2203cee9d6f2SShri Abhyankar PetscFunctionBegin; 2204cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2205cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2206cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2207cee9d6f2SShri Abhyankar idx = 0; 2208cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2209cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 2210cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2211cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 2212cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2213cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2214cee9d6f2SShri Abhyankar idx = bs*i; 2215cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2216cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 22176464896eSShri Abhyankar for(k=0;k<nz;k++){ 22186464896eSShri Abhyankar jdx = bs*vi[k]; 2219cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2220cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 2221cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2222cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2223cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2224cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2225cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2226cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2227cee9d6f2SShri Abhyankar v += bs2; 2228cee9d6f2SShri Abhyankar } 2229cee9d6f2SShri Abhyankar 2230cee9d6f2SShri Abhyankar x[idx] = s1; 2231cee9d6f2SShri Abhyankar x[1+idx] = s2; 2232cee9d6f2SShri Abhyankar x[2+idx] = s3; 2233cee9d6f2SShri Abhyankar x[3+idx] = s4; 2234cee9d6f2SShri Abhyankar x[4+idx] = s5; 2235cee9d6f2SShri Abhyankar x[5+idx] = s6; 2236cee9d6f2SShri Abhyankar } 2237cee9d6f2SShri Abhyankar 2238cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2239cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2240cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 2241cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2242cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2243cee9d6f2SShri Abhyankar idt = bs*i; 2244cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2245cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 22466464896eSShri Abhyankar for(k=0;k<nz;k++){ 22476464896eSShri Abhyankar idx = bs*vi[k]; 2248cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2249cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 2250cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2251cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2252cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2253cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2254cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2255cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2256cee9d6f2SShri Abhyankar v += bs2; 2257cee9d6f2SShri Abhyankar } 2258cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2259cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2260cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2261cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2262cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2263cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2264cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2265cee9d6f2SShri Abhyankar } 2266cee9d6f2SShri Abhyankar 2267cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2268cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2269cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2270cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2271cee9d6f2SShri Abhyankar } 22728f690400SShri Abhyankar 2273cee9d6f2SShri Abhyankar #undef __FUNCT__ 227453cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2" 227553cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 227653cca76cSShri Abhyankar { 227753cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 227853cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 227953cca76cSShri Abhyankar PetscErrorCode ierr; 228053cca76cSShri Abhyankar PetscInt idx,jdx,idt; 228153cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 228253cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 228353cca76cSShri Abhyankar PetscScalar *x; 228453cca76cSShri Abhyankar const PetscScalar *b; 228553cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 228653cca76cSShri Abhyankar 228753cca76cSShri Abhyankar PetscFunctionBegin; 228853cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 228953cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 229053cca76cSShri Abhyankar /* forward solve the lower triangular */ 229153cca76cSShri Abhyankar idx = 0; 229253cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 229353cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 229453cca76cSShri Abhyankar for (i=1; i<n; i++) { 229553cca76cSShri Abhyankar v = aa + bs2*ai[i]; 229653cca76cSShri Abhyankar vi = aj + ai[i]; 229753cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 229853cca76cSShri Abhyankar idx = bs*i; 229953cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 230053cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 230153cca76cSShri Abhyankar for(k=0;k<nz;k++){ 230253cca76cSShri Abhyankar jdx = bs*vi[k]; 230353cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 230453cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 230553cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 230653cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 230753cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 230853cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 230953cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 231053cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 231153cca76cSShri Abhyankar v += bs2; 231253cca76cSShri Abhyankar } 231353cca76cSShri Abhyankar 231453cca76cSShri Abhyankar x[idx] = s1; 231553cca76cSShri Abhyankar x[1+idx] = s2; 231653cca76cSShri Abhyankar x[2+idx] = s3; 231753cca76cSShri Abhyankar x[3+idx] = s4; 231853cca76cSShri Abhyankar x[4+idx] = s5; 231953cca76cSShri Abhyankar x[5+idx] = s6; 232053cca76cSShri Abhyankar } 232153cca76cSShri Abhyankar 232253cca76cSShri Abhyankar /* backward solve the upper triangular */ 232353cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 232453cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 232553cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 232653cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 232753cca76cSShri Abhyankar idt = bs*i; 232853cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 232953cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 233053cca76cSShri Abhyankar for(k=0;k<nz;k++){ 233153cca76cSShri Abhyankar idx = bs*vi[k]; 233253cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 233353cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 233453cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 233553cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 233653cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 233753cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 233853cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 233953cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 234053cca76cSShri Abhyankar v += bs2; 234153cca76cSShri Abhyankar } 234253cca76cSShri Abhyankar /* x = inv_diagonal*x */ 234353cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 234453cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 234553cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 234653cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 234753cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 234853cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 234953cca76cSShri Abhyankar } 235053cca76cSShri Abhyankar 235153cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 235253cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 235353cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 235453cca76cSShri Abhyankar PetscFunctionReturn(0); 235553cca76cSShri Abhyankar } 235653cca76cSShri Abhyankar 235753cca76cSShri Abhyankar #undef __FUNCT__ 23584a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2359dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 23604e2b4712SSatish Balay { 23614e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 23624e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 23636849ba73SBarry Smith PetscErrorCode ierr; 23645d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 23655d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2366d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2367d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2368d9fead3dSBarry Smith const PetscScalar *b; 23694e2b4712SSatish Balay 23704e2b4712SSatish Balay PetscFunctionBegin; 2371d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23721ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2373f1af5d2fSBarry Smith t = a->solve_work; 23744e2b4712SSatish Balay 23754e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 23764e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 23774e2b4712SSatish Balay 23784e2b4712SSatish Balay /* forward solve the lower triangular */ 23794e2b4712SSatish Balay idx = 5*(*r++); 2380f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2381f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 23824e2b4712SSatish Balay for (i=1; i<n; i++) { 23834e2b4712SSatish Balay v = aa + 25*ai[i]; 23844e2b4712SSatish Balay vi = aj + ai[i]; 23854e2b4712SSatish Balay nz = diag[i] - ai[i]; 23864e2b4712SSatish Balay idx = 5*(*r++); 2387f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2388f1af5d2fSBarry Smith s5 = b[4+idx]; 23894e2b4712SSatish Balay while (nz--) { 23904e2b4712SSatish Balay idx = 5*(*vi++); 2391f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2392f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2393f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2394f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2395f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2396f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2397f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 23984e2b4712SSatish Balay v += 25; 23994e2b4712SSatish Balay } 24004e2b4712SSatish Balay idx = 5*i; 2401f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2402f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 24034e2b4712SSatish Balay } 24044e2b4712SSatish Balay /* backward solve the upper triangular */ 24054e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 24064e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 24074e2b4712SSatish Balay vi = aj + diag[i] + 1; 24084e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 24094e2b4712SSatish Balay idt = 5*i; 2410f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2411f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 24124e2b4712SSatish Balay while (nz--) { 24134e2b4712SSatish Balay idx = 5*(*vi++); 2414f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2415f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2416f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2417f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2418f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2419f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2420f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 24214e2b4712SSatish Balay v += 25; 24224e2b4712SSatish Balay } 24234e2b4712SSatish Balay idc = 5*(*c--); 24244e2b4712SSatish Balay v = aa + 25*diag[i]; 2425f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2426f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 2427f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2428f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 2429f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2430f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 2431f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2432f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 2433f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2434f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 24354e2b4712SSatish Balay } 24364e2b4712SSatish Balay 24374e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 24384e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2439d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24401ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2441dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 24424e2b4712SSatish Balay PetscFunctionReturn(0); 24434e2b4712SSatish Balay } 24444e2b4712SSatish Balay 24454a2ae208SSatish Balay #undef __FUNCT__ 24468f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 24478f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 24488f690400SShri Abhyankar { 24498f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 24508f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 24518f690400SShri Abhyankar PetscErrorCode ierr; 24528f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 245329b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 24548f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 24558f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 24568f690400SShri Abhyankar const PetscScalar *b; 24578f690400SShri Abhyankar 24588f690400SShri Abhyankar PetscFunctionBegin; 24598f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24608f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24618f690400SShri Abhyankar t = a->solve_work; 24628f690400SShri Abhyankar 24638f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 246429b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 24658f690400SShri Abhyankar 24668f690400SShri Abhyankar /* forward solve the lower triangular */ 246729b92fc1SShri Abhyankar idx = 5*r[0]; 24688f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 24698f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 24708f690400SShri Abhyankar for (i=1; i<n; i++) { 24718f690400SShri Abhyankar v = aa + 25*ai[i]; 24728f690400SShri Abhyankar vi = aj + ai[i]; 24738f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 247429b92fc1SShri Abhyankar idx = 5*r[i]; 24758f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 24768f690400SShri Abhyankar s5 = b[4+idx]; 247729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 247829b92fc1SShri Abhyankar idx = 5*vi[m]; 24798f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 24808f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 24818f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 24828f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 24838f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 24848f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 24858f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 24868f690400SShri Abhyankar v += 25; 24878f690400SShri Abhyankar } 24888f690400SShri Abhyankar idx = 5*i; 24898f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 24908f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 24918f690400SShri Abhyankar } 24928f690400SShri Abhyankar /* backward solve the upper triangular */ 24938f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 24948f690400SShri Abhyankar k = 2*n-i; 24958f690400SShri Abhyankar v = aa + 25*ai[k]; 24968f690400SShri Abhyankar vi = aj + ai[k]; 24978f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 24988f690400SShri Abhyankar idt = 5*i; 24998f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 25008f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 250129b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 250229b92fc1SShri Abhyankar idx = 5*vi[m]; 25038f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 25048f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 25058f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 25068f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 25078f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 25088f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 25098f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 25108f690400SShri Abhyankar v += 25; 25118f690400SShri Abhyankar } 251229b92fc1SShri Abhyankar idc = 5*c[i]; 25138f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 25148f690400SShri Abhyankar v[15]*s4+v[20]*s5; 25158f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 25168f690400SShri Abhyankar v[16]*s4+v[21]*s5; 25178f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 25188f690400SShri Abhyankar v[17]*s4+v[22]*s5; 25198f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 25208f690400SShri Abhyankar v[18]*s4+v[23]*s5; 25218f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 25228f690400SShri Abhyankar v[19]*s4+v[24]*s5; 25238f690400SShri Abhyankar } 25248f690400SShri Abhyankar 25258f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 25268f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 25278f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25288f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 25298f690400SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 25308f690400SShri Abhyankar PetscFunctionReturn(0); 25318f690400SShri Abhyankar } 253278bb4007SShri Abhyankar 253378bb4007SShri Abhyankar #undef __FUNCT__ 253478bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2" 253578bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx) 253678bb4007SShri Abhyankar { 253778bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 253878bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 253978bb4007SShri Abhyankar PetscErrorCode ierr; 254078bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 254178bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 254278bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 254378bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 254478bb4007SShri Abhyankar const PetscScalar *b; 254578bb4007SShri Abhyankar 254678bb4007SShri Abhyankar PetscFunctionBegin; 254778bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 254878bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 254978bb4007SShri Abhyankar t = a->solve_work; 255078bb4007SShri Abhyankar 255178bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 255278bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 255378bb4007SShri Abhyankar 255478bb4007SShri Abhyankar /* forward solve the lower triangular */ 255578bb4007SShri Abhyankar idx = 5*r[0]; 255678bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 255778bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 255878bb4007SShri Abhyankar for (i=1; i<n; i++) { 255978bb4007SShri Abhyankar v = aa + 25*ai[i]; 256078bb4007SShri Abhyankar vi = aj + ai[i]; 256178bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 256278bb4007SShri Abhyankar idx = 5*r[i]; 256378bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 256478bb4007SShri Abhyankar s5 = b[4+idx]; 256578bb4007SShri Abhyankar for(m=0;m<nz;m++){ 256678bb4007SShri Abhyankar idx = 5*vi[m]; 256778bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 256878bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 256978bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 257078bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 257178bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 257278bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 257378bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 257478bb4007SShri Abhyankar v += 25; 257578bb4007SShri Abhyankar } 257678bb4007SShri Abhyankar idx = 5*i; 257778bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 257878bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 257978bb4007SShri Abhyankar } 258078bb4007SShri Abhyankar /* backward solve the upper triangular */ 258178bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 258278bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 258378bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 258478bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 258578bb4007SShri Abhyankar idt = 5*i; 258678bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 258778bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 258878bb4007SShri Abhyankar for(m=0;m<nz;m++){ 258978bb4007SShri Abhyankar idx = 5*vi[m]; 259078bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 259178bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 259278bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 259378bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 259478bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 259578bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 259678bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 259778bb4007SShri Abhyankar v += 25; 259878bb4007SShri Abhyankar } 259978bb4007SShri Abhyankar idc = 5*c[i]; 260078bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 260178bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 260278bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 260378bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 260478bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 260578bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 260678bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 260778bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 260878bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 260978bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 261078bb4007SShri Abhyankar } 261178bb4007SShri Abhyankar 261278bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 261378bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 261478bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 261578bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 261678bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 261778bb4007SShri Abhyankar PetscFunctionReturn(0); 261878bb4007SShri Abhyankar } 261978bb4007SShri Abhyankar 26208f690400SShri Abhyankar #undef __FUNCT__ 26214a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2622dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 262315091d37SBarry Smith { 262415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2625690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2626dfbe8321SBarry Smith PetscErrorCode ierr; 2627690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2628d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2629d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2630d9fead3dSBarry Smith const PetscScalar *b; 263115091d37SBarry Smith 263215091d37SBarry Smith PetscFunctionBegin; 2633d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26341ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 263515091d37SBarry Smith /* forward solve the lower triangular */ 263615091d37SBarry Smith idx = 0; 263715091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 263815091d37SBarry Smith for (i=1; i<n; i++) { 263915091d37SBarry Smith v = aa + 25*ai[i]; 264015091d37SBarry Smith vi = aj + ai[i]; 264115091d37SBarry Smith nz = diag[i] - ai[i]; 264215091d37SBarry Smith idx = 5*i; 2643f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 264415091d37SBarry Smith while (nz--) { 264515091d37SBarry Smith jdx = 5*(*vi++); 264615091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2647f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2648f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2649f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2650f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2651f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 265215091d37SBarry Smith v += 25; 265315091d37SBarry Smith } 2654f1af5d2fSBarry Smith x[idx] = s1; 2655f1af5d2fSBarry Smith x[1+idx] = s2; 2656f1af5d2fSBarry Smith x[2+idx] = s3; 2657f1af5d2fSBarry Smith x[3+idx] = s4; 2658f1af5d2fSBarry Smith x[4+idx] = s5; 265915091d37SBarry Smith } 266015091d37SBarry Smith /* backward solve the upper triangular */ 266115091d37SBarry Smith for (i=n-1; i>=0; i--){ 266215091d37SBarry Smith v = aa + 25*diag[i] + 25; 266315091d37SBarry Smith vi = aj + diag[i] + 1; 266415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 266515091d37SBarry Smith idt = 5*i; 2666f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2667f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 266815091d37SBarry Smith while (nz--) { 266915091d37SBarry Smith idx = 5*(*vi++); 267015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2671f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2672f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2673f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2674f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2675f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 267615091d37SBarry Smith v += 25; 267715091d37SBarry Smith } 267815091d37SBarry Smith v = aa + 25*diag[i]; 2679f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2680f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2681f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2682f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2683f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 268415091d37SBarry Smith } 268515091d37SBarry Smith 2686d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26871ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2688dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 268915091d37SBarry Smith PetscFunctionReturn(0); 269015091d37SBarry Smith } 269115091d37SBarry Smith 26924a2ae208SSatish Balay #undef __FUNCT__ 2693cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2694cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2695cee9d6f2SShri Abhyankar { 2696cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 26976464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2698cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2699cee9d6f2SShri Abhyankar PetscInt jdx; 2700cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2701cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2702cee9d6f2SShri Abhyankar const PetscScalar *b; 2703cee9d6f2SShri Abhyankar 2704cee9d6f2SShri Abhyankar PetscFunctionBegin; 2705cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2706cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2707cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2708cee9d6f2SShri Abhyankar idx = 0; 2709cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2710cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2711cee9d6f2SShri Abhyankar v = aa + 25*ai[i]; 2712cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2713cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2714cee9d6f2SShri Abhyankar idx = 5*i; 2715cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 27166464896eSShri Abhyankar for(k=0;k<nz;k++) { 27176464896eSShri Abhyankar jdx = 5*vi[k]; 2718cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2719cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2720cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2721cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2722cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2723cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2724cee9d6f2SShri Abhyankar v += 25; 2725cee9d6f2SShri Abhyankar } 2726cee9d6f2SShri Abhyankar x[idx] = s1; 2727cee9d6f2SShri Abhyankar x[1+idx] = s2; 2728cee9d6f2SShri Abhyankar x[2+idx] = s3; 2729cee9d6f2SShri Abhyankar x[3+idx] = s4; 2730cee9d6f2SShri Abhyankar x[4+idx] = s5; 2731cee9d6f2SShri Abhyankar } 2732cee9d6f2SShri Abhyankar 2733cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2734cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2735cee9d6f2SShri Abhyankar v = aa + 25*ai[2*n-i]; 2736cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2737cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2738cee9d6f2SShri Abhyankar idt = 5*i; 2739cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 2740cee9d6f2SShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 27416464896eSShri Abhyankar for(k=0;k<nz;k++){ 27426464896eSShri Abhyankar idx = 5*vi[k]; 2743cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2744cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2745cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2746cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2747cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2748cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2749cee9d6f2SShri Abhyankar v += 25; 2750cee9d6f2SShri Abhyankar } 2751cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2752cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2753cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2754cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2755cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2756cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2757cee9d6f2SShri Abhyankar } 2758cee9d6f2SShri Abhyankar 2759cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2760cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2761cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2762cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2763cee9d6f2SShri Abhyankar } 2764cee9d6f2SShri Abhyankar 2765cee9d6f2SShri Abhyankar #undef __FUNCT__ 276653cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2" 276753cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 276853cca76cSShri Abhyankar { 276953cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 277053cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 277153cca76cSShri Abhyankar PetscErrorCode ierr; 277253cca76cSShri Abhyankar PetscInt jdx; 277353cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 277453cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 277553cca76cSShri Abhyankar const PetscScalar *b; 277653cca76cSShri Abhyankar 277753cca76cSShri Abhyankar PetscFunctionBegin; 277853cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 277953cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 278053cca76cSShri Abhyankar /* forward solve the lower triangular */ 278153cca76cSShri Abhyankar idx = 0; 278253cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 278353cca76cSShri Abhyankar for (i=1; i<n; i++) { 278453cca76cSShri Abhyankar v = aa + 25*ai[i]; 278553cca76cSShri Abhyankar vi = aj + ai[i]; 278653cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 278753cca76cSShri Abhyankar idx = 5*i; 278853cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 278953cca76cSShri Abhyankar for(k=0;k<nz;k++) { 279053cca76cSShri Abhyankar jdx = 5*vi[k]; 279153cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 279253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 279353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 279453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 279553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 279653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 279753cca76cSShri Abhyankar v += 25; 279853cca76cSShri Abhyankar } 279953cca76cSShri Abhyankar x[idx] = s1; 280053cca76cSShri Abhyankar x[1+idx] = s2; 280153cca76cSShri Abhyankar x[2+idx] = s3; 280253cca76cSShri Abhyankar x[3+idx] = s4; 280353cca76cSShri Abhyankar x[4+idx] = s5; 280453cca76cSShri Abhyankar } 280553cca76cSShri Abhyankar 280653cca76cSShri Abhyankar /* backward solve the upper triangular */ 280753cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 280853cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 280953cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 281053cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 281153cca76cSShri Abhyankar idt = 5*i; 281253cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 281353cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 281453cca76cSShri Abhyankar for(k=0;k<nz;k++){ 281553cca76cSShri Abhyankar idx = 5*vi[k]; 281653cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 281753cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 281853cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 281953cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 282053cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 282153cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 282253cca76cSShri Abhyankar v += 25; 282353cca76cSShri Abhyankar } 282453cca76cSShri Abhyankar /* x = inv_diagonal*x */ 282553cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 282653cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 282753cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 282853cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 282953cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 283053cca76cSShri Abhyankar } 283153cca76cSShri Abhyankar 283253cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 283353cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 283453cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 283553cca76cSShri Abhyankar PetscFunctionReturn(0); 283653cca76cSShri Abhyankar } 283753cca76cSShri Abhyankar 283853cca76cSShri Abhyankar #undef __FUNCT__ 28394a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2840dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 28414e2b4712SSatish Balay { 28424e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 28434e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 28446849ba73SBarry Smith PetscErrorCode ierr; 28455d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 28465d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2847d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2848d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2849d9fead3dSBarry Smith const PetscScalar *b; 28504e2b4712SSatish Balay 28514e2b4712SSatish Balay PetscFunctionBegin; 2852d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28531ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2854f1af5d2fSBarry Smith t = a->solve_work; 28554e2b4712SSatish Balay 28564e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 28574e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 28584e2b4712SSatish Balay 28594e2b4712SSatish Balay /* forward solve the lower triangular */ 28604e2b4712SSatish Balay idx = 4*(*r++); 2861f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2862f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 28634e2b4712SSatish Balay for (i=1; i<n; i++) { 28644e2b4712SSatish Balay v = aa + 16*ai[i]; 28654e2b4712SSatish Balay vi = aj + ai[i]; 28664e2b4712SSatish Balay nz = diag[i] - ai[i]; 28674e2b4712SSatish Balay idx = 4*(*r++); 2868f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 28694e2b4712SSatish Balay while (nz--) { 28704e2b4712SSatish Balay idx = 4*(*vi++); 2871f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2872f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2873f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2874f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2875f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 28764e2b4712SSatish Balay v += 16; 28774e2b4712SSatish Balay } 28784e2b4712SSatish Balay idx = 4*i; 2879f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2880f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 28814e2b4712SSatish Balay } 28824e2b4712SSatish Balay /* backward solve the upper triangular */ 28834e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 28844e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 28854e2b4712SSatish Balay vi = aj + diag[i] + 1; 28864e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 28874e2b4712SSatish Balay idt = 4*i; 2888f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2889f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 28904e2b4712SSatish Balay while (nz--) { 28914e2b4712SSatish Balay idx = 4*(*vi++); 2892f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2893f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2894f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2895f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2896f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2897f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 28984e2b4712SSatish Balay v += 16; 28994e2b4712SSatish Balay } 29004e2b4712SSatish Balay idc = 4*(*c--); 29014e2b4712SSatish Balay v = aa + 16*diag[i]; 2902f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2903f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2904f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2905f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 29064e2b4712SSatish Balay } 29074e2b4712SSatish Balay 29084e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 29094e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2910d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29111ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2912dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 29134e2b4712SSatish Balay PetscFunctionReturn(0); 29144e2b4712SSatish Balay } 2915f26ec98cSKris Buschelman 2916f26ec98cSKris Buschelman #undef __FUNCT__ 29178f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 29188f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 29198f690400SShri Abhyankar { 29208f690400SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 29218f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 29228f690400SShri Abhyankar PetscErrorCode ierr; 292329b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 29248f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 29258f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 29268f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 29278f690400SShri Abhyankar const PetscScalar *b; 29288f690400SShri Abhyankar 29298f690400SShri Abhyankar PetscFunctionBegin; 29308f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29318f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 29328f690400SShri Abhyankar t = a->solve_work; 29338f690400SShri Abhyankar 29348f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 293529b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 29368f690400SShri Abhyankar 29378f690400SShri Abhyankar /* forward solve the lower triangular */ 293829b92fc1SShri Abhyankar idx = 4*r[0]; 29398f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 29408f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 29418f690400SShri Abhyankar for (i=1; i<n; i++) { 29428f690400SShri Abhyankar v = aa + 16*ai[i]; 29438f690400SShri Abhyankar vi = aj + ai[i]; 29448f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 294529b92fc1SShri Abhyankar idx = 4*r[i]; 29468f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 294729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 294829b92fc1SShri Abhyankar idx = 4*vi[m]; 29498f690400SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 29508f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 29518f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 29528f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 29538f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 29548f690400SShri Abhyankar v += 16; 29558f690400SShri Abhyankar } 29568f690400SShri Abhyankar idx = 4*i; 29578f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 29588f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 29598f690400SShri Abhyankar } 29608f690400SShri Abhyankar /* backward solve the upper triangular */ 29618f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 29628f690400SShri Abhyankar k = 2*n-i; 29638f690400SShri Abhyankar v = aa + 16*ai[k]; 29648f690400SShri Abhyankar vi = aj + ai[k]; 29658f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 29668f690400SShri Abhyankar idt = 4*i; 29678f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 29688f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 296929b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 297029b92fc1SShri Abhyankar idx = 4*vi[m]; 29718f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 29728f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 29738f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 29748f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 29758f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 29768f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 29778f690400SShri Abhyankar v += 16; 29788f690400SShri Abhyankar } 297929b92fc1SShri Abhyankar idc = 4*c[i]; 29808f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 29818f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 29828f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 29838f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 29848f690400SShri Abhyankar } 29858f690400SShri Abhyankar 29868f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 29878f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 29888f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29898f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 29908f690400SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 29918f690400SShri Abhyankar PetscFunctionReturn(0); 29928f690400SShri Abhyankar } 29938f690400SShri Abhyankar 29948f690400SShri Abhyankar #undef __FUNCT__ 299578bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2" 299678bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx) 299778bb4007SShri Abhyankar { 299878bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 299978bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 300078bb4007SShri Abhyankar PetscErrorCode ierr; 300178bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 300278bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 300378bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 300478bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 300578bb4007SShri Abhyankar const PetscScalar *b; 300678bb4007SShri Abhyankar 300778bb4007SShri Abhyankar PetscFunctionBegin; 300878bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 300978bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 301078bb4007SShri Abhyankar t = a->solve_work; 301178bb4007SShri Abhyankar 301278bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 301378bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 301478bb4007SShri Abhyankar 301578bb4007SShri Abhyankar /* forward solve the lower triangular */ 301678bb4007SShri Abhyankar idx = 4*r[0]; 301778bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 301878bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 301978bb4007SShri Abhyankar for (i=1; i<n; i++) { 302078bb4007SShri Abhyankar v = aa + 16*ai[i]; 302178bb4007SShri Abhyankar vi = aj + ai[i]; 302278bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 302378bb4007SShri Abhyankar idx = 4*r[i]; 302478bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 302578bb4007SShri Abhyankar for(m=0;m<nz;m++){ 302678bb4007SShri Abhyankar idx = 4*vi[m]; 302778bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 302878bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 302978bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 303078bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 303178bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 303278bb4007SShri Abhyankar v += 16; 303378bb4007SShri Abhyankar } 303478bb4007SShri Abhyankar idx = 4*i; 303578bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 303678bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 303778bb4007SShri Abhyankar } 303878bb4007SShri Abhyankar /* backward solve the upper triangular */ 303978bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 304078bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 304178bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 304278bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 304378bb4007SShri Abhyankar idt = 4*i; 304478bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 304578bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 304678bb4007SShri Abhyankar for(m=0;m<nz;m++){ 304778bb4007SShri Abhyankar idx = 4*vi[m]; 304878bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 304978bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 305078bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 305178bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 305278bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 305378bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 305478bb4007SShri Abhyankar v += 16; 305578bb4007SShri Abhyankar } 305678bb4007SShri Abhyankar idc = 4*c[i]; 305778bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 305878bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 305978bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 306078bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 306178bb4007SShri Abhyankar } 306278bb4007SShri Abhyankar 306378bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 306478bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 306578bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 306678bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 306778bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 306878bb4007SShri Abhyankar PetscFunctionReturn(0); 306978bb4007SShri Abhyankar } 307078bb4007SShri Abhyankar 307178bb4007SShri Abhyankar #undef __FUNCT__ 3072f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3073dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3074f26ec98cSKris Buschelman { 3075f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3076f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 30776849ba73SBarry Smith PetscErrorCode ierr; 30785d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 30795d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3080d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3081d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3082d9fead3dSBarry Smith PetscScalar *x; 3083d9fead3dSBarry Smith const PetscScalar *b; 3084f26ec98cSKris Buschelman 3085f26ec98cSKris Buschelman PetscFunctionBegin; 3086d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30871ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3088f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 3089f26ec98cSKris Buschelman 3090f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3091f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3092f26ec98cSKris Buschelman 3093f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3094f26ec98cSKris Buschelman idx = 4*(*r++); 3095f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 3096f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 3097f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 3098f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 3099f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3100f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3101f26ec98cSKris Buschelman vi = aj + ai[i]; 3102f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3103f26ec98cSKris Buschelman idx = 4*(*r++); 3104f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3105f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3106f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3107f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3108f26ec98cSKris Buschelman while (nz--) { 3109f26ec98cSKris Buschelman idx = 4*(*vi++); 3110f26ec98cSKris Buschelman x1 = t[idx]; 3111f26ec98cSKris Buschelman x2 = t[1+idx]; 3112f26ec98cSKris Buschelman x3 = t[2+idx]; 3113f26ec98cSKris Buschelman x4 = t[3+idx]; 3114f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3115f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3116f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3117f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3118f26ec98cSKris Buschelman v += 16; 3119f26ec98cSKris Buschelman } 3120f26ec98cSKris Buschelman idx = 4*i; 3121f26ec98cSKris Buschelman t[idx] = s1; 3122f26ec98cSKris Buschelman t[1+idx] = s2; 3123f26ec98cSKris Buschelman t[2+idx] = s3; 3124f26ec98cSKris Buschelman t[3+idx] = s4; 3125f26ec98cSKris Buschelman } 3126f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3127f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3128f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 3129f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3130f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3131f26ec98cSKris Buschelman idt = 4*i; 3132f26ec98cSKris Buschelman s1 = t[idt]; 3133f26ec98cSKris Buschelman s2 = t[1+idt]; 3134f26ec98cSKris Buschelman s3 = t[2+idt]; 3135f26ec98cSKris Buschelman s4 = t[3+idt]; 3136f26ec98cSKris Buschelman while (nz--) { 3137f26ec98cSKris Buschelman idx = 4*(*vi++); 3138f26ec98cSKris Buschelman x1 = t[idx]; 3139f26ec98cSKris Buschelman x2 = t[1+idx]; 3140f26ec98cSKris Buschelman x3 = t[2+idx]; 3141f26ec98cSKris Buschelman x4 = t[3+idx]; 3142f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3143f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3144f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3145f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3146f26ec98cSKris Buschelman v += 16; 3147f26ec98cSKris Buschelman } 3148f26ec98cSKris Buschelman idc = 4*(*c--); 3149f26ec98cSKris Buschelman v = aa + 16*diag[i]; 3150f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3151f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3152f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3153f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3154f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 3155f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 3156f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 3157f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 3158f26ec98cSKris Buschelman } 3159f26ec98cSKris Buschelman 3160f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3161f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3162d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 31631ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3164dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3165f26ec98cSKris Buschelman PetscFunctionReturn(0); 3166f26ec98cSKris Buschelman } 3167f26ec98cSKris Buschelman 316824c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 316924c233c2SKris Buschelman 317024c233c2SKris Buschelman #include PETSC_HAVE_SSE 317124c233c2SKris Buschelman 317224c233c2SKris Buschelman #undef __FUNCT__ 317324c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3174dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 317524c233c2SKris Buschelman { 317624c233c2SKris Buschelman /* 317724c233c2SKris Buschelman Note: This code uses demotion of double 317824c233c2SKris Buschelman to float when performing the mixed-mode computation. 317924c233c2SKris Buschelman This may not be numerically reasonable for all applications. 318024c233c2SKris Buschelman */ 318124c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 318224c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 31836849ba73SBarry Smith PetscErrorCode ierr; 31845d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 31855d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 318624c233c2SKris Buschelman MatScalar *aa=a->a,*v; 318787828ca2SBarry Smith PetscScalar *x,*b,*t; 318824c233c2SKris Buschelman 318924c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 319024c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 319124c233c2SKris Buschelman unsigned long offset; 319224c233c2SKris Buschelman 319324c233c2SKris Buschelman PetscFunctionBegin; 319424c233c2SKris Buschelman SSE_SCOPE_BEGIN; 319524c233c2SKris Buschelman 319624c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 319724c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 319824c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 319924c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 320024c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 320124c233c2SKris Buschelman 32021ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 32031ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 320424c233c2SKris Buschelman t = a->solve_work; 320524c233c2SKris Buschelman 320624c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 320724c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 320824c233c2SKris Buschelman 320924c233c2SKris Buschelman /* forward solve the lower triangular */ 321024c233c2SKris Buschelman idx = 4*(*r++); 321124c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 321224c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 321324c233c2SKris Buschelman v = aa + 16*ai[1]; 321424c233c2SKris Buschelman 321524c233c2SKris Buschelman for (i=1; i<n;) { 321624c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 321724c233c2SKris Buschelman vi = aj + ai[i]; 321824c233c2SKris Buschelman nz = diag[i] - ai[i]; 321924c233c2SKris Buschelman idx = 4*(*r++); 322024c233c2SKris Buschelman 322124c233c2SKris Buschelman /* Demote sum from double to float */ 322224c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 322324c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 322424c233c2SKris Buschelman 322524c233c2SKris Buschelman while (nz--) { 322624c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 322724c233c2SKris Buschelman idx = 4*(*vi++); 322824c233c2SKris Buschelman 322924c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 323024c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 323124c233c2SKris Buschelman 323224c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 323324c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 323424c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 323524c233c2SKris Buschelman 323624c233c2SKris Buschelman /* First Column */ 323724c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 323824c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 323924c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 324024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 324124c233c2SKris Buschelman 324224c233c2SKris Buschelman /* Second Column */ 324324c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 324424c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 324524c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 324624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 324724c233c2SKris Buschelman 324824c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 324924c233c2SKris Buschelman 325024c233c2SKris Buschelman /* Third Column */ 325124c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 325224c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 325324c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 325424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 325524c233c2SKris Buschelman 325624c233c2SKris Buschelman /* Fourth Column */ 325724c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 325824c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 325924c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 326024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 326124c233c2SKris Buschelman SSE_INLINE_END_2 326224c233c2SKris Buschelman 326324c233c2SKris Buschelman v += 16; 326424c233c2SKris Buschelman } 326524c233c2SKris Buschelman idx = 4*i; 326624c233c2SKris Buschelman v = aa + 16*ai[++i]; 326724c233c2SKris Buschelman PREFETCH_NTA(v); 326824c233c2SKris Buschelman STORE_PS(tmps,XMM7); 326924c233c2SKris Buschelman 327024c233c2SKris Buschelman /* Promote result from float to double */ 327124c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 327224c233c2SKris Buschelman } 327324c233c2SKris Buschelman /* backward solve the upper triangular */ 327424c233c2SKris Buschelman idt = 4*(n-1); 327524c233c2SKris Buschelman ai16 = 16*diag[n-1]; 327624c233c2SKris Buschelman v = aa + ai16 + 16; 327724c233c2SKris Buschelman for (i=n-1; i>=0;){ 327824c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 327924c233c2SKris Buschelman vi = aj + diag[i] + 1; 328024c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 328124c233c2SKris Buschelman 328224c233c2SKris Buschelman /* Demote accumulator from double to float */ 328324c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 328424c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 328524c233c2SKris Buschelman 328624c233c2SKris Buschelman while (nz--) { 328724c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 328824c233c2SKris Buschelman idx = 4*(*vi++); 328924c233c2SKris Buschelman 329024c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 329124c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 329224c233c2SKris Buschelman 329324c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 329424c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 329524c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 329624c233c2SKris Buschelman 329724c233c2SKris Buschelman /* First Column */ 329824c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 329924c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 330024c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 330124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 330224c233c2SKris Buschelman 330324c233c2SKris Buschelman /* Second Column */ 330424c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 330524c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 330624c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 330724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 330824c233c2SKris Buschelman 330924c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 331024c233c2SKris Buschelman 331124c233c2SKris Buschelman /* Third Column */ 331224c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 331324c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 331424c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 331524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 331624c233c2SKris Buschelman 331724c233c2SKris Buschelman /* Fourth Column */ 331824c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 331924c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 332024c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 332124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 332224c233c2SKris Buschelman SSE_INLINE_END_2 332324c233c2SKris Buschelman v += 16; 332424c233c2SKris Buschelman } 332524c233c2SKris Buschelman v = aa + ai16; 332624c233c2SKris Buschelman ai16 = 16*diag[--i]; 332724c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 332824c233c2SKris Buschelman /* 332924c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 333024c233c2SKris Buschelman which was inverted as part of the factorization 333124c233c2SKris Buschelman */ 333224c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 333324c233c2SKris Buschelman /* First Column */ 333424c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 333524c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 333624c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 333724c233c2SKris Buschelman 333824c233c2SKris Buschelman /* Second Column */ 333924c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 334024c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 334124c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 334224c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 334324c233c2SKris Buschelman 334424c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 334524c233c2SKris Buschelman 334624c233c2SKris Buschelman /* Third Column */ 334724c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 334824c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 334924c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 335024c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 335124c233c2SKris Buschelman 335224c233c2SKris Buschelman /* Fourth Column */ 335324c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 335424c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 335524c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 335624c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 335724c233c2SKris Buschelman 335824c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 335924c233c2SKris Buschelman SSE_INLINE_END_3 336024c233c2SKris Buschelman 336124c233c2SKris Buschelman /* Promote solution from float to double */ 336224c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 336324c233c2SKris Buschelman 336424c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 336524c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 336624c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 336724c233c2SKris Buschelman idc = 4*(*c--); 336824c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 336924c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 337024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 337124c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 337224c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 337324c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 337424c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 337524c233c2SKris Buschelman SSE_INLINE_END_2 337624c233c2SKris Buschelman v = aa + ai16 + 16; 337724c233c2SKris Buschelman idt -= 4; 337824c233c2SKris Buschelman } 337924c233c2SKris Buschelman 338024c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 338124c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 33821ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 33831ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3384dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 338524c233c2SKris Buschelman SSE_SCOPE_END; 338624c233c2SKris Buschelman PetscFunctionReturn(0); 338724c233c2SKris Buschelman } 338824c233c2SKris Buschelman 338924c233c2SKris Buschelman #endif 33900ef38995SBarry Smith 33910ef38995SBarry Smith 33924e2b4712SSatish Balay /* 33934e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 33944e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 33954e2b4712SSatish Balay */ 33964a2ae208SSatish Balay #undef __FUNCT__ 33974a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3398dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 33994e2b4712SSatish Balay { 34004e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3401356650c2SBarry Smith PetscInt n=a->mbs; 3402356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 3403dfbe8321SBarry Smith PetscErrorCode ierr; 3404356650c2SBarry Smith const PetscInt *diag = a->diag; 3405d9fead3dSBarry Smith const MatScalar *aa=a->a; 3406d9fead3dSBarry Smith PetscScalar *x; 3407d9fead3dSBarry Smith const PetscScalar *b; 34084e2b4712SSatish Balay 34094e2b4712SSatish Balay PetscFunctionBegin; 3410d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 34111ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 34124e2b4712SSatish Balay 3413aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 34142853dc0eSBarry Smith { 341587828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 34162853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 34172853dc0eSBarry Smith } 3418aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 34192853dc0eSBarry Smith { 342087828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 34212853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 34222853dc0eSBarry Smith } 3423aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 34242853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3425e1293385SBarry Smith #else 342630d4dcafSBarry Smith { 342787828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3428d9fead3dSBarry Smith const MatScalar *v; 3429356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 3430356650c2SBarry Smith const PetscInt *vi; 3431e1293385SBarry Smith 34324e2b4712SSatish Balay /* forward solve the lower triangular */ 34334e2b4712SSatish Balay idx = 0; 3434e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 34354e2b4712SSatish Balay for (i=1; i<n; i++) { 34364e2b4712SSatish Balay v = aa + 16*ai[i]; 34374e2b4712SSatish Balay vi = aj + ai[i]; 34384e2b4712SSatish Balay nz = diag[i] - ai[i]; 3439e1293385SBarry Smith idx += 4; 3440f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 34414e2b4712SSatish Balay while (nz--) { 34424e2b4712SSatish Balay jdx = 4*(*vi++); 34434e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3444f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3445f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3446f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3447f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 34484e2b4712SSatish Balay v += 16; 34494e2b4712SSatish Balay } 3450f1af5d2fSBarry Smith x[idx] = s1; 3451f1af5d2fSBarry Smith x[1+idx] = s2; 3452f1af5d2fSBarry Smith x[2+idx] = s3; 3453f1af5d2fSBarry Smith x[3+idx] = s4; 34544e2b4712SSatish Balay } 34554e2b4712SSatish Balay /* backward solve the upper triangular */ 34564e555682SBarry Smith idt = 4*(n-1); 34574e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 34584e555682SBarry Smith ai16 = 16*diag[i]; 34594e555682SBarry Smith v = aa + ai16 + 16; 34604e2b4712SSatish Balay vi = aj + diag[i] + 1; 34614e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3462f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3463f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 34644e2b4712SSatish Balay while (nz--) { 34654e2b4712SSatish Balay idx = 4*(*vi++); 34664e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3467f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3468f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3469f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3470f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 34714e2b4712SSatish Balay v += 16; 34724e2b4712SSatish Balay } 34734e555682SBarry Smith v = aa + ai16; 3474f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3475f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3476f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3477f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3478329f5518SBarry Smith idt -= 4; 34794e2b4712SSatish Balay } 348030d4dcafSBarry Smith } 3481e1293385SBarry Smith #endif 34824e2b4712SSatish Balay 3483d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 34841ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3485dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 34864e2b4712SSatish Balay PetscFunctionReturn(0); 34874e2b4712SSatish Balay } 34884e2b4712SSatish Balay 3489f26ec98cSKris Buschelman #undef __FUNCT__ 3490cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3491cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3492cee9d6f2SShri Abhyankar { 3493cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 34946464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3495cee9d6f2SShri Abhyankar PetscErrorCode ierr; 3496cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 3497cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3498cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 3499cee9d6f2SShri Abhyankar PetscScalar *x; 3500cee9d6f2SShri Abhyankar const PetscScalar *b; 3501cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3502cee9d6f2SShri Abhyankar 3503cee9d6f2SShri Abhyankar PetscFunctionBegin; 3504cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3505cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3506cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 3507cee9d6f2SShri Abhyankar idx = 0; 3508cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3509cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 3510cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 3511cee9d6f2SShri Abhyankar vi = aj + ai[i]; 3512cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 3513cee9d6f2SShri Abhyankar idx = bs*i; 3514cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 35156464896eSShri Abhyankar for(k=0;k<nz;k++) { 35166464896eSShri Abhyankar jdx = bs*vi[k]; 3517cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3518cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3519cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3520cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3521cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3522cee9d6f2SShri Abhyankar 3523cee9d6f2SShri Abhyankar v += bs2; 3524cee9d6f2SShri Abhyankar } 3525cee9d6f2SShri Abhyankar 3526cee9d6f2SShri Abhyankar x[idx] = s1; 3527cee9d6f2SShri Abhyankar x[1+idx] = s2; 3528cee9d6f2SShri Abhyankar x[2+idx] = s3; 3529cee9d6f2SShri Abhyankar x[3+idx] = s4; 3530cee9d6f2SShri Abhyankar } 3531cee9d6f2SShri Abhyankar 3532cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 3533cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 3534cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 3535cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 3536cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 3537cee9d6f2SShri Abhyankar idt = bs*i; 3538cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3539cee9d6f2SShri Abhyankar 35406464896eSShri Abhyankar for(k=0;k<nz;k++){ 35416464896eSShri Abhyankar idx = bs*vi[k]; 3542cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3543cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3544cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3545cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3546cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3547cee9d6f2SShri Abhyankar 3548cee9d6f2SShri Abhyankar v += bs2; 3549cee9d6f2SShri Abhyankar } 3550cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 3551cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3552cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3553cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3554cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3555cee9d6f2SShri Abhyankar 3556cee9d6f2SShri Abhyankar } 3557cee9d6f2SShri Abhyankar 3558cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3559cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3560cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3561cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 3562cee9d6f2SShri Abhyankar } 3563cee9d6f2SShri Abhyankar 3564b2b2dd24SShri Abhyankar #undef __FUNCT__ 3565b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 3566b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 3567b2b2dd24SShri Abhyankar { 3568b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3569b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3570b2b2dd24SShri Abhyankar PetscErrorCode ierr; 3571b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 3572b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3573b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 3574b2b2dd24SShri Abhyankar PetscScalar *x; 3575b2b2dd24SShri Abhyankar const PetscScalar *b; 3576b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3577cee9d6f2SShri Abhyankar 3578b2b2dd24SShri Abhyankar PetscFunctionBegin; 3579b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3580b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3581b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 3582b2b2dd24SShri Abhyankar idx = 0; 3583b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3584b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 3585b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 3586b2b2dd24SShri Abhyankar vi = aj + ai[i]; 3587b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 3588b2b2dd24SShri Abhyankar idx = bs*i; 3589b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3590b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 3591b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 3592b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3593b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3594b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3595b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3596b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3597b2b2dd24SShri Abhyankar 3598b2b2dd24SShri Abhyankar v += bs2; 3599b2b2dd24SShri Abhyankar } 3600b2b2dd24SShri Abhyankar 3601b2b2dd24SShri Abhyankar x[idx] = s1; 3602b2b2dd24SShri Abhyankar x[1+idx] = s2; 3603b2b2dd24SShri Abhyankar x[2+idx] = s3; 3604b2b2dd24SShri Abhyankar x[3+idx] = s4; 3605b2b2dd24SShri Abhyankar } 3606b2b2dd24SShri Abhyankar 3607b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 3608b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 3609b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 3610b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 3611b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 3612b2b2dd24SShri Abhyankar idt = bs*i; 3613b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3614b2b2dd24SShri Abhyankar 3615b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 3616b2b2dd24SShri Abhyankar idx = bs*vi[k]; 3617b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3618b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3619b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3620b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3621b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3622b2b2dd24SShri Abhyankar 3623b2b2dd24SShri Abhyankar v += bs2; 3624b2b2dd24SShri Abhyankar } 3625b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 3626b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3627b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3628b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3629b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3630b2b2dd24SShri Abhyankar 3631b2b2dd24SShri Abhyankar } 3632b2b2dd24SShri Abhyankar 3633b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3634b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3635b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3636b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 3637b2b2dd24SShri Abhyankar } 3638cee9d6f2SShri Abhyankar 3639cee9d6f2SShri Abhyankar #undef __FUNCT__ 3640f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3641dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3642f26ec98cSKris Buschelman { 3643f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3644690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3645dfbe8321SBarry Smith PetscErrorCode ierr; 3646690b6cddSBarry Smith PetscInt *diag = a->diag; 3647f26ec98cSKris Buschelman MatScalar *aa=a->a; 3648f26ec98cSKris Buschelman PetscScalar *x,*b; 3649f26ec98cSKris Buschelman 3650f26ec98cSKris Buschelman PetscFunctionBegin; 36511ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 36521ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3653f26ec98cSKris Buschelman 3654f26ec98cSKris Buschelman { 3655f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3656f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 3657690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3658f26ec98cSKris Buschelman 3659f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3660f26ec98cSKris Buschelman idx = 0; 3661f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 3662f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 3663f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 3664f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 3665f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3666f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3667f26ec98cSKris Buschelman vi = aj + ai[i]; 3668f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3669f26ec98cSKris Buschelman idx += 4; 3670f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3671f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3672f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3673f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3674f26ec98cSKris Buschelman while (nz--) { 3675f26ec98cSKris Buschelman jdx = 4*(*vi++); 3676f26ec98cSKris Buschelman x1 = t[jdx]; 3677f26ec98cSKris Buschelman x2 = t[1+jdx]; 3678f26ec98cSKris Buschelman x3 = t[2+jdx]; 3679f26ec98cSKris Buschelman x4 = t[3+jdx]; 3680f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3681f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3682f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3683f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3684f26ec98cSKris Buschelman v += 16; 3685f26ec98cSKris Buschelman } 3686f26ec98cSKris Buschelman t[idx] = s1; 3687f26ec98cSKris Buschelman t[1+idx] = s2; 3688f26ec98cSKris Buschelman t[2+idx] = s3; 3689f26ec98cSKris Buschelman t[3+idx] = s4; 3690f26ec98cSKris Buschelman } 3691f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3692f26ec98cSKris Buschelman idt = 4*(n-1); 3693f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3694f26ec98cSKris Buschelman ai16 = 16*diag[i]; 3695f26ec98cSKris Buschelman v = aa + ai16 + 16; 3696f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3697f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3698f26ec98cSKris Buschelman s1 = t[idt]; 3699f26ec98cSKris Buschelman s2 = t[1+idt]; 3700f26ec98cSKris Buschelman s3 = t[2+idt]; 3701f26ec98cSKris Buschelman s4 = t[3+idt]; 3702f26ec98cSKris Buschelman while (nz--) { 3703f26ec98cSKris Buschelman idx = 4*(*vi++); 3704f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 3705f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 3706f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 3707f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 3708f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3709f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3710f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3711f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3712f26ec98cSKris Buschelman v += 16; 3713f26ec98cSKris Buschelman } 3714f26ec98cSKris Buschelman v = aa + ai16; 3715f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3716f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3717f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3718f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3719f26ec98cSKris Buschelman idt -= 4; 3720f26ec98cSKris Buschelman } 3721f26ec98cSKris Buschelman } 3722f26ec98cSKris Buschelman 37231ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 37241ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3725dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3726f26ec98cSKris Buschelman PetscFunctionReturn(0); 3727f26ec98cSKris Buschelman } 3728f26ec98cSKris Buschelman 37293660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 37303660e330SKris Buschelman 37313660e330SKris Buschelman #include PETSC_HAVE_SSE 37323660e330SKris Buschelman #undef __FUNCT__ 37337cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3734dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 37353660e330SKris Buschelman { 37363660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 37372aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 3738dfbe8321SBarry Smith PetscErrorCode ierr; 3739dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 37403660e330SKris Buschelman MatScalar *aa=a->a; 374187828ca2SBarry Smith PetscScalar *x,*b; 37423660e330SKris Buschelman 37433660e330SKris Buschelman PetscFunctionBegin; 37443660e330SKris Buschelman SSE_SCOPE_BEGIN; 37453660e330SKris Buschelman /* 37463660e330SKris Buschelman Note: This code currently uses demotion of double 37473660e330SKris Buschelman to float when performing the mixed-mode computation. 37483660e330SKris Buschelman This may not be numerically reasonable for all applications. 37493660e330SKris Buschelman */ 37503660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 37513660e330SKris Buschelman 37521ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 37531ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 37543660e330SKris Buschelman { 3755eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 3756eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 37572aa5897fSKris Buschelman int nz,i,idt,ai16; 37582aa5897fSKris Buschelman unsigned int jdx,idx; 37592aa5897fSKris Buschelman unsigned short *vi; 3760eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 37613660e330SKris Buschelman 3762eb05f457SKris Buschelman /* First block is the identity. */ 37633660e330SKris Buschelman idx = 0; 3764eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 37652aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 37663660e330SKris Buschelman 37673660e330SKris Buschelman for (i=1; i<n;) { 37683660e330SKris Buschelman PREFETCH_NTA(&v[8]); 37693660e330SKris Buschelman vi = aj + ai[i]; 37703660e330SKris Buschelman nz = diag[i] - ai[i]; 37713660e330SKris Buschelman idx += 4; 37723660e330SKris Buschelman 3773eb05f457SKris Buschelman /* Demote RHS from double to float. */ 3774eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3775eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 37763660e330SKris Buschelman 37773660e330SKris Buschelman while (nz--) { 37783660e330SKris Buschelman PREFETCH_NTA(&v[16]); 37792aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 37803660e330SKris Buschelman 37813660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 3782eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 37833660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 37843660e330SKris Buschelman 37853660e330SKris Buschelman /* First Column */ 37863660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 37873660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 37883660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 37893660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 37903660e330SKris Buschelman 37913660e330SKris Buschelman /* Second Column */ 37923660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 37933660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 37943660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 37953660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 37963660e330SKris Buschelman 37973660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 37983660e330SKris Buschelman 37993660e330SKris Buschelman /* Third Column */ 38003660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 38013660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 38023660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 38033660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 38043660e330SKris Buschelman 38053660e330SKris Buschelman /* Fourth Column */ 38063660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 38073660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 38083660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 38093660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 38103660e330SKris Buschelman SSE_INLINE_END_2 38113660e330SKris Buschelman 38123660e330SKris Buschelman v += 16; 38133660e330SKris Buschelman } 38143660e330SKris Buschelman v = aa + 16*ai[++i]; 38153660e330SKris Buschelman PREFETCH_NTA(v); 3816eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 38173660e330SKris Buschelman } 3818eb05f457SKris Buschelman 3819eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 3820eb05f457SKris Buschelman 38213660e330SKris Buschelman idt = 4*(n-1); 38223660e330SKris Buschelman ai16 = 16*diag[n-1]; 38233660e330SKris Buschelman v = aa + ai16 + 16; 38243660e330SKris Buschelman for (i=n-1; i>=0;){ 38253660e330SKris Buschelman PREFETCH_NTA(&v[8]); 38263660e330SKris Buschelman vi = aj + diag[i] + 1; 38273660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 38283660e330SKris Buschelman 3829eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 38303660e330SKris Buschelman 38313660e330SKris Buschelman while (nz--) { 38323660e330SKris Buschelman PREFETCH_NTA(&v[16]); 38332aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 38343660e330SKris Buschelman 38353660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 3836eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 38373660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 38383660e330SKris Buschelman 38393660e330SKris Buschelman /* First Column */ 38403660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 38413660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 38423660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 38433660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 38443660e330SKris Buschelman 38453660e330SKris Buschelman /* Second Column */ 38463660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 38473660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 38483660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 38493660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 38503660e330SKris Buschelman 38513660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 38523660e330SKris Buschelman 38533660e330SKris Buschelman /* Third Column */ 38543660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 38553660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 38563660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 38573660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 38583660e330SKris Buschelman 38593660e330SKris Buschelman /* Fourth Column */ 38603660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 38613660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 38623660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 38633660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 38643660e330SKris Buschelman SSE_INLINE_END_2 38653660e330SKris Buschelman v += 16; 38663660e330SKris Buschelman } 38673660e330SKris Buschelman v = aa + ai16; 38683660e330SKris Buschelman ai16 = 16*diag[--i]; 38693660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 38703660e330SKris Buschelman /* 38713660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 38723660e330SKris Buschelman which was inverted as part of the factorization 38733660e330SKris Buschelman */ 3874eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 38753660e330SKris Buschelman /* First Column */ 38763660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 38773660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 38783660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 38793660e330SKris Buschelman 38803660e330SKris Buschelman /* Second Column */ 38813660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 38823660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 38833660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 38843660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 38853660e330SKris Buschelman 38863660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 38873660e330SKris Buschelman 38883660e330SKris Buschelman /* Third Column */ 38893660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 38903660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 38913660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 38923660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 38933660e330SKris Buschelman 38943660e330SKris Buschelman /* Fourth Column */ 38953660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 38963660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 38973660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 38983660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 38993660e330SKris Buschelman 39003660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 39013660e330SKris Buschelman SSE_INLINE_END_3 39023660e330SKris Buschelman 39033660e330SKris Buschelman v = aa + ai16 + 16; 39043660e330SKris Buschelman idt -= 4; 39053660e330SKris Buschelman } 3906eb05f457SKris Buschelman 3907eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 3908eb05f457SKris Buschelman idt = 4*(n-1); 3909eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 3910eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3911eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3912eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 3913eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 3914eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 3915eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 3916eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 3917eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 391854693613SKris Buschelman idt -= 4; 39193660e330SKris Buschelman } 3920eb05f457SKris Buschelman 3921eb05f457SKris Buschelman } /* End of artificial scope. */ 39221ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 39231ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3924dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 39253660e330SKris Buschelman SSE_SCOPE_END; 39263660e330SKris Buschelman PetscFunctionReturn(0); 39273660e330SKris Buschelman } 39283660e330SKris Buschelman 39297cf1b8d3SKris Buschelman #undef __FUNCT__ 39307cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3931dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 39327cf1b8d3SKris Buschelman { 39337cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 39347cf1b8d3SKris Buschelman int *aj=a->j; 3935dfbe8321SBarry Smith PetscErrorCode ierr; 3936dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 39377cf1b8d3SKris Buschelman MatScalar *aa=a->a; 39387cf1b8d3SKris Buschelman PetscScalar *x,*b; 39397cf1b8d3SKris Buschelman 39407cf1b8d3SKris Buschelman PetscFunctionBegin; 39417cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 39427cf1b8d3SKris Buschelman /* 39437cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 39447cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 39457cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 39467cf1b8d3SKris Buschelman */ 39477cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 39487cf1b8d3SKris Buschelman 39491ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 39501ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 39517cf1b8d3SKris Buschelman { 39527cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 39537cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 39547cf1b8d3SKris Buschelman int nz,i,idt,ai16; 39557cf1b8d3SKris Buschelman int jdx,idx; 39567cf1b8d3SKris Buschelman int *vi; 39577cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 39587cf1b8d3SKris Buschelman 39597cf1b8d3SKris Buschelman /* First block is the identity. */ 39607cf1b8d3SKris Buschelman idx = 0; 39617cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 39627cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 39637cf1b8d3SKris Buschelman 39647cf1b8d3SKris Buschelman for (i=1; i<n;) { 39657cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 39667cf1b8d3SKris Buschelman vi = aj + ai[i]; 39677cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 39687cf1b8d3SKris Buschelman idx += 4; 39697cf1b8d3SKris Buschelman 39707cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 39717cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 39727cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 39737cf1b8d3SKris Buschelman 39747cf1b8d3SKris Buschelman while (nz--) { 39757cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 39767cf1b8d3SKris Buschelman jdx = 4*(*vi++); 39777cf1b8d3SKris Buschelman /* jdx = *vi++; */ 39787cf1b8d3SKris Buschelman 39797cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 39807cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 39817cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 39827cf1b8d3SKris Buschelman 39837cf1b8d3SKris Buschelman /* First Column */ 39847cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 39857cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 39867cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 39877cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 39887cf1b8d3SKris Buschelman 39897cf1b8d3SKris Buschelman /* Second Column */ 39907cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 39917cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 39927cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 39937cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 39947cf1b8d3SKris Buschelman 39957cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 39967cf1b8d3SKris Buschelman 39977cf1b8d3SKris Buschelman /* Third Column */ 39987cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 39997cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 40007cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 40017cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 40027cf1b8d3SKris Buschelman 40037cf1b8d3SKris Buschelman /* Fourth Column */ 40047cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 40057cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 40067cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 40077cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 40087cf1b8d3SKris Buschelman SSE_INLINE_END_2 40097cf1b8d3SKris Buschelman 40107cf1b8d3SKris Buschelman v += 16; 40117cf1b8d3SKris Buschelman } 40127cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 40137cf1b8d3SKris Buschelman PREFETCH_NTA(v); 40147cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 40157cf1b8d3SKris Buschelman } 40167cf1b8d3SKris Buschelman 40177cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 40187cf1b8d3SKris Buschelman 40197cf1b8d3SKris Buschelman idt = 4*(n-1); 40207cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 40217cf1b8d3SKris Buschelman v = aa + ai16 + 16; 40227cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 40237cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 40247cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 40257cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 40267cf1b8d3SKris Buschelman 40277cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 40287cf1b8d3SKris Buschelman 40297cf1b8d3SKris Buschelman while (nz--) { 40307cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 40317cf1b8d3SKris Buschelman idx = 4*(*vi++); 40327cf1b8d3SKris Buschelman /* idx = *vi++; */ 40337cf1b8d3SKris Buschelman 40347cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 40357cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 40367cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 40377cf1b8d3SKris Buschelman 40387cf1b8d3SKris Buschelman /* First Column */ 40397cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 40407cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 40417cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 40427cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 40437cf1b8d3SKris Buschelman 40447cf1b8d3SKris Buschelman /* Second Column */ 40457cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 40467cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 40477cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 40487cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 40497cf1b8d3SKris Buschelman 40507cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 40517cf1b8d3SKris Buschelman 40527cf1b8d3SKris Buschelman /* Third Column */ 40537cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 40547cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 40557cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 40567cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 40577cf1b8d3SKris Buschelman 40587cf1b8d3SKris Buschelman /* Fourth Column */ 40597cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 40607cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 40617cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 40627cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 40637cf1b8d3SKris Buschelman SSE_INLINE_END_2 40647cf1b8d3SKris Buschelman v += 16; 40657cf1b8d3SKris Buschelman } 40667cf1b8d3SKris Buschelman v = aa + ai16; 40677cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 40687cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 40697cf1b8d3SKris Buschelman /* 40707cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 40717cf1b8d3SKris Buschelman which was inverted as part of the factorization 40727cf1b8d3SKris Buschelman */ 40737cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 40747cf1b8d3SKris Buschelman /* First Column */ 40757cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 40767cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 40777cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 40787cf1b8d3SKris Buschelman 40797cf1b8d3SKris Buschelman /* Second Column */ 40807cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 40817cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 40827cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 40837cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 40847cf1b8d3SKris Buschelman 40857cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 40867cf1b8d3SKris Buschelman 40877cf1b8d3SKris Buschelman /* Third Column */ 40887cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 40897cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 40907cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 40917cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 40927cf1b8d3SKris Buschelman 40937cf1b8d3SKris Buschelman /* Fourth Column */ 40947cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 40957cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 40967cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 40977cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 40987cf1b8d3SKris Buschelman 40997cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 41007cf1b8d3SKris Buschelman SSE_INLINE_END_3 41017cf1b8d3SKris Buschelman 41027cf1b8d3SKris Buschelman v = aa + ai16 + 16; 41037cf1b8d3SKris Buschelman idt -= 4; 41047cf1b8d3SKris Buschelman } 41057cf1b8d3SKris Buschelman 41067cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 41077cf1b8d3SKris Buschelman idt = 4*(n-1); 41087cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 41097cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 41107cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 41117cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 41127cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 41137cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 41147cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 41157cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 41167cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 41177cf1b8d3SKris Buschelman idt -= 4; 41187cf1b8d3SKris Buschelman } 41197cf1b8d3SKris Buschelman 41207cf1b8d3SKris Buschelman } /* End of artificial scope. */ 41211ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 41221ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4123dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 41247cf1b8d3SKris Buschelman SSE_SCOPE_END; 41257cf1b8d3SKris Buschelman PetscFunctionReturn(0); 41267cf1b8d3SKris Buschelman } 41277cf1b8d3SKris Buschelman 41283660e330SKris Buschelman #endif 41298f690400SShri Abhyankar 41304a2ae208SSatish Balay #undef __FUNCT__ 41314a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4132dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 41334e2b4712SSatish Balay { 41344e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 41354e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 41366849ba73SBarry Smith PetscErrorCode ierr; 41375d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 41385d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4139d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4140d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4141d9fead3dSBarry Smith const PetscScalar *b; 41424e2b4712SSatish Balay 41434e2b4712SSatish Balay PetscFunctionBegin; 4144d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41451ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4146f1af5d2fSBarry Smith t = a->solve_work; 41474e2b4712SSatish Balay 41484e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 41494e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 41504e2b4712SSatish Balay 41514e2b4712SSatish Balay /* forward solve the lower triangular */ 41524e2b4712SSatish Balay idx = 3*(*r++); 4153f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 41544e2b4712SSatish Balay for (i=1; i<n; i++) { 41554e2b4712SSatish Balay v = aa + 9*ai[i]; 41564e2b4712SSatish Balay vi = aj + ai[i]; 41574e2b4712SSatish Balay nz = diag[i] - ai[i]; 41584e2b4712SSatish Balay idx = 3*(*r++); 4159f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 41604e2b4712SSatish Balay while (nz--) { 41614e2b4712SSatish Balay idx = 3*(*vi++); 4162f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4163f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4164f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4165f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 41664e2b4712SSatish Balay v += 9; 41674e2b4712SSatish Balay } 41684e2b4712SSatish Balay idx = 3*i; 4169f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 41704e2b4712SSatish Balay } 41714e2b4712SSatish Balay /* backward solve the upper triangular */ 41724e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 41734e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 41744e2b4712SSatish Balay vi = aj + diag[i] + 1; 41754e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 41764e2b4712SSatish Balay idt = 3*i; 4177f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 41784e2b4712SSatish Balay while (nz--) { 41794e2b4712SSatish Balay idx = 3*(*vi++); 4180f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4181f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4182f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4183f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 41844e2b4712SSatish Balay v += 9; 41854e2b4712SSatish Balay } 41864e2b4712SSatish Balay idc = 3*(*c--); 41874e2b4712SSatish Balay v = aa + 9*diag[i]; 4188f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4189f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4190f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 41914e2b4712SSatish Balay } 41924e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 41934e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4194d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41951ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4196dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 41974e2b4712SSatish Balay PetscFunctionReturn(0); 41984e2b4712SSatish Balay } 41994e2b4712SSatish Balay 42008f690400SShri Abhyankar #undef __FUNCT__ 42018f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 42028f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 42038f690400SShri Abhyankar { 42048f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 42058f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 42068f690400SShri Abhyankar PetscErrorCode ierr; 420729b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 42088f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 42098f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 42108f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 42118f690400SShri Abhyankar const PetscScalar *b; 42128f690400SShri Abhyankar 42138f690400SShri Abhyankar PetscFunctionBegin; 42148f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42158f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 42168f690400SShri Abhyankar t = a->solve_work; 42178f690400SShri Abhyankar 42188f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 421929b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 42208f690400SShri Abhyankar 42218f690400SShri Abhyankar /* forward solve the lower triangular */ 422229b92fc1SShri Abhyankar idx = 3*r[0]; 42238f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 42248f690400SShri Abhyankar for (i=1; i<n; i++) { 42258f690400SShri Abhyankar v = aa + 9*ai[i]; 42268f690400SShri Abhyankar vi = aj + ai[i]; 42278f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 422829b92fc1SShri Abhyankar idx = 3*r[i]; 42298f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 423029b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 423129b92fc1SShri Abhyankar idx = 3*vi[m]; 42328f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 42338f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 42348f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 42358f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 42368f690400SShri Abhyankar v += 9; 42378f690400SShri Abhyankar } 42388f690400SShri Abhyankar idx = 3*i; 42398f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 42408f690400SShri Abhyankar } 42418f690400SShri Abhyankar /* backward solve the upper triangular */ 42428f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 42438f690400SShri Abhyankar k = 2*n-i; 42448f690400SShri Abhyankar v = aa + 9*ai[k]; 42458f690400SShri Abhyankar vi = aj + ai[k]; 42468f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 42478f690400SShri Abhyankar idt = 3*i; 42488f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 424929b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 425029b92fc1SShri Abhyankar idx = 3*vi[m]; 42518f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 42528f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 42538f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 42548f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 42558f690400SShri Abhyankar v += 9; 42568f690400SShri Abhyankar } 425729b92fc1SShri Abhyankar idc = 3*c[i]; 42588f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 42598f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 42608f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 42618f690400SShri Abhyankar } 42628f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 42638f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 42648f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42658f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 42668f690400SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 42678f690400SShri Abhyankar PetscFunctionReturn(0); 42688f690400SShri Abhyankar } 42698f690400SShri Abhyankar 42700c4413a7SShri Abhyankar #undef __FUNCT__ 42710c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2" 42720c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx) 42730c4413a7SShri Abhyankar { 42740c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 42750c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 42760c4413a7SShri Abhyankar PetscErrorCode ierr; 42770c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 42780c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 42790c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 42800c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 42810c4413a7SShri Abhyankar const PetscScalar *b; 42820c4413a7SShri Abhyankar 42830c4413a7SShri Abhyankar PetscFunctionBegin; 42840c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42850c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 42860c4413a7SShri Abhyankar t = a->solve_work; 42870c4413a7SShri Abhyankar 42880c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 42890c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 42900c4413a7SShri Abhyankar 42910c4413a7SShri Abhyankar /* forward solve the lower triangular */ 42920c4413a7SShri Abhyankar idx = 3*r[0]; 42930c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 42940c4413a7SShri Abhyankar for (i=1; i<n; i++) { 42950c4413a7SShri Abhyankar v = aa + 9*ai[i]; 42960c4413a7SShri Abhyankar vi = aj + ai[i]; 42970c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 42980c4413a7SShri Abhyankar idx = 3*r[i]; 42990c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 43000c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 43010c4413a7SShri Abhyankar idx = 3*vi[m]; 43020c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 43030c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 43040c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 43050c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 43060c4413a7SShri Abhyankar v += 9; 43070c4413a7SShri Abhyankar } 43080c4413a7SShri Abhyankar idx = 3*i; 43090c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 43100c4413a7SShri Abhyankar } 43110c4413a7SShri Abhyankar /* backward solve the upper triangular */ 43120c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 43130c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 43140c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 43150c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 43160c4413a7SShri Abhyankar idt = 3*i; 43170c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 43180c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 43190c4413a7SShri Abhyankar idx = 3*vi[m]; 43200c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 43210c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 43220c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 43230c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 43240c4413a7SShri Abhyankar v += 9; 43250c4413a7SShri Abhyankar } 43260c4413a7SShri Abhyankar idc = 3*c[i]; 43270c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 43280c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 43290c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 43300c4413a7SShri Abhyankar } 43310c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 43320c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 43330c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43340c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 43350c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 43360c4413a7SShri Abhyankar PetscFunctionReturn(0); 43370c4413a7SShri Abhyankar } 43380c4413a7SShri Abhyankar 433915091d37SBarry Smith /* 434015091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 434115091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 434215091d37SBarry Smith */ 43434a2ae208SSatish Balay #undef __FUNCT__ 43444a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4345dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 434615091d37SBarry Smith { 434715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4348690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4349dfbe8321SBarry Smith PetscErrorCode ierr; 4350690b6cddSBarry Smith PetscInt *diag = a->diag; 4351d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4352d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4353d9fead3dSBarry Smith const PetscScalar *b; 4354690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 435515091d37SBarry Smith 435615091d37SBarry Smith PetscFunctionBegin; 4357d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43581ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 435915091d37SBarry Smith 436015091d37SBarry Smith /* forward solve the lower triangular */ 436115091d37SBarry Smith idx = 0; 436215091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 436315091d37SBarry Smith for (i=1; i<n; i++) { 436415091d37SBarry Smith v = aa + 9*ai[i]; 436515091d37SBarry Smith vi = aj + ai[i]; 436615091d37SBarry Smith nz = diag[i] - ai[i]; 436715091d37SBarry Smith idx += 3; 4368f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 436915091d37SBarry Smith while (nz--) { 437015091d37SBarry Smith jdx = 3*(*vi++); 437115091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4372f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4373f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4374f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 437515091d37SBarry Smith v += 9; 437615091d37SBarry Smith } 4377f1af5d2fSBarry Smith x[idx] = s1; 4378f1af5d2fSBarry Smith x[1+idx] = s2; 4379f1af5d2fSBarry Smith x[2+idx] = s3; 438015091d37SBarry Smith } 438115091d37SBarry Smith /* backward solve the upper triangular */ 438215091d37SBarry Smith for (i=n-1; i>=0; i--){ 438315091d37SBarry Smith v = aa + 9*diag[i] + 9; 438415091d37SBarry Smith vi = aj + diag[i] + 1; 438515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 438615091d37SBarry Smith idt = 3*i; 4387f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4388f1af5d2fSBarry Smith s3 = x[2+idt]; 438915091d37SBarry Smith while (nz--) { 439015091d37SBarry Smith idx = 3*(*vi++); 439115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4392f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4393f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4394f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 439515091d37SBarry Smith v += 9; 439615091d37SBarry Smith } 439715091d37SBarry Smith v = aa + 9*diag[i]; 4398f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4399f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4400f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 440115091d37SBarry Smith } 440215091d37SBarry Smith 4403d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 44041ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4405dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 440615091d37SBarry Smith PetscFunctionReturn(0); 440715091d37SBarry Smith } 440815091d37SBarry Smith 44094a2ae208SSatish Balay #undef __FUNCT__ 4410cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4411cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4412cee9d6f2SShri Abhyankar { 4413cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4414ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4415cee9d6f2SShri Abhyankar PetscErrorCode ierr; 4416cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 4417cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4418cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 4419cee9d6f2SShri Abhyankar PetscScalar *x; 4420cee9d6f2SShri Abhyankar const PetscScalar *b; 4421cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4422cee9d6f2SShri Abhyankar 4423cee9d6f2SShri Abhyankar PetscFunctionBegin; 4424cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4425cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4426cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 4427cee9d6f2SShri Abhyankar idx = 0; 4428cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4429cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 4430cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 4431cee9d6f2SShri Abhyankar vi = aj + ai[i]; 4432cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 4433cee9d6f2SShri Abhyankar idx = bs*i; 4434cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4435ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4436ce3d78c0SShri Abhyankar jdx = bs*vi[k]; 4437cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4438cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4439cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4440cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4441cee9d6f2SShri Abhyankar 4442cee9d6f2SShri Abhyankar v += bs2; 4443cee9d6f2SShri Abhyankar } 4444cee9d6f2SShri Abhyankar 4445cee9d6f2SShri Abhyankar x[idx] = s1; 4446cee9d6f2SShri Abhyankar x[1+idx] = s2; 4447cee9d6f2SShri Abhyankar x[2+idx] = s3; 4448cee9d6f2SShri Abhyankar } 4449cee9d6f2SShri Abhyankar 4450cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 4451cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 4452cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 4453cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 4454cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 4455cee9d6f2SShri Abhyankar idt = bs*i; 4456cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4457cee9d6f2SShri Abhyankar 4458ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4459ce3d78c0SShri Abhyankar idx = bs*vi[k]; 4460cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4461cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4462cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4463cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4464cee9d6f2SShri Abhyankar 4465cee9d6f2SShri Abhyankar v += bs2; 4466cee9d6f2SShri Abhyankar } 4467cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 4468cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4469cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4470cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4471cee9d6f2SShri Abhyankar 4472cee9d6f2SShri Abhyankar } 4473cee9d6f2SShri Abhyankar 4474cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4475cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4476cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4477cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 4478cee9d6f2SShri Abhyankar } 4479cee9d6f2SShri Abhyankar 4480cee9d6f2SShri Abhyankar #undef __FUNCT__ 4481b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2" 4482b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4483b2b2dd24SShri Abhyankar { 4484b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4485b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4486b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4487b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 4488b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4489b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4490b2b2dd24SShri Abhyankar PetscScalar *x; 4491b2b2dd24SShri Abhyankar const PetscScalar *b; 4492b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4493b2b2dd24SShri Abhyankar 4494b2b2dd24SShri Abhyankar PetscFunctionBegin; 4495b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4496b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4497b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4498b2b2dd24SShri Abhyankar idx = 0; 4499b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4500b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4501b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4502b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4503b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4504b2b2dd24SShri Abhyankar idx = bs*i; 4505b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4506b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4507b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4508b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4509b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4510b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4511b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4512b2b2dd24SShri Abhyankar 4513b2b2dd24SShri Abhyankar v += bs2; 4514b2b2dd24SShri Abhyankar } 4515b2b2dd24SShri Abhyankar 4516b2b2dd24SShri Abhyankar x[idx] = s1; 4517b2b2dd24SShri Abhyankar x[1+idx] = s2; 4518b2b2dd24SShri Abhyankar x[2+idx] = s3; 4519b2b2dd24SShri Abhyankar } 4520b2b2dd24SShri Abhyankar 4521b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4522b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4523b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4524b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4525b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4526b2b2dd24SShri Abhyankar idt = bs*i; 4527b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4528b2b2dd24SShri Abhyankar 4529b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4530b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4531b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4532b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4533b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4534b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4535b2b2dd24SShri Abhyankar 4536b2b2dd24SShri Abhyankar v += bs2; 4537b2b2dd24SShri Abhyankar } 4538b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4539b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4540b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4541b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4542b2b2dd24SShri Abhyankar 4543b2b2dd24SShri Abhyankar } 4544b2b2dd24SShri Abhyankar 4545b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4546b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4547b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4548b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4549b2b2dd24SShri Abhyankar } 4550b2b2dd24SShri Abhyankar 4551b2b2dd24SShri Abhyankar #undef __FUNCT__ 45524a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4553dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 45544e2b4712SSatish Balay { 45554e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 45564e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 45576849ba73SBarry Smith PetscErrorCode ierr; 45585d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 45595d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4560d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4561d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 4562d9fead3dSBarry Smith const PetscScalar *b; 45634e2b4712SSatish Balay 45644e2b4712SSatish Balay PetscFunctionBegin; 4565d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45661ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4567f1af5d2fSBarry Smith t = a->solve_work; 45684e2b4712SSatish Balay 45694e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 45704e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 45714e2b4712SSatish Balay 45724e2b4712SSatish Balay /* forward solve the lower triangular */ 45734e2b4712SSatish Balay idx = 2*(*r++); 4574f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 45754e2b4712SSatish Balay for (i=1; i<n; i++) { 45764e2b4712SSatish Balay v = aa + 4*ai[i]; 45774e2b4712SSatish Balay vi = aj + ai[i]; 45784e2b4712SSatish Balay nz = diag[i] - ai[i]; 45794e2b4712SSatish Balay idx = 2*(*r++); 4580f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 45814e2b4712SSatish Balay while (nz--) { 45824e2b4712SSatish Balay idx = 2*(*vi++); 4583f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4584f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4585f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 45864e2b4712SSatish Balay v += 4; 45874e2b4712SSatish Balay } 45884e2b4712SSatish Balay idx = 2*i; 4589f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 45904e2b4712SSatish Balay } 45914e2b4712SSatish Balay /* backward solve the upper triangular */ 45924e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 45934e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 45944e2b4712SSatish Balay vi = aj + diag[i] + 1; 45954e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 45964e2b4712SSatish Balay idt = 2*i; 4597f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 45984e2b4712SSatish Balay while (nz--) { 45994e2b4712SSatish Balay idx = 2*(*vi++); 4600f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4601f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4602f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 46034e2b4712SSatish Balay v += 4; 46044e2b4712SSatish Balay } 46054e2b4712SSatish Balay idc = 2*(*c--); 46064e2b4712SSatish Balay v = aa + 4*diag[i]; 4607f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4608f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 46094e2b4712SSatish Balay } 46104e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 46114e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4612d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4614dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 46154e2b4712SSatish Balay PetscFunctionReturn(0); 46164e2b4712SSatish Balay } 46174e2b4712SSatish Balay 46188f690400SShri Abhyankar #undef __FUNCT__ 46198f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 46208f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 46218f690400SShri Abhyankar { 46228f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 46238f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 46248f690400SShri Abhyankar PetscErrorCode ierr; 462529b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 46268f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 46278f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 46288f690400SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 46298f690400SShri Abhyankar const PetscScalar *b; 46308f690400SShri Abhyankar 46318f690400SShri Abhyankar PetscFunctionBegin; 46328f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46338f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 46348f690400SShri Abhyankar t = a->solve_work; 46358f690400SShri Abhyankar 46368f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 463729b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 46388f690400SShri Abhyankar 46398f690400SShri Abhyankar /* forward solve the lower triangular */ 464029b92fc1SShri Abhyankar idx = 2*r[0]; 46418f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 46428f690400SShri Abhyankar for (i=1; i<n; i++) { 46438f690400SShri Abhyankar v = aa + 4*ai[i]; 46448f690400SShri Abhyankar vi = aj + ai[i]; 46458f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 464629b92fc1SShri Abhyankar idx = 2*r[i]; 46478f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 464829b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 464929b92fc1SShri Abhyankar jdx = 2*vi[m]; 46508f690400SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 46518f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 46528f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 46538f690400SShri Abhyankar v += 4; 46548f690400SShri Abhyankar } 46558f690400SShri Abhyankar idx = 2*i; 46568f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 46578f690400SShri Abhyankar } 46588f690400SShri Abhyankar /* backward solve the upper triangular */ 46598f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 46608f690400SShri Abhyankar k = 2*n-i; 46618f690400SShri Abhyankar v = aa + 4*ai[k]; 46628f690400SShri Abhyankar vi = aj + ai[k]; 46638f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 46648f690400SShri Abhyankar idt = 2*i; 46658f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 466629b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 466729b92fc1SShri Abhyankar idx = 2*vi[m]; 46688f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 46698f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 46708f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 46718f690400SShri Abhyankar v += 4; 46728f690400SShri Abhyankar } 467329b92fc1SShri Abhyankar idc = 2*c[i]; 46748f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 46758f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 46768f690400SShri Abhyankar } 46778f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 46788f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 46798f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46808f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 46818f690400SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 46828f690400SShri Abhyankar PetscFunctionReturn(0); 46838f690400SShri Abhyankar } 46848f690400SShri Abhyankar 46850c4413a7SShri Abhyankar #undef __FUNCT__ 46860c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2" 46870c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx) 46880c4413a7SShri Abhyankar { 46890c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 46900c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 46910c4413a7SShri Abhyankar PetscErrorCode ierr; 46920c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 46930c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 46940c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 46950c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 46960c4413a7SShri Abhyankar const PetscScalar *b; 46970c4413a7SShri Abhyankar 46980c4413a7SShri Abhyankar PetscFunctionBegin; 46990c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47000c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 47010c4413a7SShri Abhyankar t = a->solve_work; 47020c4413a7SShri Abhyankar 47030c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 47040c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 47050c4413a7SShri Abhyankar 47060c4413a7SShri Abhyankar /* forward solve the lower triangular */ 47070c4413a7SShri Abhyankar idx = 2*r[0]; 47080c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 47090c4413a7SShri Abhyankar for (i=1; i<n; i++) { 47100c4413a7SShri Abhyankar v = aa + 4*ai[i]; 47110c4413a7SShri Abhyankar vi = aj + ai[i]; 47120c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 47130c4413a7SShri Abhyankar idx = 2*r[i]; 47140c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 47150c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 47160c4413a7SShri Abhyankar jdx = 2*vi[m]; 47170c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 47180c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 47190c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 47200c4413a7SShri Abhyankar v += 4; 47210c4413a7SShri Abhyankar } 47220c4413a7SShri Abhyankar idx = 2*i; 47230c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 47240c4413a7SShri Abhyankar } 47250c4413a7SShri Abhyankar /* backward solve the upper triangular */ 47260c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 47270c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 47280c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 47290c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 47300c4413a7SShri Abhyankar idt = 2*i; 47310c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 47320c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 47330c4413a7SShri Abhyankar idx = 2*vi[m]; 47340c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 47350c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 47360c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 47370c4413a7SShri Abhyankar v += 4; 47380c4413a7SShri Abhyankar } 47390c4413a7SShri Abhyankar idc = 2*c[i]; 47400c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 47410c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 47420c4413a7SShri Abhyankar } 47430c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 47440c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 47450c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47460c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 47470c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 47480c4413a7SShri Abhyankar PetscFunctionReturn(0); 47490c4413a7SShri Abhyankar } 47508f690400SShri Abhyankar 475115091d37SBarry Smith /* 475215091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 475315091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 475415091d37SBarry Smith */ 47554a2ae208SSatish Balay #undef __FUNCT__ 47564a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4757dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 475815091d37SBarry Smith { 475915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4760690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4761dfbe8321SBarry Smith PetscErrorCode ierr; 4762690b6cddSBarry Smith PetscInt *diag = a->diag; 4763d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4764d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 4765d9fead3dSBarry Smith const PetscScalar *b; 4766690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 476715091d37SBarry Smith 476815091d37SBarry Smith PetscFunctionBegin; 4769d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 477115091d37SBarry Smith 477215091d37SBarry Smith /* forward solve the lower triangular */ 477315091d37SBarry Smith idx = 0; 477415091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 477515091d37SBarry Smith for (i=1; i<n; i++) { 477615091d37SBarry Smith v = aa + 4*ai[i]; 477715091d37SBarry Smith vi = aj + ai[i]; 477815091d37SBarry Smith nz = diag[i] - ai[i]; 477915091d37SBarry Smith idx += 2; 4780f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 478115091d37SBarry Smith while (nz--) { 478215091d37SBarry Smith jdx = 2*(*vi++); 478315091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 4784f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4785f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 478615091d37SBarry Smith v += 4; 478715091d37SBarry Smith } 4788f1af5d2fSBarry Smith x[idx] = s1; 4789f1af5d2fSBarry Smith x[1+idx] = s2; 479015091d37SBarry Smith } 479115091d37SBarry Smith /* backward solve the upper triangular */ 479215091d37SBarry Smith for (i=n-1; i>=0; i--){ 479315091d37SBarry Smith v = aa + 4*diag[i] + 4; 479415091d37SBarry Smith vi = aj + diag[i] + 1; 479515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 479615091d37SBarry Smith idt = 2*i; 4797f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 479815091d37SBarry Smith while (nz--) { 479915091d37SBarry Smith idx = 2*(*vi++); 480015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 4801f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4802f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 480315091d37SBarry Smith v += 4; 480415091d37SBarry Smith } 480515091d37SBarry Smith v = aa + 4*diag[i]; 4806f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 4807f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 480815091d37SBarry Smith } 480915091d37SBarry Smith 4810d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 48111ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4812dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 481315091d37SBarry Smith PetscFunctionReturn(0); 481415091d37SBarry Smith } 481515091d37SBarry Smith 48164a2ae208SSatish Balay #undef __FUNCT__ 4817cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4818cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4819cee9d6f2SShri Abhyankar { 4820cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4821ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4822cee9d6f2SShri Abhyankar PetscErrorCode ierr; 4823cee9d6f2SShri Abhyankar PetscInt jdx; 4824cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 4825cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4826cee9d6f2SShri Abhyankar const PetscScalar *b; 4827cee9d6f2SShri Abhyankar 4828cee9d6f2SShri Abhyankar PetscFunctionBegin; 4829cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4830cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4831cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 4832cee9d6f2SShri Abhyankar idx = 0; 4833cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4834cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 4835cee9d6f2SShri Abhyankar v = aa + 4*ai[i]; 4836cee9d6f2SShri Abhyankar vi = aj + ai[i]; 4837cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 4838cee9d6f2SShri Abhyankar idx = 2*i; 4839cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4840ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4841ce3d78c0SShri Abhyankar jdx = 2*vi[k]; 4842cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4843cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4844cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4845cee9d6f2SShri Abhyankar v += 4; 4846cee9d6f2SShri Abhyankar } 4847cee9d6f2SShri Abhyankar x[idx] = s1; 4848cee9d6f2SShri Abhyankar x[1+idx] = s2; 4849cee9d6f2SShri Abhyankar } 4850cee9d6f2SShri Abhyankar 4851cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 4852cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 4853cee9d6f2SShri Abhyankar v = aa + 4*ai[2*n-i]; 4854cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 4855cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 4856cee9d6f2SShri Abhyankar idt = 2*i; 4857cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4858ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4859ce3d78c0SShri Abhyankar idx = 2*vi[k]; 4860cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4861cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4862cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4863cee9d6f2SShri Abhyankar v += 4; 4864cee9d6f2SShri Abhyankar } 4865cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 4866cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4867cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4868cee9d6f2SShri Abhyankar } 4869cee9d6f2SShri Abhyankar 4870cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4871cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4872cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4873cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 4874cee9d6f2SShri Abhyankar } 4875cee9d6f2SShri Abhyankar 4876cee9d6f2SShri Abhyankar #undef __FUNCT__ 4877b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2" 4878b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4879b2b2dd24SShri Abhyankar { 4880b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4881b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4882b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4883b2b2dd24SShri Abhyankar PetscInt jdx; 4884b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4885b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4886b2b2dd24SShri Abhyankar const PetscScalar *b; 4887b2b2dd24SShri Abhyankar 4888b2b2dd24SShri Abhyankar PetscFunctionBegin; 4889b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4890b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4891b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4892b2b2dd24SShri Abhyankar idx = 0; 4893b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4894b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4895b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 4896b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4897b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4898b2b2dd24SShri Abhyankar idx = 2*i; 4899b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4900b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4901b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 4902b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4903b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4904b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4905b2b2dd24SShri Abhyankar v += 4; 4906b2b2dd24SShri Abhyankar } 4907b2b2dd24SShri Abhyankar x[idx] = s1; 4908b2b2dd24SShri Abhyankar x[1+idx] = s2; 4909b2b2dd24SShri Abhyankar } 4910b2b2dd24SShri Abhyankar 4911b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4912b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4913b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 4914b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4915b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4916b2b2dd24SShri Abhyankar idt = 2*i; 4917b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4918b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4919b2b2dd24SShri Abhyankar idx = 2*vi[k]; 4920b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4921b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4922b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4923b2b2dd24SShri Abhyankar v += 4; 4924b2b2dd24SShri Abhyankar } 4925b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4926b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4927b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4928b2b2dd24SShri Abhyankar } 4929b2b2dd24SShri Abhyankar 4930b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4931b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4932b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4933b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4934b2b2dd24SShri Abhyankar } 4935b2b2dd24SShri Abhyankar 4936b2b2dd24SShri Abhyankar #undef __FUNCT__ 49374a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4938dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 49394e2b4712SSatish Balay { 49404e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 49414e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 49426849ba73SBarry Smith PetscErrorCode ierr; 49435d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 49445d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 49453f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 494687828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 49474e2b4712SSatish Balay 49484e2b4712SSatish Balay PetscFunctionBegin; 49494e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 49504e2b4712SSatish Balay 49511ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 49521ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4953f1af5d2fSBarry Smith t = a->solve_work; 49544e2b4712SSatish Balay 49554e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 49564e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 49574e2b4712SSatish Balay 49584e2b4712SSatish Balay /* forward solve the lower triangular */ 4959f1af5d2fSBarry Smith t[0] = b[*r++]; 49604e2b4712SSatish Balay for (i=1; i<n; i++) { 49614e2b4712SSatish Balay v = aa + ai[i]; 49624e2b4712SSatish Balay vi = aj + ai[i]; 49634e2b4712SSatish Balay nz = diag[i] - ai[i]; 4964f1af5d2fSBarry Smith s1 = b[*r++]; 49654e2b4712SSatish Balay while (nz--) { 4966f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 49674e2b4712SSatish Balay } 4968f1af5d2fSBarry Smith t[i] = s1; 49694e2b4712SSatish Balay } 49704e2b4712SSatish Balay /* backward solve the upper triangular */ 49714e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 49724e2b4712SSatish Balay v = aa + diag[i] + 1; 49734e2b4712SSatish Balay vi = aj + diag[i] + 1; 49744e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4975f1af5d2fSBarry Smith s1 = t[i]; 49764e2b4712SSatish Balay while (nz--) { 4977f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 49784e2b4712SSatish Balay } 4979f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 49804e2b4712SSatish Balay } 49814e2b4712SSatish Balay 49824e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 49834e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 49841ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 49851ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4986dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 49874e2b4712SSatish Balay PetscFunctionReturn(0); 49884e2b4712SSatish Balay } 498915091d37SBarry Smith /* 499015091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 499115091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 499215091d37SBarry Smith */ 49934a2ae208SSatish Balay #undef __FUNCT__ 49944a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4995dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 499615091d37SBarry Smith { 499715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4998690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4999dfbe8321SBarry Smith PetscErrorCode ierr; 5000690b6cddSBarry Smith PetscInt *diag = a->diag; 500115091d37SBarry Smith MatScalar *aa=a->a; 500287828ca2SBarry Smith PetscScalar *x,*b; 500387828ca2SBarry Smith PetscScalar s1,x1; 500415091d37SBarry Smith MatScalar *v; 5005690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 500615091d37SBarry Smith 500715091d37SBarry Smith PetscFunctionBegin; 50081ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 50091ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 501015091d37SBarry Smith 501115091d37SBarry Smith /* forward solve the lower triangular */ 501215091d37SBarry Smith idx = 0; 501315091d37SBarry Smith x[0] = b[0]; 501415091d37SBarry Smith for (i=1; i<n; i++) { 501515091d37SBarry Smith v = aa + ai[i]; 501615091d37SBarry Smith vi = aj + ai[i]; 501715091d37SBarry Smith nz = diag[i] - ai[i]; 501815091d37SBarry Smith idx += 1; 5019f1af5d2fSBarry Smith s1 = b[idx]; 502015091d37SBarry Smith while (nz--) { 502115091d37SBarry Smith jdx = *vi++; 502215091d37SBarry Smith x1 = x[jdx]; 5023f1af5d2fSBarry Smith s1 -= v[0]*x1; 502415091d37SBarry Smith v += 1; 502515091d37SBarry Smith } 5026f1af5d2fSBarry Smith x[idx] = s1; 502715091d37SBarry Smith } 502815091d37SBarry Smith /* backward solve the upper triangular */ 502915091d37SBarry Smith for (i=n-1; i>=0; i--){ 503015091d37SBarry Smith v = aa + diag[i] + 1; 503115091d37SBarry Smith vi = aj + diag[i] + 1; 503215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 503315091d37SBarry Smith idt = i; 5034f1af5d2fSBarry Smith s1 = x[idt]; 503515091d37SBarry Smith while (nz--) { 503615091d37SBarry Smith idx = *vi++; 503715091d37SBarry Smith x1 = x[idx]; 5038f1af5d2fSBarry Smith s1 -= v[0]*x1; 503915091d37SBarry Smith v += 1; 504015091d37SBarry Smith } 504115091d37SBarry Smith v = aa + diag[i]; 5042f1af5d2fSBarry Smith x[idt] = v[0]*s1; 504315091d37SBarry Smith } 50441ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50451ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5046dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 504715091d37SBarry Smith PetscFunctionReturn(0); 504815091d37SBarry Smith } 50494e2b4712SSatish Balay 50504e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 505116a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 50526bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 5053ae3d28f0SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth); 50546bce7ff8SHong Zhang 50556bce7ff8SHong Zhang #undef __FUNCT__ 50566bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 50576bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 50586bce7ff8SHong Zhang { 50596bce7ff8SHong Zhang Mat C=B; 50606bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 50616bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 50626bce7ff8SHong Zhang PetscErrorCode ierr; 50636bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 50646bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 50656bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5066b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5067914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5068914a18a2SHong Zhang MatScalar *v_work; 5069ae3d28f0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 50706bce7ff8SHong Zhang 50716bce7ff8SHong Zhang PetscFunctionBegin; 50726bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 50736bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5074ae3d28f0SHong Zhang 5075fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5076fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 50776bce7ff8SHong Zhang ics = ic; 50786bce7ff8SHong Zhang 5079914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 5080fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5081914a18a2SHong Zhang 50826bce7ff8SHong Zhang for (i=0; i<n; i++){ 50836bce7ff8SHong Zhang /* zero rtmp */ 50846bce7ff8SHong Zhang /* L part */ 50856bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 50866bce7ff8SHong Zhang bjtmp = bj + bi[i]; 5087914a18a2SHong Zhang for (j=0; j<nz; j++){ 5088914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5089914a18a2SHong Zhang } 50906bce7ff8SHong Zhang 50916bce7ff8SHong Zhang /* U part */ 50921a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 50931a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 50941a83e813SShri Abhyankar for (j=0; j<nz; j++){ 50951a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 50961a83e813SShri Abhyankar } 50971a83e813SShri Abhyankar 50981a83e813SShri Abhyankar /* load in initial (unfactored row) */ 50991a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 51001a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 51011a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 51021a83e813SShri Abhyankar for (j=0; j<nz; j++) { 51031a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 51041a83e813SShri Abhyankar } 51051a83e813SShri Abhyankar 51061a83e813SShri Abhyankar /* elimination */ 51071a83e813SShri Abhyankar bjtmp = bj + bi[i]; 51081a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 51091a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 51101a83e813SShri Abhyankar row = bjtmp[k]; 51111a83e813SShri Abhyankar pc = rtmp + bs2*row; 51121a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 51131a83e813SShri Abhyankar if (flg) { 51141a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 51151a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 51161a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 51171a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 51181a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 51191a83e813SShri Abhyankar for (j=0; j<nz; j++) { 51201a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 51211a83e813SShri Abhyankar } 51221a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 51231a83e813SShri Abhyankar } 51241a83e813SShri Abhyankar } 51251a83e813SShri Abhyankar 51261a83e813SShri Abhyankar /* finished row so stick it into b->a */ 51271a83e813SShri Abhyankar /* L part */ 51281a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 51291a83e813SShri Abhyankar pj = b->j + bi[i] ; 51301a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 51311a83e813SShri Abhyankar for (j=0; j<nz; j++) { 51321a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 51331a83e813SShri Abhyankar } 51341a83e813SShri Abhyankar 51351a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 51361a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 51371a83e813SShri Abhyankar pj = b->j + bdiag[i]; 51381a83e813SShri Abhyankar /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 51391a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 51401a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 51411a83e813SShri Abhyankar 51421a83e813SShri Abhyankar /* U part */ 51431a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 51441a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 51451a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 51461a83e813SShri Abhyankar for (j=0; j<nz; j++){ 51471a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 51481a83e813SShri Abhyankar } 51491a83e813SShri Abhyankar } 51501a83e813SShri Abhyankar 51511a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 5152fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 51531a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 51541a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 51551a83e813SShri Abhyankar 5156ae3d28f0SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5157ae3d28f0SHong Zhang ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5158ae3d28f0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 5159ae3d28f0SHong Zhang if (both_identity){ 5160ae3d28f0SHong Zhang C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2; 5161ae3d28f0SHong Zhang } else { 5162ae3d28f0SHong Zhang C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct_v2; 5163ae3d28f0SHong Zhang } 5164ae3d28f0SHong Zhang 51651a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 51661a83e813SShri Abhyankar ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 51671a83e813SShri Abhyankar PetscFunctionReturn(0); 51681a83e813SShri Abhyankar } 51691a83e813SShri Abhyankar 51706bce7ff8SHong Zhang /* 51716bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 517216a2bf60SHong Zhang See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 517316a2bf60SHong Zhang because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 51746bce7ff8SHong Zhang */ 5175c0c7eb62SShri Abhyankar 51766bce7ff8SHong Zhang #undef __FUNCT__ 51776bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 51786bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 51796bce7ff8SHong Zhang { 51806bce7ff8SHong Zhang 51816bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 51826bce7ff8SHong Zhang PetscErrorCode ierr; 518316a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 518435aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 518535aa4fcfSShri Abhyankar 518635aa4fcfSShri Abhyankar PetscFunctionBegin; 518735aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 518835aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 518935aa4fcfSShri Abhyankar 519035aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 519135aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 519235aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 519335aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 519435aa4fcfSShri Abhyankar if (!b->diag){ 519535aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 519635aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 519735aa4fcfSShri Abhyankar } 519835aa4fcfSShri Abhyankar bdiag = b->diag; 519935aa4fcfSShri Abhyankar 520035aa4fcfSShri Abhyankar if (n > 0) { 520135aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 520235aa4fcfSShri Abhyankar } 520335aa4fcfSShri Abhyankar 520435aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 520535aa4fcfSShri Abhyankar bi = b->i; 520635aa4fcfSShri Abhyankar bj = b->j; 520735aa4fcfSShri Abhyankar 520835aa4fcfSShri Abhyankar /* L part */ 520935aa4fcfSShri Abhyankar bi[0] = 0; 521035aa4fcfSShri Abhyankar for (i=0; i<n; i++){ 521135aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 521235aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 521335aa4fcfSShri Abhyankar aj = a->j + ai[i]; 521435aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 521535aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 521635aa4fcfSShri Abhyankar } 521735aa4fcfSShri Abhyankar } 521835aa4fcfSShri Abhyankar 521935aa4fcfSShri Abhyankar /* U part */ 522035aa4fcfSShri Abhyankar bi_temp = bi[n]; 522135aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 522235aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 522335aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 522435aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 522535aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 522635aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 522735aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 522835aa4fcfSShri Abhyankar } 522935aa4fcfSShri Abhyankar /* diag[i] */ 523035aa4fcfSShri Abhyankar *bj = i; bj++; 523135aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 523235aa4fcfSShri Abhyankar } 523335aa4fcfSShri Abhyankar PetscFunctionReturn(0); 523435aa4fcfSShri Abhyankar } 523535aa4fcfSShri Abhyankar 523635aa4fcfSShri Abhyankar #undef __FUNCT__ 523716a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 523816a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 523916a2bf60SHong Zhang { 524016a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 524116a2bf60SHong Zhang IS isicol; 524216a2bf60SHong Zhang PetscErrorCode ierr; 524316a2bf60SHong Zhang const PetscInt *r,*ic; 52447fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 524516a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 524616a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 524716a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 52487fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 524916a2bf60SHong Zhang PetscReal f; 525016a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 525116a2bf60SHong Zhang PetscBT lnkbt; 525216a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 525316a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 525416a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 525516a2bf60SHong Zhang PetscTruth missing; 52567fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 525716a2bf60SHong Zhang 525816a2bf60SHong Zhang PetscFunctionBegin; 525916a2bf60SHong Zhang if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 526016a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 526116a2bf60SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 526216a2bf60SHong Zhang 526316a2bf60SHong Zhang f = info->fill; 526416a2bf60SHong Zhang levels = (PetscInt)info->levels; 526516a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 526616a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 526716a2bf60SHong Zhang 526816a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 526916a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 52707fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 527116a2bf60SHong Zhang 52727fa3a6a0SHong Zhang if (!levels && both_identity) { 527316a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 527416a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5275ae3d28f0SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 527635aa4fcfSShri Abhyankar 527735aa4fcfSShri Abhyankar fact->factor = MAT_FACTOR_ILU; 527835aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 527935aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 528035aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 528135aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 528235aa4fcfSShri Abhyankar b->row = isrow; 528335aa4fcfSShri Abhyankar b->col = iscol; 528435aa4fcfSShri Abhyankar b->icol = isicol; 528535aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 528635aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 528735aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 528835aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 528935aa4fcfSShri Abhyankar PetscFunctionReturn(0); 529035aa4fcfSShri Abhyankar } 529135aa4fcfSShri Abhyankar 529235aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 529335aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 529435aa4fcfSShri Abhyankar 529535aa4fcfSShri Abhyankar /* get new row pointers */ 529635aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 529735aa4fcfSShri Abhyankar bi[0] = 0; 529835aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 529935aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 530035aa4fcfSShri Abhyankar bdiag[0] = 0; 530135aa4fcfSShri Abhyankar 5302fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 530335aa4fcfSShri Abhyankar 530435aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 530535aa4fcfSShri Abhyankar nlnk = n + 1; 530635aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 530735aa4fcfSShri Abhyankar 530835aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 530935aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 531035aa4fcfSShri Abhyankar current_space = free_space; 531135aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 531235aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 531335aa4fcfSShri Abhyankar 531435aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 531535aa4fcfSShri Abhyankar nzi = 0; 531635aa4fcfSShri Abhyankar /* copy current row into linked list */ 531735aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 531835aa4fcfSShri Abhyankar if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 531935aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 532035aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 532135aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 532235aa4fcfSShri Abhyankar nzi += nlnk; 532335aa4fcfSShri Abhyankar 532435aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 532535aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 532635aa4fcfSShri Abhyankar fm = n; 532735aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 532835aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 532935aa4fcfSShri Abhyankar lnk[fm] = i; 533035aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 533135aa4fcfSShri Abhyankar nzi++; dcount++; 533235aa4fcfSShri Abhyankar } 533335aa4fcfSShri Abhyankar 533435aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 533535aa4fcfSShri Abhyankar nzbd = 0; 533635aa4fcfSShri Abhyankar prow = lnk[n]; 533735aa4fcfSShri Abhyankar while (prow < i) { 533835aa4fcfSShri Abhyankar nnz = bdiag[prow]; 533935aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 534035aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 534135aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 534235aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 534335aa4fcfSShri Abhyankar nzi += nlnk; 534435aa4fcfSShri Abhyankar prow = lnk[prow]; 534535aa4fcfSShri Abhyankar nzbd++; 534635aa4fcfSShri Abhyankar } 534735aa4fcfSShri Abhyankar bdiag[i] = nzbd; 534835aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 534935aa4fcfSShri Abhyankar 535035aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 535135aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 535235aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 535335aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 535435aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 535535aa4fcfSShri Abhyankar reallocs++; 535635aa4fcfSShri Abhyankar } 535735aa4fcfSShri Abhyankar 535835aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 535935aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 536035aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 536135aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 536235aa4fcfSShri Abhyankar 536335aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 536435aa4fcfSShri Abhyankar if (*(bj_ptr[i]+bdiag[i]) != i) { 536535aa4fcfSShri Abhyankar SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 536635aa4fcfSShri Abhyankar try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 536735aa4fcfSShri Abhyankar } 536835aa4fcfSShri Abhyankar 536935aa4fcfSShri Abhyankar current_space->array += nzi; 537035aa4fcfSShri Abhyankar current_space->local_used += nzi; 537135aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 537235aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 537335aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 537435aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 537535aa4fcfSShri Abhyankar } 537635aa4fcfSShri Abhyankar 537735aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 537835aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 537935aa4fcfSShri Abhyankar 538035aa4fcfSShri Abhyankar /* destroy list of free space and other temporary arrays */ 538135aa4fcfSShri Abhyankar ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 538235aa4fcfSShri Abhyankar 538335aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 538435aa4fcfSShri Abhyankar ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 538535aa4fcfSShri Abhyankar 538635aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 538735aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5388fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 538935aa4fcfSShri Abhyankar 539035aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 539135aa4fcfSShri Abhyankar { 539235aa4fcfSShri Abhyankar PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 539335aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 539435aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 539535aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 539635aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 539735aa4fcfSShri Abhyankar if (diagonal_fill) { 539835aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 539935aa4fcfSShri Abhyankar } 540035aa4fcfSShri Abhyankar } 540135aa4fcfSShri Abhyankar #endif 540235aa4fcfSShri Abhyankar 540335aa4fcfSShri Abhyankar /* put together the new matrix */ 540435aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 540535aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 540635aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 540735aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 540835aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 540935aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 541035aa4fcfSShri Abhyankar ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 541135aa4fcfSShri Abhyankar b->j = bj; 541235aa4fcfSShri Abhyankar b->i = bi; 541335aa4fcfSShri Abhyankar b->diag = bdiag; 541435aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 541535aa4fcfSShri Abhyankar b->ilen = 0; 541635aa4fcfSShri Abhyankar b->imax = 0; 541735aa4fcfSShri Abhyankar b->row = isrow; 541835aa4fcfSShri Abhyankar b->col = iscol; 541935aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 542035aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 542135aa4fcfSShri Abhyankar b->icol = isicol; 542235aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 542335aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 542435aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 542535aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 542635aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 5427ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocs; 5428ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 5429ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5430ae3d28f0SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 543135aa4fcfSShri Abhyankar PetscFunctionReturn(0); 543235aa4fcfSShri Abhyankar } 543335aa4fcfSShri Abhyankar 543435aa4fcfSShri Abhyankar 54354e2b4712SSatish Balay /* 54364e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 54374e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 54384e2b4712SSatish Balay Not a good example of code reuse. 54394e2b4712SSatish Balay */ 54404a2ae208SSatish Balay #undef __FUNCT__ 54414a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 54420481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 54434e2b4712SSatish Balay { 54444e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 54454e2b4712SSatish Balay IS isicol; 54466849ba73SBarry Smith PetscErrorCode ierr; 54475d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 54485d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5449a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5450d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 545141df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 5452329f5518SBarry Smith PetscReal f; 5453c0c7eb62SShri Abhyankar PetscTruth newdatastruct = PETSC_FALSE; 54544e2b4712SSatish Balay 54554e2b4712SSatish Balay PetscFunctionBegin; 545616a2bf60SHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 545716a2bf60SHong Zhang if (newdatastruct){ 545816a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 545916a2bf60SHong Zhang PetscFunctionReturn(0); 546016a2bf60SHong Zhang } 546116a2bf60SHong Zhang 54626bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 54636bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 54646bce7ff8SHong Zhang 5465435faa5fSBarry Smith f = info->fill; 5466690b6cddSBarry Smith levels = (PetscInt)info->levels; 5467690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 54684c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 546916a2bf60SHong Zhang 5470667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5471667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 54727d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 5473309c388cSBarry Smith 547441df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 547516a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 54766bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 54776bce7ff8SHong Zhang 5478719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 5479ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 5480bb3d539aSBarry Smith b->row = isrow; 5481bb3d539aSBarry Smith b->col = iscol; 5482bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5483bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5484bb3d539aSBarry Smith b->icol = isicol; 5485bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5486b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 54876bce7ff8SHong Zhang PetscFunctionReturn(0); 54886bce7ff8SHong Zhang } 54896bce7ff8SHong Zhang 54906bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 54914e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 54924e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 54934e2b4712SSatish Balay 54944e2b4712SSatish Balay /* get new row pointers */ 5495690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 54964e2b4712SSatish Balay ainew[0] = 0; 54974e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 5498690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 5499690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 55004e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 5501690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 55024e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 5503690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 55044e2b4712SSatish Balay /* im is level for each filled value */ 5505690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 55064e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 5507690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 55084e2b4712SSatish Balay dloc[0] = 0; 55094e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 5510435faa5fSBarry Smith 5511435faa5fSBarry Smith /* copy prow into linked list */ 55124e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 55133b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 55144e2b4712SSatish Balay xi = aj + ai[r[prow]]; 55154e2b4712SSatish Balay fill[n] = n; 5516435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 55174e2b4712SSatish Balay while (nz--) { 55184e2b4712SSatish Balay fm = n; 55194e2b4712SSatish Balay idx = ic[*xi++]; 55204e2b4712SSatish Balay do { 55214e2b4712SSatish Balay m = fm; 55224e2b4712SSatish Balay fm = fill[m]; 55234e2b4712SSatish Balay } while (fm < idx); 55244e2b4712SSatish Balay fill[m] = idx; 55254e2b4712SSatish Balay fill[idx] = fm; 55264e2b4712SSatish Balay im[idx] = 0; 55274e2b4712SSatish Balay } 5528435faa5fSBarry Smith 5529435faa5fSBarry Smith /* make sure diagonal entry is included */ 5530435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 5531435faa5fSBarry Smith fm = n; 5532435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 5533435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5534435faa5fSBarry Smith fill[fm] = prow; 5535435faa5fSBarry Smith im[prow] = 0; 5536435faa5fSBarry Smith nzf++; 5537335d9088SBarry Smith dcount++; 5538435faa5fSBarry Smith } 5539435faa5fSBarry Smith 55404e2b4712SSatish Balay nzi = 0; 55414e2b4712SSatish Balay row = fill[n]; 55424e2b4712SSatish Balay while (row < prow) { 55434e2b4712SSatish Balay incrlev = im[row] + 1; 55444e2b4712SSatish Balay nz = dloc[row]; 5545435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 55464e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 55474e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 55484e2b4712SSatish Balay fm = row; 55494e2b4712SSatish Balay while (nnz-- > 0) { 55504e2b4712SSatish Balay idx = *xi++; 55514e2b4712SSatish Balay if (*flev + incrlev > levels) { 55524e2b4712SSatish Balay flev++; 55534e2b4712SSatish Balay continue; 55544e2b4712SSatish Balay } 55554e2b4712SSatish Balay do { 55564e2b4712SSatish Balay m = fm; 55574e2b4712SSatish Balay fm = fill[m]; 55584e2b4712SSatish Balay } while (fm < idx); 55594e2b4712SSatish Balay if (fm != idx) { 55604e2b4712SSatish Balay im[idx] = *flev + incrlev; 55614e2b4712SSatish Balay fill[m] = idx; 55624e2b4712SSatish Balay fill[idx] = fm; 55634e2b4712SSatish Balay fm = idx; 55644e2b4712SSatish Balay nzf++; 5565ecf371e4SBarry Smith } else { 55664e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 55674e2b4712SSatish Balay } 55684e2b4712SSatish Balay flev++; 55694e2b4712SSatish Balay } 55704e2b4712SSatish Balay row = fill[row]; 55714e2b4712SSatish Balay nzi++; 55724e2b4712SSatish Balay } 55734e2b4712SSatish Balay /* copy new filled row into permanent storage */ 55744e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 55754e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 5576ecf371e4SBarry Smith 5577ecf371e4SBarry Smith /* estimate how much additional space we will need */ 5578ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5579ecf371e4SBarry Smith /* just double the memory each time */ 5580690b6cddSBarry Smith PetscInt maxadd = jmax; 5581ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 55824e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 55834e2b4712SSatish Balay jmax += maxadd; 5584ecf371e4SBarry Smith 5585ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 55865d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 55875d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5588606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 55895d0c19d7SBarry Smith ajnew = xitmp; 55905d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 55915d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5592606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 55935d0c19d7SBarry Smith ajfill = xitmp; 5594eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 55954e2b4712SSatish Balay } 55965d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 55974e2b4712SSatish Balay flev = ajfill + ainew[prow]; 55984e2b4712SSatish Balay dloc[prow] = nzi; 55994e2b4712SSatish Balay fm = fill[n]; 56004e2b4712SSatish Balay while (nzf--) { 56015d0c19d7SBarry Smith *xitmp++ = fm; 56024e2b4712SSatish Balay *flev++ = im[fm]; 56034e2b4712SSatish Balay fm = fill[fm]; 56044e2b4712SSatish Balay } 5605435faa5fSBarry Smith /* make sure row has diagonal entry */ 5606435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 560777431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 56082401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5609435faa5fSBarry Smith } 56104e2b4712SSatish Balay } 5611606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 56124e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 56134e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5614606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 5615606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 56164e2b4712SSatish Balay 56176cf91177SBarry Smith #if defined(PETSC_USE_INFO) 56184e2b4712SSatish Balay { 5619329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5620ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5621ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5622ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5623ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5624335d9088SBarry Smith if (diagonal_fill) { 5625ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5626335d9088SBarry Smith } 56274e2b4712SSatish Balay } 562863ba0a88SBarry Smith #endif 56294e2b4712SSatish Balay 56304e2b4712SSatish Balay /* put together the new matrix */ 5631719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5632719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5633ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 5634e6b907acSBarry Smith b->free_a = PETSC_TRUE; 5635e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 56367c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 5637a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 56384e2b4712SSatish Balay b->j = ajnew; 56394e2b4712SSatish Balay b->i = ainew; 56404e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 56414e2b4712SSatish Balay b->diag = dloc; 56427f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 56434e2b4712SSatish Balay b->ilen = 0; 56444e2b4712SSatish Balay b->imax = 0; 56454e2b4712SSatish Balay b->row = isrow; 56464e2b4712SSatish Balay b->col = iscol; 5647bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5648c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5649c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5650e51c0b9cSSatish Balay b->icol = isicol; 565187828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 56524e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 56534e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 5654719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 56554e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 56564e2b4712SSatish Balay 5657ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocate; 5658ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 5659ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 56606bce7ff8SHong Zhang 566141df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 56628661488fSKris Buschelman PetscFunctionReturn(0); 56638661488fSKris Buschelman } 56648661488fSKris Buschelman 5665732ee342SKris Buschelman #undef __FUNCT__ 56667e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5667dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 56687e7071cdSKris Buschelman { 566912272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 567012272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 56715a9542e3SKris Buschelman PetscFunctionBegin; 56727cf1b8d3SKris Buschelman /* Undo Column scaling */ 56737cf1b8d3SKris Buschelman /* while (nz--) { */ 56747cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 56757cf1b8d3SKris Buschelman /* } */ 5676c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 5677c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 56787cf1b8d3SKris Buschelman PetscFunctionReturn(0); 56797cf1b8d3SKris Buschelman } 56807cf1b8d3SKris Buschelman 56817cf1b8d3SKris Buschelman #undef __FUNCT__ 56827cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5683dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 56847cf1b8d3SKris Buschelman { 56857cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5686b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 56872aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 56885a9542e3SKris Buschelman PetscFunctionBegin; 56890b9da03eSKris Buschelman /* Is this really necessary? */ 569020235379SKris Buschelman while (nz--) { 56910b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 56927e7071cdSKris Buschelman } 5693c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 56947e7071cdSKris Buschelman PetscFunctionReturn(0); 56957e7071cdSKris Buschelman } 56967e7071cdSKris Buschelman 5697732ee342SKris Buschelman 5698