1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 1016a2bf60SHong Zhang #include "petscbt.h" 1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 124e2b4712SSatish Balay 134a2ae208SSatish Balay #undef __FUNCT__ 144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16f1af5d2fSBarry Smith { 17f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18dfbe8321SBarry Smith PetscErrorCode ierr; 19690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20690b6cddSBarry Smith PetscInt *diag = a->diag; 21f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2287828ca2SBarry Smith PetscScalar s1,*x,*b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 66f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6887828ca2SBarry Smith PetscScalar *x,*b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 1111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120f1af5d2fSBarry Smith { 121f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122dfbe8321SBarry Smith PetscErrorCode ierr; 123690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 125f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12687828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12787828ca2SBarry Smith PetscScalar *x,*b; 128f1af5d2fSBarry Smith 129f1af5d2fSBarry Smith PetscFunctionBegin; 130ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1311ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133f1af5d2fSBarry Smith 134f1af5d2fSBarry Smith /* forward solve the U^T */ 135f1af5d2fSBarry Smith idx = 0; 136f1af5d2fSBarry Smith for (i=0; i<n; i++) { 137f1af5d2fSBarry Smith 138f1af5d2fSBarry Smith v = aa + 9*diag[i]; 139f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 140ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144f1af5d2fSBarry Smith v += 9; 145f1af5d2fSBarry Smith 146f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 147f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 148f1af5d2fSBarry Smith while (nz--) { 149f1af5d2fSBarry Smith oidx = 3*(*vi++); 150f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153f1af5d2fSBarry Smith v += 9; 154f1af5d2fSBarry Smith } 155f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156f1af5d2fSBarry Smith idx += 3; 157f1af5d2fSBarry Smith } 158f1af5d2fSBarry Smith /* backward solve the L^T */ 159f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 160f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 161f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 162f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 163f1af5d2fSBarry Smith idt = 3*i; 164f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165f1af5d2fSBarry Smith while (nz--) { 166f1af5d2fSBarry Smith idx = 3*(*vi--); 167f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170f1af5d2fSBarry Smith v -= 9; 171f1af5d2fSBarry Smith } 172f1af5d2fSBarry Smith } 1731ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176f1af5d2fSBarry Smith PetscFunctionReturn(0); 177f1af5d2fSBarry Smith } 178f1af5d2fSBarry Smith 1794a2ae208SSatish Balay #undef __FUNCT__ 1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182f1af5d2fSBarry Smith { 183f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184dfbe8321SBarry Smith PetscErrorCode ierr; 185690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 187f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18987828ca2SBarry Smith PetscScalar *x,*b; 190f1af5d2fSBarry Smith 191f1af5d2fSBarry Smith PetscFunctionBegin; 192ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1931ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195f1af5d2fSBarry Smith 196f1af5d2fSBarry Smith /* forward solve the U^T */ 197f1af5d2fSBarry Smith idx = 0; 198f1af5d2fSBarry Smith for (i=0; i<n; i++) { 199f1af5d2fSBarry Smith 200f1af5d2fSBarry Smith v = aa + 16*diag[i]; 201f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 202ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207f1af5d2fSBarry Smith v += 16; 208f1af5d2fSBarry Smith 209f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 210f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 211f1af5d2fSBarry Smith while (nz--) { 212f1af5d2fSBarry Smith oidx = 4*(*vi++); 213f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217f1af5d2fSBarry Smith v += 16; 218f1af5d2fSBarry Smith } 219f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220f1af5d2fSBarry Smith idx += 4; 221f1af5d2fSBarry Smith } 222f1af5d2fSBarry Smith /* backward solve the L^T */ 223f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 224f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 225f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 226f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 227f1af5d2fSBarry Smith idt = 4*i; 228f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229f1af5d2fSBarry Smith while (nz--) { 230f1af5d2fSBarry Smith idx = 4*(*vi--); 231f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235f1af5d2fSBarry Smith v -= 16; 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith } 2381ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2391ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241f1af5d2fSBarry Smith PetscFunctionReturn(0); 242f1af5d2fSBarry Smith } 243f1af5d2fSBarry Smith 2444a2ae208SSatish Balay #undef __FUNCT__ 2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247f1af5d2fSBarry Smith { 248f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249dfbe8321SBarry Smith PetscErrorCode ierr; 250690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 252f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 25387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 25487828ca2SBarry Smith PetscScalar *x,*b; 255f1af5d2fSBarry Smith 256f1af5d2fSBarry Smith PetscFunctionBegin; 257ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2581ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2591ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260f1af5d2fSBarry Smith 261f1af5d2fSBarry Smith /* forward solve the U^T */ 262f1af5d2fSBarry Smith idx = 0; 263f1af5d2fSBarry Smith for (i=0; i<n; i++) { 264f1af5d2fSBarry Smith 265f1af5d2fSBarry Smith v = aa + 25*diag[i]; 266f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 267ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273f1af5d2fSBarry Smith v += 25; 274f1af5d2fSBarry Smith 275f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 276f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 277f1af5d2fSBarry Smith while (nz--) { 278f1af5d2fSBarry Smith oidx = 5*(*vi++); 279f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284f1af5d2fSBarry Smith v += 25; 285f1af5d2fSBarry Smith } 286f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287f1af5d2fSBarry Smith idx += 5; 288f1af5d2fSBarry Smith } 289f1af5d2fSBarry Smith /* backward solve the L^T */ 290f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 291f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 292f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 293f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 294f1af5d2fSBarry Smith idt = 5*i; 295f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296f1af5d2fSBarry Smith while (nz--) { 297f1af5d2fSBarry Smith idx = 5*(*vi--); 298f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303f1af5d2fSBarry Smith v -= 25; 304f1af5d2fSBarry Smith } 305f1af5d2fSBarry Smith } 3061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309f1af5d2fSBarry Smith PetscFunctionReturn(0); 310f1af5d2fSBarry Smith } 311f1af5d2fSBarry Smith 3124a2ae208SSatish Balay #undef __FUNCT__ 3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315f1af5d2fSBarry Smith { 316f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317dfbe8321SBarry Smith PetscErrorCode ierr; 318690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 320f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 32187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 32287828ca2SBarry Smith PetscScalar *x,*b; 323f1af5d2fSBarry Smith 324f1af5d2fSBarry Smith PetscFunctionBegin; 325ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328f1af5d2fSBarry Smith 329f1af5d2fSBarry Smith /* forward solve the U^T */ 330f1af5d2fSBarry Smith idx = 0; 331f1af5d2fSBarry Smith for (i=0; i<n; i++) { 332f1af5d2fSBarry Smith 333f1af5d2fSBarry Smith v = aa + 36*diag[i]; 334f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 335ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336ef66eb69SBarry Smith x6 = x[5+idx]; 337f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343f1af5d2fSBarry Smith v += 36; 344f1af5d2fSBarry Smith 345f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 346f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 347f1af5d2fSBarry Smith while (nz--) { 348f1af5d2fSBarry Smith oidx = 6*(*vi++); 349f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355f1af5d2fSBarry Smith v += 36; 356f1af5d2fSBarry Smith } 357f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358f1af5d2fSBarry Smith x[5+idx] = s6; 359f1af5d2fSBarry Smith idx += 6; 360f1af5d2fSBarry Smith } 361f1af5d2fSBarry Smith /* backward solve the L^T */ 362f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 363f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 364f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 365f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 366f1af5d2fSBarry Smith idt = 6*i; 367f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368f1af5d2fSBarry Smith s6 = x[5+idt]; 369f1af5d2fSBarry Smith while (nz--) { 370f1af5d2fSBarry Smith idx = 6*(*vi--); 371f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377f1af5d2fSBarry Smith v -= 36; 378f1af5d2fSBarry Smith } 379f1af5d2fSBarry Smith } 3801ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383f1af5d2fSBarry Smith PetscFunctionReturn(0); 384f1af5d2fSBarry Smith } 385f1af5d2fSBarry Smith 3864a2ae208SSatish Balay #undef __FUNCT__ 3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389f1af5d2fSBarry Smith { 390f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391dfbe8321SBarry Smith PetscErrorCode ierr; 392690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 394f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 39587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 39687828ca2SBarry Smith PetscScalar *x,*b; 397f1af5d2fSBarry Smith 398f1af5d2fSBarry Smith PetscFunctionBegin; 399ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4001ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402f1af5d2fSBarry Smith 403f1af5d2fSBarry Smith /* forward solve the U^T */ 404f1af5d2fSBarry Smith idx = 0; 405f1af5d2fSBarry Smith for (i=0; i<n; i++) { 406f1af5d2fSBarry Smith 407f1af5d2fSBarry Smith v = aa + 49*diag[i]; 408f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 409ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 411f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418f1af5d2fSBarry Smith v += 49; 419f1af5d2fSBarry Smith 420f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 421f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 422f1af5d2fSBarry Smith while (nz--) { 423f1af5d2fSBarry Smith oidx = 7*(*vi++); 424f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431f1af5d2fSBarry Smith v += 49; 432f1af5d2fSBarry Smith } 433f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 435f1af5d2fSBarry Smith idx += 7; 436f1af5d2fSBarry Smith } 437f1af5d2fSBarry Smith /* backward solve the L^T */ 438f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 439f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 440f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 441f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 442f1af5d2fSBarry Smith idt = 7*i; 443f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 445f1af5d2fSBarry Smith while (nz--) { 446f1af5d2fSBarry Smith idx = 7*(*vi--); 447f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454f1af5d2fSBarry Smith v -= 49; 455f1af5d2fSBarry Smith } 456f1af5d2fSBarry Smith } 4571ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460f1af5d2fSBarry Smith PetscFunctionReturn(0); 461f1af5d2fSBarry Smith } 462f1af5d2fSBarry Smith 463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4644a2ae208SSatish Balay #undef __FUNCT__ 4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467f1af5d2fSBarry Smith { 468f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 4706849ba73SBarry Smith PetscErrorCode ierr; 4715d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 4725d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473690b6cddSBarry Smith PetscInt *diag = a->diag; 474f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 47587828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 476f1af5d2fSBarry Smith 477f1af5d2fSBarry Smith PetscFunctionBegin; 4781ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480f1af5d2fSBarry Smith t = a->solve_work; 481f1af5d2fSBarry Smith 482f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484f1af5d2fSBarry Smith 485f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 486f1af5d2fSBarry Smith for (i=0; i<n; i++) { 487f1af5d2fSBarry Smith t[i] = b[c[i]]; 488f1af5d2fSBarry Smith } 489f1af5d2fSBarry Smith 490f1af5d2fSBarry Smith /* forward solve the U^T */ 491f1af5d2fSBarry Smith for (i=0; i<n; i++) { 492f1af5d2fSBarry Smith 493f1af5d2fSBarry Smith v = aa + diag[i]; 494f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 495f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 496f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 497f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 498f1af5d2fSBarry Smith while (nz--) { 499f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 500f1af5d2fSBarry Smith } 501f1af5d2fSBarry Smith t[i] = s1; 502f1af5d2fSBarry Smith } 503f1af5d2fSBarry Smith /* backward solve the L^T */ 504f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 505f1af5d2fSBarry Smith v = aa + diag[i] - 1; 506f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 507f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 508f1af5d2fSBarry Smith s1 = t[i]; 509f1af5d2fSBarry Smith while (nz--) { 510f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 511f1af5d2fSBarry Smith } 512f1af5d2fSBarry Smith } 513f1af5d2fSBarry Smith 514f1af5d2fSBarry Smith /* copy t into x according to permutation */ 515f1af5d2fSBarry Smith for (i=0; i<n; i++) { 516f1af5d2fSBarry Smith x[r[i]] = t[i]; 517f1af5d2fSBarry Smith } 518f1af5d2fSBarry Smith 519f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5211ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5221ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524f1af5d2fSBarry Smith PetscFunctionReturn(0); 525f1af5d2fSBarry Smith } 526f1af5d2fSBarry Smith 5274a2ae208SSatish Balay #undef __FUNCT__ 5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530f1af5d2fSBarry Smith { 531f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5336849ba73SBarry Smith PetscErrorCode ierr; 5345d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5355d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 537f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53887828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 53987828ca2SBarry Smith PetscScalar *x,*b,*t; 540f1af5d2fSBarry Smith 541f1af5d2fSBarry Smith PetscFunctionBegin; 5421ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5431ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544f1af5d2fSBarry Smith t = a->solve_work; 545f1af5d2fSBarry Smith 546f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548f1af5d2fSBarry Smith 549f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 550f1af5d2fSBarry Smith ii = 0; 551f1af5d2fSBarry Smith for (i=0; i<n; i++) { 552f1af5d2fSBarry Smith ic = 2*c[i]; 553f1af5d2fSBarry Smith t[ii] = b[ic]; 554f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 555f1af5d2fSBarry Smith ii += 2; 556f1af5d2fSBarry Smith } 557f1af5d2fSBarry Smith 558f1af5d2fSBarry Smith /* forward solve the U^T */ 559f1af5d2fSBarry Smith idx = 0; 560f1af5d2fSBarry Smith for (i=0; i<n; i++) { 561f1af5d2fSBarry Smith 562f1af5d2fSBarry Smith v = aa + 4*diag[i]; 563f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 564f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 565f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 566f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 567f1af5d2fSBarry Smith v += 4; 568f1af5d2fSBarry Smith 569f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 570f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 571f1af5d2fSBarry Smith while (nz--) { 572f1af5d2fSBarry Smith oidx = 2*(*vi++); 573f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 574f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 575f1af5d2fSBarry Smith v += 4; 576f1af5d2fSBarry Smith } 577f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 578f1af5d2fSBarry Smith idx += 2; 579f1af5d2fSBarry Smith } 580f1af5d2fSBarry Smith /* backward solve the L^T */ 581f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 582f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 583f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 584f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 585f1af5d2fSBarry Smith idt = 2*i; 586f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 587f1af5d2fSBarry Smith while (nz--) { 588f1af5d2fSBarry Smith idx = 2*(*vi--); 589f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 590f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 591f1af5d2fSBarry Smith v -= 4; 592f1af5d2fSBarry Smith } 593f1af5d2fSBarry Smith } 594f1af5d2fSBarry Smith 595f1af5d2fSBarry Smith /* copy t into x according to permutation */ 596f1af5d2fSBarry Smith ii = 0; 597f1af5d2fSBarry Smith for (i=0; i<n; i++) { 598f1af5d2fSBarry Smith ir = 2*r[i]; 599f1af5d2fSBarry Smith x[ir] = t[ii]; 600f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 601f1af5d2fSBarry Smith ii += 2; 602f1af5d2fSBarry Smith } 603f1af5d2fSBarry Smith 604f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609f1af5d2fSBarry Smith PetscFunctionReturn(0); 610f1af5d2fSBarry Smith } 611f1af5d2fSBarry Smith 6124a2ae208SSatish Balay #undef __FUNCT__ 6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615f1af5d2fSBarry Smith { 616f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6186849ba73SBarry Smith PetscErrorCode ierr; 6195d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 6205d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 622f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 62387828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 62487828ca2SBarry Smith PetscScalar *x,*b,*t; 625f1af5d2fSBarry Smith 626f1af5d2fSBarry Smith PetscFunctionBegin; 6271ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629f1af5d2fSBarry Smith t = a->solve_work; 630f1af5d2fSBarry Smith 631f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633f1af5d2fSBarry Smith 634f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 635f1af5d2fSBarry Smith ii = 0; 636f1af5d2fSBarry Smith for (i=0; i<n; i++) { 637f1af5d2fSBarry Smith ic = 3*c[i]; 638f1af5d2fSBarry Smith t[ii] = b[ic]; 639f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 640f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 641f1af5d2fSBarry Smith ii += 3; 642f1af5d2fSBarry Smith } 643f1af5d2fSBarry Smith 644f1af5d2fSBarry Smith /* forward solve the U^T */ 645f1af5d2fSBarry Smith idx = 0; 646f1af5d2fSBarry Smith for (i=0; i<n; i++) { 647f1af5d2fSBarry Smith 648f1af5d2fSBarry Smith v = aa + 9*diag[i]; 649f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 650f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654f1af5d2fSBarry Smith v += 9; 655f1af5d2fSBarry Smith 656f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 657f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 658f1af5d2fSBarry Smith while (nz--) { 659f1af5d2fSBarry Smith oidx = 3*(*vi++); 660f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663f1af5d2fSBarry Smith v += 9; 664f1af5d2fSBarry Smith } 665f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666f1af5d2fSBarry Smith idx += 3; 667f1af5d2fSBarry Smith } 668f1af5d2fSBarry Smith /* backward solve the L^T */ 669f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 670f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 671f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 672f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 673f1af5d2fSBarry Smith idt = 3*i; 674f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675f1af5d2fSBarry Smith while (nz--) { 676f1af5d2fSBarry Smith idx = 3*(*vi--); 677f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680f1af5d2fSBarry Smith v -= 9; 681f1af5d2fSBarry Smith } 682f1af5d2fSBarry Smith } 683f1af5d2fSBarry Smith 684f1af5d2fSBarry Smith /* copy t into x according to permutation */ 685f1af5d2fSBarry Smith ii = 0; 686f1af5d2fSBarry Smith for (i=0; i<n; i++) { 687f1af5d2fSBarry Smith ir = 3*r[i]; 688f1af5d2fSBarry Smith x[ir] = t[ii]; 689f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 690f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 691f1af5d2fSBarry Smith ii += 3; 692f1af5d2fSBarry Smith } 693f1af5d2fSBarry Smith 694f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6961ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699f1af5d2fSBarry Smith PetscFunctionReturn(0); 700f1af5d2fSBarry Smith } 701f1af5d2fSBarry Smith 7024a2ae208SSatish Balay #undef __FUNCT__ 7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705f1af5d2fSBarry Smith { 706f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7086849ba73SBarry Smith PetscErrorCode ierr; 7095d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 7105d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 712f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 71387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 71487828ca2SBarry Smith PetscScalar *x,*b,*t; 715f1af5d2fSBarry Smith 716f1af5d2fSBarry Smith PetscFunctionBegin; 7171ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719f1af5d2fSBarry Smith t = a->solve_work; 720f1af5d2fSBarry Smith 721f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723f1af5d2fSBarry Smith 724f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 725f1af5d2fSBarry Smith ii = 0; 726f1af5d2fSBarry Smith for (i=0; i<n; i++) { 727f1af5d2fSBarry Smith ic = 4*c[i]; 728f1af5d2fSBarry Smith t[ii] = b[ic]; 729f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 730f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 731f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 732f1af5d2fSBarry Smith ii += 4; 733f1af5d2fSBarry Smith } 734f1af5d2fSBarry Smith 735f1af5d2fSBarry Smith /* forward solve the U^T */ 736f1af5d2fSBarry Smith idx = 0; 737f1af5d2fSBarry Smith for (i=0; i<n; i++) { 738f1af5d2fSBarry Smith 739f1af5d2fSBarry Smith v = aa + 16*diag[i]; 740f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 741f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746f1af5d2fSBarry Smith v += 16; 747f1af5d2fSBarry Smith 748f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 749f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 750f1af5d2fSBarry Smith while (nz--) { 751f1af5d2fSBarry Smith oidx = 4*(*vi++); 752f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756f1af5d2fSBarry Smith v += 16; 757f1af5d2fSBarry Smith } 758f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759f1af5d2fSBarry Smith idx += 4; 760f1af5d2fSBarry Smith } 761f1af5d2fSBarry Smith /* backward solve the L^T */ 762f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 763f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 764f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 765f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 766f1af5d2fSBarry Smith idt = 4*i; 767f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768f1af5d2fSBarry Smith while (nz--) { 769f1af5d2fSBarry Smith idx = 4*(*vi--); 770f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774f1af5d2fSBarry Smith v -= 16; 775f1af5d2fSBarry Smith } 776f1af5d2fSBarry Smith } 777f1af5d2fSBarry Smith 778f1af5d2fSBarry Smith /* copy t into x according to permutation */ 779f1af5d2fSBarry Smith ii = 0; 780f1af5d2fSBarry Smith for (i=0; i<n; i++) { 781f1af5d2fSBarry Smith ir = 4*r[i]; 782f1af5d2fSBarry Smith x[ir] = t[ii]; 783f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 784f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 785f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 786f1af5d2fSBarry Smith ii += 4; 787f1af5d2fSBarry Smith } 788f1af5d2fSBarry Smith 789f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794f1af5d2fSBarry Smith PetscFunctionReturn(0); 795f1af5d2fSBarry Smith } 796f1af5d2fSBarry Smith 7974a2ae208SSatish Balay #undef __FUNCT__ 7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800f1af5d2fSBarry Smith { 801f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8036849ba73SBarry Smith PetscErrorCode ierr; 8045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 807f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 80887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 80987828ca2SBarry Smith PetscScalar *x,*b,*t; 810f1af5d2fSBarry Smith 811f1af5d2fSBarry Smith PetscFunctionBegin; 8121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814f1af5d2fSBarry Smith t = a->solve_work; 815f1af5d2fSBarry Smith 816f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818f1af5d2fSBarry Smith 819f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 820f1af5d2fSBarry Smith ii = 0; 821f1af5d2fSBarry Smith for (i=0; i<n; i++) { 822f1af5d2fSBarry Smith ic = 5*c[i]; 823f1af5d2fSBarry Smith t[ii] = b[ic]; 824f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 825f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 826f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 827f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 828f1af5d2fSBarry Smith ii += 5; 829f1af5d2fSBarry Smith } 830f1af5d2fSBarry Smith 831f1af5d2fSBarry Smith /* forward solve the U^T */ 832f1af5d2fSBarry Smith idx = 0; 833f1af5d2fSBarry Smith for (i=0; i<n; i++) { 834f1af5d2fSBarry Smith 835f1af5d2fSBarry Smith v = aa + 25*diag[i]; 836f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 837f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843f1af5d2fSBarry Smith v += 25; 844f1af5d2fSBarry Smith 845f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 846f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 847f1af5d2fSBarry Smith while (nz--) { 848f1af5d2fSBarry Smith oidx = 5*(*vi++); 849f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854f1af5d2fSBarry Smith v += 25; 855f1af5d2fSBarry Smith } 856f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857f1af5d2fSBarry Smith idx += 5; 858f1af5d2fSBarry Smith } 859f1af5d2fSBarry Smith /* backward solve the L^T */ 860f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 861f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 862f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 863f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 864f1af5d2fSBarry Smith idt = 5*i; 865f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866f1af5d2fSBarry Smith while (nz--) { 867f1af5d2fSBarry Smith idx = 5*(*vi--); 868f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873f1af5d2fSBarry Smith v -= 25; 874f1af5d2fSBarry Smith } 875f1af5d2fSBarry Smith } 876f1af5d2fSBarry Smith 877f1af5d2fSBarry Smith /* copy t into x according to permutation */ 878f1af5d2fSBarry Smith ii = 0; 879f1af5d2fSBarry Smith for (i=0; i<n; i++) { 880f1af5d2fSBarry Smith ir = 5*r[i]; 881f1af5d2fSBarry Smith x[ir] = t[ii]; 882f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 883f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 884f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 885f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 886f1af5d2fSBarry Smith ii += 5; 887f1af5d2fSBarry Smith } 888f1af5d2fSBarry Smith 889f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894f1af5d2fSBarry Smith PetscFunctionReturn(0); 895f1af5d2fSBarry Smith } 896f1af5d2fSBarry Smith 8974a2ae208SSatish Balay #undef __FUNCT__ 8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900f1af5d2fSBarry Smith { 901f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9036849ba73SBarry Smith PetscErrorCode ierr; 9045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 907f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 90887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 90987828ca2SBarry Smith PetscScalar *x,*b,*t; 910f1af5d2fSBarry Smith 911f1af5d2fSBarry Smith PetscFunctionBegin; 9121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914f1af5d2fSBarry Smith t = a->solve_work; 915f1af5d2fSBarry Smith 916f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918f1af5d2fSBarry Smith 919f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 920f1af5d2fSBarry Smith ii = 0; 921f1af5d2fSBarry Smith for (i=0; i<n; i++) { 922f1af5d2fSBarry Smith ic = 6*c[i]; 923f1af5d2fSBarry Smith t[ii] = b[ic]; 924f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 925f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 926f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 927f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 928f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 929f1af5d2fSBarry Smith ii += 6; 930f1af5d2fSBarry Smith } 931f1af5d2fSBarry Smith 932f1af5d2fSBarry Smith /* forward solve the U^T */ 933f1af5d2fSBarry Smith idx = 0; 934f1af5d2fSBarry Smith for (i=0; i<n; i++) { 935f1af5d2fSBarry Smith 936f1af5d2fSBarry Smith v = aa + 36*diag[i]; 937f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 938f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939f1af5d2fSBarry Smith x6 = t[5+idx]; 940f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946f1af5d2fSBarry Smith v += 36; 947f1af5d2fSBarry Smith 948f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 949f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 950f1af5d2fSBarry Smith while (nz--) { 951f1af5d2fSBarry Smith oidx = 6*(*vi++); 952f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958f1af5d2fSBarry Smith v += 36; 959f1af5d2fSBarry Smith } 960f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961f1af5d2fSBarry Smith t[5+idx] = s6; 962f1af5d2fSBarry Smith idx += 6; 963f1af5d2fSBarry Smith } 964f1af5d2fSBarry Smith /* backward solve the L^T */ 965f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 966f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 967f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 968f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 969f1af5d2fSBarry Smith idt = 6*i; 970f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971f1af5d2fSBarry Smith s6 = t[5+idt]; 972f1af5d2fSBarry Smith while (nz--) { 973f1af5d2fSBarry Smith idx = 6*(*vi--); 974f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980f1af5d2fSBarry Smith v -= 36; 981f1af5d2fSBarry Smith } 982f1af5d2fSBarry Smith } 983f1af5d2fSBarry Smith 984f1af5d2fSBarry Smith /* copy t into x according to permutation */ 985f1af5d2fSBarry Smith ii = 0; 986f1af5d2fSBarry Smith for (i=0; i<n; i++) { 987f1af5d2fSBarry Smith ir = 6*r[i]; 988f1af5d2fSBarry Smith x[ir] = t[ii]; 989f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 990f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 991f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 992f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 993f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 994f1af5d2fSBarry Smith ii += 6; 995f1af5d2fSBarry Smith } 996f1af5d2fSBarry Smith 997f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9991ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002f1af5d2fSBarry Smith PetscFunctionReturn(0); 1003f1af5d2fSBarry Smith } 1004f1af5d2fSBarry Smith 10054a2ae208SSatish Balay #undef __FUNCT__ 10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008f1af5d2fSBarry Smith { 1009f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10116849ba73SBarry Smith PetscErrorCode ierr; 10125d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 10135d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 101687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 101787828ca2SBarry Smith PetscScalar *x,*b,*t; 1018f1af5d2fSBarry Smith 1019f1af5d2fSBarry Smith PetscFunctionBegin; 10201ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022f1af5d2fSBarry Smith t = a->solve_work; 1023f1af5d2fSBarry Smith 1024f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026f1af5d2fSBarry Smith 1027f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1028f1af5d2fSBarry Smith ii = 0; 1029f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1030f1af5d2fSBarry Smith ic = 7*c[i]; 1031f1af5d2fSBarry Smith t[ii] = b[ic]; 1032f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1033f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1034f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1035f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1036f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1037f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1038f1af5d2fSBarry Smith ii += 7; 1039f1af5d2fSBarry Smith } 1040f1af5d2fSBarry Smith 1041f1af5d2fSBarry Smith /* forward solve the U^T */ 1042f1af5d2fSBarry Smith idx = 0; 1043f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1044f1af5d2fSBarry Smith 1045f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1046f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1047f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1049f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056f1af5d2fSBarry Smith v += 49; 1057f1af5d2fSBarry Smith 1058f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1059f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1060f1af5d2fSBarry Smith while (nz--) { 1061f1af5d2fSBarry Smith oidx = 7*(*vi++); 1062f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069f1af5d2fSBarry Smith v += 49; 1070f1af5d2fSBarry Smith } 1071f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1073f1af5d2fSBarry Smith idx += 7; 1074f1af5d2fSBarry Smith } 1075f1af5d2fSBarry Smith /* backward solve the L^T */ 1076f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1077f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1078f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1079f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1080f1af5d2fSBarry Smith idt = 7*i; 1081f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1083f1af5d2fSBarry Smith while (nz--) { 1084f1af5d2fSBarry Smith idx = 7*(*vi--); 1085f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092f1af5d2fSBarry Smith v -= 49; 1093f1af5d2fSBarry Smith } 1094f1af5d2fSBarry Smith } 1095f1af5d2fSBarry Smith 1096f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1097f1af5d2fSBarry Smith ii = 0; 1098f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1099f1af5d2fSBarry Smith ir = 7*r[i]; 1100f1af5d2fSBarry Smith x[ir] = t[ii]; 1101f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1102f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1103f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1104f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1105f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1106f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1107f1af5d2fSBarry Smith ii += 7; 1108f1af5d2fSBarry Smith } 1109f1af5d2fSBarry Smith 1110f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11121ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115f1af5d2fSBarry Smith PetscFunctionReturn(0); 1116f1af5d2fSBarry Smith } 1117f1af5d2fSBarry Smith 11184e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11194a2ae208SSatish Balay #undef __FUNCT__ 11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11224e2b4712SSatish Balay { 11234e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11244e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11256849ba73SBarry Smith PetscErrorCode ierr; 11265d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 11275d0c19d7SBarry Smith PetscInt i,n=a->mbs; 11285d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11293f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 113087828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11314e2b4712SSatish Balay 11324e2b4712SSatish Balay PetscFunctionBegin; 11331ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11341ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135f1af5d2fSBarry Smith t = a->solve_work; 11364e2b4712SSatish Balay 11374e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11384e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11394e2b4712SSatish Balay 11404e2b4712SSatish Balay /* forward solve the lower triangular */ 114187828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11424e2b4712SSatish Balay for (i=1; i<n; i++) { 11434e2b4712SSatish Balay v = aa + bs2*ai[i]; 11444e2b4712SSatish Balay vi = aj + ai[i]; 11454e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1146f1af5d2fSBarry Smith s = t + bs*i; 114787828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11484e2b4712SSatish Balay while (nz--) { 1149f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11504e2b4712SSatish Balay v += bs2; 11514e2b4712SSatish Balay } 11524e2b4712SSatish Balay } 11534e2b4712SSatish Balay /* backward solve the upper triangular */ 1154d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 11554e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11564e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11574e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11584e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 115987828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11604e2b4712SSatish Balay while (nz--) { 1161f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11624e2b4712SSatish Balay v += bs2; 11634e2b4712SSatish Balay } 1164f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 116587828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11664e2b4712SSatish Balay } 11674e2b4712SSatish Balay 11684e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11694e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11701ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 11734e2b4712SSatish Balay PetscFunctionReturn(0); 11744e2b4712SSatish Balay } 11754e2b4712SSatish Balay 11765c42ef9dSBarry Smith /* ----------------------------------------------------------- */ 11775c42ef9dSBarry Smith #undef __FUNCT__ 11785c42ef9dSBarry Smith #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 11795c42ef9dSBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11805c42ef9dSBarry Smith { 11815c42ef9dSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11825c42ef9dSBarry Smith IS iscol=a->col,isrow=a->row; 11835c42ef9dSBarry Smith PetscErrorCode ierr; 11845c42ef9dSBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 11855c42ef9dSBarry Smith PetscInt i,n=a->mbs,j; 11865c42ef9dSBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11875c42ef9dSBarry Smith const MatScalar *aa=a->a,*v; 11885c42ef9dSBarry Smith PetscScalar *x,*t,*ls; 11895c42ef9dSBarry Smith const PetscScalar *b; 11905c42ef9dSBarry Smith PetscFunctionBegin; 11915c42ef9dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 11925c42ef9dSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 11935c42ef9dSBarry Smith t = a->solve_work; 11945c42ef9dSBarry Smith 11955c42ef9dSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11965c42ef9dSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 11975c42ef9dSBarry Smith 11985c42ef9dSBarry Smith /* copy the b into temp work space according to permutation */ 11995c42ef9dSBarry Smith for (i=0; i<n; i++) { 12005c42ef9dSBarry Smith for (j=0; j<bs; j++) { 12015c42ef9dSBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 12025c42ef9dSBarry Smith } 12035c42ef9dSBarry Smith } 12045c42ef9dSBarry Smith 12055c42ef9dSBarry Smith 12065c42ef9dSBarry Smith /* forward solve the upper triangular transpose */ 12075c42ef9dSBarry Smith ls = a->solve_work + A->cmap->n; 12085c42ef9dSBarry Smith for (i=0; i<n; i++){ 12095c42ef9dSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 12105c42ef9dSBarry Smith Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 12115c42ef9dSBarry Smith v = aa + bs2*(a->diag[i] + 1); 12125c42ef9dSBarry Smith vi = aj + a->diag[i] + 1; 12135c42ef9dSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 12145c42ef9dSBarry Smith while (nz--) { 12155c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 12165c42ef9dSBarry Smith v += bs2; 12175c42ef9dSBarry Smith } 12185c42ef9dSBarry Smith } 12195c42ef9dSBarry Smith 12205c42ef9dSBarry Smith /* backward solve the lower triangular transpose */ 12215c42ef9dSBarry Smith for (i=n-1; i>=0; i--) { 12225c42ef9dSBarry Smith v = aa + bs2*ai[i]; 12235c42ef9dSBarry Smith vi = aj + ai[i]; 12245c42ef9dSBarry Smith nz = a->diag[i] - ai[i]; 12255c42ef9dSBarry Smith while (nz--) { 12265c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 12275c42ef9dSBarry Smith v += bs2; 12285c42ef9dSBarry Smith } 12295c42ef9dSBarry Smith } 12305c42ef9dSBarry Smith 12315c42ef9dSBarry Smith /* copy t into x according to permutation */ 12325c42ef9dSBarry Smith for (i=0; i<n; i++) { 12335c42ef9dSBarry Smith for (j=0; j<bs; j++) { 12345c42ef9dSBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 12355c42ef9dSBarry Smith } 12365c42ef9dSBarry Smith } 12375c42ef9dSBarry Smith 12385c42ef9dSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12395c42ef9dSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12405c42ef9dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 12415c42ef9dSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 12425c42ef9dSBarry Smith ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 12435c42ef9dSBarry Smith PetscFunctionReturn(0); 12445c42ef9dSBarry Smith } 12455c42ef9dSBarry Smith 12464a2ae208SSatish Balay #undef __FUNCT__ 12474a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1248dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 12494e2b4712SSatish Balay { 12504e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 12514e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 12526849ba73SBarry Smith PetscErrorCode ierr; 12535d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 12545d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 12553f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 125687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 125787828ca2SBarry Smith PetscScalar *x,*b,*t; 12584e2b4712SSatish Balay 12594e2b4712SSatish Balay PetscFunctionBegin; 12601ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 12611ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1262f1af5d2fSBarry Smith t = a->solve_work; 12634e2b4712SSatish Balay 12644e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 12654e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 12664e2b4712SSatish Balay 12674e2b4712SSatish Balay /* forward solve the lower triangular */ 12684e2b4712SSatish Balay idx = 7*(*r++); 1269f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1270f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1271f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 12724e2b4712SSatish Balay 12734e2b4712SSatish Balay for (i=1; i<n; i++) { 12744e2b4712SSatish Balay v = aa + 49*ai[i]; 12754e2b4712SSatish Balay vi = aj + ai[i]; 12764e2b4712SSatish Balay nz = diag[i] - ai[i]; 12774e2b4712SSatish Balay idx = 7*(*r++); 1278f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1279f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 12804e2b4712SSatish Balay while (nz--) { 12814e2b4712SSatish Balay idx = 7*(*vi++); 1282f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1283f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1284f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1285f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1286f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1287f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1288f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1289f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1290f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1291f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12924e2b4712SSatish Balay v += 49; 12934e2b4712SSatish Balay } 12944e2b4712SSatish Balay idx = 7*i; 1295f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1296f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1297f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12984e2b4712SSatish Balay } 12994e2b4712SSatish Balay /* backward solve the upper triangular */ 13004e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 13014e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 13024e2b4712SSatish Balay vi = aj + diag[i] + 1; 13034e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 13044e2b4712SSatish Balay idt = 7*i; 1305f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1306f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1307f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 13084e2b4712SSatish Balay while (nz--) { 13094e2b4712SSatish Balay idx = 7*(*vi++); 1310f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1311f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1312f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1313f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1314f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1315f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1316f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1317f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1318f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1319f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13204e2b4712SSatish Balay v += 49; 13214e2b4712SSatish Balay } 13224e2b4712SSatish Balay idc = 7*(*c--); 13234e2b4712SSatish Balay v = aa + 49*diag[i]; 1324f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1325f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1326f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1327f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1328f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1329f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1330f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1331f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1332f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1333f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1334f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1335f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1336f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1337f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 13384e2b4712SSatish Balay } 13394e2b4712SSatish Balay 13404e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 13414e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13421ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 13431ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1344dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 13454e2b4712SSatish Balay PetscFunctionReturn(0); 13464e2b4712SSatish Balay } 13474e2b4712SSatish Balay 1348*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 13494a2ae208SSatish Balay #undef __FUNCT__ 13508f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 13518f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 13528f690400SShri Abhyankar { 13538f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 13548f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 13558f690400SShri Abhyankar PetscErrorCode ierr; 13568f690400SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 135729b92fc1SShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 13588f690400SShri Abhyankar MatScalar *aa=a->a,*v; 13598f690400SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 13608f690400SShri Abhyankar PetscScalar *x,*b,*t; 13618f690400SShri Abhyankar 13628f690400SShri Abhyankar PetscFunctionBegin; 13638f690400SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 13648f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 13658f690400SShri Abhyankar t = a->solve_work; 13668f690400SShri Abhyankar 13678f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 136829b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 13698f690400SShri Abhyankar 13708f690400SShri Abhyankar /* forward solve the lower triangular */ 137129b92fc1SShri Abhyankar idx = 7*r[0]; 13728f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 13738f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 13748f690400SShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 13758f690400SShri Abhyankar 13768f690400SShri Abhyankar for (i=1; i<n; i++) { 13778f690400SShri Abhyankar v = aa + 49*ai[i]; 13788f690400SShri Abhyankar vi = aj + ai[i]; 13798f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 138029b92fc1SShri Abhyankar idx = 7*r[i]; 13818f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 13828f690400SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 138329b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 138429b92fc1SShri Abhyankar idx = 7*vi[m]; 13858f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 13868f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 13878f690400SShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 13888f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 13898f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 13908f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 13918f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 13928f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 13938f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 13948f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13958f690400SShri Abhyankar v += 49; 13968f690400SShri Abhyankar } 13978f690400SShri Abhyankar idx = 7*i; 13988f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 13998f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 14008f690400SShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 14018f690400SShri Abhyankar } 14028f690400SShri Abhyankar /* backward solve the upper triangular */ 14038f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 14048f690400SShri Abhyankar k = 2*n-i; 14058f690400SShri Abhyankar v = aa + 49*ai[k]; 14068f690400SShri Abhyankar vi = aj + ai[k]; 14078f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 14088f690400SShri Abhyankar idt = 7*i; 14098f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 14108f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 14118f690400SShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 141229b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 141329b92fc1SShri Abhyankar idx = 7*vi[m]; 14148f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 14158f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 14168f690400SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 14178f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 14188f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 14198f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 14208f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 14218f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 14228f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 14238f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 14248f690400SShri Abhyankar v += 49; 14258f690400SShri Abhyankar } 142629b92fc1SShri Abhyankar idc = 7*c[i]; 14278f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 14288f690400SShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 14298f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 14308f690400SShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 14318f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 14328f690400SShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 14338f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 14348f690400SShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 14358f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 14368f690400SShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 14378f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 14388f690400SShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 14398f690400SShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 14408f690400SShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 14418f690400SShri Abhyankar } 14428f690400SShri Abhyankar 14438f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 14448f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 14458f690400SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 14468f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 14478f690400SShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 14488f690400SShri Abhyankar PetscFunctionReturn(0); 14498f690400SShri Abhyankar } 1450*a2d6a19aSShri Abhyankar #endif 14518f690400SShri Abhyankar 14528f690400SShri Abhyankar #undef __FUNCT__ 1453*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1454*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 145535aa4fcfSShri Abhyankar { 145635aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 145735aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 145835aa4fcfSShri Abhyankar PetscErrorCode ierr; 145935aa4fcfSShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 146035aa4fcfSShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 146135aa4fcfSShri Abhyankar MatScalar *aa=a->a,*v; 146235aa4fcfSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 146335aa4fcfSShri Abhyankar PetscScalar *x,*b,*t; 146435aa4fcfSShri Abhyankar 146535aa4fcfSShri Abhyankar PetscFunctionBegin; 146635aa4fcfSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 146735aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 146835aa4fcfSShri Abhyankar t = a->solve_work; 146935aa4fcfSShri Abhyankar 147035aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 147135aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 147235aa4fcfSShri Abhyankar 147335aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 147435aa4fcfSShri Abhyankar idx = 7*r[0]; 147535aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 147635aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 147735aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 147835aa4fcfSShri Abhyankar 147935aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 148035aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 148135aa4fcfSShri Abhyankar vi = aj + ai[i]; 148235aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 148335aa4fcfSShri Abhyankar idx = 7*r[i]; 148435aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 148535aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 148635aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 148735aa4fcfSShri Abhyankar idx = 7*vi[m]; 148835aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 148935aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 149035aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 149135aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 149235aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 149335aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 149435aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 149535aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 149635aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 149735aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 149835aa4fcfSShri Abhyankar v += 49; 149935aa4fcfSShri Abhyankar } 150035aa4fcfSShri Abhyankar idx = 7*i; 150135aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 150235aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 150335aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 150435aa4fcfSShri Abhyankar } 150535aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 150635aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 150735aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 150835aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 150935aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 151035aa4fcfSShri Abhyankar idt = 7*i; 151135aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 151235aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 151335aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 151435aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 151535aa4fcfSShri Abhyankar idx = 7*vi[m]; 151635aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 151735aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 151835aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 151935aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 152035aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 152135aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 152235aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 152335aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 152435aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 152535aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 152635aa4fcfSShri Abhyankar v += 49; 152735aa4fcfSShri Abhyankar } 152835aa4fcfSShri Abhyankar idc = 7*c[i]; 152935aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 153035aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 153135aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 153235aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 153335aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 153435aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 153535aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 153635aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 153735aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 153835aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 153935aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 154035aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 154135aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 154235aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 154335aa4fcfSShri Abhyankar } 154435aa4fcfSShri Abhyankar 154535aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 154635aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 154735aa4fcfSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 154835aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 154935aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 155035aa4fcfSShri Abhyankar PetscFunctionReturn(0); 155135aa4fcfSShri Abhyankar } 155235aa4fcfSShri Abhyankar 155335aa4fcfSShri Abhyankar #undef __FUNCT__ 15544a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1555dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 155615091d37SBarry Smith { 155715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1558690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1559dfbe8321SBarry Smith PetscErrorCode ierr; 1560690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1561d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1562d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1563d9fead3dSBarry Smith const PetscScalar *b; 156415091d37SBarry Smith 156515091d37SBarry Smith PetscFunctionBegin; 1566d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15671ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 156815091d37SBarry Smith /* forward solve the lower triangular */ 156915091d37SBarry Smith idx = 0; 157015091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 157115091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 157215091d37SBarry Smith x[6] = b[6+idx]; 157315091d37SBarry Smith for (i=1; i<n; i++) { 157415091d37SBarry Smith v = aa + 49*ai[i]; 157515091d37SBarry Smith vi = aj + ai[i]; 157615091d37SBarry Smith nz = diag[i] - ai[i]; 157715091d37SBarry Smith idx = 7*i; 1578f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1579f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1580f1af5d2fSBarry Smith s7 = b[6+idx]; 158115091d37SBarry Smith while (nz--) { 158215091d37SBarry Smith jdx = 7*(*vi++); 158315091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 158415091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 158515091d37SBarry Smith x7 = x[6+jdx]; 1586f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1587f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1588f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1589f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1590f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1591f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1592f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 159315091d37SBarry Smith v += 49; 159415091d37SBarry Smith } 1595f1af5d2fSBarry Smith x[idx] = s1; 1596f1af5d2fSBarry Smith x[1+idx] = s2; 1597f1af5d2fSBarry Smith x[2+idx] = s3; 1598f1af5d2fSBarry Smith x[3+idx] = s4; 1599f1af5d2fSBarry Smith x[4+idx] = s5; 1600f1af5d2fSBarry Smith x[5+idx] = s6; 1601f1af5d2fSBarry Smith x[6+idx] = s7; 160215091d37SBarry Smith } 160315091d37SBarry Smith /* backward solve the upper triangular */ 160415091d37SBarry Smith for (i=n-1; i>=0; i--){ 160515091d37SBarry Smith v = aa + 49*diag[i] + 49; 160615091d37SBarry Smith vi = aj + diag[i] + 1; 160715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 160815091d37SBarry Smith idt = 7*i; 1609f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1610f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1611f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1612f1af5d2fSBarry Smith s7 = x[6+idt]; 161315091d37SBarry Smith while (nz--) { 161415091d37SBarry Smith idx = 7*(*vi++); 161515091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 161615091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 161715091d37SBarry Smith x7 = x[6+idx]; 1618f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1619f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1620f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1621f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1622f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1623f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1624f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 162515091d37SBarry Smith v += 49; 162615091d37SBarry Smith } 162715091d37SBarry Smith v = aa + 49*diag[i]; 1628f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1629f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1630f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1631f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1632f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1633f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1634f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1635f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1636f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1637f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1638f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1639f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1640f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1641f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 164215091d37SBarry Smith } 164315091d37SBarry Smith 1644d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16451ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1646dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 164715091d37SBarry Smith PetscFunctionReturn(0); 164815091d37SBarry Smith } 164915091d37SBarry Smith 1650*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 16514a2ae208SSatish Balay #undef __FUNCT__ 1652cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1653cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1654cee9d6f2SShri Abhyankar { 1655cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 16566464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1657cee9d6f2SShri Abhyankar PetscErrorCode ierr; 1658cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 1659cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1660cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 1661cee9d6f2SShri Abhyankar PetscScalar *x; 1662cee9d6f2SShri Abhyankar const PetscScalar *b; 1663cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1664cee9d6f2SShri Abhyankar 1665cee9d6f2SShri Abhyankar PetscFunctionBegin; 1666cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1667cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1668cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 1669cee9d6f2SShri Abhyankar idx = 0; 1670cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1671cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1672cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 1673cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 1674cee9d6f2SShri Abhyankar vi = aj + ai[i]; 1675cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 1676cee9d6f2SShri Abhyankar idx = bs*i; 1677cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1678cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 16796464896eSShri Abhyankar for(k=0;k<nz;k++) { 16806464896eSShri Abhyankar jdx = bs*vi[k]; 1681cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1682cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1683cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1684cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1685cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1686cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1687cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1688cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1689cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1690cee9d6f2SShri Abhyankar v += bs2; 1691cee9d6f2SShri Abhyankar } 1692cee9d6f2SShri Abhyankar 1693cee9d6f2SShri Abhyankar x[idx] = s1; 1694cee9d6f2SShri Abhyankar x[1+idx] = s2; 1695cee9d6f2SShri Abhyankar x[2+idx] = s3; 1696cee9d6f2SShri Abhyankar x[3+idx] = s4; 1697cee9d6f2SShri Abhyankar x[4+idx] = s5; 1698cee9d6f2SShri Abhyankar x[5+idx] = s6; 1699cee9d6f2SShri Abhyankar x[6+idx] = s7; 1700cee9d6f2SShri Abhyankar } 1701cee9d6f2SShri Abhyankar 1702cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 1703cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 1704cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 1705cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 1706cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 1707cee9d6f2SShri Abhyankar idt = bs*i; 1708cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1709cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 17106464896eSShri Abhyankar for(k=0;k<nz;k++) { 17116464896eSShri Abhyankar idx = bs*vi[k]; 1712cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1713cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1714cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1715cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1716cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1717cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1718cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1719cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1720cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1721cee9d6f2SShri Abhyankar v += bs2; 1722cee9d6f2SShri Abhyankar } 1723cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 1724cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1725cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1726cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1727cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1728cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1729cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1730cee9d6f2SShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1731cee9d6f2SShri Abhyankar } 1732cee9d6f2SShri Abhyankar 1733cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1734cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1735cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1736cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 1737cee9d6f2SShri Abhyankar } 1738*a2d6a19aSShri Abhyankar #endif 1739cee9d6f2SShri Abhyankar 1740cee9d6f2SShri Abhyankar #undef __FUNCT__ 1741*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1742*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 174353cca76cSShri Abhyankar { 174453cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 174553cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 174653cca76cSShri Abhyankar PetscErrorCode ierr; 174753cca76cSShri Abhyankar PetscInt idx,jdx,idt; 174853cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 174953cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 175053cca76cSShri Abhyankar PetscScalar *x; 175153cca76cSShri Abhyankar const PetscScalar *b; 175253cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 175353cca76cSShri Abhyankar 175453cca76cSShri Abhyankar PetscFunctionBegin; 175553cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 175653cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 175753cca76cSShri Abhyankar /* forward solve the lower triangular */ 175853cca76cSShri Abhyankar idx = 0; 175953cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 176053cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 176153cca76cSShri Abhyankar for (i=1; i<n; i++) { 176253cca76cSShri Abhyankar v = aa + bs2*ai[i]; 176353cca76cSShri Abhyankar vi = aj + ai[i]; 176453cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 176553cca76cSShri Abhyankar idx = bs*i; 176653cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 176753cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 176853cca76cSShri Abhyankar for(k=0;k<nz;k++) { 176953cca76cSShri Abhyankar jdx = bs*vi[k]; 177053cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 177153cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 177253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 177353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 177453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 177553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 177653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 177753cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 177853cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 177953cca76cSShri Abhyankar v += bs2; 178053cca76cSShri Abhyankar } 178153cca76cSShri Abhyankar 178253cca76cSShri Abhyankar x[idx] = s1; 178353cca76cSShri Abhyankar x[1+idx] = s2; 178453cca76cSShri Abhyankar x[2+idx] = s3; 178553cca76cSShri Abhyankar x[3+idx] = s4; 178653cca76cSShri Abhyankar x[4+idx] = s5; 178753cca76cSShri Abhyankar x[5+idx] = s6; 178853cca76cSShri Abhyankar x[6+idx] = s7; 178953cca76cSShri Abhyankar } 179053cca76cSShri Abhyankar 179153cca76cSShri Abhyankar /* backward solve the upper triangular */ 179253cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 179353cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 179453cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 179553cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 179653cca76cSShri Abhyankar idt = bs*i; 179753cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 179853cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 179953cca76cSShri Abhyankar for(k=0;k<nz;k++) { 180053cca76cSShri Abhyankar idx = bs*vi[k]; 180153cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 180253cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 180353cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 180453cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 180553cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 180653cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 180753cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 180853cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 180953cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 181053cca76cSShri Abhyankar v += bs2; 181153cca76cSShri Abhyankar } 181253cca76cSShri Abhyankar /* x = inv_diagonal*x */ 181353cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 181453cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 181553cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 181653cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 181753cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 181853cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 181953cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 182053cca76cSShri Abhyankar } 182153cca76cSShri Abhyankar 182253cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 182353cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 182453cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 182553cca76cSShri Abhyankar PetscFunctionReturn(0); 182653cca76cSShri Abhyankar } 182753cca76cSShri Abhyankar 182853cca76cSShri Abhyankar #undef __FUNCT__ 18294a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1830dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 183115091d37SBarry Smith { 183215091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 183315091d37SBarry Smith IS iscol=a->col,isrow=a->row; 18346849ba73SBarry Smith PetscErrorCode ierr; 18355d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 18365d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1837d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1838d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1839d9fead3dSBarry Smith const PetscScalar *b; 184015091d37SBarry Smith PetscFunctionBegin; 1841d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18421ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1843f1af5d2fSBarry Smith t = a->solve_work; 184415091d37SBarry Smith 184515091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 184615091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 184715091d37SBarry Smith 184815091d37SBarry Smith /* forward solve the lower triangular */ 184915091d37SBarry Smith idx = 6*(*r++); 1850f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1851f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1852f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 185315091d37SBarry Smith for (i=1; i<n; i++) { 185415091d37SBarry Smith v = aa + 36*ai[i]; 185515091d37SBarry Smith vi = aj + ai[i]; 185615091d37SBarry Smith nz = diag[i] - ai[i]; 185715091d37SBarry Smith idx = 6*(*r++); 1858f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1859f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 186015091d37SBarry Smith while (nz--) { 186115091d37SBarry Smith idx = 6*(*vi++); 1862f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1863f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1864f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1865f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1866f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1867f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1868f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1869f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 187015091d37SBarry Smith v += 36; 187115091d37SBarry Smith } 187215091d37SBarry Smith idx = 6*i; 1873f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1874f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1875f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 187615091d37SBarry Smith } 187715091d37SBarry Smith /* backward solve the upper triangular */ 187815091d37SBarry Smith for (i=n-1; i>=0; i--){ 187915091d37SBarry Smith v = aa + 36*diag[i] + 36; 188015091d37SBarry Smith vi = aj + diag[i] + 1; 188115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 188215091d37SBarry Smith idt = 6*i; 1883f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1884f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1885f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 188615091d37SBarry Smith while (nz--) { 188715091d37SBarry Smith idx = 6*(*vi++); 1888f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1889f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1890f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1891f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1892f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1893f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1894f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1895f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1896f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 189715091d37SBarry Smith v += 36; 189815091d37SBarry Smith } 189915091d37SBarry Smith idc = 6*(*c--); 190015091d37SBarry Smith v = aa + 36*diag[i]; 1901f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1902f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1903f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1904f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1905f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1906f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1907f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1908f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1909f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1910f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1911f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1912f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 191315091d37SBarry Smith } 191415091d37SBarry Smith 191515091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 191615091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1917d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19181ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1919dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 192015091d37SBarry Smith PetscFunctionReturn(0); 192115091d37SBarry Smith } 192215091d37SBarry Smith 1923*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 19244a2ae208SSatish Balay #undef __FUNCT__ 19258f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 19268f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 19278f690400SShri Abhyankar { 19288f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 19298f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 19308f690400SShri Abhyankar PetscErrorCode ierr; 19318f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 193229b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 19338f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 19348f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 19358f690400SShri Abhyankar const PetscScalar *b; 19368f690400SShri Abhyankar PetscFunctionBegin; 19378f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19388f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 19398f690400SShri Abhyankar t = a->solve_work; 19408f690400SShri Abhyankar 19418f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 194229b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 19438f690400SShri Abhyankar 19448f690400SShri Abhyankar /* forward solve the lower triangular */ 194529b92fc1SShri Abhyankar idx = 6*r[0]; 19468f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 19478f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 19488f690400SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 19498f690400SShri Abhyankar for (i=1; i<n; i++) { 19508f690400SShri Abhyankar v = aa + 36*ai[i]; 19518f690400SShri Abhyankar vi = aj + ai[i]; 19528f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 195329b92fc1SShri Abhyankar idx = 6*r[i]; 19548f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 19558f690400SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 195629b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 195729b92fc1SShri Abhyankar idx = 6*vi[m]; 19588f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 19598f690400SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 19608f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 19618f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 19628f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 19638f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 19648f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 19658f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 19668f690400SShri Abhyankar v += 36; 19678f690400SShri Abhyankar } 19688f690400SShri Abhyankar idx = 6*i; 19698f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 19708f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 19718f690400SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 19728f690400SShri Abhyankar } 19738f690400SShri Abhyankar /* backward solve the upper triangular */ 19748f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 19758f690400SShri Abhyankar k = 2*n-i; 19768f690400SShri Abhyankar v = aa + 36*ai[k]; 19778f690400SShri Abhyankar vi = aj + ai[k]; 19788f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 19798f690400SShri Abhyankar idt = 6*i; 19808f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 19818f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 19828f690400SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 198329b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 198429b92fc1SShri Abhyankar idx = 6*vi[m]; 19858f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 19868f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 19878f690400SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 19888f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 19898f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 19908f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 19918f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 19928f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 19938f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 19948f690400SShri Abhyankar v += 36; 19958f690400SShri Abhyankar } 199629b92fc1SShri Abhyankar idc = 6*c[i]; 19978f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 19988f690400SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 19998f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 20008f690400SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 20018f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 20028f690400SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 20038f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 20048f690400SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 20058f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 20068f690400SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 20078f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 20088f690400SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 20098f690400SShri Abhyankar } 20108f690400SShri Abhyankar 20118f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 20128f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 20138f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20148f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 20158f690400SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 20168f690400SShri Abhyankar PetscFunctionReturn(0); 20178f690400SShri Abhyankar } 2018*a2d6a19aSShri Abhyankar #endif 20198f690400SShri Abhyankar 20206506fda5SShri Abhyankar #undef __FUNCT__ 2021*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 2022*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 20236506fda5SShri Abhyankar { 20246506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 20256506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 20266506fda5SShri Abhyankar PetscErrorCode ierr; 20276506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 20286506fda5SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 20296506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 20306506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 20316506fda5SShri Abhyankar const PetscScalar *b; 20326506fda5SShri Abhyankar PetscFunctionBegin; 20336506fda5SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20346506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 20356506fda5SShri Abhyankar t = a->solve_work; 20366506fda5SShri Abhyankar 20376506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 20386506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 20396506fda5SShri Abhyankar 20406506fda5SShri Abhyankar /* forward solve the lower triangular */ 20416506fda5SShri Abhyankar idx = 6*r[0]; 20426506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 20436506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 20446506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 20456506fda5SShri Abhyankar for (i=1; i<n; i++) { 20466506fda5SShri Abhyankar v = aa + 36*ai[i]; 20476506fda5SShri Abhyankar vi = aj + ai[i]; 20486506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 20496506fda5SShri Abhyankar idx = 6*r[i]; 20506506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 20516506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 20526506fda5SShri Abhyankar for(m=0;m<nz;m++){ 20536506fda5SShri Abhyankar idx = 6*vi[m]; 20546506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 20556506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 20566506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 20576506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 20586506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 20596506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 20606506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 20616506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 20626506fda5SShri Abhyankar v += 36; 20636506fda5SShri Abhyankar } 20646506fda5SShri Abhyankar idx = 6*i; 20656506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 20666506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 20676506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 20686506fda5SShri Abhyankar } 20696506fda5SShri Abhyankar /* backward solve the upper triangular */ 20706506fda5SShri Abhyankar for (i=n-1; i>=0; i--){ 20716506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 20726506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 20736506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 20746506fda5SShri Abhyankar idt = 6*i; 20756506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 20766506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 20776506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 20786506fda5SShri Abhyankar for(m=0;m<nz;m++){ 20796506fda5SShri Abhyankar idx = 6*vi[m]; 20806506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 20816506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 20826506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 20836506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 20846506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 20856506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 20866506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 20876506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 20886506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 20896506fda5SShri Abhyankar v += 36; 20906506fda5SShri Abhyankar } 20916506fda5SShri Abhyankar idc = 6*c[i]; 20926506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 20936506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 20946506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 20956506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 20966506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 20976506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 20986506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 20996506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 21006506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 21016506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 21026506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 21036506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 21046506fda5SShri Abhyankar } 21056506fda5SShri Abhyankar 21066506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21076506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21086506fda5SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21096506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 21106506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 21116506fda5SShri Abhyankar PetscFunctionReturn(0); 21126506fda5SShri Abhyankar } 21138f690400SShri Abhyankar 21148f690400SShri Abhyankar #undef __FUNCT__ 21154a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 2116dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 211715091d37SBarry Smith { 211815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2119690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2120dfbe8321SBarry Smith PetscErrorCode ierr; 2121690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2122d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2123d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2124d9fead3dSBarry Smith const PetscScalar *b; 212515091d37SBarry Smith 212615091d37SBarry Smith PetscFunctionBegin; 2127d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 212915091d37SBarry Smith /* forward solve the lower triangular */ 213015091d37SBarry Smith idx = 0; 213115091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 213215091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 213315091d37SBarry Smith for (i=1; i<n; i++) { 213415091d37SBarry Smith v = aa + 36*ai[i]; 213515091d37SBarry Smith vi = aj + ai[i]; 213615091d37SBarry Smith nz = diag[i] - ai[i]; 213715091d37SBarry Smith idx = 6*i; 2138f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2139f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 214015091d37SBarry Smith while (nz--) { 214115091d37SBarry Smith jdx = 6*(*vi++); 214215091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 214315091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2144f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2145f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2146f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2147f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2148f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2149f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 215015091d37SBarry Smith v += 36; 215115091d37SBarry Smith } 2152f1af5d2fSBarry Smith x[idx] = s1; 2153f1af5d2fSBarry Smith x[1+idx] = s2; 2154f1af5d2fSBarry Smith x[2+idx] = s3; 2155f1af5d2fSBarry Smith x[3+idx] = s4; 2156f1af5d2fSBarry Smith x[4+idx] = s5; 2157f1af5d2fSBarry Smith x[5+idx] = s6; 215815091d37SBarry Smith } 215915091d37SBarry Smith /* backward solve the upper triangular */ 216015091d37SBarry Smith for (i=n-1; i>=0; i--){ 216115091d37SBarry Smith v = aa + 36*diag[i] + 36; 216215091d37SBarry Smith vi = aj + diag[i] + 1; 216315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 216415091d37SBarry Smith idt = 6*i; 2165f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2166f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2167f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 216815091d37SBarry Smith while (nz--) { 216915091d37SBarry Smith idx = 6*(*vi++); 217015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 217115091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2172f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2173f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2174f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2175f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2176f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2177f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 217815091d37SBarry Smith v += 36; 217915091d37SBarry Smith } 218015091d37SBarry Smith v = aa + 36*diag[i]; 2181f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2182f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2183f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2184f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2185f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2186f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 218715091d37SBarry Smith } 218815091d37SBarry Smith 2189d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21901ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2191dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 219215091d37SBarry Smith PetscFunctionReturn(0); 219315091d37SBarry Smith } 219415091d37SBarry Smith 2195*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 21964a2ae208SSatish Balay #undef __FUNCT__ 2197cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2198cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2199cee9d6f2SShri Abhyankar { 2200cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 22016464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2202cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2203cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 2204cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2205cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2206cee9d6f2SShri Abhyankar PetscScalar *x; 2207cee9d6f2SShri Abhyankar const PetscScalar *b; 2208cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2209cee9d6f2SShri Abhyankar 2210cee9d6f2SShri Abhyankar PetscFunctionBegin; 2211cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2212cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2213cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2214cee9d6f2SShri Abhyankar idx = 0; 2215cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2216cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 2217cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2218cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 2219cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2220cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2221cee9d6f2SShri Abhyankar idx = bs*i; 2222cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2223cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 22246464896eSShri Abhyankar for(k=0;k<nz;k++){ 22256464896eSShri Abhyankar jdx = bs*vi[k]; 2226cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2227cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 2228cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2229cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2230cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2231cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2232cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2233cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2234cee9d6f2SShri Abhyankar v += bs2; 2235cee9d6f2SShri Abhyankar } 2236cee9d6f2SShri Abhyankar 2237cee9d6f2SShri Abhyankar x[idx] = s1; 2238cee9d6f2SShri Abhyankar x[1+idx] = s2; 2239cee9d6f2SShri Abhyankar x[2+idx] = s3; 2240cee9d6f2SShri Abhyankar x[3+idx] = s4; 2241cee9d6f2SShri Abhyankar x[4+idx] = s5; 2242cee9d6f2SShri Abhyankar x[5+idx] = s6; 2243cee9d6f2SShri Abhyankar } 2244cee9d6f2SShri Abhyankar 2245cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2246cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2247cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 2248cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2249cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2250cee9d6f2SShri Abhyankar idt = bs*i; 2251cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2252cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 22536464896eSShri Abhyankar for(k=0;k<nz;k++){ 22546464896eSShri Abhyankar idx = bs*vi[k]; 2255cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2256cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 2257cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2258cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2259cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2260cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2261cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2262cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2263cee9d6f2SShri Abhyankar v += bs2; 2264cee9d6f2SShri Abhyankar } 2265cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2266cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2267cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2268cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2269cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2270cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2271cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2272cee9d6f2SShri Abhyankar } 2273cee9d6f2SShri Abhyankar 2274cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2275cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2276cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2277cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2278cee9d6f2SShri Abhyankar } 2279*a2d6a19aSShri Abhyankar #endif 22808f690400SShri Abhyankar 2281cee9d6f2SShri Abhyankar #undef __FUNCT__ 2282*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2283*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 228453cca76cSShri Abhyankar { 228553cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 228653cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 228753cca76cSShri Abhyankar PetscErrorCode ierr; 228853cca76cSShri Abhyankar PetscInt idx,jdx,idt; 228953cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 229053cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 229153cca76cSShri Abhyankar PetscScalar *x; 229253cca76cSShri Abhyankar const PetscScalar *b; 229353cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 229453cca76cSShri Abhyankar 229553cca76cSShri Abhyankar PetscFunctionBegin; 229653cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 229753cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 229853cca76cSShri Abhyankar /* forward solve the lower triangular */ 229953cca76cSShri Abhyankar idx = 0; 230053cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 230153cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 230253cca76cSShri Abhyankar for (i=1; i<n; i++) { 230353cca76cSShri Abhyankar v = aa + bs2*ai[i]; 230453cca76cSShri Abhyankar vi = aj + ai[i]; 230553cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 230653cca76cSShri Abhyankar idx = bs*i; 230753cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 230853cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 230953cca76cSShri Abhyankar for(k=0;k<nz;k++){ 231053cca76cSShri Abhyankar jdx = bs*vi[k]; 231153cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 231253cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 231353cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 231453cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 231553cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 231653cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 231753cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 231853cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 231953cca76cSShri Abhyankar v += bs2; 232053cca76cSShri Abhyankar } 232153cca76cSShri Abhyankar 232253cca76cSShri Abhyankar x[idx] = s1; 232353cca76cSShri Abhyankar x[1+idx] = s2; 232453cca76cSShri Abhyankar x[2+idx] = s3; 232553cca76cSShri Abhyankar x[3+idx] = s4; 232653cca76cSShri Abhyankar x[4+idx] = s5; 232753cca76cSShri Abhyankar x[5+idx] = s6; 232853cca76cSShri Abhyankar } 232953cca76cSShri Abhyankar 233053cca76cSShri Abhyankar /* backward solve the upper triangular */ 233153cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 233253cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 233353cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 233453cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 233553cca76cSShri Abhyankar idt = bs*i; 233653cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 233753cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 233853cca76cSShri Abhyankar for(k=0;k<nz;k++){ 233953cca76cSShri Abhyankar idx = bs*vi[k]; 234053cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 234153cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 234253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 234353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 234453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 234553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 234653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 234753cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 234853cca76cSShri Abhyankar v += bs2; 234953cca76cSShri Abhyankar } 235053cca76cSShri Abhyankar /* x = inv_diagonal*x */ 235153cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 235253cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 235353cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 235453cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 235553cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 235653cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 235753cca76cSShri Abhyankar } 235853cca76cSShri Abhyankar 235953cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 236053cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 236153cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 236253cca76cSShri Abhyankar PetscFunctionReturn(0); 236353cca76cSShri Abhyankar } 236453cca76cSShri Abhyankar 236553cca76cSShri Abhyankar #undef __FUNCT__ 23664a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2367dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 23684e2b4712SSatish Balay { 23694e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 23704e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 23716849ba73SBarry Smith PetscErrorCode ierr; 23725d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 23735d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2374d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2375d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2376d9fead3dSBarry Smith const PetscScalar *b; 23774e2b4712SSatish Balay 23784e2b4712SSatish Balay PetscFunctionBegin; 2379d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23801ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2381f1af5d2fSBarry Smith t = a->solve_work; 23824e2b4712SSatish Balay 23834e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 23844e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 23854e2b4712SSatish Balay 23864e2b4712SSatish Balay /* forward solve the lower triangular */ 23874e2b4712SSatish Balay idx = 5*(*r++); 2388f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2389f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 23904e2b4712SSatish Balay for (i=1; i<n; i++) { 23914e2b4712SSatish Balay v = aa + 25*ai[i]; 23924e2b4712SSatish Balay vi = aj + ai[i]; 23934e2b4712SSatish Balay nz = diag[i] - ai[i]; 23944e2b4712SSatish Balay idx = 5*(*r++); 2395f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2396f1af5d2fSBarry Smith s5 = b[4+idx]; 23974e2b4712SSatish Balay while (nz--) { 23984e2b4712SSatish Balay idx = 5*(*vi++); 2399f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2400f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2401f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2402f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2403f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2404f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2405f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 24064e2b4712SSatish Balay v += 25; 24074e2b4712SSatish Balay } 24084e2b4712SSatish Balay idx = 5*i; 2409f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2410f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 24114e2b4712SSatish Balay } 24124e2b4712SSatish Balay /* backward solve the upper triangular */ 24134e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 24144e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 24154e2b4712SSatish Balay vi = aj + diag[i] + 1; 24164e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 24174e2b4712SSatish Balay idt = 5*i; 2418f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2419f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 24204e2b4712SSatish Balay while (nz--) { 24214e2b4712SSatish Balay idx = 5*(*vi++); 2422f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2423f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2424f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2425f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2426f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2427f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2428f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 24294e2b4712SSatish Balay v += 25; 24304e2b4712SSatish Balay } 24314e2b4712SSatish Balay idc = 5*(*c--); 24324e2b4712SSatish Balay v = aa + 25*diag[i]; 2433f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2434f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 2435f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2436f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 2437f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2438f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 2439f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2440f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 2441f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2442f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 24434e2b4712SSatish Balay } 24444e2b4712SSatish Balay 24454e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 24464e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2447d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24481ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2449dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 24504e2b4712SSatish Balay PetscFunctionReturn(0); 24514e2b4712SSatish Balay } 24524e2b4712SSatish Balay 2453*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 24544a2ae208SSatish Balay #undef __FUNCT__ 24558f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 24568f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 24578f690400SShri Abhyankar { 24588f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 24598f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 24608f690400SShri Abhyankar PetscErrorCode ierr; 24618f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 246229b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 24638f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 24648f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 24658f690400SShri Abhyankar const PetscScalar *b; 24668f690400SShri Abhyankar 24678f690400SShri Abhyankar PetscFunctionBegin; 24688f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24698f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24708f690400SShri Abhyankar t = a->solve_work; 24718f690400SShri Abhyankar 24728f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 247329b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 24748f690400SShri Abhyankar 24758f690400SShri Abhyankar /* forward solve the lower triangular */ 247629b92fc1SShri Abhyankar idx = 5*r[0]; 24778f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 24788f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 24798f690400SShri Abhyankar for (i=1; i<n; i++) { 24808f690400SShri Abhyankar v = aa + 25*ai[i]; 24818f690400SShri Abhyankar vi = aj + ai[i]; 24828f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 248329b92fc1SShri Abhyankar idx = 5*r[i]; 24848f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 24858f690400SShri Abhyankar s5 = b[4+idx]; 248629b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 248729b92fc1SShri Abhyankar idx = 5*vi[m]; 24888f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 24898f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 24908f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 24918f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 24928f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 24938f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 24948f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 24958f690400SShri Abhyankar v += 25; 24968f690400SShri Abhyankar } 24978f690400SShri Abhyankar idx = 5*i; 24988f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 24998f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 25008f690400SShri Abhyankar } 25018f690400SShri Abhyankar /* backward solve the upper triangular */ 25028f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 25038f690400SShri Abhyankar k = 2*n-i; 25048f690400SShri Abhyankar v = aa + 25*ai[k]; 25058f690400SShri Abhyankar vi = aj + ai[k]; 25068f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 25078f690400SShri Abhyankar idt = 5*i; 25088f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 25098f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 251029b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 251129b92fc1SShri Abhyankar idx = 5*vi[m]; 25128f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 25138f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 25148f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 25158f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 25168f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 25178f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 25188f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 25198f690400SShri Abhyankar v += 25; 25208f690400SShri Abhyankar } 252129b92fc1SShri Abhyankar idc = 5*c[i]; 25228f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 25238f690400SShri Abhyankar v[15]*s4+v[20]*s5; 25248f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 25258f690400SShri Abhyankar v[16]*s4+v[21]*s5; 25268f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 25278f690400SShri Abhyankar v[17]*s4+v[22]*s5; 25288f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 25298f690400SShri Abhyankar v[18]*s4+v[23]*s5; 25308f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 25318f690400SShri Abhyankar v[19]*s4+v[24]*s5; 25328f690400SShri Abhyankar } 25338f690400SShri Abhyankar 25348f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 25358f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 25368f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25378f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 25388f690400SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 25398f690400SShri Abhyankar PetscFunctionReturn(0); 25408f690400SShri Abhyankar } 2541*a2d6a19aSShri Abhyankar #endif 254278bb4007SShri Abhyankar 254378bb4007SShri Abhyankar #undef __FUNCT__ 2544*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2545*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 254678bb4007SShri Abhyankar { 254778bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 254878bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 254978bb4007SShri Abhyankar PetscErrorCode ierr; 255078bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 255178bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 255278bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 255378bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 255478bb4007SShri Abhyankar const PetscScalar *b; 255578bb4007SShri Abhyankar 255678bb4007SShri Abhyankar PetscFunctionBegin; 255778bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 255878bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 255978bb4007SShri Abhyankar t = a->solve_work; 256078bb4007SShri Abhyankar 256178bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 256278bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 256378bb4007SShri Abhyankar 256478bb4007SShri Abhyankar /* forward solve the lower triangular */ 256578bb4007SShri Abhyankar idx = 5*r[0]; 256678bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 256778bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 256878bb4007SShri Abhyankar for (i=1; i<n; i++) { 256978bb4007SShri Abhyankar v = aa + 25*ai[i]; 257078bb4007SShri Abhyankar vi = aj + ai[i]; 257178bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 257278bb4007SShri Abhyankar idx = 5*r[i]; 257378bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 257478bb4007SShri Abhyankar s5 = b[4+idx]; 257578bb4007SShri Abhyankar for(m=0;m<nz;m++){ 257678bb4007SShri Abhyankar idx = 5*vi[m]; 257778bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 257878bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 257978bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 258078bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 258178bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 258278bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 258378bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 258478bb4007SShri Abhyankar v += 25; 258578bb4007SShri Abhyankar } 258678bb4007SShri Abhyankar idx = 5*i; 258778bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 258878bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 258978bb4007SShri Abhyankar } 259078bb4007SShri Abhyankar /* backward solve the upper triangular */ 259178bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 259278bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 259378bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 259478bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 259578bb4007SShri Abhyankar idt = 5*i; 259678bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 259778bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 259878bb4007SShri Abhyankar for(m=0;m<nz;m++){ 259978bb4007SShri Abhyankar idx = 5*vi[m]; 260078bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 260178bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 260278bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 260378bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 260478bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 260578bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 260678bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 260778bb4007SShri Abhyankar v += 25; 260878bb4007SShri Abhyankar } 260978bb4007SShri Abhyankar idc = 5*c[i]; 261078bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 261178bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 261278bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 261378bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 261478bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 261578bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 261678bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 261778bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 261878bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 261978bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 262078bb4007SShri Abhyankar } 262178bb4007SShri Abhyankar 262278bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 262378bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 262478bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 262578bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 262678bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 262778bb4007SShri Abhyankar PetscFunctionReturn(0); 262878bb4007SShri Abhyankar } 262978bb4007SShri Abhyankar 26308f690400SShri Abhyankar #undef __FUNCT__ 26314a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2632dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 263315091d37SBarry Smith { 263415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2635690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2636dfbe8321SBarry Smith PetscErrorCode ierr; 2637690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2638d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2639d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2640d9fead3dSBarry Smith const PetscScalar *b; 264115091d37SBarry Smith 264215091d37SBarry Smith PetscFunctionBegin; 2643d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26441ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 264515091d37SBarry Smith /* forward solve the lower triangular */ 264615091d37SBarry Smith idx = 0; 264715091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 264815091d37SBarry Smith for (i=1; i<n; i++) { 264915091d37SBarry Smith v = aa + 25*ai[i]; 265015091d37SBarry Smith vi = aj + ai[i]; 265115091d37SBarry Smith nz = diag[i] - ai[i]; 265215091d37SBarry Smith idx = 5*i; 2653f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 265415091d37SBarry Smith while (nz--) { 265515091d37SBarry Smith jdx = 5*(*vi++); 265615091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2657f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2658f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2659f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2660f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2661f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 266215091d37SBarry Smith v += 25; 266315091d37SBarry Smith } 2664f1af5d2fSBarry Smith x[idx] = s1; 2665f1af5d2fSBarry Smith x[1+idx] = s2; 2666f1af5d2fSBarry Smith x[2+idx] = s3; 2667f1af5d2fSBarry Smith x[3+idx] = s4; 2668f1af5d2fSBarry Smith x[4+idx] = s5; 266915091d37SBarry Smith } 267015091d37SBarry Smith /* backward solve the upper triangular */ 267115091d37SBarry Smith for (i=n-1; i>=0; i--){ 267215091d37SBarry Smith v = aa + 25*diag[i] + 25; 267315091d37SBarry Smith vi = aj + diag[i] + 1; 267415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 267515091d37SBarry Smith idt = 5*i; 2676f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2677f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 267815091d37SBarry Smith while (nz--) { 267915091d37SBarry Smith idx = 5*(*vi++); 268015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2681f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2682f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2683f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2684f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2685f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 268615091d37SBarry Smith v += 25; 268715091d37SBarry Smith } 268815091d37SBarry Smith v = aa + 25*diag[i]; 2689f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2690f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2691f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2692f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2693f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 269415091d37SBarry Smith } 269515091d37SBarry Smith 2696d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2698dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 269915091d37SBarry Smith PetscFunctionReturn(0); 270015091d37SBarry Smith } 270115091d37SBarry Smith 2702*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 27034a2ae208SSatish Balay #undef __FUNCT__ 2704cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2705cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2706cee9d6f2SShri Abhyankar { 2707cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 27086464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2709cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2710cee9d6f2SShri Abhyankar PetscInt jdx; 2711cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2712cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2713cee9d6f2SShri Abhyankar const PetscScalar *b; 2714cee9d6f2SShri Abhyankar 2715cee9d6f2SShri Abhyankar PetscFunctionBegin; 2716cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2717cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2718cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2719cee9d6f2SShri Abhyankar idx = 0; 2720cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2721cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2722cee9d6f2SShri Abhyankar v = aa + 25*ai[i]; 2723cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2724cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2725cee9d6f2SShri Abhyankar idx = 5*i; 2726cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 27276464896eSShri Abhyankar for(k=0;k<nz;k++) { 27286464896eSShri Abhyankar jdx = 5*vi[k]; 2729cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2730cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2731cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2732cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2733cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2734cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2735cee9d6f2SShri Abhyankar v += 25; 2736cee9d6f2SShri Abhyankar } 2737cee9d6f2SShri Abhyankar x[idx] = s1; 2738cee9d6f2SShri Abhyankar x[1+idx] = s2; 2739cee9d6f2SShri Abhyankar x[2+idx] = s3; 2740cee9d6f2SShri Abhyankar x[3+idx] = s4; 2741cee9d6f2SShri Abhyankar x[4+idx] = s5; 2742cee9d6f2SShri Abhyankar } 2743cee9d6f2SShri Abhyankar 2744cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2745cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2746cee9d6f2SShri Abhyankar v = aa + 25*ai[2*n-i]; 2747cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2748cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2749cee9d6f2SShri Abhyankar idt = 5*i; 2750cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 2751cee9d6f2SShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 27526464896eSShri Abhyankar for(k=0;k<nz;k++){ 27536464896eSShri Abhyankar idx = 5*vi[k]; 2754cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2755cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2756cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2757cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2758cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2759cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2760cee9d6f2SShri Abhyankar v += 25; 2761cee9d6f2SShri Abhyankar } 2762cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2763cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2764cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2765cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2766cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2767cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2768cee9d6f2SShri Abhyankar } 2769cee9d6f2SShri Abhyankar 2770cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2771cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2772cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2773cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2774cee9d6f2SShri Abhyankar } 2775*a2d6a19aSShri Abhyankar #endif 2776cee9d6f2SShri Abhyankar 2777cee9d6f2SShri Abhyankar #undef __FUNCT__ 2778*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2779*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 278053cca76cSShri Abhyankar { 278153cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 278253cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 278353cca76cSShri Abhyankar PetscErrorCode ierr; 278453cca76cSShri Abhyankar PetscInt jdx; 278553cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 278653cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 278753cca76cSShri Abhyankar const PetscScalar *b; 278853cca76cSShri Abhyankar 278953cca76cSShri Abhyankar PetscFunctionBegin; 279053cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 279153cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 279253cca76cSShri Abhyankar /* forward solve the lower triangular */ 279353cca76cSShri Abhyankar idx = 0; 279453cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 279553cca76cSShri Abhyankar for (i=1; i<n; i++) { 279653cca76cSShri Abhyankar v = aa + 25*ai[i]; 279753cca76cSShri Abhyankar vi = aj + ai[i]; 279853cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 279953cca76cSShri Abhyankar idx = 5*i; 280053cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 280153cca76cSShri Abhyankar for(k=0;k<nz;k++) { 280253cca76cSShri Abhyankar jdx = 5*vi[k]; 280353cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 280453cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 280553cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 280653cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 280753cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 280853cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 280953cca76cSShri Abhyankar v += 25; 281053cca76cSShri Abhyankar } 281153cca76cSShri Abhyankar x[idx] = s1; 281253cca76cSShri Abhyankar x[1+idx] = s2; 281353cca76cSShri Abhyankar x[2+idx] = s3; 281453cca76cSShri Abhyankar x[3+idx] = s4; 281553cca76cSShri Abhyankar x[4+idx] = s5; 281653cca76cSShri Abhyankar } 281753cca76cSShri Abhyankar 281853cca76cSShri Abhyankar /* backward solve the upper triangular */ 281953cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 282053cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 282153cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 282253cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 282353cca76cSShri Abhyankar idt = 5*i; 282453cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 282553cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 282653cca76cSShri Abhyankar for(k=0;k<nz;k++){ 282753cca76cSShri Abhyankar idx = 5*vi[k]; 282853cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 282953cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 283053cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 283153cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 283253cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 283353cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 283453cca76cSShri Abhyankar v += 25; 283553cca76cSShri Abhyankar } 283653cca76cSShri Abhyankar /* x = inv_diagonal*x */ 283753cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 283853cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 283953cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 284053cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 284153cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 284253cca76cSShri Abhyankar } 284353cca76cSShri Abhyankar 284453cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 284553cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 284653cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 284753cca76cSShri Abhyankar PetscFunctionReturn(0); 284853cca76cSShri Abhyankar } 284953cca76cSShri Abhyankar 285053cca76cSShri Abhyankar #undef __FUNCT__ 28514a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2852dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 28534e2b4712SSatish Balay { 28544e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 28554e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 28566849ba73SBarry Smith PetscErrorCode ierr; 28575d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 28585d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2859d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2860d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2861d9fead3dSBarry Smith const PetscScalar *b; 28624e2b4712SSatish Balay 28634e2b4712SSatish Balay PetscFunctionBegin; 2864d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28651ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2866f1af5d2fSBarry Smith t = a->solve_work; 28674e2b4712SSatish Balay 28684e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 28694e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 28704e2b4712SSatish Balay 28714e2b4712SSatish Balay /* forward solve the lower triangular */ 28724e2b4712SSatish Balay idx = 4*(*r++); 2873f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2874f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 28754e2b4712SSatish Balay for (i=1; i<n; i++) { 28764e2b4712SSatish Balay v = aa + 16*ai[i]; 28774e2b4712SSatish Balay vi = aj + ai[i]; 28784e2b4712SSatish Balay nz = diag[i] - ai[i]; 28794e2b4712SSatish Balay idx = 4*(*r++); 2880f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 28814e2b4712SSatish Balay while (nz--) { 28824e2b4712SSatish Balay idx = 4*(*vi++); 2883f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2884f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2885f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2886f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2887f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 28884e2b4712SSatish Balay v += 16; 28894e2b4712SSatish Balay } 28904e2b4712SSatish Balay idx = 4*i; 2891f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2892f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 28934e2b4712SSatish Balay } 28944e2b4712SSatish Balay /* backward solve the upper triangular */ 28954e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 28964e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 28974e2b4712SSatish Balay vi = aj + diag[i] + 1; 28984e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 28994e2b4712SSatish Balay idt = 4*i; 2900f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2901f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 29024e2b4712SSatish Balay while (nz--) { 29034e2b4712SSatish Balay idx = 4*(*vi++); 2904f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2905f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2906f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2907f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2908f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2909f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 29104e2b4712SSatish Balay v += 16; 29114e2b4712SSatish Balay } 29124e2b4712SSatish Balay idc = 4*(*c--); 29134e2b4712SSatish Balay v = aa + 16*diag[i]; 2914f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2915f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2916f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2917f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 29184e2b4712SSatish Balay } 29194e2b4712SSatish Balay 29204e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 29214e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2922d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29231ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2924dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 29254e2b4712SSatish Balay PetscFunctionReturn(0); 29264e2b4712SSatish Balay } 2927f26ec98cSKris Buschelman 2928*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 2929f26ec98cSKris Buschelman #undef __FUNCT__ 29308f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 29318f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 29328f690400SShri Abhyankar { 29338f690400SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 29348f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 29358f690400SShri Abhyankar PetscErrorCode ierr; 293629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 29378f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 29388f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 29398f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 29408f690400SShri Abhyankar const PetscScalar *b; 29418f690400SShri Abhyankar 29428f690400SShri Abhyankar PetscFunctionBegin; 29438f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29448f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 29458f690400SShri Abhyankar t = a->solve_work; 29468f690400SShri Abhyankar 29478f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 294829b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 29498f690400SShri Abhyankar 29508f690400SShri Abhyankar /* forward solve the lower triangular */ 295129b92fc1SShri Abhyankar idx = 4*r[0]; 29528f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 29538f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 29548f690400SShri Abhyankar for (i=1; i<n; i++) { 29558f690400SShri Abhyankar v = aa + 16*ai[i]; 29568f690400SShri Abhyankar vi = aj + ai[i]; 29578f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 295829b92fc1SShri Abhyankar idx = 4*r[i]; 29598f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 296029b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 296129b92fc1SShri Abhyankar idx = 4*vi[m]; 29628f690400SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 29638f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 29648f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 29658f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 29668f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 29678f690400SShri Abhyankar v += 16; 29688f690400SShri Abhyankar } 29698f690400SShri Abhyankar idx = 4*i; 29708f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 29718f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 29728f690400SShri Abhyankar } 29738f690400SShri Abhyankar /* backward solve the upper triangular */ 29748f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 29758f690400SShri Abhyankar k = 2*n-i; 29768f690400SShri Abhyankar v = aa + 16*ai[k]; 29778f690400SShri Abhyankar vi = aj + ai[k]; 29788f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 29798f690400SShri Abhyankar idt = 4*i; 29808f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 29818f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 298229b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 298329b92fc1SShri Abhyankar idx = 4*vi[m]; 29848f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 29858f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 29868f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 29878f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 29888f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 29898f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 29908f690400SShri Abhyankar v += 16; 29918f690400SShri Abhyankar } 299229b92fc1SShri Abhyankar idc = 4*c[i]; 29938f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 29948f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 29958f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 29968f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 29978f690400SShri Abhyankar } 29988f690400SShri Abhyankar 29998f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 30008f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 30018f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30028f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 30038f690400SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 30048f690400SShri Abhyankar PetscFunctionReturn(0); 30058f690400SShri Abhyankar } 3006*a2d6a19aSShri Abhyankar #endif 30078f690400SShri Abhyankar 30088f690400SShri Abhyankar #undef __FUNCT__ 3009*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 3010*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 301178bb4007SShri Abhyankar { 301278bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 301378bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 301478bb4007SShri Abhyankar PetscErrorCode ierr; 301578bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 301678bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 301778bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 301878bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 301978bb4007SShri Abhyankar const PetscScalar *b; 302078bb4007SShri Abhyankar 302178bb4007SShri Abhyankar PetscFunctionBegin; 302278bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 302378bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 302478bb4007SShri Abhyankar t = a->solve_work; 302578bb4007SShri Abhyankar 302678bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 302778bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 302878bb4007SShri Abhyankar 302978bb4007SShri Abhyankar /* forward solve the lower triangular */ 303078bb4007SShri Abhyankar idx = 4*r[0]; 303178bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 303278bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 303378bb4007SShri Abhyankar for (i=1; i<n; i++) { 303478bb4007SShri Abhyankar v = aa + 16*ai[i]; 303578bb4007SShri Abhyankar vi = aj + ai[i]; 303678bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 303778bb4007SShri Abhyankar idx = 4*r[i]; 303878bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 303978bb4007SShri Abhyankar for(m=0;m<nz;m++){ 304078bb4007SShri Abhyankar idx = 4*vi[m]; 304178bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 304278bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 304378bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 304478bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 304578bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 304678bb4007SShri Abhyankar v += 16; 304778bb4007SShri Abhyankar } 304878bb4007SShri Abhyankar idx = 4*i; 304978bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 305078bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 305178bb4007SShri Abhyankar } 305278bb4007SShri Abhyankar /* backward solve the upper triangular */ 305378bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 305478bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 305578bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 305678bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 305778bb4007SShri Abhyankar idt = 4*i; 305878bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 305978bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 306078bb4007SShri Abhyankar for(m=0;m<nz;m++){ 306178bb4007SShri Abhyankar idx = 4*vi[m]; 306278bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 306378bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 306478bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 306578bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 306678bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 306778bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 306878bb4007SShri Abhyankar v += 16; 306978bb4007SShri Abhyankar } 307078bb4007SShri Abhyankar idc = 4*c[i]; 307178bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 307278bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 307378bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 307478bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 307578bb4007SShri Abhyankar } 307678bb4007SShri Abhyankar 307778bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 307878bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 307978bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 308078bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308178bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 308278bb4007SShri Abhyankar PetscFunctionReturn(0); 308378bb4007SShri Abhyankar } 308478bb4007SShri Abhyankar 308578bb4007SShri Abhyankar #undef __FUNCT__ 3086f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3087dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3088f26ec98cSKris Buschelman { 3089f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3090f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 30916849ba73SBarry Smith PetscErrorCode ierr; 30925d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 30935d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3094d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3095d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3096d9fead3dSBarry Smith PetscScalar *x; 3097d9fead3dSBarry Smith const PetscScalar *b; 3098f26ec98cSKris Buschelman 3099f26ec98cSKris Buschelman PetscFunctionBegin; 3100d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 31011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3102f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 3103f26ec98cSKris Buschelman 3104f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3105f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3106f26ec98cSKris Buschelman 3107f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3108f26ec98cSKris Buschelman idx = 4*(*r++); 3109f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 3110f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 3111f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 3112f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 3113f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3114f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3115f26ec98cSKris Buschelman vi = aj + ai[i]; 3116f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3117f26ec98cSKris Buschelman idx = 4*(*r++); 3118f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3119f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3120f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3121f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3122f26ec98cSKris Buschelman while (nz--) { 3123f26ec98cSKris Buschelman idx = 4*(*vi++); 3124f26ec98cSKris Buschelman x1 = t[idx]; 3125f26ec98cSKris Buschelman x2 = t[1+idx]; 3126f26ec98cSKris Buschelman x3 = t[2+idx]; 3127f26ec98cSKris Buschelman x4 = t[3+idx]; 3128f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3129f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3130f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3131f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3132f26ec98cSKris Buschelman v += 16; 3133f26ec98cSKris Buschelman } 3134f26ec98cSKris Buschelman idx = 4*i; 3135f26ec98cSKris Buschelman t[idx] = s1; 3136f26ec98cSKris Buschelman t[1+idx] = s2; 3137f26ec98cSKris Buschelman t[2+idx] = s3; 3138f26ec98cSKris Buschelman t[3+idx] = s4; 3139f26ec98cSKris Buschelman } 3140f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3141f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3142f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 3143f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3144f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3145f26ec98cSKris Buschelman idt = 4*i; 3146f26ec98cSKris Buschelman s1 = t[idt]; 3147f26ec98cSKris Buschelman s2 = t[1+idt]; 3148f26ec98cSKris Buschelman s3 = t[2+idt]; 3149f26ec98cSKris Buschelman s4 = t[3+idt]; 3150f26ec98cSKris Buschelman while (nz--) { 3151f26ec98cSKris Buschelman idx = 4*(*vi++); 3152f26ec98cSKris Buschelman x1 = t[idx]; 3153f26ec98cSKris Buschelman x2 = t[1+idx]; 3154f26ec98cSKris Buschelman x3 = t[2+idx]; 3155f26ec98cSKris Buschelman x4 = t[3+idx]; 3156f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3157f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3158f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3159f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3160f26ec98cSKris Buschelman v += 16; 3161f26ec98cSKris Buschelman } 3162f26ec98cSKris Buschelman idc = 4*(*c--); 3163f26ec98cSKris Buschelman v = aa + 16*diag[i]; 3164f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3165f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3166f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3167f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3168f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 3169f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 3170f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 3171f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 3172f26ec98cSKris Buschelman } 3173f26ec98cSKris Buschelman 3174f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3175f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3176d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 31771ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3178dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3179f26ec98cSKris Buschelman PetscFunctionReturn(0); 3180f26ec98cSKris Buschelman } 3181f26ec98cSKris Buschelman 318224c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 318324c233c2SKris Buschelman 318424c233c2SKris Buschelman #include PETSC_HAVE_SSE 318524c233c2SKris Buschelman 318624c233c2SKris Buschelman #undef __FUNCT__ 318724c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3188dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 318924c233c2SKris Buschelman { 319024c233c2SKris Buschelman /* 319124c233c2SKris Buschelman Note: This code uses demotion of double 319224c233c2SKris Buschelman to float when performing the mixed-mode computation. 319324c233c2SKris Buschelman This may not be numerically reasonable for all applications. 319424c233c2SKris Buschelman */ 319524c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 319624c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 31976849ba73SBarry Smith PetscErrorCode ierr; 31985d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 31995d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 320024c233c2SKris Buschelman MatScalar *aa=a->a,*v; 320187828ca2SBarry Smith PetscScalar *x,*b,*t; 320224c233c2SKris Buschelman 320324c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 320424c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 320524c233c2SKris Buschelman unsigned long offset; 320624c233c2SKris Buschelman 320724c233c2SKris Buschelman PetscFunctionBegin; 320824c233c2SKris Buschelman SSE_SCOPE_BEGIN; 320924c233c2SKris Buschelman 321024c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 321124c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 321224c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 321324c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 321424c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 321524c233c2SKris Buschelman 32161ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 32171ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 321824c233c2SKris Buschelman t = a->solve_work; 321924c233c2SKris Buschelman 322024c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 322124c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 322224c233c2SKris Buschelman 322324c233c2SKris Buschelman /* forward solve the lower triangular */ 322424c233c2SKris Buschelman idx = 4*(*r++); 322524c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 322624c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 322724c233c2SKris Buschelman v = aa + 16*ai[1]; 322824c233c2SKris Buschelman 322924c233c2SKris Buschelman for (i=1; i<n;) { 323024c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 323124c233c2SKris Buschelman vi = aj + ai[i]; 323224c233c2SKris Buschelman nz = diag[i] - ai[i]; 323324c233c2SKris Buschelman idx = 4*(*r++); 323424c233c2SKris Buschelman 323524c233c2SKris Buschelman /* Demote sum from double to float */ 323624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 323724c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 323824c233c2SKris Buschelman 323924c233c2SKris Buschelman while (nz--) { 324024c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 324124c233c2SKris Buschelman idx = 4*(*vi++); 324224c233c2SKris Buschelman 324324c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 324424c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 324524c233c2SKris Buschelman 324624c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 324724c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 324824c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 324924c233c2SKris Buschelman 325024c233c2SKris Buschelman /* First Column */ 325124c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 325224c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 325324c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 325424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 325524c233c2SKris Buschelman 325624c233c2SKris Buschelman /* Second Column */ 325724c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 325824c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 325924c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 326024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 326124c233c2SKris Buschelman 326224c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 326324c233c2SKris Buschelman 326424c233c2SKris Buschelman /* Third Column */ 326524c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 326624c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 326724c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 326824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 326924c233c2SKris Buschelman 327024c233c2SKris Buschelman /* Fourth Column */ 327124c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 327224c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 327324c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 327424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 327524c233c2SKris Buschelman SSE_INLINE_END_2 327624c233c2SKris Buschelman 327724c233c2SKris Buschelman v += 16; 327824c233c2SKris Buschelman } 327924c233c2SKris Buschelman idx = 4*i; 328024c233c2SKris Buschelman v = aa + 16*ai[++i]; 328124c233c2SKris Buschelman PREFETCH_NTA(v); 328224c233c2SKris Buschelman STORE_PS(tmps,XMM7); 328324c233c2SKris Buschelman 328424c233c2SKris Buschelman /* Promote result from float to double */ 328524c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 328624c233c2SKris Buschelman } 328724c233c2SKris Buschelman /* backward solve the upper triangular */ 328824c233c2SKris Buschelman idt = 4*(n-1); 328924c233c2SKris Buschelman ai16 = 16*diag[n-1]; 329024c233c2SKris Buschelman v = aa + ai16 + 16; 329124c233c2SKris Buschelman for (i=n-1; i>=0;){ 329224c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 329324c233c2SKris Buschelman vi = aj + diag[i] + 1; 329424c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 329524c233c2SKris Buschelman 329624c233c2SKris Buschelman /* Demote accumulator from double to float */ 329724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 329824c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 329924c233c2SKris Buschelman 330024c233c2SKris Buschelman while (nz--) { 330124c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 330224c233c2SKris Buschelman idx = 4*(*vi++); 330324c233c2SKris Buschelman 330424c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 330524c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 330624c233c2SKris Buschelman 330724c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 330824c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 330924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 331024c233c2SKris Buschelman 331124c233c2SKris Buschelman /* First Column */ 331224c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 331324c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 331424c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 331524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 331624c233c2SKris Buschelman 331724c233c2SKris Buschelman /* Second Column */ 331824c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 331924c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 332024c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 332124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 332224c233c2SKris Buschelman 332324c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 332424c233c2SKris Buschelman 332524c233c2SKris Buschelman /* Third Column */ 332624c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 332724c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 332824c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 332924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 333024c233c2SKris Buschelman 333124c233c2SKris Buschelman /* Fourth Column */ 333224c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 333324c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 333424c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 333524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 333624c233c2SKris Buschelman SSE_INLINE_END_2 333724c233c2SKris Buschelman v += 16; 333824c233c2SKris Buschelman } 333924c233c2SKris Buschelman v = aa + ai16; 334024c233c2SKris Buschelman ai16 = 16*diag[--i]; 334124c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 334224c233c2SKris Buschelman /* 334324c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 334424c233c2SKris Buschelman which was inverted as part of the factorization 334524c233c2SKris Buschelman */ 334624c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 334724c233c2SKris Buschelman /* First Column */ 334824c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 334924c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 335024c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 335124c233c2SKris Buschelman 335224c233c2SKris Buschelman /* Second Column */ 335324c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 335424c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 335524c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 335624c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 335724c233c2SKris Buschelman 335824c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 335924c233c2SKris Buschelman 336024c233c2SKris Buschelman /* Third Column */ 336124c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 336224c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 336324c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 336424c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 336524c233c2SKris Buschelman 336624c233c2SKris Buschelman /* Fourth Column */ 336724c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 336824c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 336924c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 337024c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 337124c233c2SKris Buschelman 337224c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 337324c233c2SKris Buschelman SSE_INLINE_END_3 337424c233c2SKris Buschelman 337524c233c2SKris Buschelman /* Promote solution from float to double */ 337624c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 337724c233c2SKris Buschelman 337824c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 337924c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 338024c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 338124c233c2SKris Buschelman idc = 4*(*c--); 338224c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 338324c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 338424c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 338524c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 338624c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 338724c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 338824c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 338924c233c2SKris Buschelman SSE_INLINE_END_2 339024c233c2SKris Buschelman v = aa + ai16 + 16; 339124c233c2SKris Buschelman idt -= 4; 339224c233c2SKris Buschelman } 339324c233c2SKris Buschelman 339424c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 339524c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 33961ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 33971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3398dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 339924c233c2SKris Buschelman SSE_SCOPE_END; 340024c233c2SKris Buschelman PetscFunctionReturn(0); 340124c233c2SKris Buschelman } 340224c233c2SKris Buschelman 340324c233c2SKris Buschelman #endif 34040ef38995SBarry Smith 34050ef38995SBarry Smith 34064e2b4712SSatish Balay /* 34074e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 34084e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 34094e2b4712SSatish Balay */ 34104a2ae208SSatish Balay #undef __FUNCT__ 34114a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3412dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 34134e2b4712SSatish Balay { 34144e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3415356650c2SBarry Smith PetscInt n=a->mbs; 3416356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 3417dfbe8321SBarry Smith PetscErrorCode ierr; 3418356650c2SBarry Smith const PetscInt *diag = a->diag; 3419d9fead3dSBarry Smith const MatScalar *aa=a->a; 3420d9fead3dSBarry Smith PetscScalar *x; 3421d9fead3dSBarry Smith const PetscScalar *b; 34224e2b4712SSatish Balay 34234e2b4712SSatish Balay PetscFunctionBegin; 3424d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 34251ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 34264e2b4712SSatish Balay 3427aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 34282853dc0eSBarry Smith { 342987828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 34302853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 34312853dc0eSBarry Smith } 3432aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 34332853dc0eSBarry Smith { 343487828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 34352853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 34362853dc0eSBarry Smith } 3437aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 34382853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3439e1293385SBarry Smith #else 344030d4dcafSBarry Smith { 344187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3442d9fead3dSBarry Smith const MatScalar *v; 3443356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 3444356650c2SBarry Smith const PetscInt *vi; 3445e1293385SBarry Smith 34464e2b4712SSatish Balay /* forward solve the lower triangular */ 34474e2b4712SSatish Balay idx = 0; 3448e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 34494e2b4712SSatish Balay for (i=1; i<n; i++) { 34504e2b4712SSatish Balay v = aa + 16*ai[i]; 34514e2b4712SSatish Balay vi = aj + ai[i]; 34524e2b4712SSatish Balay nz = diag[i] - ai[i]; 3453e1293385SBarry Smith idx += 4; 3454f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 34554e2b4712SSatish Balay while (nz--) { 34564e2b4712SSatish Balay jdx = 4*(*vi++); 34574e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3458f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3459f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3460f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3461f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 34624e2b4712SSatish Balay v += 16; 34634e2b4712SSatish Balay } 3464f1af5d2fSBarry Smith x[idx] = s1; 3465f1af5d2fSBarry Smith x[1+idx] = s2; 3466f1af5d2fSBarry Smith x[2+idx] = s3; 3467f1af5d2fSBarry Smith x[3+idx] = s4; 34684e2b4712SSatish Balay } 34694e2b4712SSatish Balay /* backward solve the upper triangular */ 34704e555682SBarry Smith idt = 4*(n-1); 34714e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 34724e555682SBarry Smith ai16 = 16*diag[i]; 34734e555682SBarry Smith v = aa + ai16 + 16; 34744e2b4712SSatish Balay vi = aj + diag[i] + 1; 34754e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3476f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3477f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 34784e2b4712SSatish Balay while (nz--) { 34794e2b4712SSatish Balay idx = 4*(*vi++); 34804e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3481f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3482f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3483f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3484f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 34854e2b4712SSatish Balay v += 16; 34864e2b4712SSatish Balay } 34874e555682SBarry Smith v = aa + ai16; 3488f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3489f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3490f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3491f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3492329f5518SBarry Smith idt -= 4; 34934e2b4712SSatish Balay } 349430d4dcafSBarry Smith } 3495e1293385SBarry Smith #endif 34964e2b4712SSatish Balay 3497d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 34981ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3499dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 35004e2b4712SSatish Balay PetscFunctionReturn(0); 35014e2b4712SSatish Balay } 35024e2b4712SSatish Balay 3503*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 3504f26ec98cSKris Buschelman #undef __FUNCT__ 3505cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3506cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3507cee9d6f2SShri Abhyankar { 3508cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 35096464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3510cee9d6f2SShri Abhyankar PetscErrorCode ierr; 3511cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 3512cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3513cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 3514cee9d6f2SShri Abhyankar PetscScalar *x; 3515cee9d6f2SShri Abhyankar const PetscScalar *b; 3516cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3517cee9d6f2SShri Abhyankar 3518cee9d6f2SShri Abhyankar PetscFunctionBegin; 3519cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3520cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3521cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 3522cee9d6f2SShri Abhyankar idx = 0; 3523cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3524cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 3525cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 3526cee9d6f2SShri Abhyankar vi = aj + ai[i]; 3527cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 3528cee9d6f2SShri Abhyankar idx = bs*i; 3529cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 35306464896eSShri Abhyankar for(k=0;k<nz;k++) { 35316464896eSShri Abhyankar jdx = bs*vi[k]; 3532cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3533cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3534cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3535cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3536cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3537cee9d6f2SShri Abhyankar 3538cee9d6f2SShri Abhyankar v += bs2; 3539cee9d6f2SShri Abhyankar } 3540cee9d6f2SShri Abhyankar 3541cee9d6f2SShri Abhyankar x[idx] = s1; 3542cee9d6f2SShri Abhyankar x[1+idx] = s2; 3543cee9d6f2SShri Abhyankar x[2+idx] = s3; 3544cee9d6f2SShri Abhyankar x[3+idx] = s4; 3545cee9d6f2SShri Abhyankar } 3546cee9d6f2SShri Abhyankar 3547cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 3548cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 3549cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 3550cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 3551cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 3552cee9d6f2SShri Abhyankar idt = bs*i; 3553cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3554cee9d6f2SShri Abhyankar 35556464896eSShri Abhyankar for(k=0;k<nz;k++){ 35566464896eSShri Abhyankar idx = bs*vi[k]; 3557cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3558cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3559cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3560cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3561cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3562cee9d6f2SShri Abhyankar 3563cee9d6f2SShri Abhyankar v += bs2; 3564cee9d6f2SShri Abhyankar } 3565cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 3566cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3567cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3568cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3569cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3570cee9d6f2SShri Abhyankar 3571cee9d6f2SShri Abhyankar } 3572cee9d6f2SShri Abhyankar 3573cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3574cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3575cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3576cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 3577cee9d6f2SShri Abhyankar } 3578*a2d6a19aSShri Abhyankar #endif 3579cee9d6f2SShri Abhyankar 3580b2b2dd24SShri Abhyankar #undef __FUNCT__ 3581*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3582*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3583b2b2dd24SShri Abhyankar { 3584b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3585b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3586b2b2dd24SShri Abhyankar PetscErrorCode ierr; 3587b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 3588b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3589b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 3590b2b2dd24SShri Abhyankar PetscScalar *x; 3591b2b2dd24SShri Abhyankar const PetscScalar *b; 3592b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3593cee9d6f2SShri Abhyankar 3594b2b2dd24SShri Abhyankar PetscFunctionBegin; 3595b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3596b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3597b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 3598b2b2dd24SShri Abhyankar idx = 0; 3599b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3600b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 3601b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 3602b2b2dd24SShri Abhyankar vi = aj + ai[i]; 3603b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 3604b2b2dd24SShri Abhyankar idx = bs*i; 3605b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3606b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 3607b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 3608b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3609b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3610b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3611b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3612b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3613b2b2dd24SShri Abhyankar 3614b2b2dd24SShri Abhyankar v += bs2; 3615b2b2dd24SShri Abhyankar } 3616b2b2dd24SShri Abhyankar 3617b2b2dd24SShri Abhyankar x[idx] = s1; 3618b2b2dd24SShri Abhyankar x[1+idx] = s2; 3619b2b2dd24SShri Abhyankar x[2+idx] = s3; 3620b2b2dd24SShri Abhyankar x[3+idx] = s4; 3621b2b2dd24SShri Abhyankar } 3622b2b2dd24SShri Abhyankar 3623b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 3624b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 3625b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 3626b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 3627b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 3628b2b2dd24SShri Abhyankar idt = bs*i; 3629b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3630b2b2dd24SShri Abhyankar 3631b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 3632b2b2dd24SShri Abhyankar idx = bs*vi[k]; 3633b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3634b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3635b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3636b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3637b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3638b2b2dd24SShri Abhyankar 3639b2b2dd24SShri Abhyankar v += bs2; 3640b2b2dd24SShri Abhyankar } 3641b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 3642b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3643b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3644b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3645b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3646b2b2dd24SShri Abhyankar 3647b2b2dd24SShri Abhyankar } 3648b2b2dd24SShri Abhyankar 3649b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3650b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3651b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3652b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 3653b2b2dd24SShri Abhyankar } 3654cee9d6f2SShri Abhyankar 3655cee9d6f2SShri Abhyankar #undef __FUNCT__ 3656f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3657dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3658f26ec98cSKris Buschelman { 3659f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3660690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3661dfbe8321SBarry Smith PetscErrorCode ierr; 3662690b6cddSBarry Smith PetscInt *diag = a->diag; 3663f26ec98cSKris Buschelman MatScalar *aa=a->a; 3664f26ec98cSKris Buschelman PetscScalar *x,*b; 3665f26ec98cSKris Buschelman 3666f26ec98cSKris Buschelman PetscFunctionBegin; 36671ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 36681ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3669f26ec98cSKris Buschelman 3670f26ec98cSKris Buschelman { 3671f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3672f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 3673690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3674f26ec98cSKris Buschelman 3675f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3676f26ec98cSKris Buschelman idx = 0; 3677f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 3678f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 3679f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 3680f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 3681f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3682f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3683f26ec98cSKris Buschelman vi = aj + ai[i]; 3684f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3685f26ec98cSKris Buschelman idx += 4; 3686f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3687f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3688f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3689f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3690f26ec98cSKris Buschelman while (nz--) { 3691f26ec98cSKris Buschelman jdx = 4*(*vi++); 3692f26ec98cSKris Buschelman x1 = t[jdx]; 3693f26ec98cSKris Buschelman x2 = t[1+jdx]; 3694f26ec98cSKris Buschelman x3 = t[2+jdx]; 3695f26ec98cSKris Buschelman x4 = t[3+jdx]; 3696f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3697f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3698f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3699f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3700f26ec98cSKris Buschelman v += 16; 3701f26ec98cSKris Buschelman } 3702f26ec98cSKris Buschelman t[idx] = s1; 3703f26ec98cSKris Buschelman t[1+idx] = s2; 3704f26ec98cSKris Buschelman t[2+idx] = s3; 3705f26ec98cSKris Buschelman t[3+idx] = s4; 3706f26ec98cSKris Buschelman } 3707f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3708f26ec98cSKris Buschelman idt = 4*(n-1); 3709f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3710f26ec98cSKris Buschelman ai16 = 16*diag[i]; 3711f26ec98cSKris Buschelman v = aa + ai16 + 16; 3712f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3713f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3714f26ec98cSKris Buschelman s1 = t[idt]; 3715f26ec98cSKris Buschelman s2 = t[1+idt]; 3716f26ec98cSKris Buschelman s3 = t[2+idt]; 3717f26ec98cSKris Buschelman s4 = t[3+idt]; 3718f26ec98cSKris Buschelman while (nz--) { 3719f26ec98cSKris Buschelman idx = 4*(*vi++); 3720f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 3721f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 3722f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 3723f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 3724f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3725f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3726f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3727f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3728f26ec98cSKris Buschelman v += 16; 3729f26ec98cSKris Buschelman } 3730f26ec98cSKris Buschelman v = aa + ai16; 3731f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3732f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3733f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3734f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3735f26ec98cSKris Buschelman idt -= 4; 3736f26ec98cSKris Buschelman } 3737f26ec98cSKris Buschelman } 3738f26ec98cSKris Buschelman 37391ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 37401ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3741dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3742f26ec98cSKris Buschelman PetscFunctionReturn(0); 3743f26ec98cSKris Buschelman } 3744f26ec98cSKris Buschelman 37453660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 37463660e330SKris Buschelman 37473660e330SKris Buschelman #include PETSC_HAVE_SSE 37483660e330SKris Buschelman #undef __FUNCT__ 37497cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3750dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 37513660e330SKris Buschelman { 37523660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 37532aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 3754dfbe8321SBarry Smith PetscErrorCode ierr; 3755dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 37563660e330SKris Buschelman MatScalar *aa=a->a; 375787828ca2SBarry Smith PetscScalar *x,*b; 37583660e330SKris Buschelman 37593660e330SKris Buschelman PetscFunctionBegin; 37603660e330SKris Buschelman SSE_SCOPE_BEGIN; 37613660e330SKris Buschelman /* 37623660e330SKris Buschelman Note: This code currently uses demotion of double 37633660e330SKris Buschelman to float when performing the mixed-mode computation. 37643660e330SKris Buschelman This may not be numerically reasonable for all applications. 37653660e330SKris Buschelman */ 37663660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 37673660e330SKris Buschelman 37681ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 37691ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 37703660e330SKris Buschelman { 3771eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 3772eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 37732aa5897fSKris Buschelman int nz,i,idt,ai16; 37742aa5897fSKris Buschelman unsigned int jdx,idx; 37752aa5897fSKris Buschelman unsigned short *vi; 3776eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 37773660e330SKris Buschelman 3778eb05f457SKris Buschelman /* First block is the identity. */ 37793660e330SKris Buschelman idx = 0; 3780eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 37812aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 37823660e330SKris Buschelman 37833660e330SKris Buschelman for (i=1; i<n;) { 37843660e330SKris Buschelman PREFETCH_NTA(&v[8]); 37853660e330SKris Buschelman vi = aj + ai[i]; 37863660e330SKris Buschelman nz = diag[i] - ai[i]; 37873660e330SKris Buschelman idx += 4; 37883660e330SKris Buschelman 3789eb05f457SKris Buschelman /* Demote RHS from double to float. */ 3790eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3791eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 37923660e330SKris Buschelman 37933660e330SKris Buschelman while (nz--) { 37943660e330SKris Buschelman PREFETCH_NTA(&v[16]); 37952aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 37963660e330SKris Buschelman 37973660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 3798eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 37993660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 38003660e330SKris Buschelman 38013660e330SKris Buschelman /* First Column */ 38023660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 38033660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 38043660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 38053660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 38063660e330SKris Buschelman 38073660e330SKris Buschelman /* Second Column */ 38083660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 38093660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 38103660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 38113660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 38123660e330SKris Buschelman 38133660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 38143660e330SKris Buschelman 38153660e330SKris Buschelman /* Third Column */ 38163660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 38173660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 38183660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 38193660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 38203660e330SKris Buschelman 38213660e330SKris Buschelman /* Fourth Column */ 38223660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 38233660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 38243660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 38253660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 38263660e330SKris Buschelman SSE_INLINE_END_2 38273660e330SKris Buschelman 38283660e330SKris Buschelman v += 16; 38293660e330SKris Buschelman } 38303660e330SKris Buschelman v = aa + 16*ai[++i]; 38313660e330SKris Buschelman PREFETCH_NTA(v); 3832eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 38333660e330SKris Buschelman } 3834eb05f457SKris Buschelman 3835eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 3836eb05f457SKris Buschelman 38373660e330SKris Buschelman idt = 4*(n-1); 38383660e330SKris Buschelman ai16 = 16*diag[n-1]; 38393660e330SKris Buschelman v = aa + ai16 + 16; 38403660e330SKris Buschelman for (i=n-1; i>=0;){ 38413660e330SKris Buschelman PREFETCH_NTA(&v[8]); 38423660e330SKris Buschelman vi = aj + diag[i] + 1; 38433660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 38443660e330SKris Buschelman 3845eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 38463660e330SKris Buschelman 38473660e330SKris Buschelman while (nz--) { 38483660e330SKris Buschelman PREFETCH_NTA(&v[16]); 38492aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 38503660e330SKris Buschelman 38513660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 3852eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 38533660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 38543660e330SKris Buschelman 38553660e330SKris Buschelman /* First Column */ 38563660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 38573660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 38583660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 38593660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 38603660e330SKris Buschelman 38613660e330SKris Buschelman /* Second Column */ 38623660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 38633660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 38643660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 38653660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 38663660e330SKris Buschelman 38673660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 38683660e330SKris Buschelman 38693660e330SKris Buschelman /* Third Column */ 38703660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 38713660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 38723660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 38733660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 38743660e330SKris Buschelman 38753660e330SKris Buschelman /* Fourth Column */ 38763660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 38773660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 38783660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 38793660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 38803660e330SKris Buschelman SSE_INLINE_END_2 38813660e330SKris Buschelman v += 16; 38823660e330SKris Buschelman } 38833660e330SKris Buschelman v = aa + ai16; 38843660e330SKris Buschelman ai16 = 16*diag[--i]; 38853660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 38863660e330SKris Buschelman /* 38873660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 38883660e330SKris Buschelman which was inverted as part of the factorization 38893660e330SKris Buschelman */ 3890eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 38913660e330SKris Buschelman /* First Column */ 38923660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 38933660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 38943660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 38953660e330SKris Buschelman 38963660e330SKris Buschelman /* Second Column */ 38973660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 38983660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 38993660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 39003660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 39013660e330SKris Buschelman 39023660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 39033660e330SKris Buschelman 39043660e330SKris Buschelman /* Third Column */ 39053660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 39063660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 39073660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 39083660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 39093660e330SKris Buschelman 39103660e330SKris Buschelman /* Fourth Column */ 39113660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 39123660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 39133660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 39143660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 39153660e330SKris Buschelman 39163660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 39173660e330SKris Buschelman SSE_INLINE_END_3 39183660e330SKris Buschelman 39193660e330SKris Buschelman v = aa + ai16 + 16; 39203660e330SKris Buschelman idt -= 4; 39213660e330SKris Buschelman } 3922eb05f457SKris Buschelman 3923eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 3924eb05f457SKris Buschelman idt = 4*(n-1); 3925eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 3926eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3927eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3928eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 3929eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 3930eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 3931eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 3932eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 3933eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 393454693613SKris Buschelman idt -= 4; 39353660e330SKris Buschelman } 3936eb05f457SKris Buschelman 3937eb05f457SKris Buschelman } /* End of artificial scope. */ 39381ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 39391ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3940dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 39413660e330SKris Buschelman SSE_SCOPE_END; 39423660e330SKris Buschelman PetscFunctionReturn(0); 39433660e330SKris Buschelman } 39443660e330SKris Buschelman 39457cf1b8d3SKris Buschelman #undef __FUNCT__ 39467cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3947dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 39487cf1b8d3SKris Buschelman { 39497cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 39507cf1b8d3SKris Buschelman int *aj=a->j; 3951dfbe8321SBarry Smith PetscErrorCode ierr; 3952dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 39537cf1b8d3SKris Buschelman MatScalar *aa=a->a; 39547cf1b8d3SKris Buschelman PetscScalar *x,*b; 39557cf1b8d3SKris Buschelman 39567cf1b8d3SKris Buschelman PetscFunctionBegin; 39577cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 39587cf1b8d3SKris Buschelman /* 39597cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 39607cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 39617cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 39627cf1b8d3SKris Buschelman */ 39637cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 39647cf1b8d3SKris Buschelman 39651ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 39661ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 39677cf1b8d3SKris Buschelman { 39687cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 39697cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 39707cf1b8d3SKris Buschelman int nz,i,idt,ai16; 39717cf1b8d3SKris Buschelman int jdx,idx; 39727cf1b8d3SKris Buschelman int *vi; 39737cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 39747cf1b8d3SKris Buschelman 39757cf1b8d3SKris Buschelman /* First block is the identity. */ 39767cf1b8d3SKris Buschelman idx = 0; 39777cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 39787cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 39797cf1b8d3SKris Buschelman 39807cf1b8d3SKris Buschelman for (i=1; i<n;) { 39817cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 39827cf1b8d3SKris Buschelman vi = aj + ai[i]; 39837cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 39847cf1b8d3SKris Buschelman idx += 4; 39857cf1b8d3SKris Buschelman 39867cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 39877cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 39887cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 39897cf1b8d3SKris Buschelman 39907cf1b8d3SKris Buschelman while (nz--) { 39917cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 39927cf1b8d3SKris Buschelman jdx = 4*(*vi++); 39937cf1b8d3SKris Buschelman /* jdx = *vi++; */ 39947cf1b8d3SKris Buschelman 39957cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 39967cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 39977cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 39987cf1b8d3SKris Buschelman 39997cf1b8d3SKris Buschelman /* First Column */ 40007cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 40017cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 40027cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 40037cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 40047cf1b8d3SKris Buschelman 40057cf1b8d3SKris Buschelman /* Second Column */ 40067cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 40077cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 40087cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 40097cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 40107cf1b8d3SKris Buschelman 40117cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 40127cf1b8d3SKris Buschelman 40137cf1b8d3SKris Buschelman /* Third Column */ 40147cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 40157cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 40167cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 40177cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 40187cf1b8d3SKris Buschelman 40197cf1b8d3SKris Buschelman /* Fourth Column */ 40207cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 40217cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 40227cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 40237cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 40247cf1b8d3SKris Buschelman SSE_INLINE_END_2 40257cf1b8d3SKris Buschelman 40267cf1b8d3SKris Buschelman v += 16; 40277cf1b8d3SKris Buschelman } 40287cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 40297cf1b8d3SKris Buschelman PREFETCH_NTA(v); 40307cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 40317cf1b8d3SKris Buschelman } 40327cf1b8d3SKris Buschelman 40337cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 40347cf1b8d3SKris Buschelman 40357cf1b8d3SKris Buschelman idt = 4*(n-1); 40367cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 40377cf1b8d3SKris Buschelman v = aa + ai16 + 16; 40387cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 40397cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 40407cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 40417cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 40427cf1b8d3SKris Buschelman 40437cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 40447cf1b8d3SKris Buschelman 40457cf1b8d3SKris Buschelman while (nz--) { 40467cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 40477cf1b8d3SKris Buschelman idx = 4*(*vi++); 40487cf1b8d3SKris Buschelman /* idx = *vi++; */ 40497cf1b8d3SKris Buschelman 40507cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 40517cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 40527cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 40537cf1b8d3SKris Buschelman 40547cf1b8d3SKris Buschelman /* First Column */ 40557cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 40567cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 40577cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 40587cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 40597cf1b8d3SKris Buschelman 40607cf1b8d3SKris Buschelman /* Second Column */ 40617cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 40627cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 40637cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 40647cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 40657cf1b8d3SKris Buschelman 40667cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 40677cf1b8d3SKris Buschelman 40687cf1b8d3SKris Buschelman /* Third Column */ 40697cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 40707cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 40717cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 40727cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 40737cf1b8d3SKris Buschelman 40747cf1b8d3SKris Buschelman /* Fourth Column */ 40757cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 40767cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 40777cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 40787cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 40797cf1b8d3SKris Buschelman SSE_INLINE_END_2 40807cf1b8d3SKris Buschelman v += 16; 40817cf1b8d3SKris Buschelman } 40827cf1b8d3SKris Buschelman v = aa + ai16; 40837cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 40847cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 40857cf1b8d3SKris Buschelman /* 40867cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 40877cf1b8d3SKris Buschelman which was inverted as part of the factorization 40887cf1b8d3SKris Buschelman */ 40897cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 40907cf1b8d3SKris Buschelman /* First Column */ 40917cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 40927cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 40937cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 40947cf1b8d3SKris Buschelman 40957cf1b8d3SKris Buschelman /* Second Column */ 40967cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 40977cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 40987cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 40997cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 41007cf1b8d3SKris Buschelman 41017cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 41027cf1b8d3SKris Buschelman 41037cf1b8d3SKris Buschelman /* Third Column */ 41047cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 41057cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 41067cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 41077cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 41087cf1b8d3SKris Buschelman 41097cf1b8d3SKris Buschelman /* Fourth Column */ 41107cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 41117cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 41127cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 41137cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 41147cf1b8d3SKris Buschelman 41157cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 41167cf1b8d3SKris Buschelman SSE_INLINE_END_3 41177cf1b8d3SKris Buschelman 41187cf1b8d3SKris Buschelman v = aa + ai16 + 16; 41197cf1b8d3SKris Buschelman idt -= 4; 41207cf1b8d3SKris Buschelman } 41217cf1b8d3SKris Buschelman 41227cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 41237cf1b8d3SKris Buschelman idt = 4*(n-1); 41247cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 41257cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 41267cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 41277cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 41287cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 41297cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 41307cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 41317cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 41327cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 41337cf1b8d3SKris Buschelman idt -= 4; 41347cf1b8d3SKris Buschelman } 41357cf1b8d3SKris Buschelman 41367cf1b8d3SKris Buschelman } /* End of artificial scope. */ 41371ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 41381ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4139dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 41407cf1b8d3SKris Buschelman SSE_SCOPE_END; 41417cf1b8d3SKris Buschelman PetscFunctionReturn(0); 41427cf1b8d3SKris Buschelman } 41437cf1b8d3SKris Buschelman 41443660e330SKris Buschelman #endif 41458f690400SShri Abhyankar 41464a2ae208SSatish Balay #undef __FUNCT__ 41474a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4148dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 41494e2b4712SSatish Balay { 41504e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 41514e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 41526849ba73SBarry Smith PetscErrorCode ierr; 41535d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 41545d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4155d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4156d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4157d9fead3dSBarry Smith const PetscScalar *b; 41584e2b4712SSatish Balay 41594e2b4712SSatish Balay PetscFunctionBegin; 4160d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41611ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4162f1af5d2fSBarry Smith t = a->solve_work; 41634e2b4712SSatish Balay 41644e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 41654e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 41664e2b4712SSatish Balay 41674e2b4712SSatish Balay /* forward solve the lower triangular */ 41684e2b4712SSatish Balay idx = 3*(*r++); 4169f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 41704e2b4712SSatish Balay for (i=1; i<n; i++) { 41714e2b4712SSatish Balay v = aa + 9*ai[i]; 41724e2b4712SSatish Balay vi = aj + ai[i]; 41734e2b4712SSatish Balay nz = diag[i] - ai[i]; 41744e2b4712SSatish Balay idx = 3*(*r++); 4175f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 41764e2b4712SSatish Balay while (nz--) { 41774e2b4712SSatish Balay idx = 3*(*vi++); 4178f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4179f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4180f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4181f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 41824e2b4712SSatish Balay v += 9; 41834e2b4712SSatish Balay } 41844e2b4712SSatish Balay idx = 3*i; 4185f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 41864e2b4712SSatish Balay } 41874e2b4712SSatish Balay /* backward solve the upper triangular */ 41884e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 41894e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 41904e2b4712SSatish Balay vi = aj + diag[i] + 1; 41914e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 41924e2b4712SSatish Balay idt = 3*i; 4193f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 41944e2b4712SSatish Balay while (nz--) { 41954e2b4712SSatish Balay idx = 3*(*vi++); 4196f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4197f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4198f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4199f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 42004e2b4712SSatish Balay v += 9; 42014e2b4712SSatish Balay } 42024e2b4712SSatish Balay idc = 3*(*c--); 42034e2b4712SSatish Balay v = aa + 9*diag[i]; 4204f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4205f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4206f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 42074e2b4712SSatish Balay } 42084e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 42094e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4210d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42111ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4212dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 42134e2b4712SSatish Balay PetscFunctionReturn(0); 42144e2b4712SSatish Balay } 42154e2b4712SSatish Balay 4216*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 42178f690400SShri Abhyankar #undef __FUNCT__ 42188f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 42198f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 42208f690400SShri Abhyankar { 42218f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 42228f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 42238f690400SShri Abhyankar PetscErrorCode ierr; 422429b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 42258f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 42268f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 42278f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 42288f690400SShri Abhyankar const PetscScalar *b; 42298f690400SShri Abhyankar 42308f690400SShri Abhyankar PetscFunctionBegin; 42318f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42328f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 42338f690400SShri Abhyankar t = a->solve_work; 42348f690400SShri Abhyankar 42358f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 423629b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 42378f690400SShri Abhyankar 42388f690400SShri Abhyankar /* forward solve the lower triangular */ 423929b92fc1SShri Abhyankar idx = 3*r[0]; 42408f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 42418f690400SShri Abhyankar for (i=1; i<n; i++) { 42428f690400SShri Abhyankar v = aa + 9*ai[i]; 42438f690400SShri Abhyankar vi = aj + ai[i]; 42448f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 424529b92fc1SShri Abhyankar idx = 3*r[i]; 42468f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 424729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 424829b92fc1SShri Abhyankar idx = 3*vi[m]; 42498f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 42508f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 42518f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 42528f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 42538f690400SShri Abhyankar v += 9; 42548f690400SShri Abhyankar } 42558f690400SShri Abhyankar idx = 3*i; 42568f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 42578f690400SShri Abhyankar } 42588f690400SShri Abhyankar /* backward solve the upper triangular */ 42598f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 42608f690400SShri Abhyankar k = 2*n-i; 42618f690400SShri Abhyankar v = aa + 9*ai[k]; 42628f690400SShri Abhyankar vi = aj + ai[k]; 42638f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 42648f690400SShri Abhyankar idt = 3*i; 42658f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 426629b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 426729b92fc1SShri Abhyankar idx = 3*vi[m]; 42688f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 42698f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 42708f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 42718f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 42728f690400SShri Abhyankar v += 9; 42738f690400SShri Abhyankar } 427429b92fc1SShri Abhyankar idc = 3*c[i]; 42758f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 42768f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 42778f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 42788f690400SShri Abhyankar } 42798f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 42808f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 42818f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42828f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 42838f690400SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 42848f690400SShri Abhyankar PetscFunctionReturn(0); 42858f690400SShri Abhyankar } 4286*a2d6a19aSShri Abhyankar #endif 42878f690400SShri Abhyankar 42880c4413a7SShri Abhyankar #undef __FUNCT__ 4289*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 4290*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 42910c4413a7SShri Abhyankar { 42920c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 42930c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 42940c4413a7SShri Abhyankar PetscErrorCode ierr; 42950c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 42960c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 42970c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 42980c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 42990c4413a7SShri Abhyankar const PetscScalar *b; 43000c4413a7SShri Abhyankar 43010c4413a7SShri Abhyankar PetscFunctionBegin; 43020c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43030c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 43040c4413a7SShri Abhyankar t = a->solve_work; 43050c4413a7SShri Abhyankar 43060c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 43070c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 43080c4413a7SShri Abhyankar 43090c4413a7SShri Abhyankar /* forward solve the lower triangular */ 43100c4413a7SShri Abhyankar idx = 3*r[0]; 43110c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 43120c4413a7SShri Abhyankar for (i=1; i<n; i++) { 43130c4413a7SShri Abhyankar v = aa + 9*ai[i]; 43140c4413a7SShri Abhyankar vi = aj + ai[i]; 43150c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 43160c4413a7SShri Abhyankar idx = 3*r[i]; 43170c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 43180c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 43190c4413a7SShri Abhyankar idx = 3*vi[m]; 43200c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 43210c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 43220c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 43230c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 43240c4413a7SShri Abhyankar v += 9; 43250c4413a7SShri Abhyankar } 43260c4413a7SShri Abhyankar idx = 3*i; 43270c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 43280c4413a7SShri Abhyankar } 43290c4413a7SShri Abhyankar /* backward solve the upper triangular */ 43300c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 43310c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 43320c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 43330c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 43340c4413a7SShri Abhyankar idt = 3*i; 43350c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 43360c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 43370c4413a7SShri Abhyankar idx = 3*vi[m]; 43380c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 43390c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 43400c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 43410c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 43420c4413a7SShri Abhyankar v += 9; 43430c4413a7SShri Abhyankar } 43440c4413a7SShri Abhyankar idc = 3*c[i]; 43450c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 43460c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 43470c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 43480c4413a7SShri Abhyankar } 43490c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 43500c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 43510c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43520c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 43530c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 43540c4413a7SShri Abhyankar PetscFunctionReturn(0); 43550c4413a7SShri Abhyankar } 43560c4413a7SShri Abhyankar 435715091d37SBarry Smith /* 435815091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 435915091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 436015091d37SBarry Smith */ 43614a2ae208SSatish Balay #undef __FUNCT__ 43624a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4363dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 436415091d37SBarry Smith { 436515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4366690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4367dfbe8321SBarry Smith PetscErrorCode ierr; 4368690b6cddSBarry Smith PetscInt *diag = a->diag; 4369d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4370d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4371d9fead3dSBarry Smith const PetscScalar *b; 4372690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 437315091d37SBarry Smith 437415091d37SBarry Smith PetscFunctionBegin; 4375d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43761ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 437715091d37SBarry Smith 437815091d37SBarry Smith /* forward solve the lower triangular */ 437915091d37SBarry Smith idx = 0; 438015091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 438115091d37SBarry Smith for (i=1; i<n; i++) { 438215091d37SBarry Smith v = aa + 9*ai[i]; 438315091d37SBarry Smith vi = aj + ai[i]; 438415091d37SBarry Smith nz = diag[i] - ai[i]; 438515091d37SBarry Smith idx += 3; 4386f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 438715091d37SBarry Smith while (nz--) { 438815091d37SBarry Smith jdx = 3*(*vi++); 438915091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4390f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4391f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4392f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 439315091d37SBarry Smith v += 9; 439415091d37SBarry Smith } 4395f1af5d2fSBarry Smith x[idx] = s1; 4396f1af5d2fSBarry Smith x[1+idx] = s2; 4397f1af5d2fSBarry Smith x[2+idx] = s3; 439815091d37SBarry Smith } 439915091d37SBarry Smith /* backward solve the upper triangular */ 440015091d37SBarry Smith for (i=n-1; i>=0; i--){ 440115091d37SBarry Smith v = aa + 9*diag[i] + 9; 440215091d37SBarry Smith vi = aj + diag[i] + 1; 440315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 440415091d37SBarry Smith idt = 3*i; 4405f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4406f1af5d2fSBarry Smith s3 = x[2+idt]; 440715091d37SBarry Smith while (nz--) { 440815091d37SBarry Smith idx = 3*(*vi++); 440915091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4410f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4411f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4412f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 441315091d37SBarry Smith v += 9; 441415091d37SBarry Smith } 441515091d37SBarry Smith v = aa + 9*diag[i]; 4416f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4417f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4418f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 441915091d37SBarry Smith } 442015091d37SBarry Smith 4421d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 44221ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4423dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 442415091d37SBarry Smith PetscFunctionReturn(0); 442515091d37SBarry Smith } 442615091d37SBarry Smith 4427*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 44284a2ae208SSatish Balay #undef __FUNCT__ 4429cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4430cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4431cee9d6f2SShri Abhyankar { 4432cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4433ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4434cee9d6f2SShri Abhyankar PetscErrorCode ierr; 4435cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 4436cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4437cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 4438cee9d6f2SShri Abhyankar PetscScalar *x; 4439cee9d6f2SShri Abhyankar const PetscScalar *b; 4440cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4441cee9d6f2SShri Abhyankar 4442cee9d6f2SShri Abhyankar PetscFunctionBegin; 4443cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4444cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4445cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 4446cee9d6f2SShri Abhyankar idx = 0; 4447cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4448cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 4449cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 4450cee9d6f2SShri Abhyankar vi = aj + ai[i]; 4451cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 4452cee9d6f2SShri Abhyankar idx = bs*i; 4453cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4454ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4455ce3d78c0SShri Abhyankar jdx = bs*vi[k]; 4456cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4457cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4458cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4459cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4460cee9d6f2SShri Abhyankar 4461cee9d6f2SShri Abhyankar v += bs2; 4462cee9d6f2SShri Abhyankar } 4463cee9d6f2SShri Abhyankar 4464cee9d6f2SShri Abhyankar x[idx] = s1; 4465cee9d6f2SShri Abhyankar x[1+idx] = s2; 4466cee9d6f2SShri Abhyankar x[2+idx] = s3; 4467cee9d6f2SShri Abhyankar } 4468cee9d6f2SShri Abhyankar 4469cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 4470cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 4471cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 4472cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 4473cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 4474cee9d6f2SShri Abhyankar idt = bs*i; 4475cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4476cee9d6f2SShri Abhyankar 4477ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4478ce3d78c0SShri Abhyankar idx = bs*vi[k]; 4479cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4480cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4481cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4482cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4483cee9d6f2SShri Abhyankar 4484cee9d6f2SShri Abhyankar v += bs2; 4485cee9d6f2SShri Abhyankar } 4486cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 4487cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4488cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4489cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4490cee9d6f2SShri Abhyankar 4491cee9d6f2SShri Abhyankar } 4492cee9d6f2SShri Abhyankar 4493cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4494cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4495cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4496cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 4497cee9d6f2SShri Abhyankar } 4498*a2d6a19aSShri Abhyankar #endif 4499cee9d6f2SShri Abhyankar 4500cee9d6f2SShri Abhyankar #undef __FUNCT__ 4501*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4502*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4503b2b2dd24SShri Abhyankar { 4504b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4505b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4506b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4507b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 4508b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4509b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4510b2b2dd24SShri Abhyankar PetscScalar *x; 4511b2b2dd24SShri Abhyankar const PetscScalar *b; 4512b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4513b2b2dd24SShri Abhyankar 4514b2b2dd24SShri Abhyankar PetscFunctionBegin; 4515b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4516b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4517b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4518b2b2dd24SShri Abhyankar idx = 0; 4519b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4520b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4521b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4522b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4523b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4524b2b2dd24SShri Abhyankar idx = bs*i; 4525b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4526b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4527b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4528b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4529b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4530b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4531b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4532b2b2dd24SShri Abhyankar 4533b2b2dd24SShri Abhyankar v += bs2; 4534b2b2dd24SShri Abhyankar } 4535b2b2dd24SShri Abhyankar 4536b2b2dd24SShri Abhyankar x[idx] = s1; 4537b2b2dd24SShri Abhyankar x[1+idx] = s2; 4538b2b2dd24SShri Abhyankar x[2+idx] = s3; 4539b2b2dd24SShri Abhyankar } 4540b2b2dd24SShri Abhyankar 4541b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4542b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4543b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4544b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4545b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4546b2b2dd24SShri Abhyankar idt = bs*i; 4547b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4548b2b2dd24SShri Abhyankar 4549b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4550b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4551b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4552b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4553b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4554b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4555b2b2dd24SShri Abhyankar 4556b2b2dd24SShri Abhyankar v += bs2; 4557b2b2dd24SShri Abhyankar } 4558b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4559b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4560b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4561b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4562b2b2dd24SShri Abhyankar 4563b2b2dd24SShri Abhyankar } 4564b2b2dd24SShri Abhyankar 4565b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4566b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4567b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4568b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4569b2b2dd24SShri Abhyankar } 4570b2b2dd24SShri Abhyankar 4571b2b2dd24SShri Abhyankar #undef __FUNCT__ 45724a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4573dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 45744e2b4712SSatish Balay { 45754e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 45764e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 45776849ba73SBarry Smith PetscErrorCode ierr; 45785d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 45795d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4580d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4581d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 4582d9fead3dSBarry Smith const PetscScalar *b; 45834e2b4712SSatish Balay 45844e2b4712SSatish Balay PetscFunctionBegin; 4585d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 45861ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4587f1af5d2fSBarry Smith t = a->solve_work; 45884e2b4712SSatish Balay 45894e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 45904e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 45914e2b4712SSatish Balay 45924e2b4712SSatish Balay /* forward solve the lower triangular */ 45934e2b4712SSatish Balay idx = 2*(*r++); 4594f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 45954e2b4712SSatish Balay for (i=1; i<n; i++) { 45964e2b4712SSatish Balay v = aa + 4*ai[i]; 45974e2b4712SSatish Balay vi = aj + ai[i]; 45984e2b4712SSatish Balay nz = diag[i] - ai[i]; 45994e2b4712SSatish Balay idx = 2*(*r++); 4600f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 46014e2b4712SSatish Balay while (nz--) { 46024e2b4712SSatish Balay idx = 2*(*vi++); 4603f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4604f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4605f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 46064e2b4712SSatish Balay v += 4; 46074e2b4712SSatish Balay } 46084e2b4712SSatish Balay idx = 2*i; 4609f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 46104e2b4712SSatish Balay } 46114e2b4712SSatish Balay /* backward solve the upper triangular */ 46124e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 46134e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 46144e2b4712SSatish Balay vi = aj + diag[i] + 1; 46154e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 46164e2b4712SSatish Balay idt = 2*i; 4617f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 46184e2b4712SSatish Balay while (nz--) { 46194e2b4712SSatish Balay idx = 2*(*vi++); 4620f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4621f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4622f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 46234e2b4712SSatish Balay v += 4; 46244e2b4712SSatish Balay } 46254e2b4712SSatish Balay idc = 2*(*c--); 46264e2b4712SSatish Balay v = aa + 4*diag[i]; 4627f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4628f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 46294e2b4712SSatish Balay } 46304e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 46314e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4632d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4634dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 46354e2b4712SSatish Balay PetscFunctionReturn(0); 46364e2b4712SSatish Balay } 46374e2b4712SSatish Balay 4638*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 46398f690400SShri Abhyankar #undef __FUNCT__ 46408f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 46418f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 46428f690400SShri Abhyankar { 46438f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 46448f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 46458f690400SShri Abhyankar PetscErrorCode ierr; 464629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 46478f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 46488f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 46498f690400SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 46508f690400SShri Abhyankar const PetscScalar *b; 46518f690400SShri Abhyankar 46528f690400SShri Abhyankar PetscFunctionBegin; 46538f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 46548f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 46558f690400SShri Abhyankar t = a->solve_work; 46568f690400SShri Abhyankar 46578f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 465829b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 46598f690400SShri Abhyankar 46608f690400SShri Abhyankar /* forward solve the lower triangular */ 466129b92fc1SShri Abhyankar idx = 2*r[0]; 46628f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 46638f690400SShri Abhyankar for (i=1; i<n; i++) { 46648f690400SShri Abhyankar v = aa + 4*ai[i]; 46658f690400SShri Abhyankar vi = aj + ai[i]; 46668f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 466729b92fc1SShri Abhyankar idx = 2*r[i]; 46688f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 466929b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 467029b92fc1SShri Abhyankar jdx = 2*vi[m]; 46718f690400SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 46728f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 46738f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 46748f690400SShri Abhyankar v += 4; 46758f690400SShri Abhyankar } 46768f690400SShri Abhyankar idx = 2*i; 46778f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 46788f690400SShri Abhyankar } 46798f690400SShri Abhyankar /* backward solve the upper triangular */ 46808f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 46818f690400SShri Abhyankar k = 2*n-i; 46828f690400SShri Abhyankar v = aa + 4*ai[k]; 46838f690400SShri Abhyankar vi = aj + ai[k]; 46848f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 46858f690400SShri Abhyankar idt = 2*i; 46868f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 468729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 468829b92fc1SShri Abhyankar idx = 2*vi[m]; 46898f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 46908f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 46918f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 46928f690400SShri Abhyankar v += 4; 46938f690400SShri Abhyankar } 469429b92fc1SShri Abhyankar idc = 2*c[i]; 46958f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 46968f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 46978f690400SShri Abhyankar } 46988f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 46998f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 47008f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47018f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 47028f690400SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 47038f690400SShri Abhyankar PetscFunctionReturn(0); 47048f690400SShri Abhyankar } 4705*a2d6a19aSShri Abhyankar #endif 47068f690400SShri Abhyankar 47070c4413a7SShri Abhyankar #undef __FUNCT__ 4708*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 4709*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 47100c4413a7SShri Abhyankar { 47110c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 47120c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 47130c4413a7SShri Abhyankar PetscErrorCode ierr; 47140c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 47150c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 47160c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 47170c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 47180c4413a7SShri Abhyankar const PetscScalar *b; 47190c4413a7SShri Abhyankar 47200c4413a7SShri Abhyankar PetscFunctionBegin; 47210c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47220c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 47230c4413a7SShri Abhyankar t = a->solve_work; 47240c4413a7SShri Abhyankar 47250c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 47260c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 47270c4413a7SShri Abhyankar 47280c4413a7SShri Abhyankar /* forward solve the lower triangular */ 47290c4413a7SShri Abhyankar idx = 2*r[0]; 47300c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 47310c4413a7SShri Abhyankar for (i=1; i<n; i++) { 47320c4413a7SShri Abhyankar v = aa + 4*ai[i]; 47330c4413a7SShri Abhyankar vi = aj + ai[i]; 47340c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 47350c4413a7SShri Abhyankar idx = 2*r[i]; 47360c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 47370c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 47380c4413a7SShri Abhyankar jdx = 2*vi[m]; 47390c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 47400c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 47410c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 47420c4413a7SShri Abhyankar v += 4; 47430c4413a7SShri Abhyankar } 47440c4413a7SShri Abhyankar idx = 2*i; 47450c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 47460c4413a7SShri Abhyankar } 47470c4413a7SShri Abhyankar /* backward solve the upper triangular */ 47480c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 47490c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 47500c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 47510c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 47520c4413a7SShri Abhyankar idt = 2*i; 47530c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 47540c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 47550c4413a7SShri Abhyankar idx = 2*vi[m]; 47560c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 47570c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 47580c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 47590c4413a7SShri Abhyankar v += 4; 47600c4413a7SShri Abhyankar } 47610c4413a7SShri Abhyankar idc = 2*c[i]; 47620c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 47630c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 47640c4413a7SShri Abhyankar } 47650c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 47660c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 47670c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47680c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 47690c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 47700c4413a7SShri Abhyankar PetscFunctionReturn(0); 47710c4413a7SShri Abhyankar } 47728f690400SShri Abhyankar 477315091d37SBarry Smith /* 477415091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 477515091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 477615091d37SBarry Smith */ 47774a2ae208SSatish Balay #undef __FUNCT__ 47784a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4779dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 478015091d37SBarry Smith { 478115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4782690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4783dfbe8321SBarry Smith PetscErrorCode ierr; 4784690b6cddSBarry Smith PetscInt *diag = a->diag; 4785d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4786d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 4787d9fead3dSBarry Smith const PetscScalar *b; 4788690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 478915091d37SBarry Smith 479015091d37SBarry Smith PetscFunctionBegin; 4791d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 47921ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 479315091d37SBarry Smith 479415091d37SBarry Smith /* forward solve the lower triangular */ 479515091d37SBarry Smith idx = 0; 479615091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 479715091d37SBarry Smith for (i=1; i<n; i++) { 479815091d37SBarry Smith v = aa + 4*ai[i]; 479915091d37SBarry Smith vi = aj + ai[i]; 480015091d37SBarry Smith nz = diag[i] - ai[i]; 480115091d37SBarry Smith idx += 2; 4802f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 480315091d37SBarry Smith while (nz--) { 480415091d37SBarry Smith jdx = 2*(*vi++); 480515091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 4806f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4807f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 480815091d37SBarry Smith v += 4; 480915091d37SBarry Smith } 4810f1af5d2fSBarry Smith x[idx] = s1; 4811f1af5d2fSBarry Smith x[1+idx] = s2; 481215091d37SBarry Smith } 481315091d37SBarry Smith /* backward solve the upper triangular */ 481415091d37SBarry Smith for (i=n-1; i>=0; i--){ 481515091d37SBarry Smith v = aa + 4*diag[i] + 4; 481615091d37SBarry Smith vi = aj + diag[i] + 1; 481715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 481815091d37SBarry Smith idt = 2*i; 4819f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 482015091d37SBarry Smith while (nz--) { 482115091d37SBarry Smith idx = 2*(*vi++); 482215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 4823f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4824f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 482515091d37SBarry Smith v += 4; 482615091d37SBarry Smith } 482715091d37SBarry Smith v = aa + 4*diag[i]; 4828f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 4829f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 483015091d37SBarry Smith } 483115091d37SBarry Smith 4832d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 48331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4834dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 483515091d37SBarry Smith PetscFunctionReturn(0); 483615091d37SBarry Smith } 483715091d37SBarry Smith 4838*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED) 48394a2ae208SSatish Balay #undef __FUNCT__ 4840cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4841cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4842cee9d6f2SShri Abhyankar { 4843cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4844ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4845cee9d6f2SShri Abhyankar PetscErrorCode ierr; 4846cee9d6f2SShri Abhyankar PetscInt jdx; 4847cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 4848cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4849cee9d6f2SShri Abhyankar const PetscScalar *b; 4850cee9d6f2SShri Abhyankar 4851cee9d6f2SShri Abhyankar PetscFunctionBegin; 4852cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4853cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4854cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 4855cee9d6f2SShri Abhyankar idx = 0; 4856cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4857cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 4858cee9d6f2SShri Abhyankar v = aa + 4*ai[i]; 4859cee9d6f2SShri Abhyankar vi = aj + ai[i]; 4860cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 4861cee9d6f2SShri Abhyankar idx = 2*i; 4862cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4863ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4864ce3d78c0SShri Abhyankar jdx = 2*vi[k]; 4865cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4866cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4867cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4868cee9d6f2SShri Abhyankar v += 4; 4869cee9d6f2SShri Abhyankar } 4870cee9d6f2SShri Abhyankar x[idx] = s1; 4871cee9d6f2SShri Abhyankar x[1+idx] = s2; 4872cee9d6f2SShri Abhyankar } 4873cee9d6f2SShri Abhyankar 4874cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 4875cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 4876cee9d6f2SShri Abhyankar v = aa + 4*ai[2*n-i]; 4877cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 4878cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 4879cee9d6f2SShri Abhyankar idt = 2*i; 4880cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4881ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4882ce3d78c0SShri Abhyankar idx = 2*vi[k]; 4883cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4884cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4885cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4886cee9d6f2SShri Abhyankar v += 4; 4887cee9d6f2SShri Abhyankar } 4888cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 4889cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4890cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4891cee9d6f2SShri Abhyankar } 4892cee9d6f2SShri Abhyankar 4893cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4894cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4895cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4896cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 4897cee9d6f2SShri Abhyankar } 4898*a2d6a19aSShri Abhyankar #endif 4899cee9d6f2SShri Abhyankar 4900cee9d6f2SShri Abhyankar #undef __FUNCT__ 4901*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4902*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4903b2b2dd24SShri Abhyankar { 4904b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4905b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4906b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4907b2b2dd24SShri Abhyankar PetscInt jdx; 4908b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4909b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4910b2b2dd24SShri Abhyankar const PetscScalar *b; 4911b2b2dd24SShri Abhyankar 4912b2b2dd24SShri Abhyankar PetscFunctionBegin; 4913b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4914b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4915b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4916b2b2dd24SShri Abhyankar idx = 0; 4917b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4918b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4919b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 4920b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4921b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4922b2b2dd24SShri Abhyankar idx = 2*i; 4923b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4924b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4925b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 4926b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4927b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4928b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4929b2b2dd24SShri Abhyankar v += 4; 4930b2b2dd24SShri Abhyankar } 4931b2b2dd24SShri Abhyankar x[idx] = s1; 4932b2b2dd24SShri Abhyankar x[1+idx] = s2; 4933b2b2dd24SShri Abhyankar } 4934b2b2dd24SShri Abhyankar 4935b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4936b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4937b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 4938b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4939b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4940b2b2dd24SShri Abhyankar idt = 2*i; 4941b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4942b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4943b2b2dd24SShri Abhyankar idx = 2*vi[k]; 4944b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4945b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4946b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4947b2b2dd24SShri Abhyankar v += 4; 4948b2b2dd24SShri Abhyankar } 4949b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4950b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4951b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4952b2b2dd24SShri Abhyankar } 4953b2b2dd24SShri Abhyankar 4954b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4955b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4956b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4957b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4958b2b2dd24SShri Abhyankar } 4959b2b2dd24SShri Abhyankar 4960b2b2dd24SShri Abhyankar #undef __FUNCT__ 49614a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4962dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 49634e2b4712SSatish Balay { 49644e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 49654e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 49666849ba73SBarry Smith PetscErrorCode ierr; 49675d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 49685d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 49693f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 497087828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 49714e2b4712SSatish Balay 49724e2b4712SSatish Balay PetscFunctionBegin; 49734e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 49744e2b4712SSatish Balay 49751ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 49761ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4977f1af5d2fSBarry Smith t = a->solve_work; 49784e2b4712SSatish Balay 49794e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 49804e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 49814e2b4712SSatish Balay 49824e2b4712SSatish Balay /* forward solve the lower triangular */ 4983f1af5d2fSBarry Smith t[0] = b[*r++]; 49844e2b4712SSatish Balay for (i=1; i<n; i++) { 49854e2b4712SSatish Balay v = aa + ai[i]; 49864e2b4712SSatish Balay vi = aj + ai[i]; 49874e2b4712SSatish Balay nz = diag[i] - ai[i]; 4988f1af5d2fSBarry Smith s1 = b[*r++]; 49894e2b4712SSatish Balay while (nz--) { 4990f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 49914e2b4712SSatish Balay } 4992f1af5d2fSBarry Smith t[i] = s1; 49934e2b4712SSatish Balay } 49944e2b4712SSatish Balay /* backward solve the upper triangular */ 49954e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 49964e2b4712SSatish Balay v = aa + diag[i] + 1; 49974e2b4712SSatish Balay vi = aj + diag[i] + 1; 49984e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4999f1af5d2fSBarry Smith s1 = t[i]; 50004e2b4712SSatish Balay while (nz--) { 5001f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 50024e2b4712SSatish Balay } 5003f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 50044e2b4712SSatish Balay } 50054e2b4712SSatish Balay 50064e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 50074e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 50081ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50091ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5010dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 50114e2b4712SSatish Balay PetscFunctionReturn(0); 50124e2b4712SSatish Balay } 501315091d37SBarry Smith /* 501415091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 501515091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 501615091d37SBarry Smith */ 50174a2ae208SSatish Balay #undef __FUNCT__ 50184a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5019dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 502015091d37SBarry Smith { 502115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5022690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 5023dfbe8321SBarry Smith PetscErrorCode ierr; 5024690b6cddSBarry Smith PetscInt *diag = a->diag; 502515091d37SBarry Smith MatScalar *aa=a->a; 502687828ca2SBarry Smith PetscScalar *x,*b; 502787828ca2SBarry Smith PetscScalar s1,x1; 502815091d37SBarry Smith MatScalar *v; 5029690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 503015091d37SBarry Smith 503115091d37SBarry Smith PetscFunctionBegin; 50321ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 50331ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 503415091d37SBarry Smith 503515091d37SBarry Smith /* forward solve the lower triangular */ 503615091d37SBarry Smith idx = 0; 503715091d37SBarry Smith x[0] = b[0]; 503815091d37SBarry Smith for (i=1; i<n; i++) { 503915091d37SBarry Smith v = aa + ai[i]; 504015091d37SBarry Smith vi = aj + ai[i]; 504115091d37SBarry Smith nz = diag[i] - ai[i]; 504215091d37SBarry Smith idx += 1; 5043f1af5d2fSBarry Smith s1 = b[idx]; 504415091d37SBarry Smith while (nz--) { 504515091d37SBarry Smith jdx = *vi++; 504615091d37SBarry Smith x1 = x[jdx]; 5047f1af5d2fSBarry Smith s1 -= v[0]*x1; 504815091d37SBarry Smith v += 1; 504915091d37SBarry Smith } 5050f1af5d2fSBarry Smith x[idx] = s1; 505115091d37SBarry Smith } 505215091d37SBarry Smith /* backward solve the upper triangular */ 505315091d37SBarry Smith for (i=n-1; i>=0; i--){ 505415091d37SBarry Smith v = aa + diag[i] + 1; 505515091d37SBarry Smith vi = aj + diag[i] + 1; 505615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 505715091d37SBarry Smith idt = i; 5058f1af5d2fSBarry Smith s1 = x[idt]; 505915091d37SBarry Smith while (nz--) { 506015091d37SBarry Smith idx = *vi++; 506115091d37SBarry Smith x1 = x[idx]; 5062f1af5d2fSBarry Smith s1 -= v[0]*x1; 506315091d37SBarry Smith v += 1; 506415091d37SBarry Smith } 506515091d37SBarry Smith v = aa + diag[i]; 5066f1af5d2fSBarry Smith x[idt] = v[0]*s1; 506715091d37SBarry Smith } 50681ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50691ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5070dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 507115091d37SBarry Smith PetscFunctionReturn(0); 507215091d37SBarry Smith } 50734e2b4712SSatish Balay 50744e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 507516a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 50766bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 5077ae3d28f0SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth); 50786bce7ff8SHong Zhang 50796bce7ff8SHong Zhang #undef __FUNCT__ 50806bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 50816bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 50826bce7ff8SHong Zhang { 50836bce7ff8SHong Zhang Mat C=B; 50846bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 50856bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 50866bce7ff8SHong Zhang PetscErrorCode ierr; 50876bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 50886bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 50896bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5090b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5091914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5092914a18a2SHong Zhang MatScalar *v_work; 5093ae3d28f0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 50946bce7ff8SHong Zhang 50956bce7ff8SHong Zhang PetscFunctionBegin; 50966bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 50976bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5098ae3d28f0SHong Zhang 5099fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5100fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 51016bce7ff8SHong Zhang ics = ic; 51026bce7ff8SHong Zhang 5103914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 5104fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5105914a18a2SHong Zhang 51066bce7ff8SHong Zhang for (i=0; i<n; i++){ 51076bce7ff8SHong Zhang /* zero rtmp */ 51086bce7ff8SHong Zhang /* L part */ 51096bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 51106bce7ff8SHong Zhang bjtmp = bj + bi[i]; 5111914a18a2SHong Zhang for (j=0; j<nz; j++){ 5112914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5113914a18a2SHong Zhang } 51146bce7ff8SHong Zhang 51156bce7ff8SHong Zhang /* U part */ 51161a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 51171a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 51181a83e813SShri Abhyankar for (j=0; j<nz; j++){ 51191a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 51201a83e813SShri Abhyankar } 51211a83e813SShri Abhyankar 51221a83e813SShri Abhyankar /* load in initial (unfactored row) */ 51231a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 51241a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 51251a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 51261a83e813SShri Abhyankar for (j=0; j<nz; j++) { 51271a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 51281a83e813SShri Abhyankar } 51291a83e813SShri Abhyankar 51301a83e813SShri Abhyankar /* elimination */ 51311a83e813SShri Abhyankar bjtmp = bj + bi[i]; 51321a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 51331a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 51341a83e813SShri Abhyankar row = bjtmp[k]; 51351a83e813SShri Abhyankar pc = rtmp + bs2*row; 51361a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 51371a83e813SShri Abhyankar if (flg) { 51381a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 51391a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 51401a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 51411a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 51421a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 51431a83e813SShri Abhyankar for (j=0; j<nz; j++) { 51441a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 51451a83e813SShri Abhyankar } 51461a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 51471a83e813SShri Abhyankar } 51481a83e813SShri Abhyankar } 51491a83e813SShri Abhyankar 51501a83e813SShri Abhyankar /* finished row so stick it into b->a */ 51511a83e813SShri Abhyankar /* L part */ 51521a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 51531a83e813SShri Abhyankar pj = b->j + bi[i] ; 51541a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 51551a83e813SShri Abhyankar for (j=0; j<nz; j++) { 51561a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 51571a83e813SShri Abhyankar } 51581a83e813SShri Abhyankar 51591a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 51601a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 51611a83e813SShri Abhyankar pj = b->j + bdiag[i]; 51621a83e813SShri Abhyankar /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 51631a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 51641a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 51651a83e813SShri Abhyankar 51661a83e813SShri Abhyankar /* U part */ 51671a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 51681a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 51691a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 51701a83e813SShri Abhyankar for (j=0; j<nz; j++){ 51711a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 51721a83e813SShri Abhyankar } 51731a83e813SShri Abhyankar } 51741a83e813SShri Abhyankar 51751a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 5176fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 51771a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 51781a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 51791a83e813SShri Abhyankar 5180ae3d28f0SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5181ae3d28f0SHong Zhang ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5182ae3d28f0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 5183ae3d28f0SHong Zhang if (both_identity){ 5184*a2d6a19aSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 5185ae3d28f0SHong Zhang } else { 5186*a2d6a19aSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 5187ae3d28f0SHong Zhang } 5188ae3d28f0SHong Zhang 51891a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 51901a83e813SShri Abhyankar ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 51911a83e813SShri Abhyankar PetscFunctionReturn(0); 51921a83e813SShri Abhyankar } 51931a83e813SShri Abhyankar 51946bce7ff8SHong Zhang /* 51956bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 519616a2bf60SHong Zhang See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 519716a2bf60SHong Zhang because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 51986bce7ff8SHong Zhang */ 5199c0c7eb62SShri Abhyankar 52006bce7ff8SHong Zhang #undef __FUNCT__ 52016bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 52026bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 52036bce7ff8SHong Zhang { 52046bce7ff8SHong Zhang 52056bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 52066bce7ff8SHong Zhang PetscErrorCode ierr; 520716a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 520835aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 520935aa4fcfSShri Abhyankar 521035aa4fcfSShri Abhyankar PetscFunctionBegin; 521135aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 521235aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 521335aa4fcfSShri Abhyankar 521435aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 521535aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 521635aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 521735aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 521835aa4fcfSShri Abhyankar if (!b->diag){ 521935aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 522035aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 522135aa4fcfSShri Abhyankar } 522235aa4fcfSShri Abhyankar bdiag = b->diag; 522335aa4fcfSShri Abhyankar 522435aa4fcfSShri Abhyankar if (n > 0) { 522535aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 522635aa4fcfSShri Abhyankar } 522735aa4fcfSShri Abhyankar 522835aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 522935aa4fcfSShri Abhyankar bi = b->i; 523035aa4fcfSShri Abhyankar bj = b->j; 523135aa4fcfSShri Abhyankar 523235aa4fcfSShri Abhyankar /* L part */ 523335aa4fcfSShri Abhyankar bi[0] = 0; 523435aa4fcfSShri Abhyankar for (i=0; i<n; i++){ 523535aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 523635aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 523735aa4fcfSShri Abhyankar aj = a->j + ai[i]; 523835aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 523935aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 524035aa4fcfSShri Abhyankar } 524135aa4fcfSShri Abhyankar } 524235aa4fcfSShri Abhyankar 524335aa4fcfSShri Abhyankar /* U part */ 524435aa4fcfSShri Abhyankar bi_temp = bi[n]; 524535aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 524635aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 524735aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 524835aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 524935aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 525035aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 525135aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 525235aa4fcfSShri Abhyankar } 525335aa4fcfSShri Abhyankar /* diag[i] */ 525435aa4fcfSShri Abhyankar *bj = i; bj++; 525535aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 525635aa4fcfSShri Abhyankar } 525735aa4fcfSShri Abhyankar PetscFunctionReturn(0); 525835aa4fcfSShri Abhyankar } 525935aa4fcfSShri Abhyankar 526035aa4fcfSShri Abhyankar #undef __FUNCT__ 526116a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 526216a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 526316a2bf60SHong Zhang { 526416a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 526516a2bf60SHong Zhang IS isicol; 526616a2bf60SHong Zhang PetscErrorCode ierr; 526716a2bf60SHong Zhang const PetscInt *r,*ic; 52687fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 526916a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 527016a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 527116a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 52727fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 527316a2bf60SHong Zhang PetscReal f; 527416a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 527516a2bf60SHong Zhang PetscBT lnkbt; 527616a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 527716a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 527816a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 527916a2bf60SHong Zhang PetscTruth missing; 52807fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 528116a2bf60SHong Zhang 528216a2bf60SHong Zhang PetscFunctionBegin; 528316a2bf60SHong Zhang if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 528416a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 528516a2bf60SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 528616a2bf60SHong Zhang 528716a2bf60SHong Zhang f = info->fill; 528816a2bf60SHong Zhang levels = (PetscInt)info->levels; 528916a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 529016a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 529116a2bf60SHong Zhang 529216a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 529316a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 52947fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 529516a2bf60SHong Zhang 52967fa3a6a0SHong Zhang if (!levels && both_identity) { 529716a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 529816a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5299ae3d28f0SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 530035aa4fcfSShri Abhyankar 530135aa4fcfSShri Abhyankar fact->factor = MAT_FACTOR_ILU; 530235aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 530335aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 530435aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 530535aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 530635aa4fcfSShri Abhyankar b->row = isrow; 530735aa4fcfSShri Abhyankar b->col = iscol; 530835aa4fcfSShri Abhyankar b->icol = isicol; 530935aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 531035aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 531135aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 531235aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 531335aa4fcfSShri Abhyankar PetscFunctionReturn(0); 531435aa4fcfSShri Abhyankar } 531535aa4fcfSShri Abhyankar 531635aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 531735aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 531835aa4fcfSShri Abhyankar 531935aa4fcfSShri Abhyankar /* get new row pointers */ 532035aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 532135aa4fcfSShri Abhyankar bi[0] = 0; 532235aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 532335aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 532435aa4fcfSShri Abhyankar bdiag[0] = 0; 532535aa4fcfSShri Abhyankar 5326fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 532735aa4fcfSShri Abhyankar 532835aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 532935aa4fcfSShri Abhyankar nlnk = n + 1; 533035aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 533135aa4fcfSShri Abhyankar 533235aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 533335aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 533435aa4fcfSShri Abhyankar current_space = free_space; 533535aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 533635aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 533735aa4fcfSShri Abhyankar 533835aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 533935aa4fcfSShri Abhyankar nzi = 0; 534035aa4fcfSShri Abhyankar /* copy current row into linked list */ 534135aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 534235aa4fcfSShri Abhyankar if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 534335aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 534435aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 534535aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 534635aa4fcfSShri Abhyankar nzi += nlnk; 534735aa4fcfSShri Abhyankar 534835aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 534935aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 535035aa4fcfSShri Abhyankar fm = n; 535135aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 535235aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 535335aa4fcfSShri Abhyankar lnk[fm] = i; 535435aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 535535aa4fcfSShri Abhyankar nzi++; dcount++; 535635aa4fcfSShri Abhyankar } 535735aa4fcfSShri Abhyankar 535835aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 535935aa4fcfSShri Abhyankar nzbd = 0; 536035aa4fcfSShri Abhyankar prow = lnk[n]; 536135aa4fcfSShri Abhyankar while (prow < i) { 536235aa4fcfSShri Abhyankar nnz = bdiag[prow]; 536335aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 536435aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 536535aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 536635aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 536735aa4fcfSShri Abhyankar nzi += nlnk; 536835aa4fcfSShri Abhyankar prow = lnk[prow]; 536935aa4fcfSShri Abhyankar nzbd++; 537035aa4fcfSShri Abhyankar } 537135aa4fcfSShri Abhyankar bdiag[i] = nzbd; 537235aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 537335aa4fcfSShri Abhyankar 537435aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 537535aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 537635aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 537735aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 537835aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 537935aa4fcfSShri Abhyankar reallocs++; 538035aa4fcfSShri Abhyankar } 538135aa4fcfSShri Abhyankar 538235aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 538335aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 538435aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 538535aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 538635aa4fcfSShri Abhyankar 538735aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 538835aa4fcfSShri Abhyankar if (*(bj_ptr[i]+bdiag[i]) != i) { 538935aa4fcfSShri Abhyankar SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 539035aa4fcfSShri Abhyankar try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 539135aa4fcfSShri Abhyankar } 539235aa4fcfSShri Abhyankar 539335aa4fcfSShri Abhyankar current_space->array += nzi; 539435aa4fcfSShri Abhyankar current_space->local_used += nzi; 539535aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 539635aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 539735aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 539835aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 539935aa4fcfSShri Abhyankar } 540035aa4fcfSShri Abhyankar 540135aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 540235aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 540335aa4fcfSShri Abhyankar 540435aa4fcfSShri Abhyankar /* destroy list of free space and other temporary arrays */ 540535aa4fcfSShri Abhyankar ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 540635aa4fcfSShri Abhyankar 540735aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 540835aa4fcfSShri Abhyankar ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 540935aa4fcfSShri Abhyankar 541035aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 541135aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5412fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 541335aa4fcfSShri Abhyankar 541435aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 541535aa4fcfSShri Abhyankar { 541635aa4fcfSShri Abhyankar PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 541735aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 541835aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 541935aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 542035aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 542135aa4fcfSShri Abhyankar if (diagonal_fill) { 542235aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 542335aa4fcfSShri Abhyankar } 542435aa4fcfSShri Abhyankar } 542535aa4fcfSShri Abhyankar #endif 542635aa4fcfSShri Abhyankar 542735aa4fcfSShri Abhyankar /* put together the new matrix */ 542835aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 542935aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 543035aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 543135aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 543235aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 543335aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 543435aa4fcfSShri Abhyankar ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 543535aa4fcfSShri Abhyankar b->j = bj; 543635aa4fcfSShri Abhyankar b->i = bi; 543735aa4fcfSShri Abhyankar b->diag = bdiag; 543835aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 543935aa4fcfSShri Abhyankar b->ilen = 0; 544035aa4fcfSShri Abhyankar b->imax = 0; 544135aa4fcfSShri Abhyankar b->row = isrow; 544235aa4fcfSShri Abhyankar b->col = iscol; 544335aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 544435aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 544535aa4fcfSShri Abhyankar b->icol = isicol; 544635aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 544735aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 544835aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 544935aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 545035aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 5451ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocs; 5452ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 5453ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5454ae3d28f0SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 545535aa4fcfSShri Abhyankar PetscFunctionReturn(0); 545635aa4fcfSShri Abhyankar } 545735aa4fcfSShri Abhyankar 545835aa4fcfSShri Abhyankar 54594e2b4712SSatish Balay /* 54604e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 54614e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 54624e2b4712SSatish Balay Not a good example of code reuse. 54634e2b4712SSatish Balay */ 54644a2ae208SSatish Balay #undef __FUNCT__ 54654a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 54660481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 54674e2b4712SSatish Balay { 54684e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 54694e2b4712SSatish Balay IS isicol; 54706849ba73SBarry Smith PetscErrorCode ierr; 54715d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 54725d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5473a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5474d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 547541df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 5476329f5518SBarry Smith PetscReal f; 5477c0c7eb62SShri Abhyankar PetscTruth newdatastruct = PETSC_FALSE; 54784e2b4712SSatish Balay 54794e2b4712SSatish Balay PetscFunctionBegin; 548016a2bf60SHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 548116a2bf60SHong Zhang if (newdatastruct){ 548216a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 548316a2bf60SHong Zhang PetscFunctionReturn(0); 548416a2bf60SHong Zhang } 548516a2bf60SHong Zhang 54866bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 54876bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 54886bce7ff8SHong Zhang 5489435faa5fSBarry Smith f = info->fill; 5490690b6cddSBarry Smith levels = (PetscInt)info->levels; 5491690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 54924c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 549316a2bf60SHong Zhang 5494667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5495667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 54967d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 5497309c388cSBarry Smith 549841df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 549916a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 55006bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 55016bce7ff8SHong Zhang 5502719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 5503ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 5504bb3d539aSBarry Smith b->row = isrow; 5505bb3d539aSBarry Smith b->col = iscol; 5506bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5507bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5508bb3d539aSBarry Smith b->icol = isicol; 5509bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5510b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 55116bce7ff8SHong Zhang PetscFunctionReturn(0); 55126bce7ff8SHong Zhang } 55136bce7ff8SHong Zhang 55146bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 55154e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 55164e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 55174e2b4712SSatish Balay 55184e2b4712SSatish Balay /* get new row pointers */ 5519690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 55204e2b4712SSatish Balay ainew[0] = 0; 55214e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 5522690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 5523690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 55244e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 5525690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 55264e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 5527690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 55284e2b4712SSatish Balay /* im is level for each filled value */ 5529690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 55304e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 5531690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 55324e2b4712SSatish Balay dloc[0] = 0; 55334e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 5534435faa5fSBarry Smith 5535435faa5fSBarry Smith /* copy prow into linked list */ 55364e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 55373b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 55384e2b4712SSatish Balay xi = aj + ai[r[prow]]; 55394e2b4712SSatish Balay fill[n] = n; 5540435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 55414e2b4712SSatish Balay while (nz--) { 55424e2b4712SSatish Balay fm = n; 55434e2b4712SSatish Balay idx = ic[*xi++]; 55444e2b4712SSatish Balay do { 55454e2b4712SSatish Balay m = fm; 55464e2b4712SSatish Balay fm = fill[m]; 55474e2b4712SSatish Balay } while (fm < idx); 55484e2b4712SSatish Balay fill[m] = idx; 55494e2b4712SSatish Balay fill[idx] = fm; 55504e2b4712SSatish Balay im[idx] = 0; 55514e2b4712SSatish Balay } 5552435faa5fSBarry Smith 5553435faa5fSBarry Smith /* make sure diagonal entry is included */ 5554435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 5555435faa5fSBarry Smith fm = n; 5556435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 5557435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5558435faa5fSBarry Smith fill[fm] = prow; 5559435faa5fSBarry Smith im[prow] = 0; 5560435faa5fSBarry Smith nzf++; 5561335d9088SBarry Smith dcount++; 5562435faa5fSBarry Smith } 5563435faa5fSBarry Smith 55644e2b4712SSatish Balay nzi = 0; 55654e2b4712SSatish Balay row = fill[n]; 55664e2b4712SSatish Balay while (row < prow) { 55674e2b4712SSatish Balay incrlev = im[row] + 1; 55684e2b4712SSatish Balay nz = dloc[row]; 5569435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 55704e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 55714e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 55724e2b4712SSatish Balay fm = row; 55734e2b4712SSatish Balay while (nnz-- > 0) { 55744e2b4712SSatish Balay idx = *xi++; 55754e2b4712SSatish Balay if (*flev + incrlev > levels) { 55764e2b4712SSatish Balay flev++; 55774e2b4712SSatish Balay continue; 55784e2b4712SSatish Balay } 55794e2b4712SSatish Balay do { 55804e2b4712SSatish Balay m = fm; 55814e2b4712SSatish Balay fm = fill[m]; 55824e2b4712SSatish Balay } while (fm < idx); 55834e2b4712SSatish Balay if (fm != idx) { 55844e2b4712SSatish Balay im[idx] = *flev + incrlev; 55854e2b4712SSatish Balay fill[m] = idx; 55864e2b4712SSatish Balay fill[idx] = fm; 55874e2b4712SSatish Balay fm = idx; 55884e2b4712SSatish Balay nzf++; 5589ecf371e4SBarry Smith } else { 55904e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 55914e2b4712SSatish Balay } 55924e2b4712SSatish Balay flev++; 55934e2b4712SSatish Balay } 55944e2b4712SSatish Balay row = fill[row]; 55954e2b4712SSatish Balay nzi++; 55964e2b4712SSatish Balay } 55974e2b4712SSatish Balay /* copy new filled row into permanent storage */ 55984e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 55994e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 5600ecf371e4SBarry Smith 5601ecf371e4SBarry Smith /* estimate how much additional space we will need */ 5602ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5603ecf371e4SBarry Smith /* just double the memory each time */ 5604690b6cddSBarry Smith PetscInt maxadd = jmax; 5605ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 56064e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 56074e2b4712SSatish Balay jmax += maxadd; 5608ecf371e4SBarry Smith 5609ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 56105d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 56115d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5612606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 56135d0c19d7SBarry Smith ajnew = xitmp; 56145d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 56155d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5616606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 56175d0c19d7SBarry Smith ajfill = xitmp; 5618eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 56194e2b4712SSatish Balay } 56205d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 56214e2b4712SSatish Balay flev = ajfill + ainew[prow]; 56224e2b4712SSatish Balay dloc[prow] = nzi; 56234e2b4712SSatish Balay fm = fill[n]; 56244e2b4712SSatish Balay while (nzf--) { 56255d0c19d7SBarry Smith *xitmp++ = fm; 56264e2b4712SSatish Balay *flev++ = im[fm]; 56274e2b4712SSatish Balay fm = fill[fm]; 56284e2b4712SSatish Balay } 5629435faa5fSBarry Smith /* make sure row has diagonal entry */ 5630435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 563177431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 56322401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5633435faa5fSBarry Smith } 56344e2b4712SSatish Balay } 5635606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 56364e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 56374e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5638606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 5639606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 56404e2b4712SSatish Balay 56416cf91177SBarry Smith #if defined(PETSC_USE_INFO) 56424e2b4712SSatish Balay { 5643329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5644ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5645ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5646ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5647ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5648335d9088SBarry Smith if (diagonal_fill) { 5649ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5650335d9088SBarry Smith } 56514e2b4712SSatish Balay } 565263ba0a88SBarry Smith #endif 56534e2b4712SSatish Balay 56544e2b4712SSatish Balay /* put together the new matrix */ 5655719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5656719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5657ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 5658e6b907acSBarry Smith b->free_a = PETSC_TRUE; 5659e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 56607c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 5661a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 56624e2b4712SSatish Balay b->j = ajnew; 56634e2b4712SSatish Balay b->i = ainew; 56644e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 56654e2b4712SSatish Balay b->diag = dloc; 56667f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 56674e2b4712SSatish Balay b->ilen = 0; 56684e2b4712SSatish Balay b->imax = 0; 56694e2b4712SSatish Balay b->row = isrow; 56704e2b4712SSatish Balay b->col = iscol; 5671bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5672c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5673c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5674e51c0b9cSSatish Balay b->icol = isicol; 567587828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 56764e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 56774e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 5678719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 56794e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 56804e2b4712SSatish Balay 5681ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocate; 5682ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 5683ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 56846bce7ff8SHong Zhang 568541df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 56868661488fSKris Buschelman PetscFunctionReturn(0); 56878661488fSKris Buschelman } 56888661488fSKris Buschelman 5689732ee342SKris Buschelman #undef __FUNCT__ 56907e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5691dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 56927e7071cdSKris Buschelman { 569312272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 569412272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 56955a9542e3SKris Buschelman PetscFunctionBegin; 56967cf1b8d3SKris Buschelman /* Undo Column scaling */ 56977cf1b8d3SKris Buschelman /* while (nz--) { */ 56987cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 56997cf1b8d3SKris Buschelman /* } */ 5700c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 5701c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 57027cf1b8d3SKris Buschelman PetscFunctionReturn(0); 57037cf1b8d3SKris Buschelman } 57047cf1b8d3SKris Buschelman 57057cf1b8d3SKris Buschelman #undef __FUNCT__ 57067cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5707dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 57087cf1b8d3SKris Buschelman { 57097cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5710b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 57112aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 57125a9542e3SKris Buschelman PetscFunctionBegin; 57130b9da03eSKris Buschelman /* Is this really necessary? */ 571420235379SKris Buschelman while (nz--) { 57150b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 57167e7071cdSKris Buschelman } 5717c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 57187e7071cdSKris Buschelman PetscFunctionReturn(0); 57197e7071cdSKris Buschelman } 57207e7071cdSKris Buschelman 5721732ee342SKris Buschelman 5722