1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 1016a2bf60SHong Zhang #include "petscbt.h" 1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 124e2b4712SSatish Balay 134a2ae208SSatish Balay #undef __FUNCT__ 144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16f1af5d2fSBarry Smith { 17f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18dfbe8321SBarry Smith PetscErrorCode ierr; 19690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20690b6cddSBarry Smith PetscInt *diag = a->diag; 21f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2287828ca2SBarry Smith PetscScalar s1,*x,*b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 66f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6887828ca2SBarry Smith PetscScalar *x,*b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 1111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120f1af5d2fSBarry Smith { 121f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122dfbe8321SBarry Smith PetscErrorCode ierr; 123690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 125f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12687828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12787828ca2SBarry Smith PetscScalar *x,*b; 128f1af5d2fSBarry Smith 129f1af5d2fSBarry Smith PetscFunctionBegin; 130ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1311ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133f1af5d2fSBarry Smith 134f1af5d2fSBarry Smith /* forward solve the U^T */ 135f1af5d2fSBarry Smith idx = 0; 136f1af5d2fSBarry Smith for (i=0; i<n; i++) { 137f1af5d2fSBarry Smith 138f1af5d2fSBarry Smith v = aa + 9*diag[i]; 139f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 140ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144f1af5d2fSBarry Smith v += 9; 145f1af5d2fSBarry Smith 146f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 147f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 148f1af5d2fSBarry Smith while (nz--) { 149f1af5d2fSBarry Smith oidx = 3*(*vi++); 150f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153f1af5d2fSBarry Smith v += 9; 154f1af5d2fSBarry Smith } 155f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156f1af5d2fSBarry Smith idx += 3; 157f1af5d2fSBarry Smith } 158f1af5d2fSBarry Smith /* backward solve the L^T */ 159f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 160f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 161f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 162f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 163f1af5d2fSBarry Smith idt = 3*i; 164f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165f1af5d2fSBarry Smith while (nz--) { 166f1af5d2fSBarry Smith idx = 3*(*vi--); 167f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170f1af5d2fSBarry Smith v -= 9; 171f1af5d2fSBarry Smith } 172f1af5d2fSBarry Smith } 1731ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176f1af5d2fSBarry Smith PetscFunctionReturn(0); 177f1af5d2fSBarry Smith } 178f1af5d2fSBarry Smith 1794a2ae208SSatish Balay #undef __FUNCT__ 1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182f1af5d2fSBarry Smith { 183f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184dfbe8321SBarry Smith PetscErrorCode ierr; 185690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 187f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18987828ca2SBarry Smith PetscScalar *x,*b; 190f1af5d2fSBarry Smith 191f1af5d2fSBarry Smith PetscFunctionBegin; 192ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1931ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195f1af5d2fSBarry Smith 196f1af5d2fSBarry Smith /* forward solve the U^T */ 197f1af5d2fSBarry Smith idx = 0; 198f1af5d2fSBarry Smith for (i=0; i<n; i++) { 199f1af5d2fSBarry Smith 200f1af5d2fSBarry Smith v = aa + 16*diag[i]; 201f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 202ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207f1af5d2fSBarry Smith v += 16; 208f1af5d2fSBarry Smith 209f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 210f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 211f1af5d2fSBarry Smith while (nz--) { 212f1af5d2fSBarry Smith oidx = 4*(*vi++); 213f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217f1af5d2fSBarry Smith v += 16; 218f1af5d2fSBarry Smith } 219f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220f1af5d2fSBarry Smith idx += 4; 221f1af5d2fSBarry Smith } 222f1af5d2fSBarry Smith /* backward solve the L^T */ 223f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 224f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 225f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 226f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 227f1af5d2fSBarry Smith idt = 4*i; 228f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229f1af5d2fSBarry Smith while (nz--) { 230f1af5d2fSBarry Smith idx = 4*(*vi--); 231f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235f1af5d2fSBarry Smith v -= 16; 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith } 2381ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2391ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241f1af5d2fSBarry Smith PetscFunctionReturn(0); 242f1af5d2fSBarry Smith } 243f1af5d2fSBarry Smith 2444a2ae208SSatish Balay #undef __FUNCT__ 2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247f1af5d2fSBarry Smith { 248f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249dfbe8321SBarry Smith PetscErrorCode ierr; 250690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 252f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 25387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 25487828ca2SBarry Smith PetscScalar *x,*b; 255f1af5d2fSBarry Smith 256f1af5d2fSBarry Smith PetscFunctionBegin; 257ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2581ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2591ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260f1af5d2fSBarry Smith 261f1af5d2fSBarry Smith /* forward solve the U^T */ 262f1af5d2fSBarry Smith idx = 0; 263f1af5d2fSBarry Smith for (i=0; i<n; i++) { 264f1af5d2fSBarry Smith 265f1af5d2fSBarry Smith v = aa + 25*diag[i]; 266f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 267ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273f1af5d2fSBarry Smith v += 25; 274f1af5d2fSBarry Smith 275f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 276f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 277f1af5d2fSBarry Smith while (nz--) { 278f1af5d2fSBarry Smith oidx = 5*(*vi++); 279f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284f1af5d2fSBarry Smith v += 25; 285f1af5d2fSBarry Smith } 286f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287f1af5d2fSBarry Smith idx += 5; 288f1af5d2fSBarry Smith } 289f1af5d2fSBarry Smith /* backward solve the L^T */ 290f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 291f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 292f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 293f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 294f1af5d2fSBarry Smith idt = 5*i; 295f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296f1af5d2fSBarry Smith while (nz--) { 297f1af5d2fSBarry Smith idx = 5*(*vi--); 298f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303f1af5d2fSBarry Smith v -= 25; 304f1af5d2fSBarry Smith } 305f1af5d2fSBarry Smith } 3061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309f1af5d2fSBarry Smith PetscFunctionReturn(0); 310f1af5d2fSBarry Smith } 311f1af5d2fSBarry Smith 3124a2ae208SSatish Balay #undef __FUNCT__ 3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315f1af5d2fSBarry Smith { 316f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317dfbe8321SBarry Smith PetscErrorCode ierr; 318690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 320f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 32187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 32287828ca2SBarry Smith PetscScalar *x,*b; 323f1af5d2fSBarry Smith 324f1af5d2fSBarry Smith PetscFunctionBegin; 325ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328f1af5d2fSBarry Smith 329f1af5d2fSBarry Smith /* forward solve the U^T */ 330f1af5d2fSBarry Smith idx = 0; 331f1af5d2fSBarry Smith for (i=0; i<n; i++) { 332f1af5d2fSBarry Smith 333f1af5d2fSBarry Smith v = aa + 36*diag[i]; 334f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 335ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336ef66eb69SBarry Smith x6 = x[5+idx]; 337f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343f1af5d2fSBarry Smith v += 36; 344f1af5d2fSBarry Smith 345f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 346f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 347f1af5d2fSBarry Smith while (nz--) { 348f1af5d2fSBarry Smith oidx = 6*(*vi++); 349f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355f1af5d2fSBarry Smith v += 36; 356f1af5d2fSBarry Smith } 357f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358f1af5d2fSBarry Smith x[5+idx] = s6; 359f1af5d2fSBarry Smith idx += 6; 360f1af5d2fSBarry Smith } 361f1af5d2fSBarry Smith /* backward solve the L^T */ 362f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 363f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 364f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 365f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 366f1af5d2fSBarry Smith idt = 6*i; 367f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368f1af5d2fSBarry Smith s6 = x[5+idt]; 369f1af5d2fSBarry Smith while (nz--) { 370f1af5d2fSBarry Smith idx = 6*(*vi--); 371f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377f1af5d2fSBarry Smith v -= 36; 378f1af5d2fSBarry Smith } 379f1af5d2fSBarry Smith } 3801ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383f1af5d2fSBarry Smith PetscFunctionReturn(0); 384f1af5d2fSBarry Smith } 385f1af5d2fSBarry Smith 3864a2ae208SSatish Balay #undef __FUNCT__ 3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389f1af5d2fSBarry Smith { 390f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391dfbe8321SBarry Smith PetscErrorCode ierr; 392690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 394f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 39587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 39687828ca2SBarry Smith PetscScalar *x,*b; 397f1af5d2fSBarry Smith 398f1af5d2fSBarry Smith PetscFunctionBegin; 399ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4001ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402f1af5d2fSBarry Smith 403f1af5d2fSBarry Smith /* forward solve the U^T */ 404f1af5d2fSBarry Smith idx = 0; 405f1af5d2fSBarry Smith for (i=0; i<n; i++) { 406f1af5d2fSBarry Smith 407f1af5d2fSBarry Smith v = aa + 49*diag[i]; 408f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 409ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 411f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418f1af5d2fSBarry Smith v += 49; 419f1af5d2fSBarry Smith 420f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 421f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 422f1af5d2fSBarry Smith while (nz--) { 423f1af5d2fSBarry Smith oidx = 7*(*vi++); 424f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431f1af5d2fSBarry Smith v += 49; 432f1af5d2fSBarry Smith } 433f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 435f1af5d2fSBarry Smith idx += 7; 436f1af5d2fSBarry Smith } 437f1af5d2fSBarry Smith /* backward solve the L^T */ 438f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 439f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 440f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 441f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 442f1af5d2fSBarry Smith idt = 7*i; 443f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 445f1af5d2fSBarry Smith while (nz--) { 446f1af5d2fSBarry Smith idx = 7*(*vi--); 447f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454f1af5d2fSBarry Smith v -= 49; 455f1af5d2fSBarry Smith } 456f1af5d2fSBarry Smith } 4571ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460f1af5d2fSBarry Smith PetscFunctionReturn(0); 461f1af5d2fSBarry Smith } 462f1af5d2fSBarry Smith 463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4644a2ae208SSatish Balay #undef __FUNCT__ 4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467f1af5d2fSBarry Smith { 468f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 4706849ba73SBarry Smith PetscErrorCode ierr; 4715d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 4725d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473690b6cddSBarry Smith PetscInt *diag = a->diag; 474f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 47587828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 476f1af5d2fSBarry Smith 477f1af5d2fSBarry Smith PetscFunctionBegin; 4781ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480f1af5d2fSBarry Smith t = a->solve_work; 481f1af5d2fSBarry Smith 482f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484f1af5d2fSBarry Smith 485f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 486f1af5d2fSBarry Smith for (i=0; i<n; i++) { 487f1af5d2fSBarry Smith t[i] = b[c[i]]; 488f1af5d2fSBarry Smith } 489f1af5d2fSBarry Smith 490f1af5d2fSBarry Smith /* forward solve the U^T */ 491f1af5d2fSBarry Smith for (i=0; i<n; i++) { 492f1af5d2fSBarry Smith 493f1af5d2fSBarry Smith v = aa + diag[i]; 494f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 495f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 496f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 497f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 498f1af5d2fSBarry Smith while (nz--) { 499f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 500f1af5d2fSBarry Smith } 501f1af5d2fSBarry Smith t[i] = s1; 502f1af5d2fSBarry Smith } 503f1af5d2fSBarry Smith /* backward solve the L^T */ 504f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 505f1af5d2fSBarry Smith v = aa + diag[i] - 1; 506f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 507f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 508f1af5d2fSBarry Smith s1 = t[i]; 509f1af5d2fSBarry Smith while (nz--) { 510f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 511f1af5d2fSBarry Smith } 512f1af5d2fSBarry Smith } 513f1af5d2fSBarry Smith 514f1af5d2fSBarry Smith /* copy t into x according to permutation */ 515f1af5d2fSBarry Smith for (i=0; i<n; i++) { 516f1af5d2fSBarry Smith x[r[i]] = t[i]; 517f1af5d2fSBarry Smith } 518f1af5d2fSBarry Smith 519f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5211ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5221ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524f1af5d2fSBarry Smith PetscFunctionReturn(0); 525f1af5d2fSBarry Smith } 526f1af5d2fSBarry Smith 5274a2ae208SSatish Balay #undef __FUNCT__ 5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530f1af5d2fSBarry Smith { 531f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5336849ba73SBarry Smith PetscErrorCode ierr; 5345d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5355d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 537f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53887828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 53987828ca2SBarry Smith PetscScalar *x,*b,*t; 540f1af5d2fSBarry Smith 541f1af5d2fSBarry Smith PetscFunctionBegin; 5421ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5431ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544f1af5d2fSBarry Smith t = a->solve_work; 545f1af5d2fSBarry Smith 546f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548f1af5d2fSBarry Smith 549f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 550f1af5d2fSBarry Smith ii = 0; 551f1af5d2fSBarry Smith for (i=0; i<n; i++) { 552f1af5d2fSBarry Smith ic = 2*c[i]; 553f1af5d2fSBarry Smith t[ii] = b[ic]; 554f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 555f1af5d2fSBarry Smith ii += 2; 556f1af5d2fSBarry Smith } 557f1af5d2fSBarry Smith 558f1af5d2fSBarry Smith /* forward solve the U^T */ 559f1af5d2fSBarry Smith idx = 0; 560f1af5d2fSBarry Smith for (i=0; i<n; i++) { 561f1af5d2fSBarry Smith 562f1af5d2fSBarry Smith v = aa + 4*diag[i]; 563f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 564f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 565f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 566f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 567f1af5d2fSBarry Smith v += 4; 568f1af5d2fSBarry Smith 569f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 570f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 571f1af5d2fSBarry Smith while (nz--) { 572f1af5d2fSBarry Smith oidx = 2*(*vi++); 573f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 574f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 575f1af5d2fSBarry Smith v += 4; 576f1af5d2fSBarry Smith } 577f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 578f1af5d2fSBarry Smith idx += 2; 579f1af5d2fSBarry Smith } 580f1af5d2fSBarry Smith /* backward solve the L^T */ 581f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 582f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 583f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 584f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 585f1af5d2fSBarry Smith idt = 2*i; 586f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 587f1af5d2fSBarry Smith while (nz--) { 588f1af5d2fSBarry Smith idx = 2*(*vi--); 589f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 590f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 591f1af5d2fSBarry Smith v -= 4; 592f1af5d2fSBarry Smith } 593f1af5d2fSBarry Smith } 594f1af5d2fSBarry Smith 595f1af5d2fSBarry Smith /* copy t into x according to permutation */ 596f1af5d2fSBarry Smith ii = 0; 597f1af5d2fSBarry Smith for (i=0; i<n; i++) { 598f1af5d2fSBarry Smith ir = 2*r[i]; 599f1af5d2fSBarry Smith x[ir] = t[ii]; 600f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 601f1af5d2fSBarry Smith ii += 2; 602f1af5d2fSBarry Smith } 603f1af5d2fSBarry Smith 604f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609f1af5d2fSBarry Smith PetscFunctionReturn(0); 610f1af5d2fSBarry Smith } 611f1af5d2fSBarry Smith 6124a2ae208SSatish Balay #undef __FUNCT__ 6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615f1af5d2fSBarry Smith { 616f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6186849ba73SBarry Smith PetscErrorCode ierr; 6195d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 6205d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 622f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 62387828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 62487828ca2SBarry Smith PetscScalar *x,*b,*t; 625f1af5d2fSBarry Smith 626f1af5d2fSBarry Smith PetscFunctionBegin; 6271ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629f1af5d2fSBarry Smith t = a->solve_work; 630f1af5d2fSBarry Smith 631f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633f1af5d2fSBarry Smith 634f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 635f1af5d2fSBarry Smith ii = 0; 636f1af5d2fSBarry Smith for (i=0; i<n; i++) { 637f1af5d2fSBarry Smith ic = 3*c[i]; 638f1af5d2fSBarry Smith t[ii] = b[ic]; 639f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 640f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 641f1af5d2fSBarry Smith ii += 3; 642f1af5d2fSBarry Smith } 643f1af5d2fSBarry Smith 644f1af5d2fSBarry Smith /* forward solve the U^T */ 645f1af5d2fSBarry Smith idx = 0; 646f1af5d2fSBarry Smith for (i=0; i<n; i++) { 647f1af5d2fSBarry Smith 648f1af5d2fSBarry Smith v = aa + 9*diag[i]; 649f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 650f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654f1af5d2fSBarry Smith v += 9; 655f1af5d2fSBarry Smith 656f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 657f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 658f1af5d2fSBarry Smith while (nz--) { 659f1af5d2fSBarry Smith oidx = 3*(*vi++); 660f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663f1af5d2fSBarry Smith v += 9; 664f1af5d2fSBarry Smith } 665f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666f1af5d2fSBarry Smith idx += 3; 667f1af5d2fSBarry Smith } 668f1af5d2fSBarry Smith /* backward solve the L^T */ 669f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 670f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 671f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 672f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 673f1af5d2fSBarry Smith idt = 3*i; 674f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675f1af5d2fSBarry Smith while (nz--) { 676f1af5d2fSBarry Smith idx = 3*(*vi--); 677f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680f1af5d2fSBarry Smith v -= 9; 681f1af5d2fSBarry Smith } 682f1af5d2fSBarry Smith } 683f1af5d2fSBarry Smith 684f1af5d2fSBarry Smith /* copy t into x according to permutation */ 685f1af5d2fSBarry Smith ii = 0; 686f1af5d2fSBarry Smith for (i=0; i<n; i++) { 687f1af5d2fSBarry Smith ir = 3*r[i]; 688f1af5d2fSBarry Smith x[ir] = t[ii]; 689f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 690f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 691f1af5d2fSBarry Smith ii += 3; 692f1af5d2fSBarry Smith } 693f1af5d2fSBarry Smith 694f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6961ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699f1af5d2fSBarry Smith PetscFunctionReturn(0); 700f1af5d2fSBarry Smith } 701f1af5d2fSBarry Smith 7024a2ae208SSatish Balay #undef __FUNCT__ 7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705f1af5d2fSBarry Smith { 706f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7086849ba73SBarry Smith PetscErrorCode ierr; 7095d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 7105d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 712f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 71387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 71487828ca2SBarry Smith PetscScalar *x,*b,*t; 715f1af5d2fSBarry Smith 716f1af5d2fSBarry Smith PetscFunctionBegin; 7171ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719f1af5d2fSBarry Smith t = a->solve_work; 720f1af5d2fSBarry Smith 721f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723f1af5d2fSBarry Smith 724f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 725f1af5d2fSBarry Smith ii = 0; 726f1af5d2fSBarry Smith for (i=0; i<n; i++) { 727f1af5d2fSBarry Smith ic = 4*c[i]; 728f1af5d2fSBarry Smith t[ii] = b[ic]; 729f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 730f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 731f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 732f1af5d2fSBarry Smith ii += 4; 733f1af5d2fSBarry Smith } 734f1af5d2fSBarry Smith 735f1af5d2fSBarry Smith /* forward solve the U^T */ 736f1af5d2fSBarry Smith idx = 0; 737f1af5d2fSBarry Smith for (i=0; i<n; i++) { 738f1af5d2fSBarry Smith 739f1af5d2fSBarry Smith v = aa + 16*diag[i]; 740f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 741f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746f1af5d2fSBarry Smith v += 16; 747f1af5d2fSBarry Smith 748f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 749f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 750f1af5d2fSBarry Smith while (nz--) { 751f1af5d2fSBarry Smith oidx = 4*(*vi++); 752f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756f1af5d2fSBarry Smith v += 16; 757f1af5d2fSBarry Smith } 758f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759f1af5d2fSBarry Smith idx += 4; 760f1af5d2fSBarry Smith } 761f1af5d2fSBarry Smith /* backward solve the L^T */ 762f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 763f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 764f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 765f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 766f1af5d2fSBarry Smith idt = 4*i; 767f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768f1af5d2fSBarry Smith while (nz--) { 769f1af5d2fSBarry Smith idx = 4*(*vi--); 770f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774f1af5d2fSBarry Smith v -= 16; 775f1af5d2fSBarry Smith } 776f1af5d2fSBarry Smith } 777f1af5d2fSBarry Smith 778f1af5d2fSBarry Smith /* copy t into x according to permutation */ 779f1af5d2fSBarry Smith ii = 0; 780f1af5d2fSBarry Smith for (i=0; i<n; i++) { 781f1af5d2fSBarry Smith ir = 4*r[i]; 782f1af5d2fSBarry Smith x[ir] = t[ii]; 783f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 784f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 785f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 786f1af5d2fSBarry Smith ii += 4; 787f1af5d2fSBarry Smith } 788f1af5d2fSBarry Smith 789f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794f1af5d2fSBarry Smith PetscFunctionReturn(0); 795f1af5d2fSBarry Smith } 796f1af5d2fSBarry Smith 7974a2ae208SSatish Balay #undef __FUNCT__ 7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800f1af5d2fSBarry Smith { 801f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8036849ba73SBarry Smith PetscErrorCode ierr; 8045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 807f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 80887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 80987828ca2SBarry Smith PetscScalar *x,*b,*t; 810f1af5d2fSBarry Smith 811f1af5d2fSBarry Smith PetscFunctionBegin; 8121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814f1af5d2fSBarry Smith t = a->solve_work; 815f1af5d2fSBarry Smith 816f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818f1af5d2fSBarry Smith 819f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 820f1af5d2fSBarry Smith ii = 0; 821f1af5d2fSBarry Smith for (i=0; i<n; i++) { 822f1af5d2fSBarry Smith ic = 5*c[i]; 823f1af5d2fSBarry Smith t[ii] = b[ic]; 824f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 825f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 826f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 827f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 828f1af5d2fSBarry Smith ii += 5; 829f1af5d2fSBarry Smith } 830f1af5d2fSBarry Smith 831f1af5d2fSBarry Smith /* forward solve the U^T */ 832f1af5d2fSBarry Smith idx = 0; 833f1af5d2fSBarry Smith for (i=0; i<n; i++) { 834f1af5d2fSBarry Smith 835f1af5d2fSBarry Smith v = aa + 25*diag[i]; 836f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 837f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843f1af5d2fSBarry Smith v += 25; 844f1af5d2fSBarry Smith 845f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 846f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 847f1af5d2fSBarry Smith while (nz--) { 848f1af5d2fSBarry Smith oidx = 5*(*vi++); 849f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854f1af5d2fSBarry Smith v += 25; 855f1af5d2fSBarry Smith } 856f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857f1af5d2fSBarry Smith idx += 5; 858f1af5d2fSBarry Smith } 859f1af5d2fSBarry Smith /* backward solve the L^T */ 860f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 861f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 862f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 863f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 864f1af5d2fSBarry Smith idt = 5*i; 865f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866f1af5d2fSBarry Smith while (nz--) { 867f1af5d2fSBarry Smith idx = 5*(*vi--); 868f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873f1af5d2fSBarry Smith v -= 25; 874f1af5d2fSBarry Smith } 875f1af5d2fSBarry Smith } 876f1af5d2fSBarry Smith 877f1af5d2fSBarry Smith /* copy t into x according to permutation */ 878f1af5d2fSBarry Smith ii = 0; 879f1af5d2fSBarry Smith for (i=0; i<n; i++) { 880f1af5d2fSBarry Smith ir = 5*r[i]; 881f1af5d2fSBarry Smith x[ir] = t[ii]; 882f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 883f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 884f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 885f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 886f1af5d2fSBarry Smith ii += 5; 887f1af5d2fSBarry Smith } 888f1af5d2fSBarry Smith 889f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894f1af5d2fSBarry Smith PetscFunctionReturn(0); 895f1af5d2fSBarry Smith } 896f1af5d2fSBarry Smith 8974a2ae208SSatish Balay #undef __FUNCT__ 8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900f1af5d2fSBarry Smith { 901f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9036849ba73SBarry Smith PetscErrorCode ierr; 9045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 907f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 90887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 90987828ca2SBarry Smith PetscScalar *x,*b,*t; 910f1af5d2fSBarry Smith 911f1af5d2fSBarry Smith PetscFunctionBegin; 9121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914f1af5d2fSBarry Smith t = a->solve_work; 915f1af5d2fSBarry Smith 916f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918f1af5d2fSBarry Smith 919f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 920f1af5d2fSBarry Smith ii = 0; 921f1af5d2fSBarry Smith for (i=0; i<n; i++) { 922f1af5d2fSBarry Smith ic = 6*c[i]; 923f1af5d2fSBarry Smith t[ii] = b[ic]; 924f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 925f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 926f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 927f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 928f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 929f1af5d2fSBarry Smith ii += 6; 930f1af5d2fSBarry Smith } 931f1af5d2fSBarry Smith 932f1af5d2fSBarry Smith /* forward solve the U^T */ 933f1af5d2fSBarry Smith idx = 0; 934f1af5d2fSBarry Smith for (i=0; i<n; i++) { 935f1af5d2fSBarry Smith 936f1af5d2fSBarry Smith v = aa + 36*diag[i]; 937f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 938f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939f1af5d2fSBarry Smith x6 = t[5+idx]; 940f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946f1af5d2fSBarry Smith v += 36; 947f1af5d2fSBarry Smith 948f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 949f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 950f1af5d2fSBarry Smith while (nz--) { 951f1af5d2fSBarry Smith oidx = 6*(*vi++); 952f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958f1af5d2fSBarry Smith v += 36; 959f1af5d2fSBarry Smith } 960f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961f1af5d2fSBarry Smith t[5+idx] = s6; 962f1af5d2fSBarry Smith idx += 6; 963f1af5d2fSBarry Smith } 964f1af5d2fSBarry Smith /* backward solve the L^T */ 965f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 966f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 967f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 968f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 969f1af5d2fSBarry Smith idt = 6*i; 970f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971f1af5d2fSBarry Smith s6 = t[5+idt]; 972f1af5d2fSBarry Smith while (nz--) { 973f1af5d2fSBarry Smith idx = 6*(*vi--); 974f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980f1af5d2fSBarry Smith v -= 36; 981f1af5d2fSBarry Smith } 982f1af5d2fSBarry Smith } 983f1af5d2fSBarry Smith 984f1af5d2fSBarry Smith /* copy t into x according to permutation */ 985f1af5d2fSBarry Smith ii = 0; 986f1af5d2fSBarry Smith for (i=0; i<n; i++) { 987f1af5d2fSBarry Smith ir = 6*r[i]; 988f1af5d2fSBarry Smith x[ir] = t[ii]; 989f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 990f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 991f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 992f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 993f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 994f1af5d2fSBarry Smith ii += 6; 995f1af5d2fSBarry Smith } 996f1af5d2fSBarry Smith 997f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9991ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002f1af5d2fSBarry Smith PetscFunctionReturn(0); 1003f1af5d2fSBarry Smith } 1004f1af5d2fSBarry Smith 10054a2ae208SSatish Balay #undef __FUNCT__ 10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008f1af5d2fSBarry Smith { 1009f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10116849ba73SBarry Smith PetscErrorCode ierr; 10125d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 10135d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 101687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 101787828ca2SBarry Smith PetscScalar *x,*b,*t; 1018f1af5d2fSBarry Smith 1019f1af5d2fSBarry Smith PetscFunctionBegin; 10201ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022f1af5d2fSBarry Smith t = a->solve_work; 1023f1af5d2fSBarry Smith 1024f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026f1af5d2fSBarry Smith 1027f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1028f1af5d2fSBarry Smith ii = 0; 1029f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1030f1af5d2fSBarry Smith ic = 7*c[i]; 1031f1af5d2fSBarry Smith t[ii] = b[ic]; 1032f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1033f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1034f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1035f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1036f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1037f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1038f1af5d2fSBarry Smith ii += 7; 1039f1af5d2fSBarry Smith } 1040f1af5d2fSBarry Smith 1041f1af5d2fSBarry Smith /* forward solve the U^T */ 1042f1af5d2fSBarry Smith idx = 0; 1043f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1044f1af5d2fSBarry Smith 1045f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1046f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1047f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1049f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056f1af5d2fSBarry Smith v += 49; 1057f1af5d2fSBarry Smith 1058f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1059f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1060f1af5d2fSBarry Smith while (nz--) { 1061f1af5d2fSBarry Smith oidx = 7*(*vi++); 1062f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069f1af5d2fSBarry Smith v += 49; 1070f1af5d2fSBarry Smith } 1071f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1073f1af5d2fSBarry Smith idx += 7; 1074f1af5d2fSBarry Smith } 1075f1af5d2fSBarry Smith /* backward solve the L^T */ 1076f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1077f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1078f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1079f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1080f1af5d2fSBarry Smith idt = 7*i; 1081f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1083f1af5d2fSBarry Smith while (nz--) { 1084f1af5d2fSBarry Smith idx = 7*(*vi--); 1085f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092f1af5d2fSBarry Smith v -= 49; 1093f1af5d2fSBarry Smith } 1094f1af5d2fSBarry Smith } 1095f1af5d2fSBarry Smith 1096f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1097f1af5d2fSBarry Smith ii = 0; 1098f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1099f1af5d2fSBarry Smith ir = 7*r[i]; 1100f1af5d2fSBarry Smith x[ir] = t[ii]; 1101f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1102f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1103f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1104f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1105f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1106f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1107f1af5d2fSBarry Smith ii += 7; 1108f1af5d2fSBarry Smith } 1109f1af5d2fSBarry Smith 1110f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11121ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115f1af5d2fSBarry Smith PetscFunctionReturn(0); 1116f1af5d2fSBarry Smith } 1117f1af5d2fSBarry Smith 11184e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11194a2ae208SSatish Balay #undef __FUNCT__ 11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11224e2b4712SSatish Balay { 11234e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11244e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11256849ba73SBarry Smith PetscErrorCode ierr; 11265d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 11275d0c19d7SBarry Smith PetscInt i,n=a->mbs; 11285d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11293f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 113087828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11314e2b4712SSatish Balay 11324e2b4712SSatish Balay PetscFunctionBegin; 11331ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11341ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135f1af5d2fSBarry Smith t = a->solve_work; 11364e2b4712SSatish Balay 11374e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11384e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11394e2b4712SSatish Balay 11404e2b4712SSatish Balay /* forward solve the lower triangular */ 114187828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11424e2b4712SSatish Balay for (i=1; i<n; i++) { 11434e2b4712SSatish Balay v = aa + bs2*ai[i]; 11444e2b4712SSatish Balay vi = aj + ai[i]; 11454e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1146f1af5d2fSBarry Smith s = t + bs*i; 114787828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11484e2b4712SSatish Balay while (nz--) { 1149f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11504e2b4712SSatish Balay v += bs2; 11514e2b4712SSatish Balay } 11524e2b4712SSatish Balay } 11534e2b4712SSatish Balay /* backward solve the upper triangular */ 1154d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 11554e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11564e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11574e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11584e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 115987828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11604e2b4712SSatish Balay while (nz--) { 1161f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11624e2b4712SSatish Balay v += bs2; 11634e2b4712SSatish Balay } 1164f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 116587828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11664e2b4712SSatish Balay } 11674e2b4712SSatish Balay 11684e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11694e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11701ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 11734e2b4712SSatish Balay PetscFunctionReturn(0); 11744e2b4712SSatish Balay } 11754e2b4712SSatish Balay 11764a2ae208SSatish Balay #undef __FUNCT__ 11774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1178dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11794e2b4712SSatish Balay { 11804e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11814e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11826849ba73SBarry Smith PetscErrorCode ierr; 11835d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 11845d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 11853f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 118687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 118787828ca2SBarry Smith PetscScalar *x,*b,*t; 11884e2b4712SSatish Balay 11894e2b4712SSatish Balay PetscFunctionBegin; 11901ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1192f1af5d2fSBarry Smith t = a->solve_work; 11934e2b4712SSatish Balay 11944e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11954e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11964e2b4712SSatish Balay 11974e2b4712SSatish Balay /* forward solve the lower triangular */ 11984e2b4712SSatish Balay idx = 7*(*r++); 1199f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1200f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1201f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 12024e2b4712SSatish Balay 12034e2b4712SSatish Balay for (i=1; i<n; i++) { 12044e2b4712SSatish Balay v = aa + 49*ai[i]; 12054e2b4712SSatish Balay vi = aj + ai[i]; 12064e2b4712SSatish Balay nz = diag[i] - ai[i]; 12074e2b4712SSatish Balay idx = 7*(*r++); 1208f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1209f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 12104e2b4712SSatish Balay while (nz--) { 12114e2b4712SSatish Balay idx = 7*(*vi++); 1212f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1213f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1214f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1215f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1216f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1217f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1218f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1219f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1220f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1221f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12224e2b4712SSatish Balay v += 49; 12234e2b4712SSatish Balay } 12244e2b4712SSatish Balay idx = 7*i; 1225f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1226f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1227f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12284e2b4712SSatish Balay } 12294e2b4712SSatish Balay /* backward solve the upper triangular */ 12304e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12314e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12324e2b4712SSatish Balay vi = aj + diag[i] + 1; 12334e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12344e2b4712SSatish Balay idt = 7*i; 1235f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1236f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1237f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12384e2b4712SSatish Balay while (nz--) { 12394e2b4712SSatish Balay idx = 7*(*vi++); 1240f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1241f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1243f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1244f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1245f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1246f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1247f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1248f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1249f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12504e2b4712SSatish Balay v += 49; 12514e2b4712SSatish Balay } 12524e2b4712SSatish Balay idc = 7*(*c--); 12534e2b4712SSatish Balay v = aa + 49*diag[i]; 1254f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1255f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1256f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1257f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1258f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1259f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1260f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1261f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1262f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1263f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1264f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1265f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1266f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1267f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12684e2b4712SSatish Balay } 12694e2b4712SSatish Balay 12704e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12714e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12721ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12731ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1274dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 12754e2b4712SSatish Balay PetscFunctionReturn(0); 12764e2b4712SSatish Balay } 12774e2b4712SSatish Balay 12784a2ae208SSatish Balay #undef __FUNCT__ 12798f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 12808f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 12818f690400SShri Abhyankar { 12828f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 12838f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 12848f690400SShri Abhyankar PetscErrorCode ierr; 12858f690400SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 128629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 12878f690400SShri Abhyankar MatScalar *aa=a->a,*v; 12888f690400SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 12898f690400SShri Abhyankar PetscScalar *x,*b,*t; 12908f690400SShri Abhyankar 12918f690400SShri Abhyankar PetscFunctionBegin; 12928f690400SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 12938f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 12948f690400SShri Abhyankar t = a->solve_work; 12958f690400SShri Abhyankar 12968f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 129729b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 12988f690400SShri Abhyankar 12998f690400SShri Abhyankar /* forward solve the lower triangular */ 130029b92fc1SShri Abhyankar idx = 7*r[0]; 13018f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 13028f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 13038f690400SShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 13048f690400SShri Abhyankar 13058f690400SShri Abhyankar for (i=1; i<n; i++) { 13068f690400SShri Abhyankar v = aa + 49*ai[i]; 13078f690400SShri Abhyankar vi = aj + ai[i]; 13088f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 130929b92fc1SShri Abhyankar idx = 7*r[i]; 13108f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 13118f690400SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 131229b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 131329b92fc1SShri Abhyankar idx = 7*vi[m]; 13148f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 13158f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 13168f690400SShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 13178f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 13188f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 13198f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 13208f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 13218f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 13228f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 13238f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13248f690400SShri Abhyankar v += 49; 13258f690400SShri Abhyankar } 13268f690400SShri Abhyankar idx = 7*i; 13278f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 13288f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 13298f690400SShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 13308f690400SShri Abhyankar } 13318f690400SShri Abhyankar /* backward solve the upper triangular */ 13328f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 13338f690400SShri Abhyankar k = 2*n-i; 13348f690400SShri Abhyankar v = aa + 49*ai[k]; 13358f690400SShri Abhyankar vi = aj + ai[k]; 13368f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 13378f690400SShri Abhyankar idt = 7*i; 13388f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 13398f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 13408f690400SShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 134129b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 134229b92fc1SShri Abhyankar idx = 7*vi[m]; 13438f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 13448f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 13458f690400SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 13468f690400SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 13478f690400SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 13488f690400SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 13498f690400SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 13508f690400SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 13518f690400SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 13528f690400SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13538f690400SShri Abhyankar v += 49; 13548f690400SShri Abhyankar } 135529b92fc1SShri Abhyankar idc = 7*c[i]; 13568f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 13578f690400SShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 13588f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 13598f690400SShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 13608f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 13618f690400SShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 13628f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 13638f690400SShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 13648f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 13658f690400SShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 13668f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 13678f690400SShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 13688f690400SShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 13698f690400SShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 13708f690400SShri Abhyankar } 13718f690400SShri Abhyankar 13728f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 13738f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13748f690400SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 13758f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 13768f690400SShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 13778f690400SShri Abhyankar PetscFunctionReturn(0); 13788f690400SShri Abhyankar } 13798f690400SShri Abhyankar 13808f690400SShri Abhyankar #undef __FUNCT__ 13814a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1382dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 138315091d37SBarry Smith { 138415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1385690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1386dfbe8321SBarry Smith PetscErrorCode ierr; 1387690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1388d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1389d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1390d9fead3dSBarry Smith const PetscScalar *b; 139115091d37SBarry Smith 139215091d37SBarry Smith PetscFunctionBegin; 1393d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 139515091d37SBarry Smith /* forward solve the lower triangular */ 139615091d37SBarry Smith idx = 0; 139715091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 139815091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 139915091d37SBarry Smith x[6] = b[6+idx]; 140015091d37SBarry Smith for (i=1; i<n; i++) { 140115091d37SBarry Smith v = aa + 49*ai[i]; 140215091d37SBarry Smith vi = aj + ai[i]; 140315091d37SBarry Smith nz = diag[i] - ai[i]; 140415091d37SBarry Smith idx = 7*i; 1405f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1406f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1407f1af5d2fSBarry Smith s7 = b[6+idx]; 140815091d37SBarry Smith while (nz--) { 140915091d37SBarry Smith jdx = 7*(*vi++); 141015091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 141115091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 141215091d37SBarry Smith x7 = x[6+jdx]; 1413f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1414f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1415f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1416f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1417f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1418f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1419f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 142015091d37SBarry Smith v += 49; 142115091d37SBarry Smith } 1422f1af5d2fSBarry Smith x[idx] = s1; 1423f1af5d2fSBarry Smith x[1+idx] = s2; 1424f1af5d2fSBarry Smith x[2+idx] = s3; 1425f1af5d2fSBarry Smith x[3+idx] = s4; 1426f1af5d2fSBarry Smith x[4+idx] = s5; 1427f1af5d2fSBarry Smith x[5+idx] = s6; 1428f1af5d2fSBarry Smith x[6+idx] = s7; 142915091d37SBarry Smith } 143015091d37SBarry Smith /* backward solve the upper triangular */ 143115091d37SBarry Smith for (i=n-1; i>=0; i--){ 143215091d37SBarry Smith v = aa + 49*diag[i] + 49; 143315091d37SBarry Smith vi = aj + diag[i] + 1; 143415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 143515091d37SBarry Smith idt = 7*i; 1436f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1437f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1438f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1439f1af5d2fSBarry Smith s7 = x[6+idt]; 144015091d37SBarry Smith while (nz--) { 144115091d37SBarry Smith idx = 7*(*vi++); 144215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 144315091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 144415091d37SBarry Smith x7 = x[6+idx]; 1445f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1446f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1447f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1448f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1449f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1450f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1451f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 145215091d37SBarry Smith v += 49; 145315091d37SBarry Smith } 145415091d37SBarry Smith v = aa + 49*diag[i]; 1455f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1456f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1457f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1458f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1459f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1460f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1461f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1462f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1463f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1464f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1465f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1466f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1467f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1468f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 146915091d37SBarry Smith } 147015091d37SBarry Smith 1471d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14721ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1473dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 147415091d37SBarry Smith PetscFunctionReturn(0); 147515091d37SBarry Smith } 147615091d37SBarry Smith 14774a2ae208SSatish Balay #undef __FUNCT__ 1478cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1479cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1480cee9d6f2SShri Abhyankar { 1481cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 14826464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1483cee9d6f2SShri Abhyankar PetscErrorCode ierr; 1484cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 1485cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1486cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 1487cee9d6f2SShri Abhyankar PetscScalar *x; 1488cee9d6f2SShri Abhyankar const PetscScalar *b; 1489cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1490cee9d6f2SShri Abhyankar 1491cee9d6f2SShri Abhyankar PetscFunctionBegin; 1492cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1493cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1494cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 1495cee9d6f2SShri Abhyankar idx = 0; 1496cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1497cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1498cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 1499cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 1500cee9d6f2SShri Abhyankar vi = aj + ai[i]; 1501cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 1502cee9d6f2SShri Abhyankar idx = bs*i; 1503cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1504cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 15056464896eSShri Abhyankar for(k=0;k<nz;k++) { 15066464896eSShri Abhyankar jdx = bs*vi[k]; 1507cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1508cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1509cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1510cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1511cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1512cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1513cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1514cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1515cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1516cee9d6f2SShri Abhyankar v += bs2; 1517cee9d6f2SShri Abhyankar } 1518cee9d6f2SShri Abhyankar 1519cee9d6f2SShri Abhyankar x[idx] = s1; 1520cee9d6f2SShri Abhyankar x[1+idx] = s2; 1521cee9d6f2SShri Abhyankar x[2+idx] = s3; 1522cee9d6f2SShri Abhyankar x[3+idx] = s4; 1523cee9d6f2SShri Abhyankar x[4+idx] = s5; 1524cee9d6f2SShri Abhyankar x[5+idx] = s6; 1525cee9d6f2SShri Abhyankar x[6+idx] = s7; 1526cee9d6f2SShri Abhyankar } 1527cee9d6f2SShri Abhyankar 1528cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 1529cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 1530cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 1531cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 1532cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 1533cee9d6f2SShri Abhyankar idt = bs*i; 1534cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1535cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 15366464896eSShri Abhyankar for(k=0;k<nz;k++) { 15376464896eSShri Abhyankar idx = bs*vi[k]; 1538cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1539cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1540cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1541cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1542cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1543cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1544cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1545cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1546cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1547cee9d6f2SShri Abhyankar v += bs2; 1548cee9d6f2SShri Abhyankar } 1549cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 1550cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1551cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1552cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1553cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1554cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1555cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1556cee9d6f2SShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1557cee9d6f2SShri Abhyankar } 1558cee9d6f2SShri Abhyankar 1559cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1560cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1561cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1562cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 1563cee9d6f2SShri Abhyankar } 1564cee9d6f2SShri Abhyankar 1565cee9d6f2SShri Abhyankar #undef __FUNCT__ 156653cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2" 156753cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 156853cca76cSShri Abhyankar { 156953cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 157053cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 157153cca76cSShri Abhyankar PetscErrorCode ierr; 157253cca76cSShri Abhyankar PetscInt idx,jdx,idt; 157353cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 157453cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 157553cca76cSShri Abhyankar PetscScalar *x; 157653cca76cSShri Abhyankar const PetscScalar *b; 157753cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 157853cca76cSShri Abhyankar 157953cca76cSShri Abhyankar PetscFunctionBegin; 158053cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 158153cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 158253cca76cSShri Abhyankar /* forward solve the lower triangular */ 158353cca76cSShri Abhyankar idx = 0; 158453cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 158553cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 158653cca76cSShri Abhyankar for (i=1; i<n; i++) { 158753cca76cSShri Abhyankar v = aa + bs2*ai[i]; 158853cca76cSShri Abhyankar vi = aj + ai[i]; 158953cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 159053cca76cSShri Abhyankar idx = bs*i; 159153cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 159253cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 159353cca76cSShri Abhyankar for(k=0;k<nz;k++) { 159453cca76cSShri Abhyankar jdx = bs*vi[k]; 159553cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 159653cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 159753cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 159853cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 159953cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 160053cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 160153cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 160253cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 160353cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 160453cca76cSShri Abhyankar v += bs2; 160553cca76cSShri Abhyankar } 160653cca76cSShri Abhyankar 160753cca76cSShri Abhyankar x[idx] = s1; 160853cca76cSShri Abhyankar x[1+idx] = s2; 160953cca76cSShri Abhyankar x[2+idx] = s3; 161053cca76cSShri Abhyankar x[3+idx] = s4; 161153cca76cSShri Abhyankar x[4+idx] = s5; 161253cca76cSShri Abhyankar x[5+idx] = s6; 161353cca76cSShri Abhyankar x[6+idx] = s7; 161453cca76cSShri Abhyankar } 161553cca76cSShri Abhyankar 161653cca76cSShri Abhyankar /* backward solve the upper triangular */ 161753cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 161853cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 161953cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 162053cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 162153cca76cSShri Abhyankar idt = bs*i; 162253cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 162353cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 162453cca76cSShri Abhyankar for(k=0;k<nz;k++) { 162553cca76cSShri Abhyankar idx = bs*vi[k]; 162653cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 162753cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 162853cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 162953cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 163053cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 163153cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 163253cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 163353cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 163453cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 163553cca76cSShri Abhyankar v += bs2; 163653cca76cSShri Abhyankar } 163753cca76cSShri Abhyankar /* x = inv_diagonal*x */ 163853cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 163953cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 164053cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 164153cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 164253cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 164353cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 164453cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 164553cca76cSShri Abhyankar } 164653cca76cSShri Abhyankar 164753cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 164853cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 164953cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 165053cca76cSShri Abhyankar PetscFunctionReturn(0); 165153cca76cSShri Abhyankar } 165253cca76cSShri Abhyankar 165353cca76cSShri Abhyankar #undef __FUNCT__ 16544a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1655dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 165615091d37SBarry Smith { 165715091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 165815091d37SBarry Smith IS iscol=a->col,isrow=a->row; 16596849ba73SBarry Smith PetscErrorCode ierr; 16605d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 16615d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1662d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1663d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1664d9fead3dSBarry Smith const PetscScalar *b; 166515091d37SBarry Smith PetscFunctionBegin; 1666d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16671ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1668f1af5d2fSBarry Smith t = a->solve_work; 166915091d37SBarry Smith 167015091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 167115091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 167215091d37SBarry Smith 167315091d37SBarry Smith /* forward solve the lower triangular */ 167415091d37SBarry Smith idx = 6*(*r++); 1675f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1676f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1677f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 167815091d37SBarry Smith for (i=1; i<n; i++) { 167915091d37SBarry Smith v = aa + 36*ai[i]; 168015091d37SBarry Smith vi = aj + ai[i]; 168115091d37SBarry Smith nz = diag[i] - ai[i]; 168215091d37SBarry Smith idx = 6*(*r++); 1683f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1684f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 168515091d37SBarry Smith while (nz--) { 168615091d37SBarry Smith idx = 6*(*vi++); 1687f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1688f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1689f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1690f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1691f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1692f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1693f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1694f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 169515091d37SBarry Smith v += 36; 169615091d37SBarry Smith } 169715091d37SBarry Smith idx = 6*i; 1698f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1699f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1700f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 170115091d37SBarry Smith } 170215091d37SBarry Smith /* backward solve the upper triangular */ 170315091d37SBarry Smith for (i=n-1; i>=0; i--){ 170415091d37SBarry Smith v = aa + 36*diag[i] + 36; 170515091d37SBarry Smith vi = aj + diag[i] + 1; 170615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 170715091d37SBarry Smith idt = 6*i; 1708f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1709f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1710f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 171115091d37SBarry Smith while (nz--) { 171215091d37SBarry Smith idx = 6*(*vi++); 1713f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1714f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1715f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1716f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1717f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1718f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1719f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1720f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1721f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 172215091d37SBarry Smith v += 36; 172315091d37SBarry Smith } 172415091d37SBarry Smith idc = 6*(*c--); 172515091d37SBarry Smith v = aa + 36*diag[i]; 1726f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1727f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1728f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1729f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1730f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1731f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1732f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1733f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1734f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1735f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1736f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1737f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 173815091d37SBarry Smith } 173915091d37SBarry Smith 174015091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 174115091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1742d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17431ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1744dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 174515091d37SBarry Smith PetscFunctionReturn(0); 174615091d37SBarry Smith } 174715091d37SBarry Smith 17484a2ae208SSatish Balay #undef __FUNCT__ 17498f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 17508f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 17518f690400SShri Abhyankar { 17528f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 17538f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 17548f690400SShri Abhyankar PetscErrorCode ierr; 17558f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 175629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 17578f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 17588f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 17598f690400SShri Abhyankar const PetscScalar *b; 17608f690400SShri Abhyankar PetscFunctionBegin; 17618f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17628f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 17638f690400SShri Abhyankar t = a->solve_work; 17648f690400SShri Abhyankar 17658f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 176629b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 17678f690400SShri Abhyankar 17688f690400SShri Abhyankar /* forward solve the lower triangular */ 176929b92fc1SShri Abhyankar idx = 6*r[0]; 17708f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 17718f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 17728f690400SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 17738f690400SShri Abhyankar for (i=1; i<n; i++) { 17748f690400SShri Abhyankar v = aa + 36*ai[i]; 17758f690400SShri Abhyankar vi = aj + ai[i]; 17768f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 177729b92fc1SShri Abhyankar idx = 6*r[i]; 17788f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 17798f690400SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 178029b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 178129b92fc1SShri Abhyankar idx = 6*vi[m]; 17828f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 17838f690400SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 17848f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 17858f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 17868f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 17878f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 17888f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 17898f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 17908f690400SShri Abhyankar v += 36; 17918f690400SShri Abhyankar } 17928f690400SShri Abhyankar idx = 6*i; 17938f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 17948f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 17958f690400SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 17968f690400SShri Abhyankar } 17978f690400SShri Abhyankar /* backward solve the upper triangular */ 17988f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 17998f690400SShri Abhyankar k = 2*n-i; 18008f690400SShri Abhyankar v = aa + 36*ai[k]; 18018f690400SShri Abhyankar vi = aj + ai[k]; 18028f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 18038f690400SShri Abhyankar idt = 6*i; 18048f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 18058f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 18068f690400SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 180729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 180829b92fc1SShri Abhyankar idx = 6*vi[m]; 18098f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 18108f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 18118f690400SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 18128f690400SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 18138f690400SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 18148f690400SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 18158f690400SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 18168f690400SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 18178f690400SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 18188f690400SShri Abhyankar v += 36; 18198f690400SShri Abhyankar } 182029b92fc1SShri Abhyankar idc = 6*c[i]; 18218f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 18228f690400SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 18238f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 18248f690400SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 18258f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 18268f690400SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 18278f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 18288f690400SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 18298f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 18308f690400SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 18318f690400SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 18328f690400SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 18338f690400SShri Abhyankar } 18348f690400SShri Abhyankar 18358f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 18368f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 18378f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18388f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 18398f690400SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 18408f690400SShri Abhyankar PetscFunctionReturn(0); 18418f690400SShri Abhyankar } 18428f690400SShri Abhyankar 18438f690400SShri Abhyankar 18448f690400SShri Abhyankar #undef __FUNCT__ 18454a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1846dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 184715091d37SBarry Smith { 184815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1849690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1850dfbe8321SBarry Smith PetscErrorCode ierr; 1851690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1852d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1853d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1854d9fead3dSBarry Smith const PetscScalar *b; 185515091d37SBarry Smith 185615091d37SBarry Smith PetscFunctionBegin; 1857d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18581ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 185915091d37SBarry Smith /* forward solve the lower triangular */ 186015091d37SBarry Smith idx = 0; 186115091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 186215091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 186315091d37SBarry Smith for (i=1; i<n; i++) { 186415091d37SBarry Smith v = aa + 36*ai[i]; 186515091d37SBarry Smith vi = aj + ai[i]; 186615091d37SBarry Smith nz = diag[i] - ai[i]; 186715091d37SBarry Smith idx = 6*i; 1868f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1869f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 187015091d37SBarry Smith while (nz--) { 187115091d37SBarry Smith jdx = 6*(*vi++); 187215091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 187315091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1874f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1875f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1876f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1877f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1878f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1879f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 188015091d37SBarry Smith v += 36; 188115091d37SBarry Smith } 1882f1af5d2fSBarry Smith x[idx] = s1; 1883f1af5d2fSBarry Smith x[1+idx] = s2; 1884f1af5d2fSBarry Smith x[2+idx] = s3; 1885f1af5d2fSBarry Smith x[3+idx] = s4; 1886f1af5d2fSBarry Smith x[4+idx] = s5; 1887f1af5d2fSBarry Smith x[5+idx] = s6; 188815091d37SBarry Smith } 188915091d37SBarry Smith /* backward solve the upper triangular */ 189015091d37SBarry Smith for (i=n-1; i>=0; i--){ 189115091d37SBarry Smith v = aa + 36*diag[i] + 36; 189215091d37SBarry Smith vi = aj + diag[i] + 1; 189315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 189415091d37SBarry Smith idt = 6*i; 1895f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1896f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1897f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 189815091d37SBarry Smith while (nz--) { 189915091d37SBarry Smith idx = 6*(*vi++); 190015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 190115091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1902f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1903f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1904f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1905f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1906f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1907f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 190815091d37SBarry Smith v += 36; 190915091d37SBarry Smith } 191015091d37SBarry Smith v = aa + 36*diag[i]; 1911f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1912f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1913f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1914f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1915f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1916f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 191715091d37SBarry Smith } 191815091d37SBarry Smith 1919d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19201ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1921dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 192215091d37SBarry Smith PetscFunctionReturn(0); 192315091d37SBarry Smith } 192415091d37SBarry Smith 19254a2ae208SSatish Balay #undef __FUNCT__ 1926cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 1927cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1928cee9d6f2SShri Abhyankar { 1929cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 19306464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1931cee9d6f2SShri Abhyankar PetscErrorCode ierr; 1932cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 1933cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1934cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 1935cee9d6f2SShri Abhyankar PetscScalar *x; 1936cee9d6f2SShri Abhyankar const PetscScalar *b; 1937cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1938cee9d6f2SShri Abhyankar 1939cee9d6f2SShri Abhyankar PetscFunctionBegin; 1940cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1941cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1942cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 1943cee9d6f2SShri Abhyankar idx = 0; 1944cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1945cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 1946cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 1947cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 1948cee9d6f2SShri Abhyankar vi = aj + ai[i]; 1949cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 1950cee9d6f2SShri Abhyankar idx = bs*i; 1951cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1952cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 19536464896eSShri Abhyankar for(k=0;k<nz;k++){ 19546464896eSShri Abhyankar jdx = bs*vi[k]; 1955cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1956cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 1957cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1958cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1959cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1960cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1961cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1962cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1963cee9d6f2SShri Abhyankar v += bs2; 1964cee9d6f2SShri Abhyankar } 1965cee9d6f2SShri Abhyankar 1966cee9d6f2SShri Abhyankar x[idx] = s1; 1967cee9d6f2SShri Abhyankar x[1+idx] = s2; 1968cee9d6f2SShri Abhyankar x[2+idx] = s3; 1969cee9d6f2SShri Abhyankar x[3+idx] = s4; 1970cee9d6f2SShri Abhyankar x[4+idx] = s5; 1971cee9d6f2SShri Abhyankar x[5+idx] = s6; 1972cee9d6f2SShri Abhyankar } 1973cee9d6f2SShri Abhyankar 1974cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 1975cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 1976cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 1977cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 1978cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 1979cee9d6f2SShri Abhyankar idt = bs*i; 1980cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1981cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 19826464896eSShri Abhyankar for(k=0;k<nz;k++){ 19836464896eSShri Abhyankar idx = bs*vi[k]; 1984cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1985cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 1986cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1987cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1988cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1989cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1990cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1991cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1992cee9d6f2SShri Abhyankar v += bs2; 1993cee9d6f2SShri Abhyankar } 1994cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 1995cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1996cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1997cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1998cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1999cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2000cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2001cee9d6f2SShri Abhyankar } 2002cee9d6f2SShri Abhyankar 2003cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2004cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2005cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2006cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2007cee9d6f2SShri Abhyankar } 20088f690400SShri Abhyankar 2009cee9d6f2SShri Abhyankar #undef __FUNCT__ 201053cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2" 201153cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 201253cca76cSShri Abhyankar { 201353cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 201453cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 201553cca76cSShri Abhyankar PetscErrorCode ierr; 201653cca76cSShri Abhyankar PetscInt idx,jdx,idt; 201753cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 201853cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 201953cca76cSShri Abhyankar PetscScalar *x; 202053cca76cSShri Abhyankar const PetscScalar *b; 202153cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 202253cca76cSShri Abhyankar 202353cca76cSShri Abhyankar PetscFunctionBegin; 202453cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 202553cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 202653cca76cSShri Abhyankar /* forward solve the lower triangular */ 202753cca76cSShri Abhyankar idx = 0; 202853cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 202953cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 203053cca76cSShri Abhyankar for (i=1; i<n; i++) { 203153cca76cSShri Abhyankar v = aa + bs2*ai[i]; 203253cca76cSShri Abhyankar vi = aj + ai[i]; 203353cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 203453cca76cSShri Abhyankar idx = bs*i; 203553cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 203653cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 203753cca76cSShri Abhyankar for(k=0;k<nz;k++){ 203853cca76cSShri Abhyankar jdx = bs*vi[k]; 203953cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 204053cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 204153cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 204253cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 204353cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 204453cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 204553cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 204653cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 204753cca76cSShri Abhyankar v += bs2; 204853cca76cSShri Abhyankar } 204953cca76cSShri Abhyankar 205053cca76cSShri Abhyankar x[idx] = s1; 205153cca76cSShri Abhyankar x[1+idx] = s2; 205253cca76cSShri Abhyankar x[2+idx] = s3; 205353cca76cSShri Abhyankar x[3+idx] = s4; 205453cca76cSShri Abhyankar x[4+idx] = s5; 205553cca76cSShri Abhyankar x[5+idx] = s6; 205653cca76cSShri Abhyankar } 205753cca76cSShri Abhyankar 205853cca76cSShri Abhyankar /* backward solve the upper triangular */ 205953cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 206053cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 206153cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 206253cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 206353cca76cSShri Abhyankar idt = bs*i; 206453cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 206553cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 206653cca76cSShri Abhyankar for(k=0;k<nz;k++){ 206753cca76cSShri Abhyankar idx = bs*vi[k]; 206853cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 206953cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 207053cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 207153cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 207253cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 207353cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 207453cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 207553cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 207653cca76cSShri Abhyankar v += bs2; 207753cca76cSShri Abhyankar } 207853cca76cSShri Abhyankar /* x = inv_diagonal*x */ 207953cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 208053cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 208153cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 208253cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 208353cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 208453cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 208553cca76cSShri Abhyankar } 208653cca76cSShri Abhyankar 208753cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 208853cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 208953cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 209053cca76cSShri Abhyankar PetscFunctionReturn(0); 209153cca76cSShri Abhyankar } 209253cca76cSShri Abhyankar 209353cca76cSShri Abhyankar #undef __FUNCT__ 20944a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2095dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 20964e2b4712SSatish Balay { 20974e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 20984e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 20996849ba73SBarry Smith PetscErrorCode ierr; 21005d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 21015d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2102d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2103d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2104d9fead3dSBarry Smith const PetscScalar *b; 21054e2b4712SSatish Balay 21064e2b4712SSatish Balay PetscFunctionBegin; 2107d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21081ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2109f1af5d2fSBarry Smith t = a->solve_work; 21104e2b4712SSatish Balay 21114e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21124e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 21134e2b4712SSatish Balay 21144e2b4712SSatish Balay /* forward solve the lower triangular */ 21154e2b4712SSatish Balay idx = 5*(*r++); 2116f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2117f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 21184e2b4712SSatish Balay for (i=1; i<n; i++) { 21194e2b4712SSatish Balay v = aa + 25*ai[i]; 21204e2b4712SSatish Balay vi = aj + ai[i]; 21214e2b4712SSatish Balay nz = diag[i] - ai[i]; 21224e2b4712SSatish Balay idx = 5*(*r++); 2123f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2124f1af5d2fSBarry Smith s5 = b[4+idx]; 21254e2b4712SSatish Balay while (nz--) { 21264e2b4712SSatish Balay idx = 5*(*vi++); 2127f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2128f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2129f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2130f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2131f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2132f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2133f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 21344e2b4712SSatish Balay v += 25; 21354e2b4712SSatish Balay } 21364e2b4712SSatish Balay idx = 5*i; 2137f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2138f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 21394e2b4712SSatish Balay } 21404e2b4712SSatish Balay /* backward solve the upper triangular */ 21414e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 21424e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 21434e2b4712SSatish Balay vi = aj + diag[i] + 1; 21444e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 21454e2b4712SSatish Balay idt = 5*i; 2146f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2147f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 21484e2b4712SSatish Balay while (nz--) { 21494e2b4712SSatish Balay idx = 5*(*vi++); 2150f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2151f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2152f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2153f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2154f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2155f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2156f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 21574e2b4712SSatish Balay v += 25; 21584e2b4712SSatish Balay } 21594e2b4712SSatish Balay idc = 5*(*c--); 21604e2b4712SSatish Balay v = aa + 25*diag[i]; 2161f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2162f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 2163f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2164f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 2165f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2166f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 2167f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2168f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 2169f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2170f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 21714e2b4712SSatish Balay } 21724e2b4712SSatish Balay 21734e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21744e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2175d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21761ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2177dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 21784e2b4712SSatish Balay PetscFunctionReturn(0); 21794e2b4712SSatish Balay } 21804e2b4712SSatish Balay 21814a2ae208SSatish Balay #undef __FUNCT__ 21828f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 21838f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 21848f690400SShri Abhyankar { 21858f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21868f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 21878f690400SShri Abhyankar PetscErrorCode ierr; 21888f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 218929b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 21908f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 21918f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 21928f690400SShri Abhyankar const PetscScalar *b; 21938f690400SShri Abhyankar 21948f690400SShri Abhyankar PetscFunctionBegin; 21958f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21968f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21978f690400SShri Abhyankar t = a->solve_work; 21988f690400SShri Abhyankar 21998f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 220029b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22018f690400SShri Abhyankar 22028f690400SShri Abhyankar /* forward solve the lower triangular */ 220329b92fc1SShri Abhyankar idx = 5*r[0]; 22048f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 22058f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 22068f690400SShri Abhyankar for (i=1; i<n; i++) { 22078f690400SShri Abhyankar v = aa + 25*ai[i]; 22088f690400SShri Abhyankar vi = aj + ai[i]; 22098f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 221029b92fc1SShri Abhyankar idx = 5*r[i]; 22118f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 22128f690400SShri Abhyankar s5 = b[4+idx]; 221329b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 221429b92fc1SShri Abhyankar idx = 5*vi[m]; 22158f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 22168f690400SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 22178f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 22188f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 22198f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 22208f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 22218f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 22228f690400SShri Abhyankar v += 25; 22238f690400SShri Abhyankar } 22248f690400SShri Abhyankar idx = 5*i; 22258f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 22268f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 22278f690400SShri Abhyankar } 22288f690400SShri Abhyankar /* backward solve the upper triangular */ 22298f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 22308f690400SShri Abhyankar k = 2*n-i; 22318f690400SShri Abhyankar v = aa + 25*ai[k]; 22328f690400SShri Abhyankar vi = aj + ai[k]; 22338f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 22348f690400SShri Abhyankar idt = 5*i; 22358f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 22368f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 223729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 223829b92fc1SShri Abhyankar idx = 5*vi[m]; 22398f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 22408f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 22418f690400SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 22428f690400SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 22438f690400SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 22448f690400SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 22458f690400SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 22468f690400SShri Abhyankar v += 25; 22478f690400SShri Abhyankar } 224829b92fc1SShri Abhyankar idc = 5*c[i]; 22498f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 22508f690400SShri Abhyankar v[15]*s4+v[20]*s5; 22518f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 22528f690400SShri Abhyankar v[16]*s4+v[21]*s5; 22538f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 22548f690400SShri Abhyankar v[17]*s4+v[22]*s5; 22558f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 22568f690400SShri Abhyankar v[18]*s4+v[23]*s5; 22578f690400SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 22588f690400SShri Abhyankar v[19]*s4+v[24]*s5; 22598f690400SShri Abhyankar } 22608f690400SShri Abhyankar 22618f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22628f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 22638f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22648f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 22658f690400SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 22668f690400SShri Abhyankar PetscFunctionReturn(0); 22678f690400SShri Abhyankar } 22688f690400SShri Abhyankar #undef __FUNCT__ 22694a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2270dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 227115091d37SBarry Smith { 227215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2273690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2274dfbe8321SBarry Smith PetscErrorCode ierr; 2275690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2276d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2277d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2278d9fead3dSBarry Smith const PetscScalar *b; 227915091d37SBarry Smith 228015091d37SBarry Smith PetscFunctionBegin; 2281d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 228315091d37SBarry Smith /* forward solve the lower triangular */ 228415091d37SBarry Smith idx = 0; 228515091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 228615091d37SBarry Smith for (i=1; i<n; i++) { 228715091d37SBarry Smith v = aa + 25*ai[i]; 228815091d37SBarry Smith vi = aj + ai[i]; 228915091d37SBarry Smith nz = diag[i] - ai[i]; 229015091d37SBarry Smith idx = 5*i; 2291f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 229215091d37SBarry Smith while (nz--) { 229315091d37SBarry Smith jdx = 5*(*vi++); 229415091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2295f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2296f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2297f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2298f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2299f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 230015091d37SBarry Smith v += 25; 230115091d37SBarry Smith } 2302f1af5d2fSBarry Smith x[idx] = s1; 2303f1af5d2fSBarry Smith x[1+idx] = s2; 2304f1af5d2fSBarry Smith x[2+idx] = s3; 2305f1af5d2fSBarry Smith x[3+idx] = s4; 2306f1af5d2fSBarry Smith x[4+idx] = s5; 230715091d37SBarry Smith } 230815091d37SBarry Smith /* backward solve the upper triangular */ 230915091d37SBarry Smith for (i=n-1; i>=0; i--){ 231015091d37SBarry Smith v = aa + 25*diag[i] + 25; 231115091d37SBarry Smith vi = aj + diag[i] + 1; 231215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 231315091d37SBarry Smith idt = 5*i; 2314f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2315f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 231615091d37SBarry Smith while (nz--) { 231715091d37SBarry Smith idx = 5*(*vi++); 231815091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2319f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2320f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2321f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2322f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2323f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 232415091d37SBarry Smith v += 25; 232515091d37SBarry Smith } 232615091d37SBarry Smith v = aa + 25*diag[i]; 2327f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2328f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2329f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2330f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2331f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 233215091d37SBarry Smith } 233315091d37SBarry Smith 2334d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23351ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2336dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 233715091d37SBarry Smith PetscFunctionReturn(0); 233815091d37SBarry Smith } 233915091d37SBarry Smith 23404a2ae208SSatish Balay #undef __FUNCT__ 2341cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2342cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2343cee9d6f2SShri Abhyankar { 2344cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 23456464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2346cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2347cee9d6f2SShri Abhyankar PetscInt jdx; 2348cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2349cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2350cee9d6f2SShri Abhyankar const PetscScalar *b; 2351cee9d6f2SShri Abhyankar 2352cee9d6f2SShri Abhyankar PetscFunctionBegin; 2353cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2354cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2355cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2356cee9d6f2SShri Abhyankar idx = 0; 2357cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2358cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2359cee9d6f2SShri Abhyankar v = aa + 25*ai[i]; 2360cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2361cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2362cee9d6f2SShri Abhyankar idx = 5*i; 2363cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 23646464896eSShri Abhyankar for(k=0;k<nz;k++) { 23656464896eSShri Abhyankar jdx = 5*vi[k]; 2366cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2367cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2368cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2369cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2370cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2371cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2372cee9d6f2SShri Abhyankar v += 25; 2373cee9d6f2SShri Abhyankar } 2374cee9d6f2SShri Abhyankar x[idx] = s1; 2375cee9d6f2SShri Abhyankar x[1+idx] = s2; 2376cee9d6f2SShri Abhyankar x[2+idx] = s3; 2377cee9d6f2SShri Abhyankar x[3+idx] = s4; 2378cee9d6f2SShri Abhyankar x[4+idx] = s5; 2379cee9d6f2SShri Abhyankar } 2380cee9d6f2SShri Abhyankar 2381cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2382cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2383cee9d6f2SShri Abhyankar v = aa + 25*ai[2*n-i]; 2384cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2385cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2386cee9d6f2SShri Abhyankar idt = 5*i; 2387cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 2388cee9d6f2SShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 23896464896eSShri Abhyankar for(k=0;k<nz;k++){ 23906464896eSShri Abhyankar idx = 5*vi[k]; 2391cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2392cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2393cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2394cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2395cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2396cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2397cee9d6f2SShri Abhyankar v += 25; 2398cee9d6f2SShri Abhyankar } 2399cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2400cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2401cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2402cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2403cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2404cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2405cee9d6f2SShri Abhyankar } 2406cee9d6f2SShri Abhyankar 2407cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2408cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2409cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2410cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2411cee9d6f2SShri Abhyankar } 2412cee9d6f2SShri Abhyankar 2413cee9d6f2SShri Abhyankar #undef __FUNCT__ 241453cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2" 241553cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 241653cca76cSShri Abhyankar { 241753cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 241853cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 241953cca76cSShri Abhyankar PetscErrorCode ierr; 242053cca76cSShri Abhyankar PetscInt jdx; 242153cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 242253cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 242353cca76cSShri Abhyankar const PetscScalar *b; 242453cca76cSShri Abhyankar 242553cca76cSShri Abhyankar PetscFunctionBegin; 242653cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 242753cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 242853cca76cSShri Abhyankar /* forward solve the lower triangular */ 242953cca76cSShri Abhyankar idx = 0; 243053cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 243153cca76cSShri Abhyankar for (i=1; i<n; i++) { 243253cca76cSShri Abhyankar v = aa + 25*ai[i]; 243353cca76cSShri Abhyankar vi = aj + ai[i]; 243453cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 243553cca76cSShri Abhyankar idx = 5*i; 243653cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 243753cca76cSShri Abhyankar for(k=0;k<nz;k++) { 243853cca76cSShri Abhyankar jdx = 5*vi[k]; 243953cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 244053cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 244153cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 244253cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 244353cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 244453cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 244553cca76cSShri Abhyankar v += 25; 244653cca76cSShri Abhyankar } 244753cca76cSShri Abhyankar x[idx] = s1; 244853cca76cSShri Abhyankar x[1+idx] = s2; 244953cca76cSShri Abhyankar x[2+idx] = s3; 245053cca76cSShri Abhyankar x[3+idx] = s4; 245153cca76cSShri Abhyankar x[4+idx] = s5; 245253cca76cSShri Abhyankar } 245353cca76cSShri Abhyankar 245453cca76cSShri Abhyankar /* backward solve the upper triangular */ 245553cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 245653cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 245753cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 245853cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 245953cca76cSShri Abhyankar idt = 5*i; 246053cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 246153cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 246253cca76cSShri Abhyankar for(k=0;k<nz;k++){ 246353cca76cSShri Abhyankar idx = 5*vi[k]; 246453cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 246553cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 246653cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 246753cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 246853cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 246953cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 247053cca76cSShri Abhyankar v += 25; 247153cca76cSShri Abhyankar } 247253cca76cSShri Abhyankar /* x = inv_diagonal*x */ 247353cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 247453cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 247553cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 247653cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 247753cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 247853cca76cSShri Abhyankar } 247953cca76cSShri Abhyankar 248053cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 248153cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 248253cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 248353cca76cSShri Abhyankar PetscFunctionReturn(0); 248453cca76cSShri Abhyankar } 248553cca76cSShri Abhyankar 248653cca76cSShri Abhyankar #undef __FUNCT__ 24874a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2488dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 24894e2b4712SSatish Balay { 24904e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 24914e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 24926849ba73SBarry Smith PetscErrorCode ierr; 24935d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 24945d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2495d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2496d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2497d9fead3dSBarry Smith const PetscScalar *b; 24984e2b4712SSatish Balay 24994e2b4712SSatish Balay PetscFunctionBegin; 2500d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2502f1af5d2fSBarry Smith t = a->solve_work; 25034e2b4712SSatish Balay 25044e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 25054e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 25064e2b4712SSatish Balay 25074e2b4712SSatish Balay /* forward solve the lower triangular */ 25084e2b4712SSatish Balay idx = 4*(*r++); 2509f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2510f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 25114e2b4712SSatish Balay for (i=1; i<n; i++) { 25124e2b4712SSatish Balay v = aa + 16*ai[i]; 25134e2b4712SSatish Balay vi = aj + ai[i]; 25144e2b4712SSatish Balay nz = diag[i] - ai[i]; 25154e2b4712SSatish Balay idx = 4*(*r++); 2516f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 25174e2b4712SSatish Balay while (nz--) { 25184e2b4712SSatish Balay idx = 4*(*vi++); 2519f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2520f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2521f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2522f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2523f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 25244e2b4712SSatish Balay v += 16; 25254e2b4712SSatish Balay } 25264e2b4712SSatish Balay idx = 4*i; 2527f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2528f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 25294e2b4712SSatish Balay } 25304e2b4712SSatish Balay /* backward solve the upper triangular */ 25314e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 25324e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 25334e2b4712SSatish Balay vi = aj + diag[i] + 1; 25344e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 25354e2b4712SSatish Balay idt = 4*i; 2536f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2537f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 25384e2b4712SSatish Balay while (nz--) { 25394e2b4712SSatish Balay idx = 4*(*vi++); 2540f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2541f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2542f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2543f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2544f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2545f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 25464e2b4712SSatish Balay v += 16; 25474e2b4712SSatish Balay } 25484e2b4712SSatish Balay idc = 4*(*c--); 25494e2b4712SSatish Balay v = aa + 16*diag[i]; 2550f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2551f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2552f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2553f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 25544e2b4712SSatish Balay } 25554e2b4712SSatish Balay 25564e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 25574e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2558d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25591ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2560dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 25614e2b4712SSatish Balay PetscFunctionReturn(0); 25624e2b4712SSatish Balay } 2563f26ec98cSKris Buschelman 2564f26ec98cSKris Buschelman #undef __FUNCT__ 25658f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 25668f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 25678f690400SShri Abhyankar { 25688f690400SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 25698f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 25708f690400SShri Abhyankar PetscErrorCode ierr; 257129b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 25728f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 25738f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 25748f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 25758f690400SShri Abhyankar const PetscScalar *b; 25768f690400SShri Abhyankar 25778f690400SShri Abhyankar PetscFunctionBegin; 25788f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25798f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25808f690400SShri Abhyankar t = a->solve_work; 25818f690400SShri Abhyankar 25828f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 258329b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 25848f690400SShri Abhyankar 25858f690400SShri Abhyankar /* forward solve the lower triangular */ 258629b92fc1SShri Abhyankar idx = 4*r[0]; 25878f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 25888f690400SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 25898f690400SShri Abhyankar for (i=1; i<n; i++) { 25908f690400SShri Abhyankar v = aa + 16*ai[i]; 25918f690400SShri Abhyankar vi = aj + ai[i]; 25928f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 259329b92fc1SShri Abhyankar idx = 4*r[i]; 25948f690400SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 259529b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 259629b92fc1SShri Abhyankar idx = 4*vi[m]; 25978f690400SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 25988f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 25998f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 26008f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 26018f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 26028f690400SShri Abhyankar v += 16; 26038f690400SShri Abhyankar } 26048f690400SShri Abhyankar idx = 4*i; 26058f690400SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 26068f690400SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 26078f690400SShri Abhyankar } 26088f690400SShri Abhyankar /* backward solve the upper triangular */ 26098f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 26108f690400SShri Abhyankar k = 2*n-i; 26118f690400SShri Abhyankar v = aa + 16*ai[k]; 26128f690400SShri Abhyankar vi = aj + ai[k]; 26138f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 26148f690400SShri Abhyankar idt = 4*i; 26158f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 26168f690400SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 261729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 261829b92fc1SShri Abhyankar idx = 4*vi[m]; 26198f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 26208f690400SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 26218f690400SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 26228f690400SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 26238f690400SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 26248f690400SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 26258f690400SShri Abhyankar v += 16; 26268f690400SShri Abhyankar } 262729b92fc1SShri Abhyankar idc = 4*c[i]; 26288f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 26298f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 26308f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 26318f690400SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 26328f690400SShri Abhyankar } 26338f690400SShri Abhyankar 26348f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 26358f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 26368f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26378f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 26388f690400SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 26398f690400SShri Abhyankar PetscFunctionReturn(0); 26408f690400SShri Abhyankar } 26418f690400SShri Abhyankar 26428f690400SShri Abhyankar #undef __FUNCT__ 2643f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2644dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2645f26ec98cSKris Buschelman { 2646f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2647f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 26486849ba73SBarry Smith PetscErrorCode ierr; 26495d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 26505d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2651d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2652d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2653d9fead3dSBarry Smith PetscScalar *x; 2654d9fead3dSBarry Smith const PetscScalar *b; 2655f26ec98cSKris Buschelman 2656f26ec98cSKris Buschelman PetscFunctionBegin; 2657d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26581ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2659f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 2660f26ec98cSKris Buschelman 2661f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2662f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2663f26ec98cSKris Buschelman 2664f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2665f26ec98cSKris Buschelman idx = 4*(*r++); 2666f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 2667f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 2668f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 2669f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 2670f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2671f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2672f26ec98cSKris Buschelman vi = aj + ai[i]; 2673f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2674f26ec98cSKris Buschelman idx = 4*(*r++); 2675f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2676f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2677f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2678f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2679f26ec98cSKris Buschelman while (nz--) { 2680f26ec98cSKris Buschelman idx = 4*(*vi++); 2681f26ec98cSKris Buschelman x1 = t[idx]; 2682f26ec98cSKris Buschelman x2 = t[1+idx]; 2683f26ec98cSKris Buschelman x3 = t[2+idx]; 2684f26ec98cSKris Buschelman x4 = t[3+idx]; 2685f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2686f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2687f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2688f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2689f26ec98cSKris Buschelman v += 16; 2690f26ec98cSKris Buschelman } 2691f26ec98cSKris Buschelman idx = 4*i; 2692f26ec98cSKris Buschelman t[idx] = s1; 2693f26ec98cSKris Buschelman t[1+idx] = s2; 2694f26ec98cSKris Buschelman t[2+idx] = s3; 2695f26ec98cSKris Buschelman t[3+idx] = s4; 2696f26ec98cSKris Buschelman } 2697f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2698f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2699f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 2700f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2701f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2702f26ec98cSKris Buschelman idt = 4*i; 2703f26ec98cSKris Buschelman s1 = t[idt]; 2704f26ec98cSKris Buschelman s2 = t[1+idt]; 2705f26ec98cSKris Buschelman s3 = t[2+idt]; 2706f26ec98cSKris Buschelman s4 = t[3+idt]; 2707f26ec98cSKris Buschelman while (nz--) { 2708f26ec98cSKris Buschelman idx = 4*(*vi++); 2709f26ec98cSKris Buschelman x1 = t[idx]; 2710f26ec98cSKris Buschelman x2 = t[1+idx]; 2711f26ec98cSKris Buschelman x3 = t[2+idx]; 2712f26ec98cSKris Buschelman x4 = t[3+idx]; 2713f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2714f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2715f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2716f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2717f26ec98cSKris Buschelman v += 16; 2718f26ec98cSKris Buschelman } 2719f26ec98cSKris Buschelman idc = 4*(*c--); 2720f26ec98cSKris Buschelman v = aa + 16*diag[i]; 2721f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2722f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2723f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2724f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2725f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 2726f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 2727f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 2728f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 2729f26ec98cSKris Buschelman } 2730f26ec98cSKris Buschelman 2731f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2732f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2733d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27341ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2735dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2736f26ec98cSKris Buschelman PetscFunctionReturn(0); 2737f26ec98cSKris Buschelman } 2738f26ec98cSKris Buschelman 273924c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 274024c233c2SKris Buschelman 274124c233c2SKris Buschelman #include PETSC_HAVE_SSE 274224c233c2SKris Buschelman 274324c233c2SKris Buschelman #undef __FUNCT__ 274424c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 2745dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 274624c233c2SKris Buschelman { 274724c233c2SKris Buschelman /* 274824c233c2SKris Buschelman Note: This code uses demotion of double 274924c233c2SKris Buschelman to float when performing the mixed-mode computation. 275024c233c2SKris Buschelman This may not be numerically reasonable for all applications. 275124c233c2SKris Buschelman */ 275224c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 275324c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 27546849ba73SBarry Smith PetscErrorCode ierr; 27555d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 27565d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 275724c233c2SKris Buschelman MatScalar *aa=a->a,*v; 275887828ca2SBarry Smith PetscScalar *x,*b,*t; 275924c233c2SKris Buschelman 276024c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 276124c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 276224c233c2SKris Buschelman unsigned long offset; 276324c233c2SKris Buschelman 276424c233c2SKris Buschelman PetscFunctionBegin; 276524c233c2SKris Buschelman SSE_SCOPE_BEGIN; 276624c233c2SKris Buschelman 276724c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 276824c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 276924c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 277024c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 277124c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 277224c233c2SKris Buschelman 27731ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27741ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 277524c233c2SKris Buschelman t = a->solve_work; 277624c233c2SKris Buschelman 277724c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 277824c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 277924c233c2SKris Buschelman 278024c233c2SKris Buschelman /* forward solve the lower triangular */ 278124c233c2SKris Buschelman idx = 4*(*r++); 278224c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 278324c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 278424c233c2SKris Buschelman v = aa + 16*ai[1]; 278524c233c2SKris Buschelman 278624c233c2SKris Buschelman for (i=1; i<n;) { 278724c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 278824c233c2SKris Buschelman vi = aj + ai[i]; 278924c233c2SKris Buschelman nz = diag[i] - ai[i]; 279024c233c2SKris Buschelman idx = 4*(*r++); 279124c233c2SKris Buschelman 279224c233c2SKris Buschelman /* Demote sum from double to float */ 279324c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 279424c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 279524c233c2SKris Buschelman 279624c233c2SKris Buschelman while (nz--) { 279724c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 279824c233c2SKris Buschelman idx = 4*(*vi++); 279924c233c2SKris Buschelman 280024c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 280124c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 280224c233c2SKris Buschelman 280324c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 280424c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 280524c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 280624c233c2SKris Buschelman 280724c233c2SKris Buschelman /* First Column */ 280824c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 280924c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 281024c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 281124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 281224c233c2SKris Buschelman 281324c233c2SKris Buschelman /* Second Column */ 281424c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 281524c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 281624c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 281724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 281824c233c2SKris Buschelman 281924c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 282024c233c2SKris Buschelman 282124c233c2SKris Buschelman /* Third Column */ 282224c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 282324c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 282424c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 282524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 282624c233c2SKris Buschelman 282724c233c2SKris Buschelman /* Fourth Column */ 282824c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 282924c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 283024c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 283124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 283224c233c2SKris Buschelman SSE_INLINE_END_2 283324c233c2SKris Buschelman 283424c233c2SKris Buschelman v += 16; 283524c233c2SKris Buschelman } 283624c233c2SKris Buschelman idx = 4*i; 283724c233c2SKris Buschelman v = aa + 16*ai[++i]; 283824c233c2SKris Buschelman PREFETCH_NTA(v); 283924c233c2SKris Buschelman STORE_PS(tmps,XMM7); 284024c233c2SKris Buschelman 284124c233c2SKris Buschelman /* Promote result from float to double */ 284224c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 284324c233c2SKris Buschelman } 284424c233c2SKris Buschelman /* backward solve the upper triangular */ 284524c233c2SKris Buschelman idt = 4*(n-1); 284624c233c2SKris Buschelman ai16 = 16*diag[n-1]; 284724c233c2SKris Buschelman v = aa + ai16 + 16; 284824c233c2SKris Buschelman for (i=n-1; i>=0;){ 284924c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 285024c233c2SKris Buschelman vi = aj + diag[i] + 1; 285124c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 285224c233c2SKris Buschelman 285324c233c2SKris Buschelman /* Demote accumulator from double to float */ 285424c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 285524c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 285624c233c2SKris Buschelman 285724c233c2SKris Buschelman while (nz--) { 285824c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 285924c233c2SKris Buschelman idx = 4*(*vi++); 286024c233c2SKris Buschelman 286124c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 286224c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 286324c233c2SKris Buschelman 286424c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 286524c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 286624c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 286724c233c2SKris Buschelman 286824c233c2SKris Buschelman /* First Column */ 286924c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 287024c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 287124c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 287224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 287324c233c2SKris Buschelman 287424c233c2SKris Buschelman /* Second Column */ 287524c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 287624c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 287724c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 287824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 287924c233c2SKris Buschelman 288024c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 288124c233c2SKris Buschelman 288224c233c2SKris Buschelman /* Third Column */ 288324c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 288424c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 288524c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 288624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 288724c233c2SKris Buschelman 288824c233c2SKris Buschelman /* Fourth Column */ 288924c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 289024c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 289124c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 289224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 289324c233c2SKris Buschelman SSE_INLINE_END_2 289424c233c2SKris Buschelman v += 16; 289524c233c2SKris Buschelman } 289624c233c2SKris Buschelman v = aa + ai16; 289724c233c2SKris Buschelman ai16 = 16*diag[--i]; 289824c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 289924c233c2SKris Buschelman /* 290024c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 290124c233c2SKris Buschelman which was inverted as part of the factorization 290224c233c2SKris Buschelman */ 290324c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 290424c233c2SKris Buschelman /* First Column */ 290524c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 290624c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 290724c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 290824c233c2SKris Buschelman 290924c233c2SKris Buschelman /* Second Column */ 291024c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 291124c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 291224c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 291324c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 291424c233c2SKris Buschelman 291524c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 291624c233c2SKris Buschelman 291724c233c2SKris Buschelman /* Third Column */ 291824c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 291924c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 292024c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 292124c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 292224c233c2SKris Buschelman 292324c233c2SKris Buschelman /* Fourth Column */ 292424c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 292524c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 292624c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 292724c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 292824c233c2SKris Buschelman 292924c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 293024c233c2SKris Buschelman SSE_INLINE_END_3 293124c233c2SKris Buschelman 293224c233c2SKris Buschelman /* Promote solution from float to double */ 293324c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 293424c233c2SKris Buschelman 293524c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 293624c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 293724c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 293824c233c2SKris Buschelman idc = 4*(*c--); 293924c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 294024c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 294124c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 294224c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 294324c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 294424c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 294524c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 294624c233c2SKris Buschelman SSE_INLINE_END_2 294724c233c2SKris Buschelman v = aa + ai16 + 16; 294824c233c2SKris Buschelman idt -= 4; 294924c233c2SKris Buschelman } 295024c233c2SKris Buschelman 295124c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 295224c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 29531ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 29541ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2955dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 295624c233c2SKris Buschelman SSE_SCOPE_END; 295724c233c2SKris Buschelman PetscFunctionReturn(0); 295824c233c2SKris Buschelman } 295924c233c2SKris Buschelman 296024c233c2SKris Buschelman #endif 29610ef38995SBarry Smith 29620ef38995SBarry Smith 29634e2b4712SSatish Balay /* 29644e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 29654e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 29664e2b4712SSatish Balay */ 29674a2ae208SSatish Balay #undef __FUNCT__ 29684a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2969dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 29704e2b4712SSatish Balay { 29714e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2972356650c2SBarry Smith PetscInt n=a->mbs; 2973356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 2974dfbe8321SBarry Smith PetscErrorCode ierr; 2975356650c2SBarry Smith const PetscInt *diag = a->diag; 2976d9fead3dSBarry Smith const MatScalar *aa=a->a; 2977d9fead3dSBarry Smith PetscScalar *x; 2978d9fead3dSBarry Smith const PetscScalar *b; 29794e2b4712SSatish Balay 29804e2b4712SSatish Balay PetscFunctionBegin; 2981d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 29834e2b4712SSatish Balay 2984aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 29852853dc0eSBarry Smith { 298687828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 29872853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 29882853dc0eSBarry Smith } 2989aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 29902853dc0eSBarry Smith { 299187828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 29922853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 29932853dc0eSBarry Smith } 2994aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 29952853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2996e1293385SBarry Smith #else 299730d4dcafSBarry Smith { 299887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2999d9fead3dSBarry Smith const MatScalar *v; 3000356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 3001356650c2SBarry Smith const PetscInt *vi; 3002e1293385SBarry Smith 30034e2b4712SSatish Balay /* forward solve the lower triangular */ 30044e2b4712SSatish Balay idx = 0; 3005e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 30064e2b4712SSatish Balay for (i=1; i<n; i++) { 30074e2b4712SSatish Balay v = aa + 16*ai[i]; 30084e2b4712SSatish Balay vi = aj + ai[i]; 30094e2b4712SSatish Balay nz = diag[i] - ai[i]; 3010e1293385SBarry Smith idx += 4; 3011f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 30124e2b4712SSatish Balay while (nz--) { 30134e2b4712SSatish Balay jdx = 4*(*vi++); 30144e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3015f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3016f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3017f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3018f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 30194e2b4712SSatish Balay v += 16; 30204e2b4712SSatish Balay } 3021f1af5d2fSBarry Smith x[idx] = s1; 3022f1af5d2fSBarry Smith x[1+idx] = s2; 3023f1af5d2fSBarry Smith x[2+idx] = s3; 3024f1af5d2fSBarry Smith x[3+idx] = s4; 30254e2b4712SSatish Balay } 30264e2b4712SSatish Balay /* backward solve the upper triangular */ 30274e555682SBarry Smith idt = 4*(n-1); 30284e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 30294e555682SBarry Smith ai16 = 16*diag[i]; 30304e555682SBarry Smith v = aa + ai16 + 16; 30314e2b4712SSatish Balay vi = aj + diag[i] + 1; 30324e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3033f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3034f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 30354e2b4712SSatish Balay while (nz--) { 30364e2b4712SSatish Balay idx = 4*(*vi++); 30374e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3038f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3039f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3040f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3041f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 30424e2b4712SSatish Balay v += 16; 30434e2b4712SSatish Balay } 30444e555682SBarry Smith v = aa + ai16; 3045f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3046f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3047f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3048f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3049329f5518SBarry Smith idt -= 4; 30504e2b4712SSatish Balay } 305130d4dcafSBarry Smith } 3052e1293385SBarry Smith #endif 30534e2b4712SSatish Balay 3054d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30551ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3056dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 30574e2b4712SSatish Balay PetscFunctionReturn(0); 30584e2b4712SSatish Balay } 30594e2b4712SSatish Balay 3060f26ec98cSKris Buschelman #undef __FUNCT__ 3061cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3062cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3063cee9d6f2SShri Abhyankar { 3064cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 30656464896eSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3066cee9d6f2SShri Abhyankar PetscErrorCode ierr; 3067cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 3068cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3069cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 3070cee9d6f2SShri Abhyankar PetscScalar *x; 3071cee9d6f2SShri Abhyankar const PetscScalar *b; 3072cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3073cee9d6f2SShri Abhyankar 3074cee9d6f2SShri Abhyankar PetscFunctionBegin; 3075cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3076cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3077cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 3078cee9d6f2SShri Abhyankar idx = 0; 3079cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3080cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 3081cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 3082cee9d6f2SShri Abhyankar vi = aj + ai[i]; 3083cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 3084cee9d6f2SShri Abhyankar idx = bs*i; 3085cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 30866464896eSShri Abhyankar for(k=0;k<nz;k++) { 30876464896eSShri Abhyankar jdx = bs*vi[k]; 3088cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3089cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3090cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3091cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3092cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3093cee9d6f2SShri Abhyankar 3094cee9d6f2SShri Abhyankar v += bs2; 3095cee9d6f2SShri Abhyankar } 3096cee9d6f2SShri Abhyankar 3097cee9d6f2SShri Abhyankar x[idx] = s1; 3098cee9d6f2SShri Abhyankar x[1+idx] = s2; 3099cee9d6f2SShri Abhyankar x[2+idx] = s3; 3100cee9d6f2SShri Abhyankar x[3+idx] = s4; 3101cee9d6f2SShri Abhyankar } 3102cee9d6f2SShri Abhyankar 3103cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 3104cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 3105cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 3106cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 3107cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 3108cee9d6f2SShri Abhyankar idt = bs*i; 3109cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3110cee9d6f2SShri Abhyankar 31116464896eSShri Abhyankar for(k=0;k<nz;k++){ 31126464896eSShri Abhyankar idx = bs*vi[k]; 3113cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3114cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3115cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3116cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3117cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3118cee9d6f2SShri Abhyankar 3119cee9d6f2SShri Abhyankar v += bs2; 3120cee9d6f2SShri Abhyankar } 3121cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 3122cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3123cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3124cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3125cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3126cee9d6f2SShri Abhyankar 3127cee9d6f2SShri Abhyankar } 3128cee9d6f2SShri Abhyankar 3129cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3130cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3131cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3132cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 3133cee9d6f2SShri Abhyankar } 3134cee9d6f2SShri Abhyankar 3135b2b2dd24SShri Abhyankar #undef __FUNCT__ 3136b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 3137b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 3138b2b2dd24SShri Abhyankar { 3139b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3140b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3141b2b2dd24SShri Abhyankar PetscErrorCode ierr; 3142b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 3143b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3144b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 3145b2b2dd24SShri Abhyankar PetscScalar *x; 3146b2b2dd24SShri Abhyankar const PetscScalar *b; 3147b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3148cee9d6f2SShri Abhyankar 3149b2b2dd24SShri Abhyankar PetscFunctionBegin; 3150b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3151b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3152b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 3153b2b2dd24SShri Abhyankar idx = 0; 3154b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3155b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 3156b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 3157b2b2dd24SShri Abhyankar vi = aj + ai[i]; 3158b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 3159b2b2dd24SShri Abhyankar idx = bs*i; 3160b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3161b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 3162b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 3163b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3164b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3165b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3166b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3167b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3168b2b2dd24SShri Abhyankar 3169b2b2dd24SShri Abhyankar v += bs2; 3170b2b2dd24SShri Abhyankar } 3171b2b2dd24SShri Abhyankar 3172b2b2dd24SShri Abhyankar x[idx] = s1; 3173b2b2dd24SShri Abhyankar x[1+idx] = s2; 3174b2b2dd24SShri Abhyankar x[2+idx] = s3; 3175b2b2dd24SShri Abhyankar x[3+idx] = s4; 3176b2b2dd24SShri Abhyankar } 3177b2b2dd24SShri Abhyankar 3178b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 3179b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 3180b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 3181b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 3182b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 3183b2b2dd24SShri Abhyankar idt = bs*i; 3184b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3185b2b2dd24SShri Abhyankar 3186b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 3187b2b2dd24SShri Abhyankar idx = bs*vi[k]; 3188b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3189b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3190b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3191b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3192b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3193b2b2dd24SShri Abhyankar 3194b2b2dd24SShri Abhyankar v += bs2; 3195b2b2dd24SShri Abhyankar } 3196b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 3197b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3198b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3199b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3200b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3201b2b2dd24SShri Abhyankar 3202b2b2dd24SShri Abhyankar } 3203b2b2dd24SShri Abhyankar 3204b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3205b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3206b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3207b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 3208b2b2dd24SShri Abhyankar } 3209cee9d6f2SShri Abhyankar 3210cee9d6f2SShri Abhyankar #undef __FUNCT__ 3211f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3212dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3213f26ec98cSKris Buschelman { 3214f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3215690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3216dfbe8321SBarry Smith PetscErrorCode ierr; 3217690b6cddSBarry Smith PetscInt *diag = a->diag; 3218f26ec98cSKris Buschelman MatScalar *aa=a->a; 3219f26ec98cSKris Buschelman PetscScalar *x,*b; 3220f26ec98cSKris Buschelman 3221f26ec98cSKris Buschelman PetscFunctionBegin; 32221ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 32231ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3224f26ec98cSKris Buschelman 3225f26ec98cSKris Buschelman { 3226f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3227f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 3228690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3229f26ec98cSKris Buschelman 3230f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3231f26ec98cSKris Buschelman idx = 0; 3232f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 3233f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 3234f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 3235f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 3236f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3237f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3238f26ec98cSKris Buschelman vi = aj + ai[i]; 3239f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3240f26ec98cSKris Buschelman idx += 4; 3241f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3242f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3243f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3244f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3245f26ec98cSKris Buschelman while (nz--) { 3246f26ec98cSKris Buschelman jdx = 4*(*vi++); 3247f26ec98cSKris Buschelman x1 = t[jdx]; 3248f26ec98cSKris Buschelman x2 = t[1+jdx]; 3249f26ec98cSKris Buschelman x3 = t[2+jdx]; 3250f26ec98cSKris Buschelman x4 = t[3+jdx]; 3251f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3252f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3253f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3254f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3255f26ec98cSKris Buschelman v += 16; 3256f26ec98cSKris Buschelman } 3257f26ec98cSKris Buschelman t[idx] = s1; 3258f26ec98cSKris Buschelman t[1+idx] = s2; 3259f26ec98cSKris Buschelman t[2+idx] = s3; 3260f26ec98cSKris Buschelman t[3+idx] = s4; 3261f26ec98cSKris Buschelman } 3262f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3263f26ec98cSKris Buschelman idt = 4*(n-1); 3264f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3265f26ec98cSKris Buschelman ai16 = 16*diag[i]; 3266f26ec98cSKris Buschelman v = aa + ai16 + 16; 3267f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3268f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3269f26ec98cSKris Buschelman s1 = t[idt]; 3270f26ec98cSKris Buschelman s2 = t[1+idt]; 3271f26ec98cSKris Buschelman s3 = t[2+idt]; 3272f26ec98cSKris Buschelman s4 = t[3+idt]; 3273f26ec98cSKris Buschelman while (nz--) { 3274f26ec98cSKris Buschelman idx = 4*(*vi++); 3275f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 3276f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 3277f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 3278f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 3279f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3280f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3281f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3282f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3283f26ec98cSKris Buschelman v += 16; 3284f26ec98cSKris Buschelman } 3285f26ec98cSKris Buschelman v = aa + ai16; 3286f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3287f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3288f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3289f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3290f26ec98cSKris Buschelman idt -= 4; 3291f26ec98cSKris Buschelman } 3292f26ec98cSKris Buschelman } 3293f26ec98cSKris Buschelman 32941ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 32951ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3296dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3297f26ec98cSKris Buschelman PetscFunctionReturn(0); 3298f26ec98cSKris Buschelman } 3299f26ec98cSKris Buschelman 33003660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 33013660e330SKris Buschelman 33023660e330SKris Buschelman #include PETSC_HAVE_SSE 33033660e330SKris Buschelman #undef __FUNCT__ 33047cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3305dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 33063660e330SKris Buschelman { 33073660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33082aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 3309dfbe8321SBarry Smith PetscErrorCode ierr; 3310dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 33113660e330SKris Buschelman MatScalar *aa=a->a; 331287828ca2SBarry Smith PetscScalar *x,*b; 33133660e330SKris Buschelman 33143660e330SKris Buschelman PetscFunctionBegin; 33153660e330SKris Buschelman SSE_SCOPE_BEGIN; 33163660e330SKris Buschelman /* 33173660e330SKris Buschelman Note: This code currently uses demotion of double 33183660e330SKris Buschelman to float when performing the mixed-mode computation. 33193660e330SKris Buschelman This may not be numerically reasonable for all applications. 33203660e330SKris Buschelman */ 33213660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 33223660e330SKris Buschelman 33231ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 33241ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 33253660e330SKris Buschelman { 3326eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 3327eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 33282aa5897fSKris Buschelman int nz,i,idt,ai16; 33292aa5897fSKris Buschelman unsigned int jdx,idx; 33302aa5897fSKris Buschelman unsigned short *vi; 3331eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 33323660e330SKris Buschelman 3333eb05f457SKris Buschelman /* First block is the identity. */ 33343660e330SKris Buschelman idx = 0; 3335eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 33362aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 33373660e330SKris Buschelman 33383660e330SKris Buschelman for (i=1; i<n;) { 33393660e330SKris Buschelman PREFETCH_NTA(&v[8]); 33403660e330SKris Buschelman vi = aj + ai[i]; 33413660e330SKris Buschelman nz = diag[i] - ai[i]; 33423660e330SKris Buschelman idx += 4; 33433660e330SKris Buschelman 3344eb05f457SKris Buschelman /* Demote RHS from double to float. */ 3345eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3346eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 33473660e330SKris Buschelman 33483660e330SKris Buschelman while (nz--) { 33493660e330SKris Buschelman PREFETCH_NTA(&v[16]); 33502aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 33513660e330SKris Buschelman 33523660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 3353eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 33543660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 33553660e330SKris Buschelman 33563660e330SKris Buschelman /* First Column */ 33573660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 33583660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 33593660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 33603660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 33613660e330SKris Buschelman 33623660e330SKris Buschelman /* Second Column */ 33633660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 33643660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 33653660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 33663660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 33673660e330SKris Buschelman 33683660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 33693660e330SKris Buschelman 33703660e330SKris Buschelman /* Third Column */ 33713660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 33723660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 33733660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 33743660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 33753660e330SKris Buschelman 33763660e330SKris Buschelman /* Fourth Column */ 33773660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 33783660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 33793660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 33803660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 33813660e330SKris Buschelman SSE_INLINE_END_2 33823660e330SKris Buschelman 33833660e330SKris Buschelman v += 16; 33843660e330SKris Buschelman } 33853660e330SKris Buschelman v = aa + 16*ai[++i]; 33863660e330SKris Buschelman PREFETCH_NTA(v); 3387eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 33883660e330SKris Buschelman } 3389eb05f457SKris Buschelman 3390eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 3391eb05f457SKris Buschelman 33923660e330SKris Buschelman idt = 4*(n-1); 33933660e330SKris Buschelman ai16 = 16*diag[n-1]; 33943660e330SKris Buschelman v = aa + ai16 + 16; 33953660e330SKris Buschelman for (i=n-1; i>=0;){ 33963660e330SKris Buschelman PREFETCH_NTA(&v[8]); 33973660e330SKris Buschelman vi = aj + diag[i] + 1; 33983660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 33993660e330SKris Buschelman 3400eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 34013660e330SKris Buschelman 34023660e330SKris Buschelman while (nz--) { 34033660e330SKris Buschelman PREFETCH_NTA(&v[16]); 34042aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 34053660e330SKris Buschelman 34063660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 3407eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 34083660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 34093660e330SKris Buschelman 34103660e330SKris Buschelman /* First Column */ 34113660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 34123660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 34133660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 34143660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 34153660e330SKris Buschelman 34163660e330SKris Buschelman /* Second Column */ 34173660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 34183660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 34193660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 34203660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 34213660e330SKris Buschelman 34223660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 34233660e330SKris Buschelman 34243660e330SKris Buschelman /* Third Column */ 34253660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 34263660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 34273660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 34283660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 34293660e330SKris Buschelman 34303660e330SKris Buschelman /* Fourth Column */ 34313660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 34323660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 34333660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 34343660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 34353660e330SKris Buschelman SSE_INLINE_END_2 34363660e330SKris Buschelman v += 16; 34373660e330SKris Buschelman } 34383660e330SKris Buschelman v = aa + ai16; 34393660e330SKris Buschelman ai16 = 16*diag[--i]; 34403660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 34413660e330SKris Buschelman /* 34423660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 34433660e330SKris Buschelman which was inverted as part of the factorization 34443660e330SKris Buschelman */ 3445eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 34463660e330SKris Buschelman /* First Column */ 34473660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 34483660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 34493660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 34503660e330SKris Buschelman 34513660e330SKris Buschelman /* Second Column */ 34523660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 34533660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 34543660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 34553660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 34563660e330SKris Buschelman 34573660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 34583660e330SKris Buschelman 34593660e330SKris Buschelman /* Third Column */ 34603660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 34613660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 34623660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 34633660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 34643660e330SKris Buschelman 34653660e330SKris Buschelman /* Fourth Column */ 34663660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 34673660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 34683660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 34693660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 34703660e330SKris Buschelman 34713660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 34723660e330SKris Buschelman SSE_INLINE_END_3 34733660e330SKris Buschelman 34743660e330SKris Buschelman v = aa + ai16 + 16; 34753660e330SKris Buschelman idt -= 4; 34763660e330SKris Buschelman } 3477eb05f457SKris Buschelman 3478eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 3479eb05f457SKris Buschelman idt = 4*(n-1); 3480eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 3481eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3482eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3483eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 3484eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 3485eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 3486eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 3487eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 3488eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 348954693613SKris Buschelman idt -= 4; 34903660e330SKris Buschelman } 3491eb05f457SKris Buschelman 3492eb05f457SKris Buschelman } /* End of artificial scope. */ 34931ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 34941ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3495dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 34963660e330SKris Buschelman SSE_SCOPE_END; 34973660e330SKris Buschelman PetscFunctionReturn(0); 34983660e330SKris Buschelman } 34993660e330SKris Buschelman 35007cf1b8d3SKris Buschelman #undef __FUNCT__ 35017cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3502dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 35037cf1b8d3SKris Buschelman { 35047cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 35057cf1b8d3SKris Buschelman int *aj=a->j; 3506dfbe8321SBarry Smith PetscErrorCode ierr; 3507dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 35087cf1b8d3SKris Buschelman MatScalar *aa=a->a; 35097cf1b8d3SKris Buschelman PetscScalar *x,*b; 35107cf1b8d3SKris Buschelman 35117cf1b8d3SKris Buschelman PetscFunctionBegin; 35127cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 35137cf1b8d3SKris Buschelman /* 35147cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 35157cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 35167cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 35177cf1b8d3SKris Buschelman */ 35187cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 35197cf1b8d3SKris Buschelman 35201ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 35211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 35227cf1b8d3SKris Buschelman { 35237cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 35247cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 35257cf1b8d3SKris Buschelman int nz,i,idt,ai16; 35267cf1b8d3SKris Buschelman int jdx,idx; 35277cf1b8d3SKris Buschelman int *vi; 35287cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 35297cf1b8d3SKris Buschelman 35307cf1b8d3SKris Buschelman /* First block is the identity. */ 35317cf1b8d3SKris Buschelman idx = 0; 35327cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 35337cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 35347cf1b8d3SKris Buschelman 35357cf1b8d3SKris Buschelman for (i=1; i<n;) { 35367cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 35377cf1b8d3SKris Buschelman vi = aj + ai[i]; 35387cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 35397cf1b8d3SKris Buschelman idx += 4; 35407cf1b8d3SKris Buschelman 35417cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 35427cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 35437cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 35447cf1b8d3SKris Buschelman 35457cf1b8d3SKris Buschelman while (nz--) { 35467cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 35477cf1b8d3SKris Buschelman jdx = 4*(*vi++); 35487cf1b8d3SKris Buschelman /* jdx = *vi++; */ 35497cf1b8d3SKris Buschelman 35507cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 35517cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 35527cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 35537cf1b8d3SKris Buschelman 35547cf1b8d3SKris Buschelman /* First Column */ 35557cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 35567cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 35577cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 35587cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 35597cf1b8d3SKris Buschelman 35607cf1b8d3SKris Buschelman /* Second Column */ 35617cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 35627cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 35637cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 35647cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 35657cf1b8d3SKris Buschelman 35667cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 35677cf1b8d3SKris Buschelman 35687cf1b8d3SKris Buschelman /* Third Column */ 35697cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 35707cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 35717cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 35727cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 35737cf1b8d3SKris Buschelman 35747cf1b8d3SKris Buschelman /* Fourth Column */ 35757cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 35767cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 35777cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 35787cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 35797cf1b8d3SKris Buschelman SSE_INLINE_END_2 35807cf1b8d3SKris Buschelman 35817cf1b8d3SKris Buschelman v += 16; 35827cf1b8d3SKris Buschelman } 35837cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 35847cf1b8d3SKris Buschelman PREFETCH_NTA(v); 35857cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 35867cf1b8d3SKris Buschelman } 35877cf1b8d3SKris Buschelman 35887cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 35897cf1b8d3SKris Buschelman 35907cf1b8d3SKris Buschelman idt = 4*(n-1); 35917cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 35927cf1b8d3SKris Buschelman v = aa + ai16 + 16; 35937cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 35947cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 35957cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 35967cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 35977cf1b8d3SKris Buschelman 35987cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 35997cf1b8d3SKris Buschelman 36007cf1b8d3SKris Buschelman while (nz--) { 36017cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 36027cf1b8d3SKris Buschelman idx = 4*(*vi++); 36037cf1b8d3SKris Buschelman /* idx = *vi++; */ 36047cf1b8d3SKris Buschelman 36057cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 36067cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 36077cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 36087cf1b8d3SKris Buschelman 36097cf1b8d3SKris Buschelman /* First Column */ 36107cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 36117cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 36127cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 36137cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 36147cf1b8d3SKris Buschelman 36157cf1b8d3SKris Buschelman /* Second Column */ 36167cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 36177cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 36187cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 36197cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 36207cf1b8d3SKris Buschelman 36217cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 36227cf1b8d3SKris Buschelman 36237cf1b8d3SKris Buschelman /* Third Column */ 36247cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 36257cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 36267cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 36277cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 36287cf1b8d3SKris Buschelman 36297cf1b8d3SKris Buschelman /* Fourth Column */ 36307cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 36317cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 36327cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 36337cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 36347cf1b8d3SKris Buschelman SSE_INLINE_END_2 36357cf1b8d3SKris Buschelman v += 16; 36367cf1b8d3SKris Buschelman } 36377cf1b8d3SKris Buschelman v = aa + ai16; 36387cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 36397cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 36407cf1b8d3SKris Buschelman /* 36417cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 36427cf1b8d3SKris Buschelman which was inverted as part of the factorization 36437cf1b8d3SKris Buschelman */ 36447cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 36457cf1b8d3SKris Buschelman /* First Column */ 36467cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 36477cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 36487cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 36497cf1b8d3SKris Buschelman 36507cf1b8d3SKris Buschelman /* Second Column */ 36517cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 36527cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 36537cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 36547cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 36557cf1b8d3SKris Buschelman 36567cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 36577cf1b8d3SKris Buschelman 36587cf1b8d3SKris Buschelman /* Third Column */ 36597cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 36607cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 36617cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 36627cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 36637cf1b8d3SKris Buschelman 36647cf1b8d3SKris Buschelman /* Fourth Column */ 36657cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 36667cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 36677cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 36687cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 36697cf1b8d3SKris Buschelman 36707cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 36717cf1b8d3SKris Buschelman SSE_INLINE_END_3 36727cf1b8d3SKris Buschelman 36737cf1b8d3SKris Buschelman v = aa + ai16 + 16; 36747cf1b8d3SKris Buschelman idt -= 4; 36757cf1b8d3SKris Buschelman } 36767cf1b8d3SKris Buschelman 36777cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 36787cf1b8d3SKris Buschelman idt = 4*(n-1); 36797cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 36807cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 36817cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 36827cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 36837cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 36847cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 36857cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 36867cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 36877cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 36887cf1b8d3SKris Buschelman idt -= 4; 36897cf1b8d3SKris Buschelman } 36907cf1b8d3SKris Buschelman 36917cf1b8d3SKris Buschelman } /* End of artificial scope. */ 36921ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 36931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3694dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 36957cf1b8d3SKris Buschelman SSE_SCOPE_END; 36967cf1b8d3SKris Buschelman PetscFunctionReturn(0); 36977cf1b8d3SKris Buschelman } 36987cf1b8d3SKris Buschelman 36993660e330SKris Buschelman #endif 37008f690400SShri Abhyankar 37014a2ae208SSatish Balay #undef __FUNCT__ 37024a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3703dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 37044e2b4712SSatish Balay { 37054e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 37064e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 37076849ba73SBarry Smith PetscErrorCode ierr; 37085d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 37095d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3710d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3711d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3712d9fead3dSBarry Smith const PetscScalar *b; 37134e2b4712SSatish Balay 37144e2b4712SSatish Balay PetscFunctionBegin; 3715d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 37161ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3717f1af5d2fSBarry Smith t = a->solve_work; 37184e2b4712SSatish Balay 37194e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 37204e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 37214e2b4712SSatish Balay 37224e2b4712SSatish Balay /* forward solve the lower triangular */ 37234e2b4712SSatish Balay idx = 3*(*r++); 3724f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 37254e2b4712SSatish Balay for (i=1; i<n; i++) { 37264e2b4712SSatish Balay v = aa + 9*ai[i]; 37274e2b4712SSatish Balay vi = aj + ai[i]; 37284e2b4712SSatish Balay nz = diag[i] - ai[i]; 37294e2b4712SSatish Balay idx = 3*(*r++); 3730f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 37314e2b4712SSatish Balay while (nz--) { 37324e2b4712SSatish Balay idx = 3*(*vi++); 3733f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3734f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3735f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3736f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 37374e2b4712SSatish Balay v += 9; 37384e2b4712SSatish Balay } 37394e2b4712SSatish Balay idx = 3*i; 3740f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 37414e2b4712SSatish Balay } 37424e2b4712SSatish Balay /* backward solve the upper triangular */ 37434e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 37444e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 37454e2b4712SSatish Balay vi = aj + diag[i] + 1; 37464e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 37474e2b4712SSatish Balay idt = 3*i; 3748f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 37494e2b4712SSatish Balay while (nz--) { 37504e2b4712SSatish Balay idx = 3*(*vi++); 3751f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3752f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3753f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3754f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 37554e2b4712SSatish Balay v += 9; 37564e2b4712SSatish Balay } 37574e2b4712SSatish Balay idc = 3*(*c--); 37584e2b4712SSatish Balay v = aa + 9*diag[i]; 3759f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3760f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3761f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 37624e2b4712SSatish Balay } 37634e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 37644e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3765d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 37661ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3767dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 37684e2b4712SSatish Balay PetscFunctionReturn(0); 37694e2b4712SSatish Balay } 37704e2b4712SSatish Balay 37718f690400SShri Abhyankar #undef __FUNCT__ 37728f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 37738f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 37748f690400SShri Abhyankar { 37758f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 37768f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 37778f690400SShri Abhyankar PetscErrorCode ierr; 377829b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 37798f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 37808f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 37818f690400SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 37828f690400SShri Abhyankar const PetscScalar *b; 37838f690400SShri Abhyankar 37848f690400SShri Abhyankar PetscFunctionBegin; 37858f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 37868f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 37878f690400SShri Abhyankar t = a->solve_work; 37888f690400SShri Abhyankar 37898f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 379029b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 37918f690400SShri Abhyankar 37928f690400SShri Abhyankar /* forward solve the lower triangular */ 379329b92fc1SShri Abhyankar idx = 3*r[0]; 37948f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 37958f690400SShri Abhyankar for (i=1; i<n; i++) { 37968f690400SShri Abhyankar v = aa + 9*ai[i]; 37978f690400SShri Abhyankar vi = aj + ai[i]; 37988f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 379929b92fc1SShri Abhyankar idx = 3*r[i]; 38008f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 380129b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 380229b92fc1SShri Abhyankar idx = 3*vi[m]; 38038f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 38048f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 38058f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 38068f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 38078f690400SShri Abhyankar v += 9; 38088f690400SShri Abhyankar } 38098f690400SShri Abhyankar idx = 3*i; 38108f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 38118f690400SShri Abhyankar } 38128f690400SShri Abhyankar /* backward solve the upper triangular */ 38138f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 38148f690400SShri Abhyankar k = 2*n-i; 38158f690400SShri Abhyankar v = aa + 9*ai[k]; 38168f690400SShri Abhyankar vi = aj + ai[k]; 38178f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 38188f690400SShri Abhyankar idt = 3*i; 38198f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 382029b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 382129b92fc1SShri Abhyankar idx = 3*vi[m]; 38228f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 38238f690400SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 38248f690400SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 38258f690400SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 38268f690400SShri Abhyankar v += 9; 38278f690400SShri Abhyankar } 382829b92fc1SShri Abhyankar idc = 3*c[i]; 38298f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 38308f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 38318f690400SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 38328f690400SShri Abhyankar } 38338f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 38348f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 38358f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 38368f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 38378f690400SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 38388f690400SShri Abhyankar PetscFunctionReturn(0); 38398f690400SShri Abhyankar } 38408f690400SShri Abhyankar 3841*0c4413a7SShri Abhyankar #undef __FUNCT__ 3842*0c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2" 3843*0c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx) 3844*0c4413a7SShri Abhyankar { 3845*0c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3846*0c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 3847*0c4413a7SShri Abhyankar PetscErrorCode ierr; 3848*0c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 3849*0c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 3850*0c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 3851*0c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3852*0c4413a7SShri Abhyankar const PetscScalar *b; 3853*0c4413a7SShri Abhyankar 3854*0c4413a7SShri Abhyankar PetscFunctionBegin; 3855*0c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3856*0c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3857*0c4413a7SShri Abhyankar t = a->solve_work; 3858*0c4413a7SShri Abhyankar 3859*0c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3860*0c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3861*0c4413a7SShri Abhyankar 3862*0c4413a7SShri Abhyankar /* forward solve the lower triangular */ 3863*0c4413a7SShri Abhyankar idx = 3*r[0]; 3864*0c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 3865*0c4413a7SShri Abhyankar for (i=1; i<n; i++) { 3866*0c4413a7SShri Abhyankar v = aa + 9*ai[i]; 3867*0c4413a7SShri Abhyankar vi = aj + ai[i]; 3868*0c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 3869*0c4413a7SShri Abhyankar idx = 3*r[i]; 3870*0c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3871*0c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 3872*0c4413a7SShri Abhyankar idx = 3*vi[m]; 3873*0c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3874*0c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3875*0c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3876*0c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3877*0c4413a7SShri Abhyankar v += 9; 3878*0c4413a7SShri Abhyankar } 3879*0c4413a7SShri Abhyankar idx = 3*i; 3880*0c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 3881*0c4413a7SShri Abhyankar } 3882*0c4413a7SShri Abhyankar /* backward solve the upper triangular */ 3883*0c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 3884*0c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 3885*0c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 3886*0c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 3887*0c4413a7SShri Abhyankar idt = 3*i; 3888*0c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 3889*0c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 3890*0c4413a7SShri Abhyankar idx = 3*vi[m]; 3891*0c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3892*0c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3893*0c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3894*0c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3895*0c4413a7SShri Abhyankar v += 9; 3896*0c4413a7SShri Abhyankar } 3897*0c4413a7SShri Abhyankar idc = 3*c[i]; 3898*0c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3899*0c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3900*0c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3901*0c4413a7SShri Abhyankar } 3902*0c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3903*0c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3904*0c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3905*0c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3906*0c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3907*0c4413a7SShri Abhyankar PetscFunctionReturn(0); 3908*0c4413a7SShri Abhyankar } 3909*0c4413a7SShri Abhyankar 391015091d37SBarry Smith /* 391115091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 391215091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 391315091d37SBarry Smith */ 39144a2ae208SSatish Balay #undef __FUNCT__ 39154a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 3916dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 391715091d37SBarry Smith { 391815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3919690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3920dfbe8321SBarry Smith PetscErrorCode ierr; 3921690b6cddSBarry Smith PetscInt *diag = a->diag; 3922d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3923d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 3924d9fead3dSBarry Smith const PetscScalar *b; 3925690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 392615091d37SBarry Smith 392715091d37SBarry Smith PetscFunctionBegin; 3928d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 39291ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 393015091d37SBarry Smith 393115091d37SBarry Smith /* forward solve the lower triangular */ 393215091d37SBarry Smith idx = 0; 393315091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 393415091d37SBarry Smith for (i=1; i<n; i++) { 393515091d37SBarry Smith v = aa + 9*ai[i]; 393615091d37SBarry Smith vi = aj + ai[i]; 393715091d37SBarry Smith nz = diag[i] - ai[i]; 393815091d37SBarry Smith idx += 3; 3939f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 394015091d37SBarry Smith while (nz--) { 394115091d37SBarry Smith jdx = 3*(*vi++); 394215091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 3943f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3944f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3945f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 394615091d37SBarry Smith v += 9; 394715091d37SBarry Smith } 3948f1af5d2fSBarry Smith x[idx] = s1; 3949f1af5d2fSBarry Smith x[1+idx] = s2; 3950f1af5d2fSBarry Smith x[2+idx] = s3; 395115091d37SBarry Smith } 395215091d37SBarry Smith /* backward solve the upper triangular */ 395315091d37SBarry Smith for (i=n-1; i>=0; i--){ 395415091d37SBarry Smith v = aa + 9*diag[i] + 9; 395515091d37SBarry Smith vi = aj + diag[i] + 1; 395615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 395715091d37SBarry Smith idt = 3*i; 3958f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3959f1af5d2fSBarry Smith s3 = x[2+idt]; 396015091d37SBarry Smith while (nz--) { 396115091d37SBarry Smith idx = 3*(*vi++); 396215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 3963f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3964f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3965f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 396615091d37SBarry Smith v += 9; 396715091d37SBarry Smith } 396815091d37SBarry Smith v = aa + 9*diag[i]; 3969f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3970f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3971f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 397215091d37SBarry Smith } 397315091d37SBarry Smith 3974d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 39751ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3976dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 397715091d37SBarry Smith PetscFunctionReturn(0); 397815091d37SBarry Smith } 397915091d37SBarry Smith 39804a2ae208SSatish Balay #undef __FUNCT__ 3981cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 3982cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3983cee9d6f2SShri Abhyankar { 3984cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3985ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3986cee9d6f2SShri Abhyankar PetscErrorCode ierr; 3987cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 3988cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3989cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 3990cee9d6f2SShri Abhyankar PetscScalar *x; 3991cee9d6f2SShri Abhyankar const PetscScalar *b; 3992cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 3993cee9d6f2SShri Abhyankar 3994cee9d6f2SShri Abhyankar PetscFunctionBegin; 3995cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3996cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3997cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 3998cee9d6f2SShri Abhyankar idx = 0; 3999cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4000cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 4001cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 4002cee9d6f2SShri Abhyankar vi = aj + ai[i]; 4003cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 4004cee9d6f2SShri Abhyankar idx = bs*i; 4005cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4006ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4007ce3d78c0SShri Abhyankar jdx = bs*vi[k]; 4008cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4009cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4010cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4011cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4012cee9d6f2SShri Abhyankar 4013cee9d6f2SShri Abhyankar v += bs2; 4014cee9d6f2SShri Abhyankar } 4015cee9d6f2SShri Abhyankar 4016cee9d6f2SShri Abhyankar x[idx] = s1; 4017cee9d6f2SShri Abhyankar x[1+idx] = s2; 4018cee9d6f2SShri Abhyankar x[2+idx] = s3; 4019cee9d6f2SShri Abhyankar } 4020cee9d6f2SShri Abhyankar 4021cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 4022cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 4023cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 4024cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 4025cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 4026cee9d6f2SShri Abhyankar idt = bs*i; 4027cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4028cee9d6f2SShri Abhyankar 4029ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4030ce3d78c0SShri Abhyankar idx = bs*vi[k]; 4031cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4032cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4033cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4034cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4035cee9d6f2SShri Abhyankar 4036cee9d6f2SShri Abhyankar v += bs2; 4037cee9d6f2SShri Abhyankar } 4038cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 4039cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4040cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4041cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4042cee9d6f2SShri Abhyankar 4043cee9d6f2SShri Abhyankar } 4044cee9d6f2SShri Abhyankar 4045cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4046cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4047cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4048cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 4049cee9d6f2SShri Abhyankar } 4050cee9d6f2SShri Abhyankar 4051cee9d6f2SShri Abhyankar #undef __FUNCT__ 4052b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2" 4053b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4054b2b2dd24SShri Abhyankar { 4055b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4056b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4057b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4058b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 4059b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4060b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4061b2b2dd24SShri Abhyankar PetscScalar *x; 4062b2b2dd24SShri Abhyankar const PetscScalar *b; 4063b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4064b2b2dd24SShri Abhyankar 4065b2b2dd24SShri Abhyankar PetscFunctionBegin; 4066b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4067b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4068b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4069b2b2dd24SShri Abhyankar idx = 0; 4070b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4071b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4072b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4073b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4074b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4075b2b2dd24SShri Abhyankar idx = bs*i; 4076b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4077b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4078b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4079b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4080b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4081b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4082b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4083b2b2dd24SShri Abhyankar 4084b2b2dd24SShri Abhyankar v += bs2; 4085b2b2dd24SShri Abhyankar } 4086b2b2dd24SShri Abhyankar 4087b2b2dd24SShri Abhyankar x[idx] = s1; 4088b2b2dd24SShri Abhyankar x[1+idx] = s2; 4089b2b2dd24SShri Abhyankar x[2+idx] = s3; 4090b2b2dd24SShri Abhyankar } 4091b2b2dd24SShri Abhyankar 4092b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4093b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4094b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4095b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4096b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4097b2b2dd24SShri Abhyankar idt = bs*i; 4098b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4099b2b2dd24SShri Abhyankar 4100b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4101b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4102b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4103b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4104b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4105b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4106b2b2dd24SShri Abhyankar 4107b2b2dd24SShri Abhyankar v += bs2; 4108b2b2dd24SShri Abhyankar } 4109b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4110b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4111b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4112b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4113b2b2dd24SShri Abhyankar 4114b2b2dd24SShri Abhyankar } 4115b2b2dd24SShri Abhyankar 4116b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4117b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4118b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4119b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4120b2b2dd24SShri Abhyankar } 4121b2b2dd24SShri Abhyankar 4122b2b2dd24SShri Abhyankar #undef __FUNCT__ 41234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4124dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 41254e2b4712SSatish Balay { 41264e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 41274e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 41286849ba73SBarry Smith PetscErrorCode ierr; 41295d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 41305d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4131d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4132d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 4133d9fead3dSBarry Smith const PetscScalar *b; 41344e2b4712SSatish Balay 41354e2b4712SSatish Balay PetscFunctionBegin; 4136d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41371ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4138f1af5d2fSBarry Smith t = a->solve_work; 41394e2b4712SSatish Balay 41404e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 41414e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 41424e2b4712SSatish Balay 41434e2b4712SSatish Balay /* forward solve the lower triangular */ 41444e2b4712SSatish Balay idx = 2*(*r++); 4145f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 41464e2b4712SSatish Balay for (i=1; i<n; i++) { 41474e2b4712SSatish Balay v = aa + 4*ai[i]; 41484e2b4712SSatish Balay vi = aj + ai[i]; 41494e2b4712SSatish Balay nz = diag[i] - ai[i]; 41504e2b4712SSatish Balay idx = 2*(*r++); 4151f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 41524e2b4712SSatish Balay while (nz--) { 41534e2b4712SSatish Balay idx = 2*(*vi++); 4154f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4155f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4156f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 41574e2b4712SSatish Balay v += 4; 41584e2b4712SSatish Balay } 41594e2b4712SSatish Balay idx = 2*i; 4160f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 41614e2b4712SSatish Balay } 41624e2b4712SSatish Balay /* backward solve the upper triangular */ 41634e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 41644e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 41654e2b4712SSatish Balay vi = aj + diag[i] + 1; 41664e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 41674e2b4712SSatish Balay idt = 2*i; 4168f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 41694e2b4712SSatish Balay while (nz--) { 41704e2b4712SSatish Balay idx = 2*(*vi++); 4171f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4172f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4173f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 41744e2b4712SSatish Balay v += 4; 41754e2b4712SSatish Balay } 41764e2b4712SSatish Balay idc = 2*(*c--); 41774e2b4712SSatish Balay v = aa + 4*diag[i]; 4178f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4179f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 41804e2b4712SSatish Balay } 41814e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 41824e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4183d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41841ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4185dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 41864e2b4712SSatish Balay PetscFunctionReturn(0); 41874e2b4712SSatish Balay } 41884e2b4712SSatish Balay 41898f690400SShri Abhyankar #undef __FUNCT__ 41908f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 41918f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 41928f690400SShri Abhyankar { 41938f690400SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 41948f690400SShri Abhyankar IS iscol=a->col,isrow=a->row; 41958f690400SShri Abhyankar PetscErrorCode ierr; 419629b92fc1SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 41978f690400SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 41988f690400SShri Abhyankar const MatScalar *aa=a->a,*v; 41998f690400SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 42008f690400SShri Abhyankar const PetscScalar *b; 42018f690400SShri Abhyankar 42028f690400SShri Abhyankar PetscFunctionBegin; 42038f690400SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42048f690400SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 42058f690400SShri Abhyankar t = a->solve_work; 42068f690400SShri Abhyankar 42078f690400SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 420829b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 42098f690400SShri Abhyankar 42108f690400SShri Abhyankar /* forward solve the lower triangular */ 421129b92fc1SShri Abhyankar idx = 2*r[0]; 42128f690400SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 42138f690400SShri Abhyankar for (i=1; i<n; i++) { 42148f690400SShri Abhyankar v = aa + 4*ai[i]; 42158f690400SShri Abhyankar vi = aj + ai[i]; 42168f690400SShri Abhyankar nz = ai[i+1] - ai[i]; 421729b92fc1SShri Abhyankar idx = 2*r[i]; 42188f690400SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 421929b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 422029b92fc1SShri Abhyankar jdx = 2*vi[m]; 42218f690400SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 42228f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 42238f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 42248f690400SShri Abhyankar v += 4; 42258f690400SShri Abhyankar } 42268f690400SShri Abhyankar idx = 2*i; 42278f690400SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 42288f690400SShri Abhyankar } 42298f690400SShri Abhyankar /* backward solve the upper triangular */ 42308f690400SShri Abhyankar for (i=n-1; i>=0; i--){ 42318f690400SShri Abhyankar k = 2*n-i; 42328f690400SShri Abhyankar v = aa + 4*ai[k]; 42338f690400SShri Abhyankar vi = aj + ai[k]; 42348f690400SShri Abhyankar nz = ai[k +1] - ai[k] - 1; 42358f690400SShri Abhyankar idt = 2*i; 42368f690400SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 423729b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 423829b92fc1SShri Abhyankar idx = 2*vi[m]; 42398f690400SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 42408f690400SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 42418f690400SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 42428f690400SShri Abhyankar v += 4; 42438f690400SShri Abhyankar } 424429b92fc1SShri Abhyankar idc = 2*c[i]; 42458f690400SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 42468f690400SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 42478f690400SShri Abhyankar } 42488f690400SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 42498f690400SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 42508f690400SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42518f690400SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 42528f690400SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 42538f690400SShri Abhyankar PetscFunctionReturn(0); 42548f690400SShri Abhyankar } 42558f690400SShri Abhyankar 4256*0c4413a7SShri Abhyankar #undef __FUNCT__ 4257*0c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2" 4258*0c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4259*0c4413a7SShri Abhyankar { 4260*0c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4261*0c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 4262*0c4413a7SShri Abhyankar PetscErrorCode ierr; 4263*0c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 4264*0c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 4265*0c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 4266*0c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 4267*0c4413a7SShri Abhyankar const PetscScalar *b; 4268*0c4413a7SShri Abhyankar 4269*0c4413a7SShri Abhyankar PetscFunctionBegin; 4270*0c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4271*0c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4272*0c4413a7SShri Abhyankar t = a->solve_work; 4273*0c4413a7SShri Abhyankar 4274*0c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4275*0c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4276*0c4413a7SShri Abhyankar 4277*0c4413a7SShri Abhyankar /* forward solve the lower triangular */ 4278*0c4413a7SShri Abhyankar idx = 2*r[0]; 4279*0c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 4280*0c4413a7SShri Abhyankar for (i=1; i<n; i++) { 4281*0c4413a7SShri Abhyankar v = aa + 4*ai[i]; 4282*0c4413a7SShri Abhyankar vi = aj + ai[i]; 4283*0c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 4284*0c4413a7SShri Abhyankar idx = 2*r[i]; 4285*0c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 4286*0c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 4287*0c4413a7SShri Abhyankar jdx = 2*vi[m]; 4288*0c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 4289*0c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4290*0c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4291*0c4413a7SShri Abhyankar v += 4; 4292*0c4413a7SShri Abhyankar } 4293*0c4413a7SShri Abhyankar idx = 2*i; 4294*0c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 4295*0c4413a7SShri Abhyankar } 4296*0c4413a7SShri Abhyankar /* backward solve the upper triangular */ 4297*0c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 4298*0c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 4299*0c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 4300*0c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 4301*0c4413a7SShri Abhyankar idt = 2*i; 4302*0c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 4303*0c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 4304*0c4413a7SShri Abhyankar idx = 2*vi[m]; 4305*0c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 4306*0c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4307*0c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4308*0c4413a7SShri Abhyankar v += 4; 4309*0c4413a7SShri Abhyankar } 4310*0c4413a7SShri Abhyankar idc = 2*c[i]; 4311*0c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4312*0c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4313*0c4413a7SShri Abhyankar } 4314*0c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4315*0c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4316*0c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4317*0c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4318*0c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4319*0c4413a7SShri Abhyankar PetscFunctionReturn(0); 4320*0c4413a7SShri Abhyankar } 43218f690400SShri Abhyankar 432215091d37SBarry Smith /* 432315091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 432415091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 432515091d37SBarry Smith */ 43264a2ae208SSatish Balay #undef __FUNCT__ 43274a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4328dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 432915091d37SBarry Smith { 433015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4331690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4332dfbe8321SBarry Smith PetscErrorCode ierr; 4333690b6cddSBarry Smith PetscInt *diag = a->diag; 4334d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4335d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 4336d9fead3dSBarry Smith const PetscScalar *b; 4337690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 433815091d37SBarry Smith 433915091d37SBarry Smith PetscFunctionBegin; 4340d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43411ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 434215091d37SBarry Smith 434315091d37SBarry Smith /* forward solve the lower triangular */ 434415091d37SBarry Smith idx = 0; 434515091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 434615091d37SBarry Smith for (i=1; i<n; i++) { 434715091d37SBarry Smith v = aa + 4*ai[i]; 434815091d37SBarry Smith vi = aj + ai[i]; 434915091d37SBarry Smith nz = diag[i] - ai[i]; 435015091d37SBarry Smith idx += 2; 4351f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 435215091d37SBarry Smith while (nz--) { 435315091d37SBarry Smith jdx = 2*(*vi++); 435415091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 4355f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4356f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 435715091d37SBarry Smith v += 4; 435815091d37SBarry Smith } 4359f1af5d2fSBarry Smith x[idx] = s1; 4360f1af5d2fSBarry Smith x[1+idx] = s2; 436115091d37SBarry Smith } 436215091d37SBarry Smith /* backward solve the upper triangular */ 436315091d37SBarry Smith for (i=n-1; i>=0; i--){ 436415091d37SBarry Smith v = aa + 4*diag[i] + 4; 436515091d37SBarry Smith vi = aj + diag[i] + 1; 436615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 436715091d37SBarry Smith idt = 2*i; 4368f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 436915091d37SBarry Smith while (nz--) { 437015091d37SBarry Smith idx = 2*(*vi++); 437115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 4372f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4373f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 437415091d37SBarry Smith v += 4; 437515091d37SBarry Smith } 437615091d37SBarry Smith v = aa + 4*diag[i]; 4377f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 4378f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 437915091d37SBarry Smith } 438015091d37SBarry Smith 4381d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43821ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4383dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 438415091d37SBarry Smith PetscFunctionReturn(0); 438515091d37SBarry Smith } 438615091d37SBarry Smith 43874a2ae208SSatish Balay #undef __FUNCT__ 4388cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4389cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4390cee9d6f2SShri Abhyankar { 4391cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4392ce3d78c0SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4393cee9d6f2SShri Abhyankar PetscErrorCode ierr; 4394cee9d6f2SShri Abhyankar PetscInt jdx; 4395cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 4396cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4397cee9d6f2SShri Abhyankar const PetscScalar *b; 4398cee9d6f2SShri Abhyankar 4399cee9d6f2SShri Abhyankar PetscFunctionBegin; 4400cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4401cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4402cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 4403cee9d6f2SShri Abhyankar idx = 0; 4404cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4405cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 4406cee9d6f2SShri Abhyankar v = aa + 4*ai[i]; 4407cee9d6f2SShri Abhyankar vi = aj + ai[i]; 4408cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 4409cee9d6f2SShri Abhyankar idx = 2*i; 4410cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4411ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4412ce3d78c0SShri Abhyankar jdx = 2*vi[k]; 4413cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4414cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4415cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4416cee9d6f2SShri Abhyankar v += 4; 4417cee9d6f2SShri Abhyankar } 4418cee9d6f2SShri Abhyankar x[idx] = s1; 4419cee9d6f2SShri Abhyankar x[1+idx] = s2; 4420cee9d6f2SShri Abhyankar } 4421cee9d6f2SShri Abhyankar 4422cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 4423cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 4424cee9d6f2SShri Abhyankar v = aa + 4*ai[2*n-i]; 4425cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 4426cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 4427cee9d6f2SShri Abhyankar idt = 2*i; 4428cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4429ce3d78c0SShri Abhyankar for(k=0;k<nz;k++){ 4430ce3d78c0SShri Abhyankar idx = 2*vi[k]; 4431cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4432cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4433cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4434cee9d6f2SShri Abhyankar v += 4; 4435cee9d6f2SShri Abhyankar } 4436cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 4437cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4438cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4439cee9d6f2SShri Abhyankar } 4440cee9d6f2SShri Abhyankar 4441cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4442cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4443cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4444cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 4445cee9d6f2SShri Abhyankar } 4446cee9d6f2SShri Abhyankar 4447cee9d6f2SShri Abhyankar #undef __FUNCT__ 4448b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2" 4449b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4450b2b2dd24SShri Abhyankar { 4451b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4452b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4453b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4454b2b2dd24SShri Abhyankar PetscInt jdx; 4455b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4456b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4457b2b2dd24SShri Abhyankar const PetscScalar *b; 4458b2b2dd24SShri Abhyankar 4459b2b2dd24SShri Abhyankar PetscFunctionBegin; 4460b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4461b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4462b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4463b2b2dd24SShri Abhyankar idx = 0; 4464b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4465b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4466b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 4467b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4468b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4469b2b2dd24SShri Abhyankar idx = 2*i; 4470b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4471b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4472b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 4473b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4474b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4475b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4476b2b2dd24SShri Abhyankar v += 4; 4477b2b2dd24SShri Abhyankar } 4478b2b2dd24SShri Abhyankar x[idx] = s1; 4479b2b2dd24SShri Abhyankar x[1+idx] = s2; 4480b2b2dd24SShri Abhyankar } 4481b2b2dd24SShri Abhyankar 4482b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4483b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4484b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 4485b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4486b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4487b2b2dd24SShri Abhyankar idt = 2*i; 4488b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4489b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4490b2b2dd24SShri Abhyankar idx = 2*vi[k]; 4491b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4492b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4493b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4494b2b2dd24SShri Abhyankar v += 4; 4495b2b2dd24SShri Abhyankar } 4496b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4497b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4498b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4499b2b2dd24SShri Abhyankar } 4500b2b2dd24SShri Abhyankar 4501b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4502b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4503b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4504b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4505b2b2dd24SShri Abhyankar } 4506b2b2dd24SShri Abhyankar 4507b2b2dd24SShri Abhyankar #undef __FUNCT__ 45084a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4509dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 45104e2b4712SSatish Balay { 45114e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 45124e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 45136849ba73SBarry Smith PetscErrorCode ierr; 45145d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 45155d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 45163f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 451787828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 45184e2b4712SSatish Balay 45194e2b4712SSatish Balay PetscFunctionBegin; 45204e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 45214e2b4712SSatish Balay 45221ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 45231ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4524f1af5d2fSBarry Smith t = a->solve_work; 45254e2b4712SSatish Balay 45264e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 45274e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 45284e2b4712SSatish Balay 45294e2b4712SSatish Balay /* forward solve the lower triangular */ 4530f1af5d2fSBarry Smith t[0] = b[*r++]; 45314e2b4712SSatish Balay for (i=1; i<n; i++) { 45324e2b4712SSatish Balay v = aa + ai[i]; 45334e2b4712SSatish Balay vi = aj + ai[i]; 45344e2b4712SSatish Balay nz = diag[i] - ai[i]; 4535f1af5d2fSBarry Smith s1 = b[*r++]; 45364e2b4712SSatish Balay while (nz--) { 4537f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 45384e2b4712SSatish Balay } 4539f1af5d2fSBarry Smith t[i] = s1; 45404e2b4712SSatish Balay } 45414e2b4712SSatish Balay /* backward solve the upper triangular */ 45424e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 45434e2b4712SSatish Balay v = aa + diag[i] + 1; 45444e2b4712SSatish Balay vi = aj + diag[i] + 1; 45454e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4546f1af5d2fSBarry Smith s1 = t[i]; 45474e2b4712SSatish Balay while (nz--) { 4548f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 45494e2b4712SSatish Balay } 4550f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 45514e2b4712SSatish Balay } 45524e2b4712SSatish Balay 45534e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 45544e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 45551ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 45561ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4557dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 45584e2b4712SSatish Balay PetscFunctionReturn(0); 45594e2b4712SSatish Balay } 456015091d37SBarry Smith /* 456115091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 456215091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 456315091d37SBarry Smith */ 45644a2ae208SSatish Balay #undef __FUNCT__ 45654a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4566dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 456715091d37SBarry Smith { 456815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4569690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4570dfbe8321SBarry Smith PetscErrorCode ierr; 4571690b6cddSBarry Smith PetscInt *diag = a->diag; 457215091d37SBarry Smith MatScalar *aa=a->a; 457387828ca2SBarry Smith PetscScalar *x,*b; 457487828ca2SBarry Smith PetscScalar s1,x1; 457515091d37SBarry Smith MatScalar *v; 4576690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 457715091d37SBarry Smith 457815091d37SBarry Smith PetscFunctionBegin; 45791ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 45801ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 458115091d37SBarry Smith 458215091d37SBarry Smith /* forward solve the lower triangular */ 458315091d37SBarry Smith idx = 0; 458415091d37SBarry Smith x[0] = b[0]; 458515091d37SBarry Smith for (i=1; i<n; i++) { 458615091d37SBarry Smith v = aa + ai[i]; 458715091d37SBarry Smith vi = aj + ai[i]; 458815091d37SBarry Smith nz = diag[i] - ai[i]; 458915091d37SBarry Smith idx += 1; 4590f1af5d2fSBarry Smith s1 = b[idx]; 459115091d37SBarry Smith while (nz--) { 459215091d37SBarry Smith jdx = *vi++; 459315091d37SBarry Smith x1 = x[jdx]; 4594f1af5d2fSBarry Smith s1 -= v[0]*x1; 459515091d37SBarry Smith v += 1; 459615091d37SBarry Smith } 4597f1af5d2fSBarry Smith x[idx] = s1; 459815091d37SBarry Smith } 459915091d37SBarry Smith /* backward solve the upper triangular */ 460015091d37SBarry Smith for (i=n-1; i>=0; i--){ 460115091d37SBarry Smith v = aa + diag[i] + 1; 460215091d37SBarry Smith vi = aj + diag[i] + 1; 460315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 460415091d37SBarry Smith idt = i; 4605f1af5d2fSBarry Smith s1 = x[idt]; 460615091d37SBarry Smith while (nz--) { 460715091d37SBarry Smith idx = *vi++; 460815091d37SBarry Smith x1 = x[idx]; 4609f1af5d2fSBarry Smith s1 -= v[0]*x1; 461015091d37SBarry Smith v += 1; 461115091d37SBarry Smith } 461215091d37SBarry Smith v = aa + diag[i]; 4613f1af5d2fSBarry Smith x[idt] = v[0]*s1; 461415091d37SBarry Smith } 46151ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 46161ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4617dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 461815091d37SBarry Smith PetscFunctionReturn(0); 461915091d37SBarry Smith } 46204e2b4712SSatish Balay 46214e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 462216a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 46236bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 46246bce7ff8SHong Zhang 46256bce7ff8SHong Zhang #undef __FUNCT__ 46266bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 46276bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 46286bce7ff8SHong Zhang { 46296bce7ff8SHong Zhang Mat C=B; 46306bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 46316bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 46326bce7ff8SHong Zhang PetscErrorCode ierr; 46336bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 46346bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 46356bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4636b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4637914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4638914a18a2SHong Zhang MatScalar *v_work; 46396bce7ff8SHong Zhang 46406bce7ff8SHong Zhang PetscFunctionBegin; 46416bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 46426bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4643914a18a2SHong Zhang ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4644914a18a2SHong Zhang ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 46456bce7ff8SHong Zhang ics = ic; 46466bce7ff8SHong Zhang 4647914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 4648914a18a2SHong Zhang ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 4649b588c5a2SHong Zhang mwork = v_work + bs; 4650b588c5a2SHong Zhang v_pivots = (PetscInt*)(mwork + bs2); 4651914a18a2SHong Zhang 46526bce7ff8SHong Zhang for (i=0; i<n; i++){ 46536bce7ff8SHong Zhang /* zero rtmp */ 46546bce7ff8SHong Zhang /* L part */ 46556bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 46566bce7ff8SHong Zhang bjtmp = bj + bi[i]; 4657914a18a2SHong Zhang for (j=0; j<nz; j++){ 4658914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4659914a18a2SHong Zhang } 46606bce7ff8SHong Zhang 46616bce7ff8SHong Zhang /* U part */ 46626bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i]; 46636bce7ff8SHong Zhang bjtmp = bj + bi[2*n-i]; 4664914a18a2SHong Zhang for (j=0; j<nz; j++){ 4665914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4666914a18a2SHong Zhang } 46676bce7ff8SHong Zhang 46686bce7ff8SHong Zhang /* load in initial (unfactored row) */ 46696bce7ff8SHong Zhang nz = ai[r[i]+1] - ai[r[i]]; 46706bce7ff8SHong Zhang ajtmp = aj + ai[r[i]]; 4671914a18a2SHong Zhang v = aa + bs2*ai[r[i]]; 46726bce7ff8SHong Zhang for (j=0; j<nz; j++) { 4673914a18a2SHong Zhang ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 46746bce7ff8SHong Zhang } 46756bce7ff8SHong Zhang 46766bce7ff8SHong Zhang /* elimination */ 46776bce7ff8SHong Zhang bjtmp = bj + bi[i]; 46786bce7ff8SHong Zhang nzL = bi[i+1] - bi[i]; 4679b1646270SShri Abhyankar for(k=0;k < nzL;k++) { 4680b1646270SShri Abhyankar row = bjtmp[k]; 4681914a18a2SHong Zhang pc = rtmp + bs2*row; 4682914a18a2SHong Zhang for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 4683914a18a2SHong Zhang if (flg) { 4684914a18a2SHong Zhang pv = b->a + bs2*bdiag[row]; 4685b588c5a2SHong Zhang Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 46866bce7ff8SHong Zhang pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 4687914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-row]; 46886bce7ff8SHong Zhang nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 4689914a18a2SHong Zhang for (j=0; j<nz; j++) { 4690914a18a2SHong Zhang Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 4691914a18a2SHong Zhang } 4692b588c5a2SHong Zhang ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 46936bce7ff8SHong Zhang } 46946bce7ff8SHong Zhang } 46956bce7ff8SHong Zhang 46966bce7ff8SHong Zhang /* finished row so stick it into b->a */ 46976bce7ff8SHong Zhang /* L part */ 4698914a18a2SHong Zhang pv = b->a + bs2*bi[i] ; 46996bce7ff8SHong Zhang pj = b->j + bi[i] ; 47006bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 47016bce7ff8SHong Zhang for (j=0; j<nz; j++) { 4702914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 47036bce7ff8SHong Zhang } 47046bce7ff8SHong Zhang 47056bce7ff8SHong Zhang /* Mark diagonal and invert diagonal for simplier triangular solves */ 4706914a18a2SHong Zhang pv = b->a + bs2*bdiag[i]; 47076bce7ff8SHong Zhang pj = b->j + bdiag[i]; 4708914a18a2SHong Zhang /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 4709914a18a2SHong Zhang ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4710914a18a2SHong Zhang ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 47116bce7ff8SHong Zhang 47126bce7ff8SHong Zhang /* U part */ 4713914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-i]; 47146bce7ff8SHong Zhang pj = b->j + bi[2*n-i]; 47156bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i] - 1; 4716914a18a2SHong Zhang for (j=0; j<nz; j++){ 4717914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4718914a18a2SHong Zhang } 47196bce7ff8SHong Zhang } 47206bce7ff8SHong Zhang 47216bce7ff8SHong Zhang ierr = PetscFree(rtmp);CHKERRQ(ierr); 47226bce7ff8SHong Zhang ierr = PetscFree(v_work);CHKERRQ(ierr); 47236bce7ff8SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 47246bce7ff8SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 472527019359SHong Zhang 47266bce7ff8SHong Zhang C->assembled = PETSC_TRUE; 4727914a18a2SHong Zhang ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 47286bce7ff8SHong Zhang PetscFunctionReturn(0); 47296bce7ff8SHong Zhang } 47306bce7ff8SHong Zhang 47311a83e813SShri Abhyankar #undef __FUNCT__ 47321a83e813SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2" 47331a83e813SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info) 47341a83e813SShri Abhyankar { 47351a83e813SShri Abhyankar Mat C=B; 47361a83e813SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 47371a83e813SShri Abhyankar IS isrow = b->row,isicol = b->icol; 47381a83e813SShri Abhyankar PetscErrorCode ierr; 47391a83e813SShri Abhyankar const PetscInt *r,*ic,*ics; 47401a83e813SShri Abhyankar PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 47411a83e813SShri Abhyankar PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 47421a83e813SShri Abhyankar MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 47431a83e813SShri Abhyankar PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 47441a83e813SShri Abhyankar MatScalar *v_work; 47451a83e813SShri Abhyankar 47461a83e813SShri Abhyankar PetscFunctionBegin; 47471a83e813SShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 47481a83e813SShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 47491a83e813SShri Abhyankar ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 47501a83e813SShri Abhyankar ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 47511a83e813SShri Abhyankar ics = ic; 47521a83e813SShri Abhyankar 47531a83e813SShri Abhyankar /* generate work space needed by dense LU factorization */ 47541a83e813SShri Abhyankar ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 47551a83e813SShri Abhyankar mwork = v_work + bs; 47561a83e813SShri Abhyankar v_pivots = (PetscInt*)(mwork + bs2); 47571a83e813SShri Abhyankar 47581a83e813SShri Abhyankar for (i=0; i<n; i++){ 47591a83e813SShri Abhyankar /* zero rtmp */ 47601a83e813SShri Abhyankar /* L part */ 47611a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 47621a83e813SShri Abhyankar bjtmp = bj + bi[i]; 47631a83e813SShri Abhyankar for (j=0; j<nz; j++){ 47641a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 47651a83e813SShri Abhyankar } 47661a83e813SShri Abhyankar 47671a83e813SShri Abhyankar /* U part */ 47681a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 47691a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 47701a83e813SShri Abhyankar for (j=0; j<nz; j++){ 47711a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 47721a83e813SShri Abhyankar } 47731a83e813SShri Abhyankar 47741a83e813SShri Abhyankar /* load in initial (unfactored row) */ 47751a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 47761a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 47771a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 47781a83e813SShri Abhyankar for (j=0; j<nz; j++) { 47791a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 47801a83e813SShri Abhyankar } 47811a83e813SShri Abhyankar 47821a83e813SShri Abhyankar /* elimination */ 47831a83e813SShri Abhyankar bjtmp = bj + bi[i]; 47841a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 47851a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 47861a83e813SShri Abhyankar row = bjtmp[k]; 47871a83e813SShri Abhyankar pc = rtmp + bs2*row; 47881a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 47891a83e813SShri Abhyankar if (flg) { 47901a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 47911a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 47921a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 47931a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 47941a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 47951a83e813SShri Abhyankar for (j=0; j<nz; j++) { 47961a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 47971a83e813SShri Abhyankar } 47981a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 47991a83e813SShri Abhyankar } 48001a83e813SShri Abhyankar } 48011a83e813SShri Abhyankar 48021a83e813SShri Abhyankar /* finished row so stick it into b->a */ 48031a83e813SShri Abhyankar /* L part */ 48041a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 48051a83e813SShri Abhyankar pj = b->j + bi[i] ; 48061a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 48071a83e813SShri Abhyankar for (j=0; j<nz; j++) { 48081a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 48091a83e813SShri Abhyankar } 48101a83e813SShri Abhyankar 48111a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 48121a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 48131a83e813SShri Abhyankar pj = b->j + bdiag[i]; 48141a83e813SShri Abhyankar /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 48151a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 48161a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 48171a83e813SShri Abhyankar 48181a83e813SShri Abhyankar /* U part */ 48191a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 48201a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 48211a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 48221a83e813SShri Abhyankar for (j=0; j<nz; j++){ 48231a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 48241a83e813SShri Abhyankar } 48251a83e813SShri Abhyankar } 48261a83e813SShri Abhyankar 48271a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 48281a83e813SShri Abhyankar ierr = PetscFree(v_work);CHKERRQ(ierr); 48291a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 48301a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 48311a83e813SShri Abhyankar 48321a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 48331a83e813SShri Abhyankar ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 48341a83e813SShri Abhyankar PetscFunctionReturn(0); 48351a83e813SShri Abhyankar } 48361a83e813SShri Abhyankar 48376bce7ff8SHong Zhang /* 48386bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 483916a2bf60SHong Zhang See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 484016a2bf60SHong Zhang because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 48416bce7ff8SHong Zhang */ 48426bce7ff8SHong Zhang #undef __FUNCT__ 48436bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 48446bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 48456bce7ff8SHong Zhang { 48466bce7ff8SHong Zhang 48476bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 48486bce7ff8SHong Zhang PetscErrorCode ierr; 484916a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 485016a2bf60SHong Zhang PetscInt i,j,nz,*bi,*bj,*bdiag; 48516bce7ff8SHong Zhang 48526bce7ff8SHong Zhang PetscFunctionBegin; 485316a2bf60SHong Zhang /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */ 485416a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 48556bce7ff8SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 485616a2bf60SHong Zhang 485716a2bf60SHong Zhang /* allocate matrix arrays for new data structure */ 485816a2bf60SHong Zhang ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr); 485916a2bf60SHong Zhang ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr); 486016a2bf60SHong Zhang b->singlemalloc = PETSC_TRUE; 486116a2bf60SHong Zhang if (!b->diag){ 486216a2bf60SHong Zhang ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 486316a2bf60SHong Zhang } 4864914a18a2SHong Zhang bdiag = b->diag; 48656bce7ff8SHong Zhang 486616a2bf60SHong Zhang if (n > 0) { 486716a2bf60SHong Zhang ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 48686bce7ff8SHong Zhang } 48696bce7ff8SHong Zhang 48706bce7ff8SHong Zhang /* set bi and bj with new data structure */ 48716bce7ff8SHong Zhang bi = b->i; 48726bce7ff8SHong Zhang bj = b->j; 48736bce7ff8SHong Zhang 48746bce7ff8SHong Zhang /* L part */ 48756bce7ff8SHong Zhang bi[0] = 0; 487616a2bf60SHong Zhang for (i=0; i<n; i++){ 48776bce7ff8SHong Zhang nz = adiag[i] - ai[i]; 4878914a18a2SHong Zhang bi[i+1] = bi[i] + nz; 48796bce7ff8SHong Zhang aj = a->j + ai[i]; 48806bce7ff8SHong Zhang for (j=0; j<nz; j++){ 48816bce7ff8SHong Zhang *bj = aj[j]; bj++; 48826bce7ff8SHong Zhang } 48836bce7ff8SHong Zhang } 48846bce7ff8SHong Zhang 48856bce7ff8SHong Zhang /* U part */ 488616a2bf60SHong Zhang bi[n+1] = bi[n]; 488716a2bf60SHong Zhang for (i=n-1; i>=0; i--){ 48886bce7ff8SHong Zhang nz = ai[i+1] - adiag[i] - 1; 488916a2bf60SHong Zhang bi[2*n-i+1] = bi[2*n-i] + nz + 1; 48906bce7ff8SHong Zhang aj = a->j + adiag[i] + 1; 48916bce7ff8SHong Zhang for (j=0; j<nz; j++){ 48926bce7ff8SHong Zhang *bj = aj[j]; bj++; 48936bce7ff8SHong Zhang } 48946bce7ff8SHong Zhang /* diag[i] */ 48956bce7ff8SHong Zhang *bj = i; bj++; 489616a2bf60SHong Zhang bdiag[i] = bi[2*n-i+1]-1; 48976bce7ff8SHong Zhang } 48986bce7ff8SHong Zhang PetscFunctionReturn(0); 48996bce7ff8SHong Zhang } 49006bce7ff8SHong Zhang 490116a2bf60SHong Zhang #undef __FUNCT__ 490216a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 490316a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 490416a2bf60SHong Zhang { 490516a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 490616a2bf60SHong Zhang IS isicol; 490716a2bf60SHong Zhang PetscErrorCode ierr; 490816a2bf60SHong Zhang const PetscInt *r,*ic; 49097fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 491016a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 491116a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 491216a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 49137fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 491416a2bf60SHong Zhang PetscReal f; 491516a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 491616a2bf60SHong Zhang PetscBT lnkbt; 491716a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 491816a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 491916a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 492016a2bf60SHong Zhang PetscTruth missing; 49217fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 492216a2bf60SHong Zhang 492316a2bf60SHong Zhang PetscFunctionBegin; 492416a2bf60SHong Zhang if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 492516a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 492616a2bf60SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 492716a2bf60SHong Zhang 492816a2bf60SHong Zhang f = info->fill; 492916a2bf60SHong Zhang levels = (PetscInt)info->levels; 493016a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 493116a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 493216a2bf60SHong Zhang 493316a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 493416a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 49357fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 493616a2bf60SHong Zhang 49377fa3a6a0SHong Zhang if (!levels && both_identity) { 493816a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 493916a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 494016a2bf60SHong Zhang (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 49417fa3a6a0SHong Zhang /* set MatSolve routines */ 49427fa3a6a0SHong Zhang switch (bs){ 49437fa3a6a0SHong Zhang case 2: 49447fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 49457fa3a6a0SHong Zhang break; 49467fa3a6a0SHong Zhang case 3: 49477fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 49487fa3a6a0SHong Zhang break; 49497fa3a6a0SHong Zhang case 4: 49507fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 49517fa3a6a0SHong Zhang break; 49527fa3a6a0SHong Zhang case 5: 49537fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 49547fa3a6a0SHong Zhang break; 49557fa3a6a0SHong Zhang case 6: 49567fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 49577fa3a6a0SHong Zhang break; 49587fa3a6a0SHong Zhang case 7: 49597fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 49607fa3a6a0SHong Zhang break; 49617fa3a6a0SHong Zhang default: 49627fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 49637fa3a6a0SHong Zhang break; 49647fa3a6a0SHong Zhang } 496516a2bf60SHong Zhang 496616a2bf60SHong Zhang fact->factor = MAT_FACTOR_ILU; 496716a2bf60SHong Zhang (fact)->info.factor_mallocs = 0; 496816a2bf60SHong Zhang (fact)->info.fill_ratio_given = info->fill; 496916a2bf60SHong Zhang (fact)->info.fill_ratio_needed = 1.0; 497016a2bf60SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 497116a2bf60SHong Zhang b->row = isrow; 497216a2bf60SHong Zhang b->col = iscol; 497316a2bf60SHong Zhang b->icol = isicol; 497416a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 497516a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 497616a2bf60SHong Zhang b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4977b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 497816a2bf60SHong Zhang PetscFunctionReturn(0); 497916a2bf60SHong Zhang } 498016a2bf60SHong Zhang 498116a2bf60SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 498216a2bf60SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 498316a2bf60SHong Zhang 498416a2bf60SHong Zhang /* get new row pointers */ 498516a2bf60SHong Zhang ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 498616a2bf60SHong Zhang bi[0] = 0; 498716a2bf60SHong Zhang /* bdiag is location of diagonal in factor */ 498816a2bf60SHong Zhang ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 498916a2bf60SHong Zhang bdiag[0] = 0; 499016a2bf60SHong Zhang 499116a2bf60SHong Zhang ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr); 499216a2bf60SHong Zhang bjlvl_ptr = (PetscInt**)(bj_ptr + n); 499316a2bf60SHong Zhang 499416a2bf60SHong Zhang /* create a linked list for storing column indices of the active row */ 499516a2bf60SHong Zhang nlnk = n + 1; 499616a2bf60SHong Zhang ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 499716a2bf60SHong Zhang 499816a2bf60SHong Zhang /* initial FreeSpace size is f*(ai[n]+1) */ 499916a2bf60SHong Zhang ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 500016a2bf60SHong Zhang current_space = free_space; 500116a2bf60SHong Zhang ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 500216a2bf60SHong Zhang current_space_lvl = free_space_lvl; 500316a2bf60SHong Zhang 500416a2bf60SHong Zhang for (i=0; i<n; i++) { 500516a2bf60SHong Zhang nzi = 0; 500616a2bf60SHong Zhang /* copy current row into linked list */ 500716a2bf60SHong Zhang nnz = ai[r[i]+1] - ai[r[i]]; 500816a2bf60SHong Zhang if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 500916a2bf60SHong Zhang cols = aj + ai[r[i]]; 501016a2bf60SHong Zhang lnk[i] = -1; /* marker to indicate if diagonal exists */ 501116a2bf60SHong Zhang ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 501216a2bf60SHong Zhang nzi += nlnk; 501316a2bf60SHong Zhang 501416a2bf60SHong Zhang /* make sure diagonal entry is included */ 501516a2bf60SHong Zhang if (diagonal_fill && lnk[i] == -1) { 501616a2bf60SHong Zhang fm = n; 501716a2bf60SHong Zhang while (lnk[fm] < i) fm = lnk[fm]; 501816a2bf60SHong Zhang lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 501916a2bf60SHong Zhang lnk[fm] = i; 502016a2bf60SHong Zhang lnk_lvl[i] = 0; 502116a2bf60SHong Zhang nzi++; dcount++; 502216a2bf60SHong Zhang } 502316a2bf60SHong Zhang 502416a2bf60SHong Zhang /* add pivot rows into the active row */ 502516a2bf60SHong Zhang nzbd = 0; 502616a2bf60SHong Zhang prow = lnk[n]; 502716a2bf60SHong Zhang while (prow < i) { 502816a2bf60SHong Zhang nnz = bdiag[prow]; 502916a2bf60SHong Zhang cols = bj_ptr[prow] + nnz + 1; 503016a2bf60SHong Zhang cols_lvl = bjlvl_ptr[prow] + nnz + 1; 503116a2bf60SHong Zhang nnz = bi[prow+1] - bi[prow] - nnz - 1; 503216a2bf60SHong Zhang ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 503316a2bf60SHong Zhang nzi += nlnk; 503416a2bf60SHong Zhang prow = lnk[prow]; 503516a2bf60SHong Zhang nzbd++; 503616a2bf60SHong Zhang } 503716a2bf60SHong Zhang bdiag[i] = nzbd; 503816a2bf60SHong Zhang bi[i+1] = bi[i] + nzi; 503916a2bf60SHong Zhang 504016a2bf60SHong Zhang /* if free space is not available, make more free space */ 504116a2bf60SHong Zhang if (current_space->local_remaining<nzi) { 504216a2bf60SHong Zhang nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 504316a2bf60SHong Zhang ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 504416a2bf60SHong Zhang ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 504516a2bf60SHong Zhang reallocs++; 504616a2bf60SHong Zhang } 504716a2bf60SHong Zhang 504816a2bf60SHong Zhang /* copy data into free_space and free_space_lvl, then initialize lnk */ 504916a2bf60SHong Zhang ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 505016a2bf60SHong Zhang bj_ptr[i] = current_space->array; 505116a2bf60SHong Zhang bjlvl_ptr[i] = current_space_lvl->array; 505216a2bf60SHong Zhang 505316a2bf60SHong Zhang /* make sure the active row i has diagonal entry */ 505416a2bf60SHong Zhang if (*(bj_ptr[i]+bdiag[i]) != i) { 505516a2bf60SHong Zhang SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 505616a2bf60SHong Zhang try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 505716a2bf60SHong Zhang } 505816a2bf60SHong Zhang 505916a2bf60SHong Zhang current_space->array += nzi; 506016a2bf60SHong Zhang current_space->local_used += nzi; 506116a2bf60SHong Zhang current_space->local_remaining -= nzi; 506216a2bf60SHong Zhang current_space_lvl->array += nzi; 506316a2bf60SHong Zhang current_space_lvl->local_used += nzi; 506416a2bf60SHong Zhang current_space_lvl->local_remaining -= nzi; 506516a2bf60SHong Zhang } 506616a2bf60SHong Zhang 506716a2bf60SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 506816a2bf60SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 506916a2bf60SHong Zhang 507016a2bf60SHong Zhang /* destroy list of free space and other temporary arrays */ 507116a2bf60SHong Zhang ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 507216a2bf60SHong Zhang 507316a2bf60SHong Zhang /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5074783ef271SHong Zhang ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 507516a2bf60SHong Zhang 507616a2bf60SHong Zhang ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 507716a2bf60SHong Zhang ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 507816a2bf60SHong Zhang ierr = PetscFree(bj_ptr);CHKERRQ(ierr); 507916a2bf60SHong Zhang 508016a2bf60SHong Zhang #if defined(PETSC_USE_INFO) 508116a2bf60SHong Zhang { 508216a2bf60SHong Zhang PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 508316a2bf60SHong Zhang ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 508416a2bf60SHong Zhang ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 508516a2bf60SHong Zhang ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 508616a2bf60SHong Zhang ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 508716a2bf60SHong Zhang if (diagonal_fill) { 508816a2bf60SHong Zhang ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 508916a2bf60SHong Zhang } 509016a2bf60SHong Zhang } 509116a2bf60SHong Zhang #endif 509216a2bf60SHong Zhang 509316a2bf60SHong Zhang /* put together the new matrix */ 509416a2bf60SHong Zhang ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 509516a2bf60SHong Zhang ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 509616a2bf60SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 509716a2bf60SHong Zhang b->free_a = PETSC_TRUE; 509816a2bf60SHong Zhang b->free_ij = PETSC_TRUE; 509916a2bf60SHong Zhang b->singlemalloc = PETSC_FALSE; 51007fa3a6a0SHong Zhang ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 510116a2bf60SHong Zhang b->j = bj; 510216a2bf60SHong Zhang b->i = bi; 510316a2bf60SHong Zhang b->diag = bdiag; 51047f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 510516a2bf60SHong Zhang b->ilen = 0; 510616a2bf60SHong Zhang b->imax = 0; 510716a2bf60SHong Zhang b->row = isrow; 510816a2bf60SHong Zhang b->col = iscol; 510916a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 511016a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 511116a2bf60SHong Zhang b->icol = isicol; 51127fa3a6a0SHong Zhang ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 511316a2bf60SHong Zhang /* In b structure: Free imax, ilen, old a, old j. 511416a2bf60SHong Zhang Allocate bdiag, solve_work, new a, new j */ 51157fa3a6a0SHong Zhang ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 511616a2bf60SHong Zhang b->maxnz = b->nz = bi[2*n+1] ; 511716a2bf60SHong Zhang (fact)->info.factor_mallocs = reallocs; 511816a2bf60SHong Zhang (fact)->info.fill_ratio_given = f; 511916a2bf60SHong Zhang (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]); 512016a2bf60SHong Zhang (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 51217fa3a6a0SHong Zhang /* set MatSolve routines */ 51227fa3a6a0SHong Zhang if (both_identity){ 51237fa3a6a0SHong Zhang switch (bs){ 51247fa3a6a0SHong Zhang case 2: 51257fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 51267fa3a6a0SHong Zhang break; 51277fa3a6a0SHong Zhang case 3: 51287fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 51297fa3a6a0SHong Zhang break; 51307fa3a6a0SHong Zhang case 4: 51317fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 51327fa3a6a0SHong Zhang break; 51337fa3a6a0SHong Zhang case 5: 51347fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 51357fa3a6a0SHong Zhang break; 51367fa3a6a0SHong Zhang case 6: 51377fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 51387fa3a6a0SHong Zhang break; 51397fa3a6a0SHong Zhang case 7: 51407fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 51417fa3a6a0SHong Zhang break; 51427fa3a6a0SHong Zhang default: 51437fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 51447fa3a6a0SHong Zhang break; 51457fa3a6a0SHong Zhang } 51467fa3a6a0SHong Zhang } else { 51477fa3a6a0SHong Zhang switch (bs){ 51487fa3a6a0SHong Zhang case 2: 51497fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct; 51507fa3a6a0SHong Zhang break; 51517fa3a6a0SHong Zhang case 3: 51527fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct; 51537fa3a6a0SHong Zhang break; 51547fa3a6a0SHong Zhang case 4: 51557fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct; 51567fa3a6a0SHong Zhang break; 51577fa3a6a0SHong Zhang case 5: 51587fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct; 51597fa3a6a0SHong Zhang break; 51607fa3a6a0SHong Zhang case 6: 51617fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct; 51627fa3a6a0SHong Zhang break; 51637fa3a6a0SHong Zhang case 7: 51647fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct; 51657fa3a6a0SHong Zhang break; 51667fa3a6a0SHong Zhang default: 51677fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 51687fa3a6a0SHong Zhang break; 51697fa3a6a0SHong Zhang } 51707fa3a6a0SHong Zhang } 517116a2bf60SHong Zhang PetscFunctionReturn(0); 517216a2bf60SHong Zhang } 517316a2bf60SHong Zhang 51744e2b4712SSatish Balay /* 51754e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 51764e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 51774e2b4712SSatish Balay Not a good example of code reuse. 51784e2b4712SSatish Balay */ 51794a2ae208SSatish Balay #undef __FUNCT__ 51804a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 51810481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 51824e2b4712SSatish Balay { 51834e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 51844e2b4712SSatish Balay IS isicol; 51856849ba73SBarry Smith PetscErrorCode ierr; 51865d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 51875d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5188a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5189d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 519041df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 5191329f5518SBarry Smith PetscReal f; 519216a2bf60SHong Zhang PetscTruth newdatastruct=PETSC_FALSE; 51934e2b4712SSatish Balay 51944e2b4712SSatish Balay PetscFunctionBegin; 519516a2bf60SHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 519616a2bf60SHong Zhang if (newdatastruct){ 519716a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 519816a2bf60SHong Zhang PetscFunctionReturn(0); 519916a2bf60SHong Zhang } 520016a2bf60SHong Zhang 52016bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 52026bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 52036bce7ff8SHong Zhang 5204435faa5fSBarry Smith f = info->fill; 5205690b6cddSBarry Smith levels = (PetscInt)info->levels; 5206690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 52074c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 520816a2bf60SHong Zhang 5209667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5210667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 52117d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 5212309c388cSBarry Smith 521341df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 521416a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 52156bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 52166bce7ff8SHong Zhang 5217719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 5218719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 5219bb3d539aSBarry Smith b->row = isrow; 5220bb3d539aSBarry Smith b->col = iscol; 5221bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5222bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5223bb3d539aSBarry Smith b->icol = isicol; 5224bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5225b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 52266bce7ff8SHong Zhang PetscFunctionReturn(0); 52276bce7ff8SHong Zhang } 52286bce7ff8SHong Zhang 52296bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 52304e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 52314e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 52324e2b4712SSatish Balay 52334e2b4712SSatish Balay /* get new row pointers */ 5234690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 52354e2b4712SSatish Balay ainew[0] = 0; 52364e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 5237690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 5238690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 52394e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 5240690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 52414e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 5242690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 52434e2b4712SSatish Balay /* im is level for each filled value */ 5244690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 52454e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 5246690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 52474e2b4712SSatish Balay dloc[0] = 0; 52484e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 5249435faa5fSBarry Smith 5250435faa5fSBarry Smith /* copy prow into linked list */ 52514e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 52523b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 52534e2b4712SSatish Balay xi = aj + ai[r[prow]]; 52544e2b4712SSatish Balay fill[n] = n; 5255435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 52564e2b4712SSatish Balay while (nz--) { 52574e2b4712SSatish Balay fm = n; 52584e2b4712SSatish Balay idx = ic[*xi++]; 52594e2b4712SSatish Balay do { 52604e2b4712SSatish Balay m = fm; 52614e2b4712SSatish Balay fm = fill[m]; 52624e2b4712SSatish Balay } while (fm < idx); 52634e2b4712SSatish Balay fill[m] = idx; 52644e2b4712SSatish Balay fill[idx] = fm; 52654e2b4712SSatish Balay im[idx] = 0; 52664e2b4712SSatish Balay } 5267435faa5fSBarry Smith 5268435faa5fSBarry Smith /* make sure diagonal entry is included */ 5269435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 5270435faa5fSBarry Smith fm = n; 5271435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 5272435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5273435faa5fSBarry Smith fill[fm] = prow; 5274435faa5fSBarry Smith im[prow] = 0; 5275435faa5fSBarry Smith nzf++; 5276335d9088SBarry Smith dcount++; 5277435faa5fSBarry Smith } 5278435faa5fSBarry Smith 52794e2b4712SSatish Balay nzi = 0; 52804e2b4712SSatish Balay row = fill[n]; 52814e2b4712SSatish Balay while (row < prow) { 52824e2b4712SSatish Balay incrlev = im[row] + 1; 52834e2b4712SSatish Balay nz = dloc[row]; 5284435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 52854e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 52864e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 52874e2b4712SSatish Balay fm = row; 52884e2b4712SSatish Balay while (nnz-- > 0) { 52894e2b4712SSatish Balay idx = *xi++; 52904e2b4712SSatish Balay if (*flev + incrlev > levels) { 52914e2b4712SSatish Balay flev++; 52924e2b4712SSatish Balay continue; 52934e2b4712SSatish Balay } 52944e2b4712SSatish Balay do { 52954e2b4712SSatish Balay m = fm; 52964e2b4712SSatish Balay fm = fill[m]; 52974e2b4712SSatish Balay } while (fm < idx); 52984e2b4712SSatish Balay if (fm != idx) { 52994e2b4712SSatish Balay im[idx] = *flev + incrlev; 53004e2b4712SSatish Balay fill[m] = idx; 53014e2b4712SSatish Balay fill[idx] = fm; 53024e2b4712SSatish Balay fm = idx; 53034e2b4712SSatish Balay nzf++; 5304ecf371e4SBarry Smith } else { 53054e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 53064e2b4712SSatish Balay } 53074e2b4712SSatish Balay flev++; 53084e2b4712SSatish Balay } 53094e2b4712SSatish Balay row = fill[row]; 53104e2b4712SSatish Balay nzi++; 53114e2b4712SSatish Balay } 53124e2b4712SSatish Balay /* copy new filled row into permanent storage */ 53134e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 53144e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 5315ecf371e4SBarry Smith 5316ecf371e4SBarry Smith /* estimate how much additional space we will need */ 5317ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5318ecf371e4SBarry Smith /* just double the memory each time */ 5319690b6cddSBarry Smith PetscInt maxadd = jmax; 5320ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 53214e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 53224e2b4712SSatish Balay jmax += maxadd; 5323ecf371e4SBarry Smith 5324ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 53255d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 53265d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5327606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 53285d0c19d7SBarry Smith ajnew = xitmp; 53295d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 53305d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5331606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 53325d0c19d7SBarry Smith ajfill = xitmp; 5333eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 53344e2b4712SSatish Balay } 53355d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 53364e2b4712SSatish Balay flev = ajfill + ainew[prow]; 53374e2b4712SSatish Balay dloc[prow] = nzi; 53384e2b4712SSatish Balay fm = fill[n]; 53394e2b4712SSatish Balay while (nzf--) { 53405d0c19d7SBarry Smith *xitmp++ = fm; 53414e2b4712SSatish Balay *flev++ = im[fm]; 53424e2b4712SSatish Balay fm = fill[fm]; 53434e2b4712SSatish Balay } 5344435faa5fSBarry Smith /* make sure row has diagonal entry */ 5345435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 534677431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 53472401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5348435faa5fSBarry Smith } 53494e2b4712SSatish Balay } 5350606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 53514e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 53524e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5353606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 5354606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 53554e2b4712SSatish Balay 53566cf91177SBarry Smith #if defined(PETSC_USE_INFO) 53574e2b4712SSatish Balay { 5358329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5359ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5360ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5361ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5362ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5363335d9088SBarry Smith if (diagonal_fill) { 5364ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5365335d9088SBarry Smith } 53664e2b4712SSatish Balay } 536763ba0a88SBarry Smith #endif 53684e2b4712SSatish Balay 53694e2b4712SSatish Balay /* put together the new matrix */ 5370719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5371719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5372719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 5373e6b907acSBarry Smith b->free_a = PETSC_TRUE; 5374e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 53757c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 5376a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 53774e2b4712SSatish Balay b->j = ajnew; 53784e2b4712SSatish Balay b->i = ainew; 53794e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 53804e2b4712SSatish Balay b->diag = dloc; 53817f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 53824e2b4712SSatish Balay b->ilen = 0; 53834e2b4712SSatish Balay b->imax = 0; 53844e2b4712SSatish Balay b->row = isrow; 53854e2b4712SSatish Balay b->col = iscol; 5386bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5387c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5388c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5389e51c0b9cSSatish Balay b->icol = isicol; 539087828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 53914e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 53924e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 5393719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 53944e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 53954e2b4712SSatish Balay 5396719d5645SBarry Smith (fact)->info.factor_mallocs = reallocate; 5397719d5645SBarry Smith (fact)->info.fill_ratio_given = f; 5398719d5645SBarry Smith (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 53996bce7ff8SHong Zhang 540041df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 54018661488fSKris Buschelman PetscFunctionReturn(0); 54028661488fSKris Buschelman } 54038661488fSKris Buschelman 5404732ee342SKris Buschelman #undef __FUNCT__ 54057e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5406dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 54077e7071cdSKris Buschelman { 540812272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 540912272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 54105a9542e3SKris Buschelman PetscFunctionBegin; 54117cf1b8d3SKris Buschelman /* Undo Column scaling */ 54127cf1b8d3SKris Buschelman /* while (nz--) { */ 54137cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 54147cf1b8d3SKris Buschelman /* } */ 5415c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 5416c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 54177cf1b8d3SKris Buschelman PetscFunctionReturn(0); 54187cf1b8d3SKris Buschelman } 54197cf1b8d3SKris Buschelman 54207cf1b8d3SKris Buschelman #undef __FUNCT__ 54217cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5422dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 54237cf1b8d3SKris Buschelman { 54247cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5425b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 54262aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 54275a9542e3SKris Buschelman PetscFunctionBegin; 54280b9da03eSKris Buschelman /* Is this really necessary? */ 542920235379SKris Buschelman while (nz--) { 54300b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 54317e7071cdSKris Buschelman } 5432c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 54337e7071cdSKris Buschelman PetscFunctionReturn(0); 54347e7071cdSKris Buschelman } 54357e7071cdSKris Buschelman 5436732ee342SKris Buschelman 5437