1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 1016a2bf60SHong Zhang #include "petscbt.h" 1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 124e2b4712SSatish Balay 134a2ae208SSatish Balay #undef __FUNCT__ 144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16f1af5d2fSBarry Smith { 17f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18dfbe8321SBarry Smith PetscErrorCode ierr; 19690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20690b6cddSBarry Smith PetscInt *diag = a->diag; 21f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2287828ca2SBarry Smith PetscScalar s1,*x,*b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 66f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6887828ca2SBarry Smith PetscScalar *x,*b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 1111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120f1af5d2fSBarry Smith { 121f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122dfbe8321SBarry Smith PetscErrorCode ierr; 123690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 125f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12687828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12787828ca2SBarry Smith PetscScalar *x,*b; 128f1af5d2fSBarry Smith 129f1af5d2fSBarry Smith PetscFunctionBegin; 130ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1311ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133f1af5d2fSBarry Smith 134f1af5d2fSBarry Smith /* forward solve the U^T */ 135f1af5d2fSBarry Smith idx = 0; 136f1af5d2fSBarry Smith for (i=0; i<n; i++) { 137f1af5d2fSBarry Smith 138f1af5d2fSBarry Smith v = aa + 9*diag[i]; 139f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 140ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144f1af5d2fSBarry Smith v += 9; 145f1af5d2fSBarry Smith 146f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 147f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 148f1af5d2fSBarry Smith while (nz--) { 149f1af5d2fSBarry Smith oidx = 3*(*vi++); 150f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153f1af5d2fSBarry Smith v += 9; 154f1af5d2fSBarry Smith } 155f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156f1af5d2fSBarry Smith idx += 3; 157f1af5d2fSBarry Smith } 158f1af5d2fSBarry Smith /* backward solve the L^T */ 159f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 160f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 161f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 162f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 163f1af5d2fSBarry Smith idt = 3*i; 164f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165f1af5d2fSBarry Smith while (nz--) { 166f1af5d2fSBarry Smith idx = 3*(*vi--); 167f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170f1af5d2fSBarry Smith v -= 9; 171f1af5d2fSBarry Smith } 172f1af5d2fSBarry Smith } 1731ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176f1af5d2fSBarry Smith PetscFunctionReturn(0); 177f1af5d2fSBarry Smith } 178f1af5d2fSBarry Smith 1794a2ae208SSatish Balay #undef __FUNCT__ 1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182f1af5d2fSBarry Smith { 183f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184dfbe8321SBarry Smith PetscErrorCode ierr; 185690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 187f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18987828ca2SBarry Smith PetscScalar *x,*b; 190f1af5d2fSBarry Smith 191f1af5d2fSBarry Smith PetscFunctionBegin; 192ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1931ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195f1af5d2fSBarry Smith 196f1af5d2fSBarry Smith /* forward solve the U^T */ 197f1af5d2fSBarry Smith idx = 0; 198f1af5d2fSBarry Smith for (i=0; i<n; i++) { 199f1af5d2fSBarry Smith 200f1af5d2fSBarry Smith v = aa + 16*diag[i]; 201f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 202ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207f1af5d2fSBarry Smith v += 16; 208f1af5d2fSBarry Smith 209f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 210f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 211f1af5d2fSBarry Smith while (nz--) { 212f1af5d2fSBarry Smith oidx = 4*(*vi++); 213f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217f1af5d2fSBarry Smith v += 16; 218f1af5d2fSBarry Smith } 219f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220f1af5d2fSBarry Smith idx += 4; 221f1af5d2fSBarry Smith } 222f1af5d2fSBarry Smith /* backward solve the L^T */ 223f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 224f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 225f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 226f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 227f1af5d2fSBarry Smith idt = 4*i; 228f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229f1af5d2fSBarry Smith while (nz--) { 230f1af5d2fSBarry Smith idx = 4*(*vi--); 231f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235f1af5d2fSBarry Smith v -= 16; 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith } 2381ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2391ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241f1af5d2fSBarry Smith PetscFunctionReturn(0); 242f1af5d2fSBarry Smith } 243f1af5d2fSBarry Smith 2444a2ae208SSatish Balay #undef __FUNCT__ 2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247f1af5d2fSBarry Smith { 248f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249dfbe8321SBarry Smith PetscErrorCode ierr; 250690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 252f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 25387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 25487828ca2SBarry Smith PetscScalar *x,*b; 255f1af5d2fSBarry Smith 256f1af5d2fSBarry Smith PetscFunctionBegin; 257ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2581ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2591ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260f1af5d2fSBarry Smith 261f1af5d2fSBarry Smith /* forward solve the U^T */ 262f1af5d2fSBarry Smith idx = 0; 263f1af5d2fSBarry Smith for (i=0; i<n; i++) { 264f1af5d2fSBarry Smith 265f1af5d2fSBarry Smith v = aa + 25*diag[i]; 266f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 267ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273f1af5d2fSBarry Smith v += 25; 274f1af5d2fSBarry Smith 275f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 276f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 277f1af5d2fSBarry Smith while (nz--) { 278f1af5d2fSBarry Smith oidx = 5*(*vi++); 279f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284f1af5d2fSBarry Smith v += 25; 285f1af5d2fSBarry Smith } 286f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287f1af5d2fSBarry Smith idx += 5; 288f1af5d2fSBarry Smith } 289f1af5d2fSBarry Smith /* backward solve the L^T */ 290f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 291f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 292f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 293f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 294f1af5d2fSBarry Smith idt = 5*i; 295f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296f1af5d2fSBarry Smith while (nz--) { 297f1af5d2fSBarry Smith idx = 5*(*vi--); 298f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303f1af5d2fSBarry Smith v -= 25; 304f1af5d2fSBarry Smith } 305f1af5d2fSBarry Smith } 3061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309f1af5d2fSBarry Smith PetscFunctionReturn(0); 310f1af5d2fSBarry Smith } 311f1af5d2fSBarry Smith 3124a2ae208SSatish Balay #undef __FUNCT__ 3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315f1af5d2fSBarry Smith { 316f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317dfbe8321SBarry Smith PetscErrorCode ierr; 318690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 320f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 32187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 32287828ca2SBarry Smith PetscScalar *x,*b; 323f1af5d2fSBarry Smith 324f1af5d2fSBarry Smith PetscFunctionBegin; 325ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328f1af5d2fSBarry Smith 329f1af5d2fSBarry Smith /* forward solve the U^T */ 330f1af5d2fSBarry Smith idx = 0; 331f1af5d2fSBarry Smith for (i=0; i<n; i++) { 332f1af5d2fSBarry Smith 333f1af5d2fSBarry Smith v = aa + 36*diag[i]; 334f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 335ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336ef66eb69SBarry Smith x6 = x[5+idx]; 337f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343f1af5d2fSBarry Smith v += 36; 344f1af5d2fSBarry Smith 345f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 346f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 347f1af5d2fSBarry Smith while (nz--) { 348f1af5d2fSBarry Smith oidx = 6*(*vi++); 349f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355f1af5d2fSBarry Smith v += 36; 356f1af5d2fSBarry Smith } 357f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358f1af5d2fSBarry Smith x[5+idx] = s6; 359f1af5d2fSBarry Smith idx += 6; 360f1af5d2fSBarry Smith } 361f1af5d2fSBarry Smith /* backward solve the L^T */ 362f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 363f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 364f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 365f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 366f1af5d2fSBarry Smith idt = 6*i; 367f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368f1af5d2fSBarry Smith s6 = x[5+idt]; 369f1af5d2fSBarry Smith while (nz--) { 370f1af5d2fSBarry Smith idx = 6*(*vi--); 371f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377f1af5d2fSBarry Smith v -= 36; 378f1af5d2fSBarry Smith } 379f1af5d2fSBarry Smith } 3801ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383f1af5d2fSBarry Smith PetscFunctionReturn(0); 384f1af5d2fSBarry Smith } 385f1af5d2fSBarry Smith 3864a2ae208SSatish Balay #undef __FUNCT__ 3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389f1af5d2fSBarry Smith { 390f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391dfbe8321SBarry Smith PetscErrorCode ierr; 392690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 394f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 39587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 39687828ca2SBarry Smith PetscScalar *x,*b; 397f1af5d2fSBarry Smith 398f1af5d2fSBarry Smith PetscFunctionBegin; 399ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4001ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402f1af5d2fSBarry Smith 403f1af5d2fSBarry Smith /* forward solve the U^T */ 404f1af5d2fSBarry Smith idx = 0; 405f1af5d2fSBarry Smith for (i=0; i<n; i++) { 406f1af5d2fSBarry Smith 407f1af5d2fSBarry Smith v = aa + 49*diag[i]; 408f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 409ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 411f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418f1af5d2fSBarry Smith v += 49; 419f1af5d2fSBarry Smith 420f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 421f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 422f1af5d2fSBarry Smith while (nz--) { 423f1af5d2fSBarry Smith oidx = 7*(*vi++); 424f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431f1af5d2fSBarry Smith v += 49; 432f1af5d2fSBarry Smith } 433f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 435f1af5d2fSBarry Smith idx += 7; 436f1af5d2fSBarry Smith } 437f1af5d2fSBarry Smith /* backward solve the L^T */ 438f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 439f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 440f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 441f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 442f1af5d2fSBarry Smith idt = 7*i; 443f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 445f1af5d2fSBarry Smith while (nz--) { 446f1af5d2fSBarry Smith idx = 7*(*vi--); 447f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454f1af5d2fSBarry Smith v -= 49; 455f1af5d2fSBarry Smith } 456f1af5d2fSBarry Smith } 4571ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460f1af5d2fSBarry Smith PetscFunctionReturn(0); 461f1af5d2fSBarry Smith } 462f1af5d2fSBarry Smith 463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4644a2ae208SSatish Balay #undef __FUNCT__ 4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467f1af5d2fSBarry Smith { 468f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 4706849ba73SBarry Smith PetscErrorCode ierr; 4715d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 4725d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473690b6cddSBarry Smith PetscInt *diag = a->diag; 474f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 47587828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 476f1af5d2fSBarry Smith 477f1af5d2fSBarry Smith PetscFunctionBegin; 4781ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480f1af5d2fSBarry Smith t = a->solve_work; 481f1af5d2fSBarry Smith 482f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484f1af5d2fSBarry Smith 485f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 486f1af5d2fSBarry Smith for (i=0; i<n; i++) { 487f1af5d2fSBarry Smith t[i] = b[c[i]]; 488f1af5d2fSBarry Smith } 489f1af5d2fSBarry Smith 490f1af5d2fSBarry Smith /* forward solve the U^T */ 491f1af5d2fSBarry Smith for (i=0; i<n; i++) { 492f1af5d2fSBarry Smith 493f1af5d2fSBarry Smith v = aa + diag[i]; 494f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 495f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 496f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 497f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 498f1af5d2fSBarry Smith while (nz--) { 499f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 500f1af5d2fSBarry Smith } 501f1af5d2fSBarry Smith t[i] = s1; 502f1af5d2fSBarry Smith } 503f1af5d2fSBarry Smith /* backward solve the L^T */ 504f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 505f1af5d2fSBarry Smith v = aa + diag[i] - 1; 506f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 507f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 508f1af5d2fSBarry Smith s1 = t[i]; 509f1af5d2fSBarry Smith while (nz--) { 510f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 511f1af5d2fSBarry Smith } 512f1af5d2fSBarry Smith } 513f1af5d2fSBarry Smith 514f1af5d2fSBarry Smith /* copy t into x according to permutation */ 515f1af5d2fSBarry Smith for (i=0; i<n; i++) { 516f1af5d2fSBarry Smith x[r[i]] = t[i]; 517f1af5d2fSBarry Smith } 518f1af5d2fSBarry Smith 519f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5211ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5221ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524f1af5d2fSBarry Smith PetscFunctionReturn(0); 525f1af5d2fSBarry Smith } 526f1af5d2fSBarry Smith 5274a2ae208SSatish Balay #undef __FUNCT__ 5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530f1af5d2fSBarry Smith { 531f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5336849ba73SBarry Smith PetscErrorCode ierr; 5345d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5355d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 537f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53887828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 53987828ca2SBarry Smith PetscScalar *x,*b,*t; 540f1af5d2fSBarry Smith 541f1af5d2fSBarry Smith PetscFunctionBegin; 5421ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5431ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544f1af5d2fSBarry Smith t = a->solve_work; 545f1af5d2fSBarry Smith 546f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548f1af5d2fSBarry Smith 549f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 550f1af5d2fSBarry Smith ii = 0; 551f1af5d2fSBarry Smith for (i=0; i<n; i++) { 552f1af5d2fSBarry Smith ic = 2*c[i]; 553f1af5d2fSBarry Smith t[ii] = b[ic]; 554f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 555f1af5d2fSBarry Smith ii += 2; 556f1af5d2fSBarry Smith } 557f1af5d2fSBarry Smith 558f1af5d2fSBarry Smith /* forward solve the U^T */ 559f1af5d2fSBarry Smith idx = 0; 560f1af5d2fSBarry Smith for (i=0; i<n; i++) { 561f1af5d2fSBarry Smith 562f1af5d2fSBarry Smith v = aa + 4*diag[i]; 563f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 564f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 565f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 566f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 567f1af5d2fSBarry Smith v += 4; 568f1af5d2fSBarry Smith 569f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 570f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 571f1af5d2fSBarry Smith while (nz--) { 572f1af5d2fSBarry Smith oidx = 2*(*vi++); 573f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 574f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 575f1af5d2fSBarry Smith v += 4; 576f1af5d2fSBarry Smith } 577f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 578f1af5d2fSBarry Smith idx += 2; 579f1af5d2fSBarry Smith } 580f1af5d2fSBarry Smith /* backward solve the L^T */ 581f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 582f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 583f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 584f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 585f1af5d2fSBarry Smith idt = 2*i; 586f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 587f1af5d2fSBarry Smith while (nz--) { 588f1af5d2fSBarry Smith idx = 2*(*vi--); 589f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 590f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 591f1af5d2fSBarry Smith v -= 4; 592f1af5d2fSBarry Smith } 593f1af5d2fSBarry Smith } 594f1af5d2fSBarry Smith 595f1af5d2fSBarry Smith /* copy t into x according to permutation */ 596f1af5d2fSBarry Smith ii = 0; 597f1af5d2fSBarry Smith for (i=0; i<n; i++) { 598f1af5d2fSBarry Smith ir = 2*r[i]; 599f1af5d2fSBarry Smith x[ir] = t[ii]; 600f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 601f1af5d2fSBarry Smith ii += 2; 602f1af5d2fSBarry Smith } 603f1af5d2fSBarry Smith 604f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609f1af5d2fSBarry Smith PetscFunctionReturn(0); 610f1af5d2fSBarry Smith } 611f1af5d2fSBarry Smith 6124a2ae208SSatish Balay #undef __FUNCT__ 6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615f1af5d2fSBarry Smith { 616f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6186849ba73SBarry Smith PetscErrorCode ierr; 6195d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 6205d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 622f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 62387828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 62487828ca2SBarry Smith PetscScalar *x,*b,*t; 625f1af5d2fSBarry Smith 626f1af5d2fSBarry Smith PetscFunctionBegin; 6271ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629f1af5d2fSBarry Smith t = a->solve_work; 630f1af5d2fSBarry Smith 631f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633f1af5d2fSBarry Smith 634f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 635f1af5d2fSBarry Smith ii = 0; 636f1af5d2fSBarry Smith for (i=0; i<n; i++) { 637f1af5d2fSBarry Smith ic = 3*c[i]; 638f1af5d2fSBarry Smith t[ii] = b[ic]; 639f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 640f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 641f1af5d2fSBarry Smith ii += 3; 642f1af5d2fSBarry Smith } 643f1af5d2fSBarry Smith 644f1af5d2fSBarry Smith /* forward solve the U^T */ 645f1af5d2fSBarry Smith idx = 0; 646f1af5d2fSBarry Smith for (i=0; i<n; i++) { 647f1af5d2fSBarry Smith 648f1af5d2fSBarry Smith v = aa + 9*diag[i]; 649f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 650f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654f1af5d2fSBarry Smith v += 9; 655f1af5d2fSBarry Smith 656f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 657f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 658f1af5d2fSBarry Smith while (nz--) { 659f1af5d2fSBarry Smith oidx = 3*(*vi++); 660f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663f1af5d2fSBarry Smith v += 9; 664f1af5d2fSBarry Smith } 665f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666f1af5d2fSBarry Smith idx += 3; 667f1af5d2fSBarry Smith } 668f1af5d2fSBarry Smith /* backward solve the L^T */ 669f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 670f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 671f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 672f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 673f1af5d2fSBarry Smith idt = 3*i; 674f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675f1af5d2fSBarry Smith while (nz--) { 676f1af5d2fSBarry Smith idx = 3*(*vi--); 677f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680f1af5d2fSBarry Smith v -= 9; 681f1af5d2fSBarry Smith } 682f1af5d2fSBarry Smith } 683f1af5d2fSBarry Smith 684f1af5d2fSBarry Smith /* copy t into x according to permutation */ 685f1af5d2fSBarry Smith ii = 0; 686f1af5d2fSBarry Smith for (i=0; i<n; i++) { 687f1af5d2fSBarry Smith ir = 3*r[i]; 688f1af5d2fSBarry Smith x[ir] = t[ii]; 689f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 690f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 691f1af5d2fSBarry Smith ii += 3; 692f1af5d2fSBarry Smith } 693f1af5d2fSBarry Smith 694f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6961ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699f1af5d2fSBarry Smith PetscFunctionReturn(0); 700f1af5d2fSBarry Smith } 701f1af5d2fSBarry Smith 7024a2ae208SSatish Balay #undef __FUNCT__ 7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705f1af5d2fSBarry Smith { 706f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7086849ba73SBarry Smith PetscErrorCode ierr; 7095d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 7105d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 712f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 71387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 71487828ca2SBarry Smith PetscScalar *x,*b,*t; 715f1af5d2fSBarry Smith 716f1af5d2fSBarry Smith PetscFunctionBegin; 7171ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719f1af5d2fSBarry Smith t = a->solve_work; 720f1af5d2fSBarry Smith 721f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723f1af5d2fSBarry Smith 724f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 725f1af5d2fSBarry Smith ii = 0; 726f1af5d2fSBarry Smith for (i=0; i<n; i++) { 727f1af5d2fSBarry Smith ic = 4*c[i]; 728f1af5d2fSBarry Smith t[ii] = b[ic]; 729f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 730f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 731f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 732f1af5d2fSBarry Smith ii += 4; 733f1af5d2fSBarry Smith } 734f1af5d2fSBarry Smith 735f1af5d2fSBarry Smith /* forward solve the U^T */ 736f1af5d2fSBarry Smith idx = 0; 737f1af5d2fSBarry Smith for (i=0; i<n; i++) { 738f1af5d2fSBarry Smith 739f1af5d2fSBarry Smith v = aa + 16*diag[i]; 740f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 741f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746f1af5d2fSBarry Smith v += 16; 747f1af5d2fSBarry Smith 748f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 749f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 750f1af5d2fSBarry Smith while (nz--) { 751f1af5d2fSBarry Smith oidx = 4*(*vi++); 752f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756f1af5d2fSBarry Smith v += 16; 757f1af5d2fSBarry Smith } 758f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759f1af5d2fSBarry Smith idx += 4; 760f1af5d2fSBarry Smith } 761f1af5d2fSBarry Smith /* backward solve the L^T */ 762f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 763f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 764f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 765f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 766f1af5d2fSBarry Smith idt = 4*i; 767f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768f1af5d2fSBarry Smith while (nz--) { 769f1af5d2fSBarry Smith idx = 4*(*vi--); 770f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774f1af5d2fSBarry Smith v -= 16; 775f1af5d2fSBarry Smith } 776f1af5d2fSBarry Smith } 777f1af5d2fSBarry Smith 778f1af5d2fSBarry Smith /* copy t into x according to permutation */ 779f1af5d2fSBarry Smith ii = 0; 780f1af5d2fSBarry Smith for (i=0; i<n; i++) { 781f1af5d2fSBarry Smith ir = 4*r[i]; 782f1af5d2fSBarry Smith x[ir] = t[ii]; 783f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 784f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 785f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 786f1af5d2fSBarry Smith ii += 4; 787f1af5d2fSBarry Smith } 788f1af5d2fSBarry Smith 789f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794f1af5d2fSBarry Smith PetscFunctionReturn(0); 795f1af5d2fSBarry Smith } 796f1af5d2fSBarry Smith 7974a2ae208SSatish Balay #undef __FUNCT__ 7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800f1af5d2fSBarry Smith { 801f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8036849ba73SBarry Smith PetscErrorCode ierr; 8045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 807f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 80887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 80987828ca2SBarry Smith PetscScalar *x,*b,*t; 810f1af5d2fSBarry Smith 811f1af5d2fSBarry Smith PetscFunctionBegin; 8121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814f1af5d2fSBarry Smith t = a->solve_work; 815f1af5d2fSBarry Smith 816f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818f1af5d2fSBarry Smith 819f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 820f1af5d2fSBarry Smith ii = 0; 821f1af5d2fSBarry Smith for (i=0; i<n; i++) { 822f1af5d2fSBarry Smith ic = 5*c[i]; 823f1af5d2fSBarry Smith t[ii] = b[ic]; 824f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 825f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 826f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 827f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 828f1af5d2fSBarry Smith ii += 5; 829f1af5d2fSBarry Smith } 830f1af5d2fSBarry Smith 831f1af5d2fSBarry Smith /* forward solve the U^T */ 832f1af5d2fSBarry Smith idx = 0; 833f1af5d2fSBarry Smith for (i=0; i<n; i++) { 834f1af5d2fSBarry Smith 835f1af5d2fSBarry Smith v = aa + 25*diag[i]; 836f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 837f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843f1af5d2fSBarry Smith v += 25; 844f1af5d2fSBarry Smith 845f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 846f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 847f1af5d2fSBarry Smith while (nz--) { 848f1af5d2fSBarry Smith oidx = 5*(*vi++); 849f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854f1af5d2fSBarry Smith v += 25; 855f1af5d2fSBarry Smith } 856f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857f1af5d2fSBarry Smith idx += 5; 858f1af5d2fSBarry Smith } 859f1af5d2fSBarry Smith /* backward solve the L^T */ 860f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 861f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 862f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 863f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 864f1af5d2fSBarry Smith idt = 5*i; 865f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866f1af5d2fSBarry Smith while (nz--) { 867f1af5d2fSBarry Smith idx = 5*(*vi--); 868f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873f1af5d2fSBarry Smith v -= 25; 874f1af5d2fSBarry Smith } 875f1af5d2fSBarry Smith } 876f1af5d2fSBarry Smith 877f1af5d2fSBarry Smith /* copy t into x according to permutation */ 878f1af5d2fSBarry Smith ii = 0; 879f1af5d2fSBarry Smith for (i=0; i<n; i++) { 880f1af5d2fSBarry Smith ir = 5*r[i]; 881f1af5d2fSBarry Smith x[ir] = t[ii]; 882f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 883f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 884f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 885f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 886f1af5d2fSBarry Smith ii += 5; 887f1af5d2fSBarry Smith } 888f1af5d2fSBarry Smith 889f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8911ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894f1af5d2fSBarry Smith PetscFunctionReturn(0); 895f1af5d2fSBarry Smith } 896f1af5d2fSBarry Smith 8974a2ae208SSatish Balay #undef __FUNCT__ 8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900f1af5d2fSBarry Smith { 901f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9036849ba73SBarry Smith PetscErrorCode ierr; 9045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9055d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 907f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 90887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 90987828ca2SBarry Smith PetscScalar *x,*b,*t; 910f1af5d2fSBarry Smith 911f1af5d2fSBarry Smith PetscFunctionBegin; 9121ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914f1af5d2fSBarry Smith t = a->solve_work; 915f1af5d2fSBarry Smith 916f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918f1af5d2fSBarry Smith 919f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 920f1af5d2fSBarry Smith ii = 0; 921f1af5d2fSBarry Smith for (i=0; i<n; i++) { 922f1af5d2fSBarry Smith ic = 6*c[i]; 923f1af5d2fSBarry Smith t[ii] = b[ic]; 924f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 925f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 926f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 927f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 928f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 929f1af5d2fSBarry Smith ii += 6; 930f1af5d2fSBarry Smith } 931f1af5d2fSBarry Smith 932f1af5d2fSBarry Smith /* forward solve the U^T */ 933f1af5d2fSBarry Smith idx = 0; 934f1af5d2fSBarry Smith for (i=0; i<n; i++) { 935f1af5d2fSBarry Smith 936f1af5d2fSBarry Smith v = aa + 36*diag[i]; 937f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 938f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939f1af5d2fSBarry Smith x6 = t[5+idx]; 940f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946f1af5d2fSBarry Smith v += 36; 947f1af5d2fSBarry Smith 948f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 949f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 950f1af5d2fSBarry Smith while (nz--) { 951f1af5d2fSBarry Smith oidx = 6*(*vi++); 952f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958f1af5d2fSBarry Smith v += 36; 959f1af5d2fSBarry Smith } 960f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961f1af5d2fSBarry Smith t[5+idx] = s6; 962f1af5d2fSBarry Smith idx += 6; 963f1af5d2fSBarry Smith } 964f1af5d2fSBarry Smith /* backward solve the L^T */ 965f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 966f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 967f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 968f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 969f1af5d2fSBarry Smith idt = 6*i; 970f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971f1af5d2fSBarry Smith s6 = t[5+idt]; 972f1af5d2fSBarry Smith while (nz--) { 973f1af5d2fSBarry Smith idx = 6*(*vi--); 974f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980f1af5d2fSBarry Smith v -= 36; 981f1af5d2fSBarry Smith } 982f1af5d2fSBarry Smith } 983f1af5d2fSBarry Smith 984f1af5d2fSBarry Smith /* copy t into x according to permutation */ 985f1af5d2fSBarry Smith ii = 0; 986f1af5d2fSBarry Smith for (i=0; i<n; i++) { 987f1af5d2fSBarry Smith ir = 6*r[i]; 988f1af5d2fSBarry Smith x[ir] = t[ii]; 989f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 990f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 991f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 992f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 993f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 994f1af5d2fSBarry Smith ii += 6; 995f1af5d2fSBarry Smith } 996f1af5d2fSBarry Smith 997f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9991ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002f1af5d2fSBarry Smith PetscFunctionReturn(0); 1003f1af5d2fSBarry Smith } 1004f1af5d2fSBarry Smith 10054a2ae208SSatish Balay #undef __FUNCT__ 10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008f1af5d2fSBarry Smith { 1009f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10116849ba73SBarry Smith PetscErrorCode ierr; 10125d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 10135d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 101687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 101787828ca2SBarry Smith PetscScalar *x,*b,*t; 1018f1af5d2fSBarry Smith 1019f1af5d2fSBarry Smith PetscFunctionBegin; 10201ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022f1af5d2fSBarry Smith t = a->solve_work; 1023f1af5d2fSBarry Smith 1024f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026f1af5d2fSBarry Smith 1027f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1028f1af5d2fSBarry Smith ii = 0; 1029f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1030f1af5d2fSBarry Smith ic = 7*c[i]; 1031f1af5d2fSBarry Smith t[ii] = b[ic]; 1032f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1033f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1034f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1035f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1036f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1037f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1038f1af5d2fSBarry Smith ii += 7; 1039f1af5d2fSBarry Smith } 1040f1af5d2fSBarry Smith 1041f1af5d2fSBarry Smith /* forward solve the U^T */ 1042f1af5d2fSBarry Smith idx = 0; 1043f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1044f1af5d2fSBarry Smith 1045f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1046f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1047f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1049f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056f1af5d2fSBarry Smith v += 49; 1057f1af5d2fSBarry Smith 1058f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1059f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1060f1af5d2fSBarry Smith while (nz--) { 1061f1af5d2fSBarry Smith oidx = 7*(*vi++); 1062f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069f1af5d2fSBarry Smith v += 49; 1070f1af5d2fSBarry Smith } 1071f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1073f1af5d2fSBarry Smith idx += 7; 1074f1af5d2fSBarry Smith } 1075f1af5d2fSBarry Smith /* backward solve the L^T */ 1076f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1077f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1078f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1079f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1080f1af5d2fSBarry Smith idt = 7*i; 1081f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1083f1af5d2fSBarry Smith while (nz--) { 1084f1af5d2fSBarry Smith idx = 7*(*vi--); 1085f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092f1af5d2fSBarry Smith v -= 49; 1093f1af5d2fSBarry Smith } 1094f1af5d2fSBarry Smith } 1095f1af5d2fSBarry Smith 1096f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1097f1af5d2fSBarry Smith ii = 0; 1098f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1099f1af5d2fSBarry Smith ir = 7*r[i]; 1100f1af5d2fSBarry Smith x[ir] = t[ii]; 1101f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1102f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1103f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1104f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1105f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1106f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1107f1af5d2fSBarry Smith ii += 7; 1108f1af5d2fSBarry Smith } 1109f1af5d2fSBarry Smith 1110f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11121ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115f1af5d2fSBarry Smith PetscFunctionReturn(0); 1116f1af5d2fSBarry Smith } 1117f1af5d2fSBarry Smith 11184e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11194a2ae208SSatish Balay #undef __FUNCT__ 11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11224e2b4712SSatish Balay { 11234e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11244e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11256849ba73SBarry Smith PetscErrorCode ierr; 11265d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 11275d0c19d7SBarry Smith PetscInt i,n=a->mbs; 11285d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11293f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 113087828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11314e2b4712SSatish Balay 11324e2b4712SSatish Balay PetscFunctionBegin; 11331ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11341ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135f1af5d2fSBarry Smith t = a->solve_work; 11364e2b4712SSatish Balay 11374e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11384e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11394e2b4712SSatish Balay 11404e2b4712SSatish Balay /* forward solve the lower triangular */ 114187828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11424e2b4712SSatish Balay for (i=1; i<n; i++) { 11434e2b4712SSatish Balay v = aa + bs2*ai[i]; 11444e2b4712SSatish Balay vi = aj + ai[i]; 11454e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1146f1af5d2fSBarry Smith s = t + bs*i; 114787828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11484e2b4712SSatish Balay while (nz--) { 1149f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11504e2b4712SSatish Balay v += bs2; 11514e2b4712SSatish Balay } 11524e2b4712SSatish Balay } 11534e2b4712SSatish Balay /* backward solve the upper triangular */ 1154d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 11554e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11564e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11574e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11584e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 115987828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11604e2b4712SSatish Balay while (nz--) { 1161f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11624e2b4712SSatish Balay v += bs2; 11634e2b4712SSatish Balay } 1164f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 116587828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11664e2b4712SSatish Balay } 11674e2b4712SSatish Balay 11684e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11694e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11701ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 11734e2b4712SSatish Balay PetscFunctionReturn(0); 11744e2b4712SSatish Balay } 11754e2b4712SSatish Balay 11764a2ae208SSatish Balay #undef __FUNCT__ 11774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1178dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11794e2b4712SSatish Balay { 11804e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11814e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11826849ba73SBarry Smith PetscErrorCode ierr; 11835d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 11845d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 11853f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 118687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 118787828ca2SBarry Smith PetscScalar *x,*b,*t; 11884e2b4712SSatish Balay 11894e2b4712SSatish Balay PetscFunctionBegin; 11901ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1192f1af5d2fSBarry Smith t = a->solve_work; 11934e2b4712SSatish Balay 11944e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11954e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11964e2b4712SSatish Balay 11974e2b4712SSatish Balay /* forward solve the lower triangular */ 11984e2b4712SSatish Balay idx = 7*(*r++); 1199f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1200f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1201f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 12024e2b4712SSatish Balay 12034e2b4712SSatish Balay for (i=1; i<n; i++) { 12044e2b4712SSatish Balay v = aa + 49*ai[i]; 12054e2b4712SSatish Balay vi = aj + ai[i]; 12064e2b4712SSatish Balay nz = diag[i] - ai[i]; 12074e2b4712SSatish Balay idx = 7*(*r++); 1208f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1209f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 12104e2b4712SSatish Balay while (nz--) { 12114e2b4712SSatish Balay idx = 7*(*vi++); 1212f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1213f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1214f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1215f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1216f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1217f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1218f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1219f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1220f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1221f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12224e2b4712SSatish Balay v += 49; 12234e2b4712SSatish Balay } 12244e2b4712SSatish Balay idx = 7*i; 1225f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1226f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1227f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12284e2b4712SSatish Balay } 12294e2b4712SSatish Balay /* backward solve the upper triangular */ 12304e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12314e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12324e2b4712SSatish Balay vi = aj + diag[i] + 1; 12334e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12344e2b4712SSatish Balay idt = 7*i; 1235f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1236f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1237f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12384e2b4712SSatish Balay while (nz--) { 12394e2b4712SSatish Balay idx = 7*(*vi++); 1240f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1241f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1243f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1244f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1245f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1246f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1247f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1248f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1249f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12504e2b4712SSatish Balay v += 49; 12514e2b4712SSatish Balay } 12524e2b4712SSatish Balay idc = 7*(*c--); 12534e2b4712SSatish Balay v = aa + 49*diag[i]; 1254f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1255f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1256f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1257f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1258f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1259f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1260f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1261f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1262f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1263f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1264f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1265f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1266f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1267f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12684e2b4712SSatish Balay } 12694e2b4712SSatish Balay 12704e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12714e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12721ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12731ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1274dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 12754e2b4712SSatish Balay PetscFunctionReturn(0); 12764e2b4712SSatish Balay } 12774e2b4712SSatish Balay 12784a2ae208SSatish Balay #undef __FUNCT__ 12794a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1280dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 128115091d37SBarry Smith { 128215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1283690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1284dfbe8321SBarry Smith PetscErrorCode ierr; 1285690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1286d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1287d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1288d9fead3dSBarry Smith const PetscScalar *b; 128915091d37SBarry Smith 129015091d37SBarry Smith PetscFunctionBegin; 1291d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 12921ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 129315091d37SBarry Smith /* forward solve the lower triangular */ 129415091d37SBarry Smith idx = 0; 129515091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 129615091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 129715091d37SBarry Smith x[6] = b[6+idx]; 129815091d37SBarry Smith for (i=1; i<n; i++) { 129915091d37SBarry Smith v = aa + 49*ai[i]; 130015091d37SBarry Smith vi = aj + ai[i]; 130115091d37SBarry Smith nz = diag[i] - ai[i]; 130215091d37SBarry Smith idx = 7*i; 1303f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1304f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1305f1af5d2fSBarry Smith s7 = b[6+idx]; 130615091d37SBarry Smith while (nz--) { 130715091d37SBarry Smith jdx = 7*(*vi++); 130815091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 130915091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 131015091d37SBarry Smith x7 = x[6+jdx]; 1311f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1312f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1313f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1314f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1315f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1316f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1317f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 131815091d37SBarry Smith v += 49; 131915091d37SBarry Smith } 1320f1af5d2fSBarry Smith x[idx] = s1; 1321f1af5d2fSBarry Smith x[1+idx] = s2; 1322f1af5d2fSBarry Smith x[2+idx] = s3; 1323f1af5d2fSBarry Smith x[3+idx] = s4; 1324f1af5d2fSBarry Smith x[4+idx] = s5; 1325f1af5d2fSBarry Smith x[5+idx] = s6; 1326f1af5d2fSBarry Smith x[6+idx] = s7; 132715091d37SBarry Smith } 132815091d37SBarry Smith /* backward solve the upper triangular */ 132915091d37SBarry Smith for (i=n-1; i>=0; i--){ 133015091d37SBarry Smith v = aa + 49*diag[i] + 49; 133115091d37SBarry Smith vi = aj + diag[i] + 1; 133215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 133315091d37SBarry Smith idt = 7*i; 1334f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1335f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1336f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1337f1af5d2fSBarry Smith s7 = x[6+idt]; 133815091d37SBarry Smith while (nz--) { 133915091d37SBarry Smith idx = 7*(*vi++); 134015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 134115091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 134215091d37SBarry Smith x7 = x[6+idx]; 1343f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1344f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1345f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1346f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1347f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1348f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1349f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 135015091d37SBarry Smith v += 49; 135115091d37SBarry Smith } 135215091d37SBarry Smith v = aa + 49*diag[i]; 1353f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1354f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1355f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1356f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1357f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1358f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1359f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1360f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1361f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1362f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1363f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1364f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1365f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1366f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 136715091d37SBarry Smith } 136815091d37SBarry Smith 1369d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13701ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1371dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 137215091d37SBarry Smith PetscFunctionReturn(0); 137315091d37SBarry Smith } 137415091d37SBarry Smith 13754a2ae208SSatish Balay #undef __FUNCT__ 1376cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1377cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1378cee9d6f2SShri Abhyankar { 1379cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1380cee9d6f2SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1381cee9d6f2SShri Abhyankar PetscErrorCode ierr; 1382cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 1383cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1384cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 1385cee9d6f2SShri Abhyankar PetscScalar *x; 1386cee9d6f2SShri Abhyankar const PetscScalar *b; 1387cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1388cee9d6f2SShri Abhyankar 1389cee9d6f2SShri Abhyankar PetscFunctionBegin; 1390cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1391cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1392cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 1393cee9d6f2SShri Abhyankar idx = 0; 1394cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1395cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1396cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 1397cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 1398cee9d6f2SShri Abhyankar vi = aj + ai[i]; 1399cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 1400cee9d6f2SShri Abhyankar idx = bs*i; 1401cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1402cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1403cee9d6f2SShri Abhyankar while (nz--) { 1404cee9d6f2SShri Abhyankar jdx = bs*(*vi++); 1405cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1406cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1407cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1408cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1409cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1410cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1411cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1412cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1413cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1414cee9d6f2SShri Abhyankar v += bs2; 1415cee9d6f2SShri Abhyankar } 1416cee9d6f2SShri Abhyankar 1417cee9d6f2SShri Abhyankar x[idx] = s1; 1418cee9d6f2SShri Abhyankar x[1+idx] = s2; 1419cee9d6f2SShri Abhyankar x[2+idx] = s3; 1420cee9d6f2SShri Abhyankar x[3+idx] = s4; 1421cee9d6f2SShri Abhyankar x[4+idx] = s5; 1422cee9d6f2SShri Abhyankar x[5+idx] = s6; 1423cee9d6f2SShri Abhyankar x[6+idx] = s7; 1424cee9d6f2SShri Abhyankar } 1425cee9d6f2SShri Abhyankar 1426cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 1427cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 1428cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 1429cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 1430cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 1431cee9d6f2SShri Abhyankar idt = bs*i; 1432cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1433cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1434cee9d6f2SShri Abhyankar while (nz--) { 1435cee9d6f2SShri Abhyankar idx = bs*(*vi++); 1436cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1437cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1438cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1439cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1440cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1441cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1442cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1443cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1444cee9d6f2SShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1445cee9d6f2SShri Abhyankar v += bs2; 1446cee9d6f2SShri Abhyankar } 1447cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 1448cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1449cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1450cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1451cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1452cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1453cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1454cee9d6f2SShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1455cee9d6f2SShri Abhyankar } 1456cee9d6f2SShri Abhyankar 1457cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1458cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1459cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1460cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 1461cee9d6f2SShri Abhyankar } 1462cee9d6f2SShri Abhyankar 1463cee9d6f2SShri Abhyankar #undef __FUNCT__ 14644a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1465dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 146615091d37SBarry Smith { 146715091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 146815091d37SBarry Smith IS iscol=a->col,isrow=a->row; 14696849ba73SBarry Smith PetscErrorCode ierr; 14705d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 14715d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1472d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1473d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1474d9fead3dSBarry Smith const PetscScalar *b; 147515091d37SBarry Smith PetscFunctionBegin; 1476d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14771ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1478f1af5d2fSBarry Smith t = a->solve_work; 147915091d37SBarry Smith 148015091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 148115091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 148215091d37SBarry Smith 148315091d37SBarry Smith /* forward solve the lower triangular */ 148415091d37SBarry Smith idx = 6*(*r++); 1485f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1486f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1487f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 148815091d37SBarry Smith for (i=1; i<n; i++) { 148915091d37SBarry Smith v = aa + 36*ai[i]; 149015091d37SBarry Smith vi = aj + ai[i]; 149115091d37SBarry Smith nz = diag[i] - ai[i]; 149215091d37SBarry Smith idx = 6*(*r++); 1493f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1494f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 149515091d37SBarry Smith while (nz--) { 149615091d37SBarry Smith idx = 6*(*vi++); 1497f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1498f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1499f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1500f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1501f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1502f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1503f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1504f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 150515091d37SBarry Smith v += 36; 150615091d37SBarry Smith } 150715091d37SBarry Smith idx = 6*i; 1508f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1509f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1510f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 151115091d37SBarry Smith } 151215091d37SBarry Smith /* backward solve the upper triangular */ 151315091d37SBarry Smith for (i=n-1; i>=0; i--){ 151415091d37SBarry Smith v = aa + 36*diag[i] + 36; 151515091d37SBarry Smith vi = aj + diag[i] + 1; 151615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 151715091d37SBarry Smith idt = 6*i; 1518f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1519f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1520f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 152115091d37SBarry Smith while (nz--) { 152215091d37SBarry Smith idx = 6*(*vi++); 1523f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1524f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1525f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1526f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1527f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1528f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1529f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1530f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1531f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 153215091d37SBarry Smith v += 36; 153315091d37SBarry Smith } 153415091d37SBarry Smith idc = 6*(*c--); 153515091d37SBarry Smith v = aa + 36*diag[i]; 1536f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1537f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1538f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1539f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1540f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1541f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1542f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1543f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1544f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1545f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1546f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1547f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 154815091d37SBarry Smith } 154915091d37SBarry Smith 155015091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 155115091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1552d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1554dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 155515091d37SBarry Smith PetscFunctionReturn(0); 155615091d37SBarry Smith } 155715091d37SBarry Smith 15584a2ae208SSatish Balay #undef __FUNCT__ 15594a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1560dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 156115091d37SBarry Smith { 156215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1563690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1564dfbe8321SBarry Smith PetscErrorCode ierr; 1565690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1566d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1567d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1568d9fead3dSBarry Smith const PetscScalar *b; 156915091d37SBarry Smith 157015091d37SBarry Smith PetscFunctionBegin; 1571d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15721ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 157315091d37SBarry Smith /* forward solve the lower triangular */ 157415091d37SBarry Smith idx = 0; 157515091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 157615091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 157715091d37SBarry Smith for (i=1; i<n; i++) { 157815091d37SBarry Smith v = aa + 36*ai[i]; 157915091d37SBarry Smith vi = aj + ai[i]; 158015091d37SBarry Smith nz = diag[i] - ai[i]; 158115091d37SBarry Smith idx = 6*i; 1582f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1583f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 158415091d37SBarry Smith while (nz--) { 158515091d37SBarry Smith jdx = 6*(*vi++); 158615091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 158715091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1588f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1589f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1590f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1591f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1592f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1593f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 159415091d37SBarry Smith v += 36; 159515091d37SBarry Smith } 1596f1af5d2fSBarry Smith x[idx] = s1; 1597f1af5d2fSBarry Smith x[1+idx] = s2; 1598f1af5d2fSBarry Smith x[2+idx] = s3; 1599f1af5d2fSBarry Smith x[3+idx] = s4; 1600f1af5d2fSBarry Smith x[4+idx] = s5; 1601f1af5d2fSBarry Smith x[5+idx] = s6; 160215091d37SBarry Smith } 160315091d37SBarry Smith /* backward solve the upper triangular */ 160415091d37SBarry Smith for (i=n-1; i>=0; i--){ 160515091d37SBarry Smith v = aa + 36*diag[i] + 36; 160615091d37SBarry Smith vi = aj + diag[i] + 1; 160715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 160815091d37SBarry Smith idt = 6*i; 1609f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1610f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1611f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 161215091d37SBarry Smith while (nz--) { 161315091d37SBarry Smith idx = 6*(*vi++); 161415091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 161515091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1616f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1617f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1618f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1619f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1620f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1621f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 162215091d37SBarry Smith v += 36; 162315091d37SBarry Smith } 162415091d37SBarry Smith v = aa + 36*diag[i]; 1625f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1626f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1627f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1628f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1629f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1630f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 163115091d37SBarry Smith } 163215091d37SBarry Smith 1633d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16341ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1635dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 163615091d37SBarry Smith PetscFunctionReturn(0); 163715091d37SBarry Smith } 163815091d37SBarry Smith 16394a2ae208SSatish Balay #undef __FUNCT__ 1640cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 1641cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1642cee9d6f2SShri Abhyankar { 1643cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1644cee9d6f2SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1645cee9d6f2SShri Abhyankar PetscErrorCode ierr; 1646cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 1647cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1648cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 1649cee9d6f2SShri Abhyankar PetscScalar *x; 1650cee9d6f2SShri Abhyankar const PetscScalar *b; 1651cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1652cee9d6f2SShri Abhyankar 1653cee9d6f2SShri Abhyankar PetscFunctionBegin; 1654cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1655cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1656cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 1657cee9d6f2SShri Abhyankar idx = 0; 1658cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1659cee9d6f2SShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 1660cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 1661cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 1662cee9d6f2SShri Abhyankar vi = aj + ai[i]; 1663cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 1664cee9d6f2SShri Abhyankar idx = bs*i; 1665cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1666cee9d6f2SShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 1667cee9d6f2SShri Abhyankar while (nz--) { 1668cee9d6f2SShri Abhyankar jdx = bs*(*vi++); 1669cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1670cee9d6f2SShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 1671cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1672cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1673cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1674cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1675cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1676cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1677cee9d6f2SShri Abhyankar v += bs2; 1678cee9d6f2SShri Abhyankar } 1679cee9d6f2SShri Abhyankar 1680cee9d6f2SShri Abhyankar x[idx] = s1; 1681cee9d6f2SShri Abhyankar x[1+idx] = s2; 1682cee9d6f2SShri Abhyankar x[2+idx] = s3; 1683cee9d6f2SShri Abhyankar x[3+idx] = s4; 1684cee9d6f2SShri Abhyankar x[4+idx] = s5; 1685cee9d6f2SShri Abhyankar x[5+idx] = s6; 1686cee9d6f2SShri Abhyankar } 1687cee9d6f2SShri Abhyankar 1688cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 1689cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 1690cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 1691cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 1692cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 1693cee9d6f2SShri Abhyankar idt = bs*i; 1694cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1695cee9d6f2SShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 1696cee9d6f2SShri Abhyankar while (nz--) { 1697cee9d6f2SShri Abhyankar idx = bs*(*vi++); 1698cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1699cee9d6f2SShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 1700cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1701cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 1702cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1703cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1704cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1705cee9d6f2SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1706cee9d6f2SShri Abhyankar v += bs2; 1707cee9d6f2SShri Abhyankar } 1708cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 1709cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1710cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1711cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1712cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1713cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1714cee9d6f2SShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1715cee9d6f2SShri Abhyankar } 1716cee9d6f2SShri Abhyankar 1717cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1718cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1719cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1720cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 1721cee9d6f2SShri Abhyankar } 1722cee9d6f2SShri Abhyankar #undef __FUNCT__ 17234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1724dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 17254e2b4712SSatish Balay { 17264e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 17274e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 17286849ba73SBarry Smith PetscErrorCode ierr; 17295d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 17305d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1731d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1732d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1733d9fead3dSBarry Smith const PetscScalar *b; 17344e2b4712SSatish Balay 17354e2b4712SSatish Balay PetscFunctionBegin; 1736d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17371ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1738f1af5d2fSBarry Smith t = a->solve_work; 17394e2b4712SSatish Balay 17404e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 17414e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 17424e2b4712SSatish Balay 17434e2b4712SSatish Balay /* forward solve the lower triangular */ 17444e2b4712SSatish Balay idx = 5*(*r++); 1745f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1746f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 17474e2b4712SSatish Balay for (i=1; i<n; i++) { 17484e2b4712SSatish Balay v = aa + 25*ai[i]; 17494e2b4712SSatish Balay vi = aj + ai[i]; 17504e2b4712SSatish Balay nz = diag[i] - ai[i]; 17514e2b4712SSatish Balay idx = 5*(*r++); 1752f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1753f1af5d2fSBarry Smith s5 = b[4+idx]; 17544e2b4712SSatish Balay while (nz--) { 17554e2b4712SSatish Balay idx = 5*(*vi++); 1756f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1757f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1758f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1759f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1760f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1761f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1762f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 17634e2b4712SSatish Balay v += 25; 17644e2b4712SSatish Balay } 17654e2b4712SSatish Balay idx = 5*i; 1766f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1767f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 17684e2b4712SSatish Balay } 17694e2b4712SSatish Balay /* backward solve the upper triangular */ 17704e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 17714e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 17724e2b4712SSatish Balay vi = aj + diag[i] + 1; 17734e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 17744e2b4712SSatish Balay idt = 5*i; 1775f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1776f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 17774e2b4712SSatish Balay while (nz--) { 17784e2b4712SSatish Balay idx = 5*(*vi++); 1779f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1780f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1781f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1782f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1783f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1784f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1785f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 17864e2b4712SSatish Balay v += 25; 17874e2b4712SSatish Balay } 17884e2b4712SSatish Balay idc = 5*(*c--); 17894e2b4712SSatish Balay v = aa + 25*diag[i]; 1790f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1791f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 1792f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1793f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 1794f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1795f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 1796f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1797f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 1798f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1799f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 18004e2b4712SSatish Balay } 18014e2b4712SSatish Balay 18024e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 18034e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1804d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18051ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1806dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 18074e2b4712SSatish Balay PetscFunctionReturn(0); 18084e2b4712SSatish Balay } 18094e2b4712SSatish Balay 18104a2ae208SSatish Balay #undef __FUNCT__ 18114a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1812dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 181315091d37SBarry Smith { 181415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1815690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1816dfbe8321SBarry Smith PetscErrorCode ierr; 1817690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1818d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1819d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1820d9fead3dSBarry Smith const PetscScalar *b; 182115091d37SBarry Smith 182215091d37SBarry Smith PetscFunctionBegin; 1823d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18241ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 182515091d37SBarry Smith /* forward solve the lower triangular */ 182615091d37SBarry Smith idx = 0; 182715091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 182815091d37SBarry Smith for (i=1; i<n; i++) { 182915091d37SBarry Smith v = aa + 25*ai[i]; 183015091d37SBarry Smith vi = aj + ai[i]; 183115091d37SBarry Smith nz = diag[i] - ai[i]; 183215091d37SBarry Smith idx = 5*i; 1833f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 183415091d37SBarry Smith while (nz--) { 183515091d37SBarry Smith jdx = 5*(*vi++); 183615091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1837f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1838f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1839f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1840f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1841f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 184215091d37SBarry Smith v += 25; 184315091d37SBarry Smith } 1844f1af5d2fSBarry Smith x[idx] = s1; 1845f1af5d2fSBarry Smith x[1+idx] = s2; 1846f1af5d2fSBarry Smith x[2+idx] = s3; 1847f1af5d2fSBarry Smith x[3+idx] = s4; 1848f1af5d2fSBarry Smith x[4+idx] = s5; 184915091d37SBarry Smith } 185015091d37SBarry Smith /* backward solve the upper triangular */ 185115091d37SBarry Smith for (i=n-1; i>=0; i--){ 185215091d37SBarry Smith v = aa + 25*diag[i] + 25; 185315091d37SBarry Smith vi = aj + diag[i] + 1; 185415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 185515091d37SBarry Smith idt = 5*i; 1856f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1857f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 185815091d37SBarry Smith while (nz--) { 185915091d37SBarry Smith idx = 5*(*vi++); 186015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1861f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1862f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1863f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1864f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1865f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 186615091d37SBarry Smith v += 25; 186715091d37SBarry Smith } 186815091d37SBarry Smith v = aa + 25*diag[i]; 1869f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1870f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1871f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1872f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1873f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 187415091d37SBarry Smith } 187515091d37SBarry Smith 1876d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18771ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1878dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 187915091d37SBarry Smith PetscFunctionReturn(0); 188015091d37SBarry Smith } 188115091d37SBarry Smith 18824a2ae208SSatish Balay #undef __FUNCT__ 1883cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 1884cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1885cee9d6f2SShri Abhyankar { 1886cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1887cee9d6f2SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1888cee9d6f2SShri Abhyankar PetscErrorCode ierr; 1889cee9d6f2SShri Abhyankar PetscInt jdx; 1890cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 1891cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1892cee9d6f2SShri Abhyankar const PetscScalar *b; 1893cee9d6f2SShri Abhyankar 1894cee9d6f2SShri Abhyankar PetscFunctionBegin; 1895cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1896cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1897cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 1898cee9d6f2SShri Abhyankar idx = 0; 1899cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1900cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 1901cee9d6f2SShri Abhyankar v = aa + 25*ai[i]; 1902cee9d6f2SShri Abhyankar vi = aj + ai[i]; 1903cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 1904cee9d6f2SShri Abhyankar idx = 5*i; 1905cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1906cee9d6f2SShri Abhyankar while (nz--) { 1907cee9d6f2SShri Abhyankar jdx = 5*(*vi++); 1908cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1909cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1910cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1911cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1912cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1913cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1914cee9d6f2SShri Abhyankar v += 25; 1915cee9d6f2SShri Abhyankar } 1916cee9d6f2SShri Abhyankar x[idx] = s1; 1917cee9d6f2SShri Abhyankar x[1+idx] = s2; 1918cee9d6f2SShri Abhyankar x[2+idx] = s3; 1919cee9d6f2SShri Abhyankar x[3+idx] = s4; 1920cee9d6f2SShri Abhyankar x[4+idx] = s5; 1921cee9d6f2SShri Abhyankar } 1922cee9d6f2SShri Abhyankar 1923cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 1924cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 1925cee9d6f2SShri Abhyankar v = aa + 25*ai[2*n-i]; 1926cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 1927cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 1928cee9d6f2SShri Abhyankar idt = 5*i; 1929cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 1930cee9d6f2SShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1931cee9d6f2SShri Abhyankar while (nz--) { 1932cee9d6f2SShri Abhyankar idx = 5*(*vi++); 1933cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1934cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1935cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1936cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1937cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1938cee9d6f2SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1939cee9d6f2SShri Abhyankar v += 25; 1940cee9d6f2SShri Abhyankar } 1941cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 1942cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1943cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1944cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1945cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1946cee9d6f2SShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1947cee9d6f2SShri Abhyankar } 1948cee9d6f2SShri Abhyankar 1949cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1950cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1951cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1952cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 1953cee9d6f2SShri Abhyankar } 1954cee9d6f2SShri Abhyankar 1955cee9d6f2SShri Abhyankar #undef __FUNCT__ 19564a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1957dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 19584e2b4712SSatish Balay { 19594e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 19604e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 19616849ba73SBarry Smith PetscErrorCode ierr; 19625d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 19635d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1964d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1965d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1966d9fead3dSBarry Smith const PetscScalar *b; 19674e2b4712SSatish Balay 19684e2b4712SSatish Balay PetscFunctionBegin; 1969d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1971f1af5d2fSBarry Smith t = a->solve_work; 19724e2b4712SSatish Balay 19734e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 19744e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 19754e2b4712SSatish Balay 19764e2b4712SSatish Balay /* forward solve the lower triangular */ 19774e2b4712SSatish Balay idx = 4*(*r++); 1978f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1979f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 19804e2b4712SSatish Balay for (i=1; i<n; i++) { 19814e2b4712SSatish Balay v = aa + 16*ai[i]; 19824e2b4712SSatish Balay vi = aj + ai[i]; 19834e2b4712SSatish Balay nz = diag[i] - ai[i]; 19844e2b4712SSatish Balay idx = 4*(*r++); 1985f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 19864e2b4712SSatish Balay while (nz--) { 19874e2b4712SSatish Balay idx = 4*(*vi++); 1988f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1989f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1990f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1991f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1992f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 19934e2b4712SSatish Balay v += 16; 19944e2b4712SSatish Balay } 19954e2b4712SSatish Balay idx = 4*i; 1996f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1997f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 19984e2b4712SSatish Balay } 19994e2b4712SSatish Balay /* backward solve the upper triangular */ 20004e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 20014e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 20024e2b4712SSatish Balay vi = aj + diag[i] + 1; 20034e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 20044e2b4712SSatish Balay idt = 4*i; 2005f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2006f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 20074e2b4712SSatish Balay while (nz--) { 20084e2b4712SSatish Balay idx = 4*(*vi++); 2009f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2010f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2011f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2012f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2013f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2014f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 20154e2b4712SSatish Balay v += 16; 20164e2b4712SSatish Balay } 20174e2b4712SSatish Balay idc = 4*(*c--); 20184e2b4712SSatish Balay v = aa + 16*diag[i]; 2019f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2020f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2021f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2022f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 20234e2b4712SSatish Balay } 20244e2b4712SSatish Balay 20254e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 20264e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2027d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20281ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2029dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 20304e2b4712SSatish Balay PetscFunctionReturn(0); 20314e2b4712SSatish Balay } 2032f26ec98cSKris Buschelman 2033f26ec98cSKris Buschelman #undef __FUNCT__ 2034f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2035dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2036f26ec98cSKris Buschelman { 2037f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2038f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 20396849ba73SBarry Smith PetscErrorCode ierr; 20405d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 20415d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2042d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2043d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2044d9fead3dSBarry Smith PetscScalar *x; 2045d9fead3dSBarry Smith const PetscScalar *b; 2046f26ec98cSKris Buschelman 2047f26ec98cSKris Buschelman PetscFunctionBegin; 2048d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20491ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2050f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 2051f26ec98cSKris Buschelman 2052f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2053f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2054f26ec98cSKris Buschelman 2055f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2056f26ec98cSKris Buschelman idx = 4*(*r++); 2057f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 2058f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 2059f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 2060f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 2061f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2062f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2063f26ec98cSKris Buschelman vi = aj + ai[i]; 2064f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2065f26ec98cSKris Buschelman idx = 4*(*r++); 2066f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2067f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2068f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2069f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2070f26ec98cSKris Buschelman while (nz--) { 2071f26ec98cSKris Buschelman idx = 4*(*vi++); 2072f26ec98cSKris Buschelman x1 = t[idx]; 2073f26ec98cSKris Buschelman x2 = t[1+idx]; 2074f26ec98cSKris Buschelman x3 = t[2+idx]; 2075f26ec98cSKris Buschelman x4 = t[3+idx]; 2076f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2077f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2078f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2079f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2080f26ec98cSKris Buschelman v += 16; 2081f26ec98cSKris Buschelman } 2082f26ec98cSKris Buschelman idx = 4*i; 2083f26ec98cSKris Buschelman t[idx] = s1; 2084f26ec98cSKris Buschelman t[1+idx] = s2; 2085f26ec98cSKris Buschelman t[2+idx] = s3; 2086f26ec98cSKris Buschelman t[3+idx] = s4; 2087f26ec98cSKris Buschelman } 2088f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2089f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2090f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 2091f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2092f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2093f26ec98cSKris Buschelman idt = 4*i; 2094f26ec98cSKris Buschelman s1 = t[idt]; 2095f26ec98cSKris Buschelman s2 = t[1+idt]; 2096f26ec98cSKris Buschelman s3 = t[2+idt]; 2097f26ec98cSKris Buschelman s4 = t[3+idt]; 2098f26ec98cSKris Buschelman while (nz--) { 2099f26ec98cSKris Buschelman idx = 4*(*vi++); 2100f26ec98cSKris Buschelman x1 = t[idx]; 2101f26ec98cSKris Buschelman x2 = t[1+idx]; 2102f26ec98cSKris Buschelman x3 = t[2+idx]; 2103f26ec98cSKris Buschelman x4 = t[3+idx]; 2104f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2105f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2106f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2107f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2108f26ec98cSKris Buschelman v += 16; 2109f26ec98cSKris Buschelman } 2110f26ec98cSKris Buschelman idc = 4*(*c--); 2111f26ec98cSKris Buschelman v = aa + 16*diag[i]; 2112f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2113f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2114f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2115f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2116f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 2117f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 2118f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 2119f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 2120f26ec98cSKris Buschelman } 2121f26ec98cSKris Buschelman 2122f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2123f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2124d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21251ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2126dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2127f26ec98cSKris Buschelman PetscFunctionReturn(0); 2128f26ec98cSKris Buschelman } 2129f26ec98cSKris Buschelman 213024c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 213124c233c2SKris Buschelman 213224c233c2SKris Buschelman #include PETSC_HAVE_SSE 213324c233c2SKris Buschelman 213424c233c2SKris Buschelman #undef __FUNCT__ 213524c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 2136dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 213724c233c2SKris Buschelman { 213824c233c2SKris Buschelman /* 213924c233c2SKris Buschelman Note: This code uses demotion of double 214024c233c2SKris Buschelman to float when performing the mixed-mode computation. 214124c233c2SKris Buschelman This may not be numerically reasonable for all applications. 214224c233c2SKris Buschelman */ 214324c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 214424c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 21456849ba73SBarry Smith PetscErrorCode ierr; 21465d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 21475d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 214824c233c2SKris Buschelman MatScalar *aa=a->a,*v; 214987828ca2SBarry Smith PetscScalar *x,*b,*t; 215024c233c2SKris Buschelman 215124c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 215224c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 215324c233c2SKris Buschelman unsigned long offset; 215424c233c2SKris Buschelman 215524c233c2SKris Buschelman PetscFunctionBegin; 215624c233c2SKris Buschelman SSE_SCOPE_BEGIN; 215724c233c2SKris Buschelman 215824c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 215924c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 216024c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 216124c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 216224c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 216324c233c2SKris Buschelman 21641ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 21651ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 216624c233c2SKris Buschelman t = a->solve_work; 216724c233c2SKris Buschelman 216824c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 216924c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 217024c233c2SKris Buschelman 217124c233c2SKris Buschelman /* forward solve the lower triangular */ 217224c233c2SKris Buschelman idx = 4*(*r++); 217324c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 217424c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 217524c233c2SKris Buschelman v = aa + 16*ai[1]; 217624c233c2SKris Buschelman 217724c233c2SKris Buschelman for (i=1; i<n;) { 217824c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 217924c233c2SKris Buschelman vi = aj + ai[i]; 218024c233c2SKris Buschelman nz = diag[i] - ai[i]; 218124c233c2SKris Buschelman idx = 4*(*r++); 218224c233c2SKris Buschelman 218324c233c2SKris Buschelman /* Demote sum from double to float */ 218424c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 218524c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 218624c233c2SKris Buschelman 218724c233c2SKris Buschelman while (nz--) { 218824c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 218924c233c2SKris Buschelman idx = 4*(*vi++); 219024c233c2SKris Buschelman 219124c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 219224c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 219324c233c2SKris Buschelman 219424c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 219524c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 219624c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 219724c233c2SKris Buschelman 219824c233c2SKris Buschelman /* First Column */ 219924c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 220024c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 220124c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 220224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 220324c233c2SKris Buschelman 220424c233c2SKris Buschelman /* Second Column */ 220524c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 220624c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 220724c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 220824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 220924c233c2SKris Buschelman 221024c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 221124c233c2SKris Buschelman 221224c233c2SKris Buschelman /* Third Column */ 221324c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 221424c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 221524c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 221624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 221724c233c2SKris Buschelman 221824c233c2SKris Buschelman /* Fourth Column */ 221924c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 222024c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 222124c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 222224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 222324c233c2SKris Buschelman SSE_INLINE_END_2 222424c233c2SKris Buschelman 222524c233c2SKris Buschelman v += 16; 222624c233c2SKris Buschelman } 222724c233c2SKris Buschelman idx = 4*i; 222824c233c2SKris Buschelman v = aa + 16*ai[++i]; 222924c233c2SKris Buschelman PREFETCH_NTA(v); 223024c233c2SKris Buschelman STORE_PS(tmps,XMM7); 223124c233c2SKris Buschelman 223224c233c2SKris Buschelman /* Promote result from float to double */ 223324c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 223424c233c2SKris Buschelman } 223524c233c2SKris Buschelman /* backward solve the upper triangular */ 223624c233c2SKris Buschelman idt = 4*(n-1); 223724c233c2SKris Buschelman ai16 = 16*diag[n-1]; 223824c233c2SKris Buschelman v = aa + ai16 + 16; 223924c233c2SKris Buschelman for (i=n-1; i>=0;){ 224024c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 224124c233c2SKris Buschelman vi = aj + diag[i] + 1; 224224c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 224324c233c2SKris Buschelman 224424c233c2SKris Buschelman /* Demote accumulator from double to float */ 224524c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 224624c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 224724c233c2SKris Buschelman 224824c233c2SKris Buschelman while (nz--) { 224924c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 225024c233c2SKris Buschelman idx = 4*(*vi++); 225124c233c2SKris Buschelman 225224c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 225324c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 225424c233c2SKris Buschelman 225524c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 225624c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 225724c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 225824c233c2SKris Buschelman 225924c233c2SKris Buschelman /* First Column */ 226024c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 226124c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 226224c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 226324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 226424c233c2SKris Buschelman 226524c233c2SKris Buschelman /* Second Column */ 226624c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 226724c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 226824c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 226924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 227024c233c2SKris Buschelman 227124c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 227224c233c2SKris Buschelman 227324c233c2SKris Buschelman /* Third Column */ 227424c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 227524c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 227624c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 227724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 227824c233c2SKris Buschelman 227924c233c2SKris Buschelman /* Fourth Column */ 228024c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 228124c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 228224c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 228324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 228424c233c2SKris Buschelman SSE_INLINE_END_2 228524c233c2SKris Buschelman v += 16; 228624c233c2SKris Buschelman } 228724c233c2SKris Buschelman v = aa + ai16; 228824c233c2SKris Buschelman ai16 = 16*diag[--i]; 228924c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 229024c233c2SKris Buschelman /* 229124c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 229224c233c2SKris Buschelman which was inverted as part of the factorization 229324c233c2SKris Buschelman */ 229424c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 229524c233c2SKris Buschelman /* First Column */ 229624c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 229724c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 229824c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 229924c233c2SKris Buschelman 230024c233c2SKris Buschelman /* Second Column */ 230124c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 230224c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 230324c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 230424c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 230524c233c2SKris Buschelman 230624c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 230724c233c2SKris Buschelman 230824c233c2SKris Buschelman /* Third Column */ 230924c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 231024c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 231124c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 231224c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 231324c233c2SKris Buschelman 231424c233c2SKris Buschelman /* Fourth Column */ 231524c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 231624c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 231724c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 231824c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 231924c233c2SKris Buschelman 232024c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 232124c233c2SKris Buschelman SSE_INLINE_END_3 232224c233c2SKris Buschelman 232324c233c2SKris Buschelman /* Promote solution from float to double */ 232424c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 232524c233c2SKris Buschelman 232624c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 232724c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 232824c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 232924c233c2SKris Buschelman idc = 4*(*c--); 233024c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 233124c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 233224c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 233324c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 233424c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 233524c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 233624c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 233724c233c2SKris Buschelman SSE_INLINE_END_2 233824c233c2SKris Buschelman v = aa + ai16 + 16; 233924c233c2SKris Buschelman idt -= 4; 234024c233c2SKris Buschelman } 234124c233c2SKris Buschelman 234224c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 234324c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 23441ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 23451ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2346dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 234724c233c2SKris Buschelman SSE_SCOPE_END; 234824c233c2SKris Buschelman PetscFunctionReturn(0); 234924c233c2SKris Buschelman } 235024c233c2SKris Buschelman 235124c233c2SKris Buschelman #endif 23520ef38995SBarry Smith 23530ef38995SBarry Smith 23544e2b4712SSatish Balay /* 23554e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 23564e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 23574e2b4712SSatish Balay */ 23584a2ae208SSatish Balay #undef __FUNCT__ 23594a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2360dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 23614e2b4712SSatish Balay { 23624e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2363356650c2SBarry Smith PetscInt n=a->mbs; 2364356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 2365dfbe8321SBarry Smith PetscErrorCode ierr; 2366356650c2SBarry Smith const PetscInt *diag = a->diag; 2367d9fead3dSBarry Smith const MatScalar *aa=a->a; 2368d9fead3dSBarry Smith PetscScalar *x; 2369d9fead3dSBarry Smith const PetscScalar *b; 23704e2b4712SSatish Balay 23714e2b4712SSatish Balay PetscFunctionBegin; 2372d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23744e2b4712SSatish Balay 2375aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 23762853dc0eSBarry Smith { 237787828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 23782853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 23792853dc0eSBarry Smith } 2380aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 23812853dc0eSBarry Smith { 238287828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 23832853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 23842853dc0eSBarry Smith } 2385aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 23862853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2387e1293385SBarry Smith #else 238830d4dcafSBarry Smith { 238987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2390d9fead3dSBarry Smith const MatScalar *v; 2391356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 2392356650c2SBarry Smith const PetscInt *vi; 2393e1293385SBarry Smith 23944e2b4712SSatish Balay /* forward solve the lower triangular */ 23954e2b4712SSatish Balay idx = 0; 2396e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 23974e2b4712SSatish Balay for (i=1; i<n; i++) { 23984e2b4712SSatish Balay v = aa + 16*ai[i]; 23994e2b4712SSatish Balay vi = aj + ai[i]; 24004e2b4712SSatish Balay nz = diag[i] - ai[i]; 2401e1293385SBarry Smith idx += 4; 2402f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 24034e2b4712SSatish Balay while (nz--) { 24044e2b4712SSatish Balay jdx = 4*(*vi++); 24054e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2406f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2407f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2408f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2409f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 24104e2b4712SSatish Balay v += 16; 24114e2b4712SSatish Balay } 2412f1af5d2fSBarry Smith x[idx] = s1; 2413f1af5d2fSBarry Smith x[1+idx] = s2; 2414f1af5d2fSBarry Smith x[2+idx] = s3; 2415f1af5d2fSBarry Smith x[3+idx] = s4; 24164e2b4712SSatish Balay } 24174e2b4712SSatish Balay /* backward solve the upper triangular */ 24184e555682SBarry Smith idt = 4*(n-1); 24194e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 24204e555682SBarry Smith ai16 = 16*diag[i]; 24214e555682SBarry Smith v = aa + ai16 + 16; 24224e2b4712SSatish Balay vi = aj + diag[i] + 1; 24234e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2424f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2425f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 24264e2b4712SSatish Balay while (nz--) { 24274e2b4712SSatish Balay idx = 4*(*vi++); 24284e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2429f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2430f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2431f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2432f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 24334e2b4712SSatish Balay v += 16; 24344e2b4712SSatish Balay } 24354e555682SBarry Smith v = aa + ai16; 2436f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2437f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2438f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2439f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2440329f5518SBarry Smith idt -= 4; 24414e2b4712SSatish Balay } 244230d4dcafSBarry Smith } 2443e1293385SBarry Smith #endif 24444e2b4712SSatish Balay 2445d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24461ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2447dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 24484e2b4712SSatish Balay PetscFunctionReturn(0); 24494e2b4712SSatish Balay } 24504e2b4712SSatish Balay 2451f26ec98cSKris Buschelman #undef __FUNCT__ 2452cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 2453cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2454cee9d6f2SShri Abhyankar { 2455cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2456cee9d6f2SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2457cee9d6f2SShri Abhyankar PetscErrorCode ierr; 2458cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 2459cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2460cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 2461cee9d6f2SShri Abhyankar PetscScalar *x; 2462cee9d6f2SShri Abhyankar const PetscScalar *b; 2463cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2464cee9d6f2SShri Abhyankar 2465cee9d6f2SShri Abhyankar PetscFunctionBegin; 2466cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2467cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2468cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 2469cee9d6f2SShri Abhyankar idx = 0; 2470cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2471cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 2472cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 2473cee9d6f2SShri Abhyankar vi = aj + ai[i]; 2474cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 2475cee9d6f2SShri Abhyankar idx = bs*i; 2476cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2477cee9d6f2SShri Abhyankar while (nz--) { 2478cee9d6f2SShri Abhyankar jdx = bs*(*vi++); 2479cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2480cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2481cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2482cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2483cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2484cee9d6f2SShri Abhyankar 2485cee9d6f2SShri Abhyankar v += bs2; 2486cee9d6f2SShri Abhyankar } 2487cee9d6f2SShri Abhyankar 2488cee9d6f2SShri Abhyankar x[idx] = s1; 2489cee9d6f2SShri Abhyankar x[1+idx] = s2; 2490cee9d6f2SShri Abhyankar x[2+idx] = s3; 2491cee9d6f2SShri Abhyankar x[3+idx] = s4; 2492cee9d6f2SShri Abhyankar } 2493cee9d6f2SShri Abhyankar 2494cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 2495cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 2496cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 2497cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 2498cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 2499cee9d6f2SShri Abhyankar idt = bs*i; 2500cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2501cee9d6f2SShri Abhyankar 2502cee9d6f2SShri Abhyankar while (nz--) { 2503cee9d6f2SShri Abhyankar idx = bs*(*vi++); 2504cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2505cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2506cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2507cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2508cee9d6f2SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2509cee9d6f2SShri Abhyankar 2510cee9d6f2SShri Abhyankar v += bs2; 2511cee9d6f2SShri Abhyankar } 2512cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 2513cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2514cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 2515cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2516cee9d6f2SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2517cee9d6f2SShri Abhyankar 2518cee9d6f2SShri Abhyankar } 2519cee9d6f2SShri Abhyankar 2520cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2521cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2522cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2523cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 2524cee9d6f2SShri Abhyankar } 2525cee9d6f2SShri Abhyankar 2526cee9d6f2SShri Abhyankar 2527cee9d6f2SShri Abhyankar 2528cee9d6f2SShri Abhyankar #undef __FUNCT__ 2529f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2530dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2531f26ec98cSKris Buschelman { 2532f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2533690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2534dfbe8321SBarry Smith PetscErrorCode ierr; 2535690b6cddSBarry Smith PetscInt *diag = a->diag; 2536f26ec98cSKris Buschelman MatScalar *aa=a->a; 2537f26ec98cSKris Buschelman PetscScalar *x,*b; 2538f26ec98cSKris Buschelman 2539f26ec98cSKris Buschelman PetscFunctionBegin; 25401ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25411ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2542f26ec98cSKris Buschelman 2543f26ec98cSKris Buschelman { 2544f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2545f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 2546690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2547f26ec98cSKris Buschelman 2548f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2549f26ec98cSKris Buschelman idx = 0; 2550f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 2551f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 2552f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 2553f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 2554f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2555f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2556f26ec98cSKris Buschelman vi = aj + ai[i]; 2557f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2558f26ec98cSKris Buschelman idx += 4; 2559f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2560f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2561f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2562f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2563f26ec98cSKris Buschelman while (nz--) { 2564f26ec98cSKris Buschelman jdx = 4*(*vi++); 2565f26ec98cSKris Buschelman x1 = t[jdx]; 2566f26ec98cSKris Buschelman x2 = t[1+jdx]; 2567f26ec98cSKris Buschelman x3 = t[2+jdx]; 2568f26ec98cSKris Buschelman x4 = t[3+jdx]; 2569f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2570f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2571f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2572f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2573f26ec98cSKris Buschelman v += 16; 2574f26ec98cSKris Buschelman } 2575f26ec98cSKris Buschelman t[idx] = s1; 2576f26ec98cSKris Buschelman t[1+idx] = s2; 2577f26ec98cSKris Buschelman t[2+idx] = s3; 2578f26ec98cSKris Buschelman t[3+idx] = s4; 2579f26ec98cSKris Buschelman } 2580f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2581f26ec98cSKris Buschelman idt = 4*(n-1); 2582f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2583f26ec98cSKris Buschelman ai16 = 16*diag[i]; 2584f26ec98cSKris Buschelman v = aa + ai16 + 16; 2585f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2586f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2587f26ec98cSKris Buschelman s1 = t[idt]; 2588f26ec98cSKris Buschelman s2 = t[1+idt]; 2589f26ec98cSKris Buschelman s3 = t[2+idt]; 2590f26ec98cSKris Buschelman s4 = t[3+idt]; 2591f26ec98cSKris Buschelman while (nz--) { 2592f26ec98cSKris Buschelman idx = 4*(*vi++); 2593f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 2594f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 2595f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 2596f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 2597f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2598f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2599f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2600f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2601f26ec98cSKris Buschelman v += 16; 2602f26ec98cSKris Buschelman } 2603f26ec98cSKris Buschelman v = aa + ai16; 2604f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2605f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2606f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2607f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2608f26ec98cSKris Buschelman idt -= 4; 2609f26ec98cSKris Buschelman } 2610f26ec98cSKris Buschelman } 2611f26ec98cSKris Buschelman 26121ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 26131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2614dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2615f26ec98cSKris Buschelman PetscFunctionReturn(0); 2616f26ec98cSKris Buschelman } 2617f26ec98cSKris Buschelman 26183660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 26193660e330SKris Buschelman 26203660e330SKris Buschelman #include PETSC_HAVE_SSE 26213660e330SKris Buschelman #undef __FUNCT__ 26227cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2623dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 26243660e330SKris Buschelman { 26253660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 26262aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 2627dfbe8321SBarry Smith PetscErrorCode ierr; 2628dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 26293660e330SKris Buschelman MatScalar *aa=a->a; 263087828ca2SBarry Smith PetscScalar *x,*b; 26313660e330SKris Buschelman 26323660e330SKris Buschelman PetscFunctionBegin; 26333660e330SKris Buschelman SSE_SCOPE_BEGIN; 26343660e330SKris Buschelman /* 26353660e330SKris Buschelman Note: This code currently uses demotion of double 26363660e330SKris Buschelman to float when performing the mixed-mode computation. 26373660e330SKris Buschelman This may not be numerically reasonable for all applications. 26383660e330SKris Buschelman */ 26393660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 26403660e330SKris Buschelman 26411ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 26421ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26433660e330SKris Buschelman { 2644eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 2645eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 26462aa5897fSKris Buschelman int nz,i,idt,ai16; 26472aa5897fSKris Buschelman unsigned int jdx,idx; 26482aa5897fSKris Buschelman unsigned short *vi; 2649eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 26503660e330SKris Buschelman 2651eb05f457SKris Buschelman /* First block is the identity. */ 26523660e330SKris Buschelman idx = 0; 2653eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 26542aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 26553660e330SKris Buschelman 26563660e330SKris Buschelman for (i=1; i<n;) { 26573660e330SKris Buschelman PREFETCH_NTA(&v[8]); 26583660e330SKris Buschelman vi = aj + ai[i]; 26593660e330SKris Buschelman nz = diag[i] - ai[i]; 26603660e330SKris Buschelman idx += 4; 26613660e330SKris Buschelman 2662eb05f457SKris Buschelman /* Demote RHS from double to float. */ 2663eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2664eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 26653660e330SKris Buschelman 26663660e330SKris Buschelman while (nz--) { 26673660e330SKris Buschelman PREFETCH_NTA(&v[16]); 26682aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 26693660e330SKris Buschelman 26703660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 2671eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 26723660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 26733660e330SKris Buschelman 26743660e330SKris Buschelman /* First Column */ 26753660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 26763660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 26773660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 26783660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 26793660e330SKris Buschelman 26803660e330SKris Buschelman /* Second Column */ 26813660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 26823660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 26833660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 26843660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 26853660e330SKris Buschelman 26863660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 26873660e330SKris Buschelman 26883660e330SKris Buschelman /* Third Column */ 26893660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 26903660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26913660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 26923660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 26933660e330SKris Buschelman 26943660e330SKris Buschelman /* Fourth Column */ 26953660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 26963660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 26973660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 26983660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 26993660e330SKris Buschelman SSE_INLINE_END_2 27003660e330SKris Buschelman 27013660e330SKris Buschelman v += 16; 27023660e330SKris Buschelman } 27033660e330SKris Buschelman v = aa + 16*ai[++i]; 27043660e330SKris Buschelman PREFETCH_NTA(v); 2705eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 27063660e330SKris Buschelman } 2707eb05f457SKris Buschelman 2708eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 2709eb05f457SKris Buschelman 27103660e330SKris Buschelman idt = 4*(n-1); 27113660e330SKris Buschelman ai16 = 16*diag[n-1]; 27123660e330SKris Buschelman v = aa + ai16 + 16; 27133660e330SKris Buschelman for (i=n-1; i>=0;){ 27143660e330SKris Buschelman PREFETCH_NTA(&v[8]); 27153660e330SKris Buschelman vi = aj + diag[i] + 1; 27163660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 27173660e330SKris Buschelman 2718eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 27193660e330SKris Buschelman 27203660e330SKris Buschelman while (nz--) { 27213660e330SKris Buschelman PREFETCH_NTA(&v[16]); 27222aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 27233660e330SKris Buschelman 27243660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 2725eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 27263660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 27273660e330SKris Buschelman 27283660e330SKris Buschelman /* First Column */ 27293660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 27303660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 27313660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 27323660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 27333660e330SKris Buschelman 27343660e330SKris Buschelman /* Second Column */ 27353660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 27363660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 27373660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 27383660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 27393660e330SKris Buschelman 27403660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 27413660e330SKris Buschelman 27423660e330SKris Buschelman /* Third Column */ 27433660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 27443660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 27453660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 27463660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 27473660e330SKris Buschelman 27483660e330SKris Buschelman /* Fourth Column */ 27493660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 27503660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 27513660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 27523660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 27533660e330SKris Buschelman SSE_INLINE_END_2 27543660e330SKris Buschelman v += 16; 27553660e330SKris Buschelman } 27563660e330SKris Buschelman v = aa + ai16; 27573660e330SKris Buschelman ai16 = 16*diag[--i]; 27583660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 27593660e330SKris Buschelman /* 27603660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 27613660e330SKris Buschelman which was inverted as part of the factorization 27623660e330SKris Buschelman */ 2763eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 27643660e330SKris Buschelman /* First Column */ 27653660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 27663660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 27673660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 27683660e330SKris Buschelman 27693660e330SKris Buschelman /* Second Column */ 27703660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 27713660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 27723660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 27733660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 27743660e330SKris Buschelman 27753660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 27763660e330SKris Buschelman 27773660e330SKris Buschelman /* Third Column */ 27783660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 27793660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 27803660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 27813660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 27823660e330SKris Buschelman 27833660e330SKris Buschelman /* Fourth Column */ 27843660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 27853660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 27863660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 27873660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 27883660e330SKris Buschelman 27893660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 27903660e330SKris Buschelman SSE_INLINE_END_3 27913660e330SKris Buschelman 27923660e330SKris Buschelman v = aa + ai16 + 16; 27933660e330SKris Buschelman idt -= 4; 27943660e330SKris Buschelman } 2795eb05f457SKris Buschelman 2796eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 2797eb05f457SKris Buschelman idt = 4*(n-1); 2798eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 2799eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2800eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2801eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 2802eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 2803eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 2804eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 2805eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 2806eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 280754693613SKris Buschelman idt -= 4; 28083660e330SKris Buschelman } 2809eb05f457SKris Buschelman 2810eb05f457SKris Buschelman } /* End of artificial scope. */ 28111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 28121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2813dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 28143660e330SKris Buschelman SSE_SCOPE_END; 28153660e330SKris Buschelman PetscFunctionReturn(0); 28163660e330SKris Buschelman } 28173660e330SKris Buschelman 28187cf1b8d3SKris Buschelman #undef __FUNCT__ 28197cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2820dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 28217cf1b8d3SKris Buschelman { 28227cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 28237cf1b8d3SKris Buschelman int *aj=a->j; 2824dfbe8321SBarry Smith PetscErrorCode ierr; 2825dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 28267cf1b8d3SKris Buschelman MatScalar *aa=a->a; 28277cf1b8d3SKris Buschelman PetscScalar *x,*b; 28287cf1b8d3SKris Buschelman 28297cf1b8d3SKris Buschelman PetscFunctionBegin; 28307cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 28317cf1b8d3SKris Buschelman /* 28327cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 28337cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 28347cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 28357cf1b8d3SKris Buschelman */ 28367cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 28377cf1b8d3SKris Buschelman 28381ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 28391ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28407cf1b8d3SKris Buschelman { 28417cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 28427cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 28437cf1b8d3SKris Buschelman int nz,i,idt,ai16; 28447cf1b8d3SKris Buschelman int jdx,idx; 28457cf1b8d3SKris Buschelman int *vi; 28467cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 28477cf1b8d3SKris Buschelman 28487cf1b8d3SKris Buschelman /* First block is the identity. */ 28497cf1b8d3SKris Buschelman idx = 0; 28507cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 28517cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 28527cf1b8d3SKris Buschelman 28537cf1b8d3SKris Buschelman for (i=1; i<n;) { 28547cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 28557cf1b8d3SKris Buschelman vi = aj + ai[i]; 28567cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 28577cf1b8d3SKris Buschelman idx += 4; 28587cf1b8d3SKris Buschelman 28597cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 28607cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 28617cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 28627cf1b8d3SKris Buschelman 28637cf1b8d3SKris Buschelman while (nz--) { 28647cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 28657cf1b8d3SKris Buschelman jdx = 4*(*vi++); 28667cf1b8d3SKris Buschelman /* jdx = *vi++; */ 28677cf1b8d3SKris Buschelman 28687cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 28697cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 28707cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 28717cf1b8d3SKris Buschelman 28727cf1b8d3SKris Buschelman /* First Column */ 28737cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 28747cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 28757cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 28767cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 28777cf1b8d3SKris Buschelman 28787cf1b8d3SKris Buschelman /* Second Column */ 28797cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 28807cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 28817cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 28827cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 28837cf1b8d3SKris Buschelman 28847cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 28857cf1b8d3SKris Buschelman 28867cf1b8d3SKris Buschelman /* Third Column */ 28877cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 28887cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 28897cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 28907cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 28917cf1b8d3SKris Buschelman 28927cf1b8d3SKris Buschelman /* Fourth Column */ 28937cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 28947cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 28957cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 28967cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 28977cf1b8d3SKris Buschelman SSE_INLINE_END_2 28987cf1b8d3SKris Buschelman 28997cf1b8d3SKris Buschelman v += 16; 29007cf1b8d3SKris Buschelman } 29017cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 29027cf1b8d3SKris Buschelman PREFETCH_NTA(v); 29037cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 29047cf1b8d3SKris Buschelman } 29057cf1b8d3SKris Buschelman 29067cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 29077cf1b8d3SKris Buschelman 29087cf1b8d3SKris Buschelman idt = 4*(n-1); 29097cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 29107cf1b8d3SKris Buschelman v = aa + ai16 + 16; 29117cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 29127cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 29137cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 29147cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 29157cf1b8d3SKris Buschelman 29167cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 29177cf1b8d3SKris Buschelman 29187cf1b8d3SKris Buschelman while (nz--) { 29197cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 29207cf1b8d3SKris Buschelman idx = 4*(*vi++); 29217cf1b8d3SKris Buschelman /* idx = *vi++; */ 29227cf1b8d3SKris Buschelman 29237cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 29247cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 29257cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 29267cf1b8d3SKris Buschelman 29277cf1b8d3SKris Buschelman /* First Column */ 29287cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 29297cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 29307cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 29317cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 29327cf1b8d3SKris Buschelman 29337cf1b8d3SKris Buschelman /* Second Column */ 29347cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 29357cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 29367cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 29377cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 29387cf1b8d3SKris Buschelman 29397cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 29407cf1b8d3SKris Buschelman 29417cf1b8d3SKris Buschelman /* Third Column */ 29427cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 29437cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 29447cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 29457cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 29467cf1b8d3SKris Buschelman 29477cf1b8d3SKris Buschelman /* Fourth Column */ 29487cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 29497cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 29507cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 29517cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 29527cf1b8d3SKris Buschelman SSE_INLINE_END_2 29537cf1b8d3SKris Buschelman v += 16; 29547cf1b8d3SKris Buschelman } 29557cf1b8d3SKris Buschelman v = aa + ai16; 29567cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 29577cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 29587cf1b8d3SKris Buschelman /* 29597cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 29607cf1b8d3SKris Buschelman which was inverted as part of the factorization 29617cf1b8d3SKris Buschelman */ 29627cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 29637cf1b8d3SKris Buschelman /* First Column */ 29647cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 29657cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 29667cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 29677cf1b8d3SKris Buschelman 29687cf1b8d3SKris Buschelman /* Second Column */ 29697cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 29707cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 29717cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 29727cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 29737cf1b8d3SKris Buschelman 29747cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 29757cf1b8d3SKris Buschelman 29767cf1b8d3SKris Buschelman /* Third Column */ 29777cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 29787cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 29797cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 29807cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 29817cf1b8d3SKris Buschelman 29827cf1b8d3SKris Buschelman /* Fourth Column */ 29837cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 29847cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 29857cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 29867cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 29877cf1b8d3SKris Buschelman 29887cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 29897cf1b8d3SKris Buschelman SSE_INLINE_END_3 29907cf1b8d3SKris Buschelman 29917cf1b8d3SKris Buschelman v = aa + ai16 + 16; 29927cf1b8d3SKris Buschelman idt -= 4; 29937cf1b8d3SKris Buschelman } 29947cf1b8d3SKris Buschelman 29957cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 29967cf1b8d3SKris Buschelman idt = 4*(n-1); 29977cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 29987cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 29997cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 30007cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 30017cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 30027cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 30037cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 30047cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 30057cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 30067cf1b8d3SKris Buschelman idt -= 4; 30077cf1b8d3SKris Buschelman } 30087cf1b8d3SKris Buschelman 30097cf1b8d3SKris Buschelman } /* End of artificial scope. */ 30101ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 30111ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3012dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 30137cf1b8d3SKris Buschelman SSE_SCOPE_END; 30147cf1b8d3SKris Buschelman PetscFunctionReturn(0); 30157cf1b8d3SKris Buschelman } 30167cf1b8d3SKris Buschelman 30173660e330SKris Buschelman #endif 30184a2ae208SSatish Balay #undef __FUNCT__ 30194a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3020dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 30214e2b4712SSatish Balay { 30224e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 30234e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 30246849ba73SBarry Smith PetscErrorCode ierr; 30255d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 30265d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3027d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3028d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3029d9fead3dSBarry Smith const PetscScalar *b; 30304e2b4712SSatish Balay 30314e2b4712SSatish Balay PetscFunctionBegin; 3032d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30331ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3034f1af5d2fSBarry Smith t = a->solve_work; 30354e2b4712SSatish Balay 30364e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 30374e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 30384e2b4712SSatish Balay 30394e2b4712SSatish Balay /* forward solve the lower triangular */ 30404e2b4712SSatish Balay idx = 3*(*r++); 3041f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 30424e2b4712SSatish Balay for (i=1; i<n; i++) { 30434e2b4712SSatish Balay v = aa + 9*ai[i]; 30444e2b4712SSatish Balay vi = aj + ai[i]; 30454e2b4712SSatish Balay nz = diag[i] - ai[i]; 30464e2b4712SSatish Balay idx = 3*(*r++); 3047f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 30484e2b4712SSatish Balay while (nz--) { 30494e2b4712SSatish Balay idx = 3*(*vi++); 3050f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3051f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3052f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3053f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 30544e2b4712SSatish Balay v += 9; 30554e2b4712SSatish Balay } 30564e2b4712SSatish Balay idx = 3*i; 3057f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 30584e2b4712SSatish Balay } 30594e2b4712SSatish Balay /* backward solve the upper triangular */ 30604e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 30614e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 30624e2b4712SSatish Balay vi = aj + diag[i] + 1; 30634e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 30644e2b4712SSatish Balay idt = 3*i; 3065f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 30664e2b4712SSatish Balay while (nz--) { 30674e2b4712SSatish Balay idx = 3*(*vi++); 3068f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3069f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3070f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3071f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 30724e2b4712SSatish Balay v += 9; 30734e2b4712SSatish Balay } 30744e2b4712SSatish Balay idc = 3*(*c--); 30754e2b4712SSatish Balay v = aa + 9*diag[i]; 3076f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3077f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3078f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 30794e2b4712SSatish Balay } 30804e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 30814e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3082d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30831ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3084dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 30854e2b4712SSatish Balay PetscFunctionReturn(0); 30864e2b4712SSatish Balay } 30874e2b4712SSatish Balay 308815091d37SBarry Smith /* 308915091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 309015091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 309115091d37SBarry Smith */ 30924a2ae208SSatish Balay #undef __FUNCT__ 30934a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 3094dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 309515091d37SBarry Smith { 309615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3097690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3098dfbe8321SBarry Smith PetscErrorCode ierr; 3099690b6cddSBarry Smith PetscInt *diag = a->diag; 3100d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3101d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 3102d9fead3dSBarry Smith const PetscScalar *b; 3103690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 310415091d37SBarry Smith 310515091d37SBarry Smith PetscFunctionBegin; 3106d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 31071ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 310815091d37SBarry Smith 310915091d37SBarry Smith /* forward solve the lower triangular */ 311015091d37SBarry Smith idx = 0; 311115091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 311215091d37SBarry Smith for (i=1; i<n; i++) { 311315091d37SBarry Smith v = aa + 9*ai[i]; 311415091d37SBarry Smith vi = aj + ai[i]; 311515091d37SBarry Smith nz = diag[i] - ai[i]; 311615091d37SBarry Smith idx += 3; 3117f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 311815091d37SBarry Smith while (nz--) { 311915091d37SBarry Smith jdx = 3*(*vi++); 312015091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 3121f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3122f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3123f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 312415091d37SBarry Smith v += 9; 312515091d37SBarry Smith } 3126f1af5d2fSBarry Smith x[idx] = s1; 3127f1af5d2fSBarry Smith x[1+idx] = s2; 3128f1af5d2fSBarry Smith x[2+idx] = s3; 312915091d37SBarry Smith } 313015091d37SBarry Smith /* backward solve the upper triangular */ 313115091d37SBarry Smith for (i=n-1; i>=0; i--){ 313215091d37SBarry Smith v = aa + 9*diag[i] + 9; 313315091d37SBarry Smith vi = aj + diag[i] + 1; 313415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 313515091d37SBarry Smith idt = 3*i; 3136f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3137f1af5d2fSBarry Smith s3 = x[2+idt]; 313815091d37SBarry Smith while (nz--) { 313915091d37SBarry Smith idx = 3*(*vi++); 314015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 3141f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3142f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3143f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 314415091d37SBarry Smith v += 9; 314515091d37SBarry Smith } 314615091d37SBarry Smith v = aa + 9*diag[i]; 3147f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3148f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3149f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 315015091d37SBarry Smith } 315115091d37SBarry Smith 3152d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 31531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3154dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 315515091d37SBarry Smith PetscFunctionReturn(0); 315615091d37SBarry Smith } 315715091d37SBarry Smith 31584a2ae208SSatish Balay #undef __FUNCT__ 3159cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 3160cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3161cee9d6f2SShri Abhyankar { 3162cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3163cee9d6f2SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3164cee9d6f2SShri Abhyankar PetscErrorCode ierr; 3165cee9d6f2SShri Abhyankar PetscInt idx,jdx,idt; 3166cee9d6f2SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3167cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 3168cee9d6f2SShri Abhyankar PetscScalar *x; 3169cee9d6f2SShri Abhyankar const PetscScalar *b; 3170cee9d6f2SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 3171cee9d6f2SShri Abhyankar 3172cee9d6f2SShri Abhyankar PetscFunctionBegin; 3173cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3174cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3175cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 3176cee9d6f2SShri Abhyankar idx = 0; 3177cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 3178cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 3179cee9d6f2SShri Abhyankar v = aa + bs2*ai[i]; 3180cee9d6f2SShri Abhyankar vi = aj + ai[i]; 3181cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 3182cee9d6f2SShri Abhyankar idx = bs*i; 3183cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3184cee9d6f2SShri Abhyankar while (nz--) { 3185cee9d6f2SShri Abhyankar jdx = bs*(*vi++); 3186cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 3187cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3188cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3189cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3190cee9d6f2SShri Abhyankar 3191cee9d6f2SShri Abhyankar v += bs2; 3192cee9d6f2SShri Abhyankar } 3193cee9d6f2SShri Abhyankar 3194cee9d6f2SShri Abhyankar x[idx] = s1; 3195cee9d6f2SShri Abhyankar x[1+idx] = s2; 3196cee9d6f2SShri Abhyankar x[2+idx] = s3; 3197cee9d6f2SShri Abhyankar } 3198cee9d6f2SShri Abhyankar 3199cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 3200cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 3201cee9d6f2SShri Abhyankar v = aa + bs2*ai[2*n-i]; 3202cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 3203cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 3204cee9d6f2SShri Abhyankar idt = bs*i; 3205cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 3206cee9d6f2SShri Abhyankar 3207cee9d6f2SShri Abhyankar while (nz--) { 3208cee9d6f2SShri Abhyankar idx = bs*(*vi++); 3209cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3210cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3211cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3212cee9d6f2SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3213cee9d6f2SShri Abhyankar 3214cee9d6f2SShri Abhyankar v += bs2; 3215cee9d6f2SShri Abhyankar } 3216cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 3217cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3218cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3219cee9d6f2SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3220cee9d6f2SShri Abhyankar 3221cee9d6f2SShri Abhyankar } 3222cee9d6f2SShri Abhyankar 3223cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3224cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3225cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3226cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 3227cee9d6f2SShri Abhyankar } 3228cee9d6f2SShri Abhyankar 3229cee9d6f2SShri Abhyankar #undef __FUNCT__ 32304a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 3231dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 32324e2b4712SSatish Balay { 32334e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 32344e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 32356849ba73SBarry Smith PetscErrorCode ierr; 32365d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 32375d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3238d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3239d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 3240d9fead3dSBarry Smith const PetscScalar *b; 32414e2b4712SSatish Balay 32424e2b4712SSatish Balay PetscFunctionBegin; 3243d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 32441ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3245f1af5d2fSBarry Smith t = a->solve_work; 32464e2b4712SSatish Balay 32474e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 32484e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 32494e2b4712SSatish Balay 32504e2b4712SSatish Balay /* forward solve the lower triangular */ 32514e2b4712SSatish Balay idx = 2*(*r++); 3252f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 32534e2b4712SSatish Balay for (i=1; i<n; i++) { 32544e2b4712SSatish Balay v = aa + 4*ai[i]; 32554e2b4712SSatish Balay vi = aj + ai[i]; 32564e2b4712SSatish Balay nz = diag[i] - ai[i]; 32574e2b4712SSatish Balay idx = 2*(*r++); 3258f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 32594e2b4712SSatish Balay while (nz--) { 32604e2b4712SSatish Balay idx = 2*(*vi++); 3261f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3262f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3263f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 32644e2b4712SSatish Balay v += 4; 32654e2b4712SSatish Balay } 32664e2b4712SSatish Balay idx = 2*i; 3267f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 32684e2b4712SSatish Balay } 32694e2b4712SSatish Balay /* backward solve the upper triangular */ 32704e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 32714e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 32724e2b4712SSatish Balay vi = aj + diag[i] + 1; 32734e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 32744e2b4712SSatish Balay idt = 2*i; 3275f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 32764e2b4712SSatish Balay while (nz--) { 32774e2b4712SSatish Balay idx = 2*(*vi++); 3278f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3279f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3280f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 32814e2b4712SSatish Balay v += 4; 32824e2b4712SSatish Balay } 32834e2b4712SSatish Balay idc = 2*(*c--); 32844e2b4712SSatish Balay v = aa + 4*diag[i]; 3285f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 3286f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 32874e2b4712SSatish Balay } 32884e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 32894e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3290d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 32911ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3292dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 32934e2b4712SSatish Balay PetscFunctionReturn(0); 32944e2b4712SSatish Balay } 32954e2b4712SSatish Balay 329615091d37SBarry Smith /* 329715091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 329815091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 329915091d37SBarry Smith */ 33004a2ae208SSatish Balay #undef __FUNCT__ 33014a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 3302dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 330315091d37SBarry Smith { 330415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3305690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3306dfbe8321SBarry Smith PetscErrorCode ierr; 3307690b6cddSBarry Smith PetscInt *diag = a->diag; 3308d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3309d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 3310d9fead3dSBarry Smith const PetscScalar *b; 3311690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 331215091d37SBarry Smith 331315091d37SBarry Smith PetscFunctionBegin; 3314d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 33151ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 331615091d37SBarry Smith 331715091d37SBarry Smith /* forward solve the lower triangular */ 331815091d37SBarry Smith idx = 0; 331915091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 332015091d37SBarry Smith for (i=1; i<n; i++) { 332115091d37SBarry Smith v = aa + 4*ai[i]; 332215091d37SBarry Smith vi = aj + ai[i]; 332315091d37SBarry Smith nz = diag[i] - ai[i]; 332415091d37SBarry Smith idx += 2; 3325f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 332615091d37SBarry Smith while (nz--) { 332715091d37SBarry Smith jdx = 2*(*vi++); 332815091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 3329f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3330f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 333115091d37SBarry Smith v += 4; 333215091d37SBarry Smith } 3333f1af5d2fSBarry Smith x[idx] = s1; 3334f1af5d2fSBarry Smith x[1+idx] = s2; 333515091d37SBarry Smith } 333615091d37SBarry Smith /* backward solve the upper triangular */ 333715091d37SBarry Smith for (i=n-1; i>=0; i--){ 333815091d37SBarry Smith v = aa + 4*diag[i] + 4; 333915091d37SBarry Smith vi = aj + diag[i] + 1; 334015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 334115091d37SBarry Smith idt = 2*i; 3342f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 334315091d37SBarry Smith while (nz--) { 334415091d37SBarry Smith idx = 2*(*vi++); 334515091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 3346f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3347f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 334815091d37SBarry Smith v += 4; 334915091d37SBarry Smith } 335015091d37SBarry Smith v = aa + 4*diag[i]; 3351f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 3352f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 335315091d37SBarry Smith } 335415091d37SBarry Smith 3355d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 33561ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3357dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 335815091d37SBarry Smith PetscFunctionReturn(0); 335915091d37SBarry Smith } 336015091d37SBarry Smith 33614a2ae208SSatish Balay #undef __FUNCT__ 3362cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 3363cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3364cee9d6f2SShri Abhyankar { 3365cee9d6f2SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3366cee9d6f2SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 3367cee9d6f2SShri Abhyankar PetscErrorCode ierr; 3368cee9d6f2SShri Abhyankar PetscInt jdx; 3369cee9d6f2SShri Abhyankar const MatScalar *aa=a->a,*v; 3370cee9d6f2SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 3371cee9d6f2SShri Abhyankar const PetscScalar *b; 3372cee9d6f2SShri Abhyankar 3373cee9d6f2SShri Abhyankar PetscFunctionBegin; 3374cee9d6f2SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3375cee9d6f2SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3376cee9d6f2SShri Abhyankar /* forward solve the lower triangular */ 3377cee9d6f2SShri Abhyankar idx = 0; 3378cee9d6f2SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 3379cee9d6f2SShri Abhyankar for (i=1; i<n; i++) { 3380cee9d6f2SShri Abhyankar v = aa + 4*ai[i]; 3381cee9d6f2SShri Abhyankar vi = aj + ai[i]; 3382cee9d6f2SShri Abhyankar nz = ai[i+1] - ai[i]; 3383cee9d6f2SShri Abhyankar idx = 2*i; 3384cee9d6f2SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 3385cee9d6f2SShri Abhyankar while (nz--) { 3386cee9d6f2SShri Abhyankar jdx = 2*(*vi++); 3387cee9d6f2SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 3388cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 3389cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 3390cee9d6f2SShri Abhyankar v += 4; 3391cee9d6f2SShri Abhyankar } 3392cee9d6f2SShri Abhyankar x[idx] = s1; 3393cee9d6f2SShri Abhyankar x[1+idx] = s2; 3394cee9d6f2SShri Abhyankar } 3395cee9d6f2SShri Abhyankar 3396cee9d6f2SShri Abhyankar /* backward solve the upper triangular */ 3397cee9d6f2SShri Abhyankar for (i=n-1; i>=0; i--){ 3398cee9d6f2SShri Abhyankar v = aa + 4*ai[2*n-i]; 3399cee9d6f2SShri Abhyankar vi = aj + ai[2*n-i]; 3400cee9d6f2SShri Abhyankar nz = ai[2*n-i +1] - ai[2*n-i]-1; 3401cee9d6f2SShri Abhyankar idt = 2*i; 3402cee9d6f2SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 3403cee9d6f2SShri Abhyankar while (nz--) { 3404cee9d6f2SShri Abhyankar idx = 2*(*vi++); 3405cee9d6f2SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 3406cee9d6f2SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 3407cee9d6f2SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 3408cee9d6f2SShri Abhyankar v += 4; 3409cee9d6f2SShri Abhyankar } 3410cee9d6f2SShri Abhyankar /* x = inv_diagonal*x */ 3411cee9d6f2SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 3412cee9d6f2SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 3413cee9d6f2SShri Abhyankar } 3414cee9d6f2SShri Abhyankar 3415cee9d6f2SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3416cee9d6f2SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3417cee9d6f2SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3418cee9d6f2SShri Abhyankar PetscFunctionReturn(0); 3419cee9d6f2SShri Abhyankar } 3420cee9d6f2SShri Abhyankar 3421cee9d6f2SShri Abhyankar #undef __FUNCT__ 34224a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 3423dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 34244e2b4712SSatish Balay { 34254e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 34264e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 34276849ba73SBarry Smith PetscErrorCode ierr; 34285d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 34295d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 34303f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 343187828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 34324e2b4712SSatish Balay 34334e2b4712SSatish Balay PetscFunctionBegin; 34344e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 34354e2b4712SSatish Balay 34361ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 34371ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3438f1af5d2fSBarry Smith t = a->solve_work; 34394e2b4712SSatish Balay 34404e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 34414e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 34424e2b4712SSatish Balay 34434e2b4712SSatish Balay /* forward solve the lower triangular */ 3444f1af5d2fSBarry Smith t[0] = b[*r++]; 34454e2b4712SSatish Balay for (i=1; i<n; i++) { 34464e2b4712SSatish Balay v = aa + ai[i]; 34474e2b4712SSatish Balay vi = aj + ai[i]; 34484e2b4712SSatish Balay nz = diag[i] - ai[i]; 3449f1af5d2fSBarry Smith s1 = b[*r++]; 34504e2b4712SSatish Balay while (nz--) { 3451f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 34524e2b4712SSatish Balay } 3453f1af5d2fSBarry Smith t[i] = s1; 34544e2b4712SSatish Balay } 34554e2b4712SSatish Balay /* backward solve the upper triangular */ 34564e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 34574e2b4712SSatish Balay v = aa + diag[i] + 1; 34584e2b4712SSatish Balay vi = aj + diag[i] + 1; 34594e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3460f1af5d2fSBarry Smith s1 = t[i]; 34614e2b4712SSatish Balay while (nz--) { 3462f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 34634e2b4712SSatish Balay } 3464f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 34654e2b4712SSatish Balay } 34664e2b4712SSatish Balay 34674e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 34684e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 34691ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 34701ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3471dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 34724e2b4712SSatish Balay PetscFunctionReturn(0); 34734e2b4712SSatish Balay } 347415091d37SBarry Smith /* 347515091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 347615091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 347715091d37SBarry Smith */ 34784a2ae208SSatish Balay #undef __FUNCT__ 34794a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3480dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 348115091d37SBarry Smith { 348215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3483690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3484dfbe8321SBarry Smith PetscErrorCode ierr; 3485690b6cddSBarry Smith PetscInt *diag = a->diag; 348615091d37SBarry Smith MatScalar *aa=a->a; 348787828ca2SBarry Smith PetscScalar *x,*b; 348887828ca2SBarry Smith PetscScalar s1,x1; 348915091d37SBarry Smith MatScalar *v; 3490690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 349115091d37SBarry Smith 349215091d37SBarry Smith PetscFunctionBegin; 34931ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 34941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 349515091d37SBarry Smith 349615091d37SBarry Smith /* forward solve the lower triangular */ 349715091d37SBarry Smith idx = 0; 349815091d37SBarry Smith x[0] = b[0]; 349915091d37SBarry Smith for (i=1; i<n; i++) { 350015091d37SBarry Smith v = aa + ai[i]; 350115091d37SBarry Smith vi = aj + ai[i]; 350215091d37SBarry Smith nz = diag[i] - ai[i]; 350315091d37SBarry Smith idx += 1; 3504f1af5d2fSBarry Smith s1 = b[idx]; 350515091d37SBarry Smith while (nz--) { 350615091d37SBarry Smith jdx = *vi++; 350715091d37SBarry Smith x1 = x[jdx]; 3508f1af5d2fSBarry Smith s1 -= v[0]*x1; 350915091d37SBarry Smith v += 1; 351015091d37SBarry Smith } 3511f1af5d2fSBarry Smith x[idx] = s1; 351215091d37SBarry Smith } 351315091d37SBarry Smith /* backward solve the upper triangular */ 351415091d37SBarry Smith for (i=n-1; i>=0; i--){ 351515091d37SBarry Smith v = aa + diag[i] + 1; 351615091d37SBarry Smith vi = aj + diag[i] + 1; 351715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 351815091d37SBarry Smith idt = i; 3519f1af5d2fSBarry Smith s1 = x[idt]; 352015091d37SBarry Smith while (nz--) { 352115091d37SBarry Smith idx = *vi++; 352215091d37SBarry Smith x1 = x[idx]; 3523f1af5d2fSBarry Smith s1 -= v[0]*x1; 352415091d37SBarry Smith v += 1; 352515091d37SBarry Smith } 352615091d37SBarry Smith v = aa + diag[i]; 3527f1af5d2fSBarry Smith x[idt] = v[0]*s1; 352815091d37SBarry Smith } 35291ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 35301ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3531dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 353215091d37SBarry Smith PetscFunctionReturn(0); 353315091d37SBarry Smith } 35344e2b4712SSatish Balay 35354e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 353616a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 35376bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 35386bce7ff8SHong Zhang 353984a281e5SHong Zhang extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 35406bce7ff8SHong Zhang #undef __FUNCT__ 35416bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 35426bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 35436bce7ff8SHong Zhang { 35446bce7ff8SHong Zhang Mat C=B; 35456bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 35466bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 35476bce7ff8SHong Zhang PetscErrorCode ierr; 35486bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 35496bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 35506bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 3551914a18a2SHong Zhang MatScalar *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a; 3552914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 3553914a18a2SHong Zhang MatScalar *v_work; 35546bce7ff8SHong Zhang 35556bce7ff8SHong Zhang PetscFunctionBegin; 35566bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 35576bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3558914a18a2SHong Zhang ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 3559914a18a2SHong Zhang ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 35606bce7ff8SHong Zhang ics = ic; 35616bce7ff8SHong Zhang 3562914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 3563914a18a2SHong Zhang ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 3564914a18a2SHong Zhang multiplier = v_work + bs; 3565914a18a2SHong Zhang v_pivots = (PetscInt*)(multiplier + bs2); 3566914a18a2SHong Zhang 35676bce7ff8SHong Zhang for (i=0; i<n; i++){ 35686bce7ff8SHong Zhang /* zero rtmp */ 35696bce7ff8SHong Zhang /* L part */ 35706bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 35716bce7ff8SHong Zhang bjtmp = bj + bi[i]; 3572914a18a2SHong Zhang for (j=0; j<nz; j++){ 3573914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3574914a18a2SHong Zhang } 35756bce7ff8SHong Zhang 35766bce7ff8SHong Zhang /* U part */ 35776bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i]; 35786bce7ff8SHong Zhang bjtmp = bj + bi[2*n-i]; 3579914a18a2SHong Zhang for (j=0; j<nz; j++){ 3580914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3581914a18a2SHong Zhang } 35826bce7ff8SHong Zhang 35836bce7ff8SHong Zhang /* load in initial (unfactored row) */ 35846bce7ff8SHong Zhang nz = ai[r[i]+1] - ai[r[i]]; 35856bce7ff8SHong Zhang ajtmp = aj + ai[r[i]]; 3586914a18a2SHong Zhang v = aa + bs2*ai[r[i]]; 35876bce7ff8SHong Zhang for (j=0; j<nz; j++) { 3588914a18a2SHong Zhang ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 35896bce7ff8SHong Zhang } 35906bce7ff8SHong Zhang 35916bce7ff8SHong Zhang /* elimination */ 35926bce7ff8SHong Zhang bjtmp = bj + bi[i]; 35936bce7ff8SHong Zhang row = *bjtmp++; 35946bce7ff8SHong Zhang nzL = bi[i+1] - bi[i]; 35956bce7ff8SHong Zhang k = 0; 35966bce7ff8SHong Zhang while (k < nzL) { 3597914a18a2SHong Zhang pc = rtmp + bs2*row; 3598914a18a2SHong Zhang for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 3599914a18a2SHong Zhang if (flg) { 3600914a18a2SHong Zhang pv = b->a + bs2*bdiag[row]; 3601914a18a2SHong Zhang Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */ 36026bce7ff8SHong Zhang pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 3603914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-row]; 36046bce7ff8SHong Zhang nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 3605914a18a2SHong Zhang for (j=0; j<nz; j++) { 3606914a18a2SHong Zhang Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 3607914a18a2SHong Zhang } 36086bce7ff8SHong Zhang ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 36096bce7ff8SHong Zhang } 36106bce7ff8SHong Zhang row = *bjtmp++; k++; 36116bce7ff8SHong Zhang } 36126bce7ff8SHong Zhang 36136bce7ff8SHong Zhang /* finished row so stick it into b->a */ 36146bce7ff8SHong Zhang /* L part */ 3615914a18a2SHong Zhang pv = b->a + bs2*bi[i] ; 36166bce7ff8SHong Zhang pj = b->j + bi[i] ; 36176bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 36186bce7ff8SHong Zhang for (j=0; j<nz; j++) { 3619914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 36206bce7ff8SHong Zhang } 36216bce7ff8SHong Zhang 36226bce7ff8SHong Zhang /* Mark diagonal and invert diagonal for simplier triangular solves */ 3623914a18a2SHong Zhang pv = b->a + bs2*bdiag[i]; 36246bce7ff8SHong Zhang pj = b->j + bdiag[i]; 3625914a18a2SHong Zhang /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 3626914a18a2SHong Zhang ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3627914a18a2SHong Zhang ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 36286bce7ff8SHong Zhang 36296bce7ff8SHong Zhang /* U part */ 3630914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-i]; 36316bce7ff8SHong Zhang pj = b->j + bi[2*n-i]; 36326bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i] - 1; 3633914a18a2SHong Zhang for (j=0; j<nz; j++){ 3634914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3635914a18a2SHong Zhang } 36366bce7ff8SHong Zhang } 36376bce7ff8SHong Zhang 36386bce7ff8SHong Zhang ierr = PetscFree(rtmp);CHKERRQ(ierr); 36396bce7ff8SHong Zhang ierr = PetscFree(v_work);CHKERRQ(ierr); 36406bce7ff8SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 36416bce7ff8SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 364227019359SHong Zhang 36436bce7ff8SHong Zhang C->assembled = PETSC_TRUE; 3644914a18a2SHong Zhang ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 36456bce7ff8SHong Zhang PetscFunctionReturn(0); 36466bce7ff8SHong Zhang } 36476bce7ff8SHong Zhang 36486bce7ff8SHong Zhang /* 36496bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 365016a2bf60SHong Zhang See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 365116a2bf60SHong Zhang because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 36526bce7ff8SHong Zhang */ 36536bce7ff8SHong Zhang #undef __FUNCT__ 36546bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 36556bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 36566bce7ff8SHong Zhang { 36576bce7ff8SHong Zhang 36586bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 36596bce7ff8SHong Zhang PetscErrorCode ierr; 366016a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 366116a2bf60SHong Zhang PetscInt i,j,nz,*bi,*bj,*bdiag; 36626bce7ff8SHong Zhang 36636bce7ff8SHong Zhang PetscFunctionBegin; 366416a2bf60SHong Zhang /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */ 366516a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 36666bce7ff8SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 366716a2bf60SHong Zhang 366816a2bf60SHong Zhang /* allocate matrix arrays for new data structure */ 366916a2bf60SHong Zhang ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr); 367016a2bf60SHong Zhang ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr); 367116a2bf60SHong Zhang b->singlemalloc = PETSC_TRUE; 367216a2bf60SHong Zhang if (!b->diag){ 367316a2bf60SHong Zhang ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 367416a2bf60SHong Zhang } 3675914a18a2SHong Zhang bdiag = b->diag; 36766bce7ff8SHong Zhang 367716a2bf60SHong Zhang if (n > 0) { 367816a2bf60SHong Zhang ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 36796bce7ff8SHong Zhang } 36806bce7ff8SHong Zhang 36816bce7ff8SHong Zhang /* set bi and bj with new data structure */ 36826bce7ff8SHong Zhang bi = b->i; 36836bce7ff8SHong Zhang bj = b->j; 36846bce7ff8SHong Zhang 36856bce7ff8SHong Zhang /* L part */ 36866bce7ff8SHong Zhang bi[0] = 0; 368716a2bf60SHong Zhang for (i=0; i<n; i++){ 36886bce7ff8SHong Zhang nz = adiag[i] - ai[i]; 3689914a18a2SHong Zhang bi[i+1] = bi[i] + nz; 36906bce7ff8SHong Zhang aj = a->j + ai[i]; 36916bce7ff8SHong Zhang for (j=0; j<nz; j++){ 36926bce7ff8SHong Zhang *bj = aj[j]; bj++; 36936bce7ff8SHong Zhang } 36946bce7ff8SHong Zhang } 36956bce7ff8SHong Zhang 36966bce7ff8SHong Zhang /* U part */ 369716a2bf60SHong Zhang bi[n+1] = bi[n]; 369816a2bf60SHong Zhang for (i=n-1; i>=0; i--){ 36996bce7ff8SHong Zhang nz = ai[i+1] - adiag[i] - 1; 370016a2bf60SHong Zhang bi[2*n-i+1] = bi[2*n-i] + nz + 1; 37016bce7ff8SHong Zhang aj = a->j + adiag[i] + 1; 37026bce7ff8SHong Zhang for (j=0; j<nz; j++){ 37036bce7ff8SHong Zhang *bj = aj[j]; bj++; 37046bce7ff8SHong Zhang } 37056bce7ff8SHong Zhang /* diag[i] */ 37066bce7ff8SHong Zhang *bj = i; bj++; 370716a2bf60SHong Zhang bdiag[i] = bi[2*n-i+1]-1; 37086bce7ff8SHong Zhang } 37096bce7ff8SHong Zhang PetscFunctionReturn(0); 37106bce7ff8SHong Zhang } 37116bce7ff8SHong Zhang 371216a2bf60SHong Zhang extern PetscErrorCode PetscFreeSpaceContiguous_newdatastruct(PetscFreeSpaceList *,PetscInt *,PetscInt,PetscInt *,PetscInt *); 371316a2bf60SHong Zhang #undef __FUNCT__ 371416a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 371516a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 371616a2bf60SHong Zhang { 371716a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 371816a2bf60SHong Zhang IS isicol; 371916a2bf60SHong Zhang PetscErrorCode ierr; 372016a2bf60SHong Zhang const PetscInt *r,*ic; 3721*7fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 372216a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 372316a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 372416a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 3725*7fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 372616a2bf60SHong Zhang PetscReal f; 372716a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 372816a2bf60SHong Zhang PetscBT lnkbt; 372916a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 373016a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 373116a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 373216a2bf60SHong Zhang PetscTruth missing; 3733*7fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 373416a2bf60SHong Zhang 373516a2bf60SHong Zhang PetscFunctionBegin; 373616a2bf60SHong Zhang if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 373716a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 373816a2bf60SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 373916a2bf60SHong Zhang 374016a2bf60SHong Zhang f = info->fill; 374116a2bf60SHong Zhang levels = (PetscInt)info->levels; 374216a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 374316a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 374416a2bf60SHong Zhang 374516a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 374616a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3747*7fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 374816a2bf60SHong Zhang 3749*7fa3a6a0SHong Zhang if (!levels && both_identity) { 375016a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 375116a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 375216a2bf60SHong Zhang (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 3753*7fa3a6a0SHong Zhang /* set MatSolve routines */ 3754*7fa3a6a0SHong Zhang switch (bs){ 3755*7fa3a6a0SHong Zhang case 2: 3756*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 3757*7fa3a6a0SHong Zhang break; 3758*7fa3a6a0SHong Zhang case 3: 3759*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 3760*7fa3a6a0SHong Zhang break; 3761*7fa3a6a0SHong Zhang case 4: 3762*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 3763*7fa3a6a0SHong Zhang break; 3764*7fa3a6a0SHong Zhang case 5: 3765*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 3766*7fa3a6a0SHong Zhang break; 3767*7fa3a6a0SHong Zhang case 6: 3768*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 3769*7fa3a6a0SHong Zhang break; 3770*7fa3a6a0SHong Zhang case 7: 3771*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 3772*7fa3a6a0SHong Zhang break; 3773*7fa3a6a0SHong Zhang default: 3774*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 3775*7fa3a6a0SHong Zhang break; 3776*7fa3a6a0SHong Zhang } 377716a2bf60SHong Zhang 377816a2bf60SHong Zhang fact->factor = MAT_FACTOR_ILU; 377916a2bf60SHong Zhang (fact)->info.factor_mallocs = 0; 378016a2bf60SHong Zhang (fact)->info.fill_ratio_given = info->fill; 378116a2bf60SHong Zhang (fact)->info.fill_ratio_needed = 1.0; 378216a2bf60SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 378316a2bf60SHong Zhang b->row = isrow; 378416a2bf60SHong Zhang b->col = iscol; 378516a2bf60SHong Zhang b->icol = isicol; 378616a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 378716a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 378816a2bf60SHong Zhang b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 378916a2bf60SHong Zhang ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 379016a2bf60SHong Zhang PetscFunctionReturn(0); 379116a2bf60SHong Zhang } 379216a2bf60SHong Zhang 379316a2bf60SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 379416a2bf60SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 379516a2bf60SHong Zhang 379616a2bf60SHong Zhang /* get new row pointers */ 379716a2bf60SHong Zhang ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 379816a2bf60SHong Zhang bi[0] = 0; 379916a2bf60SHong Zhang /* bdiag is location of diagonal in factor */ 380016a2bf60SHong Zhang ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 380116a2bf60SHong Zhang bdiag[0] = 0; 380216a2bf60SHong Zhang 380316a2bf60SHong Zhang ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr); 380416a2bf60SHong Zhang bjlvl_ptr = (PetscInt**)(bj_ptr + n); 380516a2bf60SHong Zhang 380616a2bf60SHong Zhang /* create a linked list for storing column indices of the active row */ 380716a2bf60SHong Zhang nlnk = n + 1; 380816a2bf60SHong Zhang ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 380916a2bf60SHong Zhang 381016a2bf60SHong Zhang /* initial FreeSpace size is f*(ai[n]+1) */ 381116a2bf60SHong Zhang ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 381216a2bf60SHong Zhang current_space = free_space; 381316a2bf60SHong Zhang ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 381416a2bf60SHong Zhang current_space_lvl = free_space_lvl; 381516a2bf60SHong Zhang 381616a2bf60SHong Zhang for (i=0; i<n; i++) { 381716a2bf60SHong Zhang nzi = 0; 381816a2bf60SHong Zhang /* copy current row into linked list */ 381916a2bf60SHong Zhang nnz = ai[r[i]+1] - ai[r[i]]; 382016a2bf60SHong Zhang if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 382116a2bf60SHong Zhang cols = aj + ai[r[i]]; 382216a2bf60SHong Zhang lnk[i] = -1; /* marker to indicate if diagonal exists */ 382316a2bf60SHong Zhang ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 382416a2bf60SHong Zhang nzi += nlnk; 382516a2bf60SHong Zhang 382616a2bf60SHong Zhang /* make sure diagonal entry is included */ 382716a2bf60SHong Zhang if (diagonal_fill && lnk[i] == -1) { 382816a2bf60SHong Zhang fm = n; 382916a2bf60SHong Zhang while (lnk[fm] < i) fm = lnk[fm]; 383016a2bf60SHong Zhang lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 383116a2bf60SHong Zhang lnk[fm] = i; 383216a2bf60SHong Zhang lnk_lvl[i] = 0; 383316a2bf60SHong Zhang nzi++; dcount++; 383416a2bf60SHong Zhang } 383516a2bf60SHong Zhang 383616a2bf60SHong Zhang /* add pivot rows into the active row */ 383716a2bf60SHong Zhang nzbd = 0; 383816a2bf60SHong Zhang prow = lnk[n]; 383916a2bf60SHong Zhang while (prow < i) { 384016a2bf60SHong Zhang nnz = bdiag[prow]; 384116a2bf60SHong Zhang cols = bj_ptr[prow] + nnz + 1; 384216a2bf60SHong Zhang cols_lvl = bjlvl_ptr[prow] + nnz + 1; 384316a2bf60SHong Zhang nnz = bi[prow+1] - bi[prow] - nnz - 1; 384416a2bf60SHong Zhang ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 384516a2bf60SHong Zhang nzi += nlnk; 384616a2bf60SHong Zhang prow = lnk[prow]; 384716a2bf60SHong Zhang nzbd++; 384816a2bf60SHong Zhang } 384916a2bf60SHong Zhang bdiag[i] = nzbd; 385016a2bf60SHong Zhang bi[i+1] = bi[i] + nzi; 385116a2bf60SHong Zhang 385216a2bf60SHong Zhang /* if free space is not available, make more free space */ 385316a2bf60SHong Zhang if (current_space->local_remaining<nzi) { 385416a2bf60SHong Zhang nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 385516a2bf60SHong Zhang ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 385616a2bf60SHong Zhang ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 385716a2bf60SHong Zhang reallocs++; 385816a2bf60SHong Zhang } 385916a2bf60SHong Zhang 386016a2bf60SHong Zhang /* copy data into free_space and free_space_lvl, then initialize lnk */ 386116a2bf60SHong Zhang ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 386216a2bf60SHong Zhang bj_ptr[i] = current_space->array; 386316a2bf60SHong Zhang bjlvl_ptr[i] = current_space_lvl->array; 386416a2bf60SHong Zhang 386516a2bf60SHong Zhang /* make sure the active row i has diagonal entry */ 386616a2bf60SHong Zhang if (*(bj_ptr[i]+bdiag[i]) != i) { 386716a2bf60SHong Zhang SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 386816a2bf60SHong Zhang try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 386916a2bf60SHong Zhang } 387016a2bf60SHong Zhang 387116a2bf60SHong Zhang current_space->array += nzi; 387216a2bf60SHong Zhang current_space->local_used += nzi; 387316a2bf60SHong Zhang current_space->local_remaining -= nzi; 387416a2bf60SHong Zhang current_space_lvl->array += nzi; 387516a2bf60SHong Zhang current_space_lvl->local_used += nzi; 387616a2bf60SHong Zhang current_space_lvl->local_remaining -= nzi; 387716a2bf60SHong Zhang } 387816a2bf60SHong Zhang 387916a2bf60SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 388016a2bf60SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 388116a2bf60SHong Zhang 388216a2bf60SHong Zhang /* destroy list of free space and other temporary arrays */ 388316a2bf60SHong Zhang ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 388416a2bf60SHong Zhang 388516a2bf60SHong Zhang /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 388616a2bf60SHong Zhang ierr = PetscFreeSpaceContiguous_newdatastruct(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 388716a2bf60SHong Zhang 388816a2bf60SHong Zhang ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 388916a2bf60SHong Zhang ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 389016a2bf60SHong Zhang ierr = PetscFree(bj_ptr);CHKERRQ(ierr); 389116a2bf60SHong Zhang 389216a2bf60SHong Zhang #if defined(PETSC_USE_INFO) 389316a2bf60SHong Zhang { 389416a2bf60SHong Zhang PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 389516a2bf60SHong Zhang ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 389616a2bf60SHong Zhang ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 389716a2bf60SHong Zhang ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 389816a2bf60SHong Zhang ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 389916a2bf60SHong Zhang if (diagonal_fill) { 390016a2bf60SHong Zhang ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 390116a2bf60SHong Zhang } 390216a2bf60SHong Zhang } 390316a2bf60SHong Zhang #endif 390416a2bf60SHong Zhang 390516a2bf60SHong Zhang /* put together the new matrix */ 390616a2bf60SHong Zhang ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 390716a2bf60SHong Zhang ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 390816a2bf60SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 390916a2bf60SHong Zhang b->free_a = PETSC_TRUE; 391016a2bf60SHong Zhang b->free_ij = PETSC_TRUE; 391116a2bf60SHong Zhang b->singlemalloc = PETSC_FALSE; 3912*7fa3a6a0SHong Zhang ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 391316a2bf60SHong Zhang b->j = bj; 391416a2bf60SHong Zhang b->i = bi; 391516a2bf60SHong Zhang b->diag = bdiag; 391616a2bf60SHong Zhang b->ilen = 0; 391716a2bf60SHong Zhang b->imax = 0; 391816a2bf60SHong Zhang b->row = isrow; 391916a2bf60SHong Zhang b->col = iscol; 392016a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 392116a2bf60SHong Zhang ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 392216a2bf60SHong Zhang b->icol = isicol; 3923*7fa3a6a0SHong Zhang ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 392416a2bf60SHong Zhang /* In b structure: Free imax, ilen, old a, old j. 392516a2bf60SHong Zhang Allocate bdiag, solve_work, new a, new j */ 3926*7fa3a6a0SHong Zhang ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 392716a2bf60SHong Zhang b->maxnz = b->nz = bi[2*n+1] ; 392816a2bf60SHong Zhang (fact)->info.factor_mallocs = reallocs; 392916a2bf60SHong Zhang (fact)->info.fill_ratio_given = f; 393016a2bf60SHong Zhang (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]); 393116a2bf60SHong Zhang (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 3932*7fa3a6a0SHong Zhang /* set MatSolve routines */ 3933*7fa3a6a0SHong Zhang if (both_identity){ 3934*7fa3a6a0SHong Zhang switch (bs){ 3935*7fa3a6a0SHong Zhang case 2: 3936*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 3937*7fa3a6a0SHong Zhang break; 3938*7fa3a6a0SHong Zhang case 3: 3939*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct; 3940*7fa3a6a0SHong Zhang break; 3941*7fa3a6a0SHong Zhang case 4: 3942*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct; 3943*7fa3a6a0SHong Zhang break; 3944*7fa3a6a0SHong Zhang case 5: 3945*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 3946*7fa3a6a0SHong Zhang break; 3947*7fa3a6a0SHong Zhang case 6: 3948*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct; 3949*7fa3a6a0SHong Zhang break; 3950*7fa3a6a0SHong Zhang case 7: 3951*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct; 3952*7fa3a6a0SHong Zhang break; 3953*7fa3a6a0SHong Zhang default: 3954*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 3955*7fa3a6a0SHong Zhang break; 3956*7fa3a6a0SHong Zhang } 3957*7fa3a6a0SHong Zhang } else { 3958*7fa3a6a0SHong Zhang switch (bs){ 3959*7fa3a6a0SHong Zhang /* not implemented yet! 3960*7fa3a6a0SHong Zhang case 2: 3961*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct; 3962*7fa3a6a0SHong Zhang break; 3963*7fa3a6a0SHong Zhang case 3: 3964*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct; 3965*7fa3a6a0SHong Zhang break; 3966*7fa3a6a0SHong Zhang case 4: 3967*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct; 3968*7fa3a6a0SHong Zhang break; 3969*7fa3a6a0SHong Zhang case 5: 3970*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct; 3971*7fa3a6a0SHong Zhang break; 3972*7fa3a6a0SHong Zhang case 6: 3973*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct; 3974*7fa3a6a0SHong Zhang break; 3975*7fa3a6a0SHong Zhang case 7: 3976*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct; 3977*7fa3a6a0SHong Zhang break; 3978*7fa3a6a0SHong Zhang default: 3979*7fa3a6a0SHong Zhang fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 3980*7fa3a6a0SHong Zhang break; 3981*7fa3a6a0SHong Zhang */ 3982*7fa3a6a0SHong Zhang } 3983*7fa3a6a0SHong Zhang } 398416a2bf60SHong Zhang PetscFunctionReturn(0); 398516a2bf60SHong Zhang } 398616a2bf60SHong Zhang 39874e2b4712SSatish Balay /* 39884e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 39894e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 39904e2b4712SSatish Balay Not a good example of code reuse. 39914e2b4712SSatish Balay */ 39924a2ae208SSatish Balay #undef __FUNCT__ 39934a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 39940481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 39954e2b4712SSatish Balay { 39964e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 39974e2b4712SSatish Balay IS isicol; 39986849ba73SBarry Smith PetscErrorCode ierr; 39995d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 40005d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 4001a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 4002d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 400341df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 4004329f5518SBarry Smith PetscReal f; 400516a2bf60SHong Zhang PetscTruth newdatastruct=PETSC_FALSE; 40064e2b4712SSatish Balay 40074e2b4712SSatish Balay PetscFunctionBegin; 400816a2bf60SHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 400916a2bf60SHong Zhang if (newdatastruct){ 401016a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 401116a2bf60SHong Zhang PetscFunctionReturn(0); 401216a2bf60SHong Zhang } 401316a2bf60SHong Zhang 40146bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 40156bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 40166bce7ff8SHong Zhang 4017435faa5fSBarry Smith f = info->fill; 4018690b6cddSBarry Smith levels = (PetscInt)info->levels; 4019690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 40204c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 402116a2bf60SHong Zhang 4022667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4023667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 40247d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 4025309c388cSBarry Smith 402641df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 402716a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 40286bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 40296bce7ff8SHong Zhang 4030719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 4031719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 4032bb3d539aSBarry Smith b->row = isrow; 4033bb3d539aSBarry Smith b->col = iscol; 4034bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4035bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4036bb3d539aSBarry Smith b->icol = isicol; 4037bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4038719d5645SBarry Smith ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 40396bce7ff8SHong Zhang PetscFunctionReturn(0); 40406bce7ff8SHong Zhang } 40416bce7ff8SHong Zhang 40426bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 40434e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 40444e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 40454e2b4712SSatish Balay 40464e2b4712SSatish Balay /* get new row pointers */ 4047690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 40484e2b4712SSatish Balay ainew[0] = 0; 40494e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 4050690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 4051690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 40524e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 4053690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 40544e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 4055690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 40564e2b4712SSatish Balay /* im is level for each filled value */ 4057690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 40584e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 4059690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 40604e2b4712SSatish Balay dloc[0] = 0; 40614e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 4062435faa5fSBarry Smith 4063435faa5fSBarry Smith /* copy prow into linked list */ 40644e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 40653b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 40664e2b4712SSatish Balay xi = aj + ai[r[prow]]; 40674e2b4712SSatish Balay fill[n] = n; 4068435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 40694e2b4712SSatish Balay while (nz--) { 40704e2b4712SSatish Balay fm = n; 40714e2b4712SSatish Balay idx = ic[*xi++]; 40724e2b4712SSatish Balay do { 40734e2b4712SSatish Balay m = fm; 40744e2b4712SSatish Balay fm = fill[m]; 40754e2b4712SSatish Balay } while (fm < idx); 40764e2b4712SSatish Balay fill[m] = idx; 40774e2b4712SSatish Balay fill[idx] = fm; 40784e2b4712SSatish Balay im[idx] = 0; 40794e2b4712SSatish Balay } 4080435faa5fSBarry Smith 4081435faa5fSBarry Smith /* make sure diagonal entry is included */ 4082435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 4083435faa5fSBarry Smith fm = n; 4084435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 4085435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 4086435faa5fSBarry Smith fill[fm] = prow; 4087435faa5fSBarry Smith im[prow] = 0; 4088435faa5fSBarry Smith nzf++; 4089335d9088SBarry Smith dcount++; 4090435faa5fSBarry Smith } 4091435faa5fSBarry Smith 40924e2b4712SSatish Balay nzi = 0; 40934e2b4712SSatish Balay row = fill[n]; 40944e2b4712SSatish Balay while (row < prow) { 40954e2b4712SSatish Balay incrlev = im[row] + 1; 40964e2b4712SSatish Balay nz = dloc[row]; 4097435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 40984e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 40994e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 41004e2b4712SSatish Balay fm = row; 41014e2b4712SSatish Balay while (nnz-- > 0) { 41024e2b4712SSatish Balay idx = *xi++; 41034e2b4712SSatish Balay if (*flev + incrlev > levels) { 41044e2b4712SSatish Balay flev++; 41054e2b4712SSatish Balay continue; 41064e2b4712SSatish Balay } 41074e2b4712SSatish Balay do { 41084e2b4712SSatish Balay m = fm; 41094e2b4712SSatish Balay fm = fill[m]; 41104e2b4712SSatish Balay } while (fm < idx); 41114e2b4712SSatish Balay if (fm != idx) { 41124e2b4712SSatish Balay im[idx] = *flev + incrlev; 41134e2b4712SSatish Balay fill[m] = idx; 41144e2b4712SSatish Balay fill[idx] = fm; 41154e2b4712SSatish Balay fm = idx; 41164e2b4712SSatish Balay nzf++; 4117ecf371e4SBarry Smith } else { 41184e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 41194e2b4712SSatish Balay } 41204e2b4712SSatish Balay flev++; 41214e2b4712SSatish Balay } 41224e2b4712SSatish Balay row = fill[row]; 41234e2b4712SSatish Balay nzi++; 41244e2b4712SSatish Balay } 41254e2b4712SSatish Balay /* copy new filled row into permanent storage */ 41264e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 41274e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 4128ecf371e4SBarry Smith 4129ecf371e4SBarry Smith /* estimate how much additional space we will need */ 4130ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 4131ecf371e4SBarry Smith /* just double the memory each time */ 4132690b6cddSBarry Smith PetscInt maxadd = jmax; 4133ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 41344e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 41354e2b4712SSatish Balay jmax += maxadd; 4136ecf371e4SBarry Smith 4137ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 41385d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 41395d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4140606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 41415d0c19d7SBarry Smith ajnew = xitmp; 41425d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 41435d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4144606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 41455d0c19d7SBarry Smith ajfill = xitmp; 4146eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 41474e2b4712SSatish Balay } 41485d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 41494e2b4712SSatish Balay flev = ajfill + ainew[prow]; 41504e2b4712SSatish Balay dloc[prow] = nzi; 41514e2b4712SSatish Balay fm = fill[n]; 41524e2b4712SSatish Balay while (nzf--) { 41535d0c19d7SBarry Smith *xitmp++ = fm; 41544e2b4712SSatish Balay *flev++ = im[fm]; 41554e2b4712SSatish Balay fm = fill[fm]; 41564e2b4712SSatish Balay } 4157435faa5fSBarry Smith /* make sure row has diagonal entry */ 4158435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 415977431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 41602401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 4161435faa5fSBarry Smith } 41624e2b4712SSatish Balay } 4163606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 41644e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 41654e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4166606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 4167606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 41684e2b4712SSatish Balay 41696cf91177SBarry Smith #if defined(PETSC_USE_INFO) 41704e2b4712SSatish Balay { 4171329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 4172ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 4173ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 4174ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 4175ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 4176335d9088SBarry Smith if (diagonal_fill) { 4177ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 4178335d9088SBarry Smith } 41794e2b4712SSatish Balay } 418063ba0a88SBarry Smith #endif 41814e2b4712SSatish Balay 41824e2b4712SSatish Balay /* put together the new matrix */ 4183719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 4184719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 4185719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 4186e6b907acSBarry Smith b->free_a = PETSC_TRUE; 4187e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 41887c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 4189a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 41904e2b4712SSatish Balay b->j = ajnew; 41914e2b4712SSatish Balay b->i = ainew; 41924e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 41934e2b4712SSatish Balay b->diag = dloc; 41944e2b4712SSatish Balay b->ilen = 0; 41954e2b4712SSatish Balay b->imax = 0; 41964e2b4712SSatish Balay b->row = isrow; 41974e2b4712SSatish Balay b->col = iscol; 4198bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4199c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4200c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4201e51c0b9cSSatish Balay b->icol = isicol; 420287828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 42034e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 42044e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 4205719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 42064e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 42074e2b4712SSatish Balay 4208719d5645SBarry Smith (fact)->info.factor_mallocs = reallocate; 4209719d5645SBarry Smith (fact)->info.fill_ratio_given = f; 4210719d5645SBarry Smith (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 42116bce7ff8SHong Zhang 421241df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 42138661488fSKris Buschelman PetscFunctionReturn(0); 42148661488fSKris Buschelman } 42158661488fSKris Buschelman 4216732ee342SKris Buschelman #undef __FUNCT__ 42177e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 4218dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 42197e7071cdSKris Buschelman { 422012272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 422112272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 42225a9542e3SKris Buschelman PetscFunctionBegin; 42237cf1b8d3SKris Buschelman /* Undo Column scaling */ 42247cf1b8d3SKris Buschelman /* while (nz--) { */ 42257cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 42267cf1b8d3SKris Buschelman /* } */ 4227c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 4228c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 42297cf1b8d3SKris Buschelman PetscFunctionReturn(0); 42307cf1b8d3SKris Buschelman } 42317cf1b8d3SKris Buschelman 42327cf1b8d3SKris Buschelman #undef __FUNCT__ 42337cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 4234dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 42357cf1b8d3SKris Buschelman { 42367cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4237b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 42382aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 42395a9542e3SKris Buschelman PetscFunctionBegin; 42400b9da03eSKris Buschelman /* Is this really necessary? */ 424120235379SKris Buschelman while (nz--) { 42420b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 42437e7071cdSKris Buschelman } 4244c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 42457e7071cdSKris Buschelman PetscFunctionReturn(0); 42467e7071cdSKris Buschelman } 42477e7071cdSKris Buschelman 4248732ee342SKris Buschelman 4249