1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 1016a2bf60SHong Zhang #include "petscbt.h" 1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 124e2b4712SSatish Balay 134a2ae208SSatish Balay #undef __FUNCT__ 144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16f1af5d2fSBarry Smith { 17f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18dfbe8321SBarry Smith PetscErrorCode ierr; 19690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20690b6cddSBarry Smith PetscInt *diag = a->diag; 21f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2287828ca2SBarry Smith PetscScalar s1,*x,*b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 66f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6887828ca2SBarry Smith PetscScalar *x,*b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 1111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 118*6929473cSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct" 119*6929473cSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 120*6929473cSShri Abhyankar { 121*6929473cSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122*6929473cSShri Abhyankar PetscErrorCode ierr; 123*6929473cSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 124*6929473cSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 125*6929473cSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 126*6929473cSShri Abhyankar MatScalar *aa=a->a,*v; 127*6929473cSShri Abhyankar PetscScalar s1,s2,x1,x2; 128*6929473cSShri Abhyankar PetscScalar *x,*b; 129*6929473cSShri Abhyankar 130*6929473cSShri Abhyankar PetscFunctionBegin; 131*6929473cSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 132*6929473cSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 133*6929473cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 134*6929473cSShri Abhyankar 135*6929473cSShri Abhyankar /* forward solve the U^T */ 136*6929473cSShri Abhyankar idx = 0; 137*6929473cSShri Abhyankar for (i=0; i<n; i++) { 138*6929473cSShri Abhyankar v = aa + bs2*diag[i]; 139*6929473cSShri Abhyankar /* multiply by the inverse of the block diagonal */ 140*6929473cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 141*6929473cSShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 142*6929473cSShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 143*6929473cSShri Abhyankar v -= bs2; 144*6929473cSShri Abhyankar 145*6929473cSShri Abhyankar vi = aj + diag[i] - 1; 146*6929473cSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 147*6929473cSShri Abhyankar for(j=0;j>-nz;j--){ 148*6929473cSShri Abhyankar oidx = bs*vi[j]; 149*6929473cSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2; 150*6929473cSShri Abhyankar x[oidx+1] -= v[2]*s1 + v[3]*s2; 151*6929473cSShri Abhyankar v -= bs2; 152*6929473cSShri Abhyankar } 153*6929473cSShri Abhyankar x[idx] = s1;x[1+idx] = s2; 154*6929473cSShri Abhyankar idx += bs; 155*6929473cSShri Abhyankar } 156*6929473cSShri Abhyankar /* backward solve the L^T */ 157*6929473cSShri Abhyankar for (i=n-1; i>=0; i--){ 158*6929473cSShri Abhyankar v = aa + bs2*ai[i]; 159*6929473cSShri Abhyankar vi = aj + ai[i]; 160*6929473cSShri Abhyankar nz = ai[i+1] - ai[i]; 161*6929473cSShri Abhyankar idt = bs*i; 162*6929473cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 163*6929473cSShri Abhyankar for(j=0;j<nz;j++){ 164*6929473cSShri Abhyankar idx = bs*vi[j]; 165*6929473cSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2; 166*6929473cSShri Abhyankar x[idx+1] -= v[2]*s1 + v[3]*s2; 167*6929473cSShri Abhyankar v += bs2; 168*6929473cSShri Abhyankar } 169*6929473cSShri Abhyankar } 170*6929473cSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 171*6929473cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 172*6929473cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 173*6929473cSShri Abhyankar PetscFunctionReturn(0); 174*6929473cSShri Abhyankar } 175*6929473cSShri Abhyankar 176*6929473cSShri Abhyankar #undef __FUNCT__ 1774a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 178dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 179f1af5d2fSBarry Smith { 180f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181dfbe8321SBarry Smith PetscErrorCode ierr; 182690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 183690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 184f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18587828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 18687828ca2SBarry Smith PetscScalar *x,*b; 187f1af5d2fSBarry Smith 188f1af5d2fSBarry Smith PetscFunctionBegin; 189ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1901ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192f1af5d2fSBarry Smith 193f1af5d2fSBarry Smith /* forward solve the U^T */ 194f1af5d2fSBarry Smith idx = 0; 195f1af5d2fSBarry Smith for (i=0; i<n; i++) { 196f1af5d2fSBarry Smith 197f1af5d2fSBarry Smith v = aa + 9*diag[i]; 198f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 199ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 200f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 201f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 202f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 203f1af5d2fSBarry Smith v += 9; 204f1af5d2fSBarry Smith 205f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 206f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 207f1af5d2fSBarry Smith while (nz--) { 208f1af5d2fSBarry Smith oidx = 3*(*vi++); 209f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 210f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 211f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 212f1af5d2fSBarry Smith v += 9; 213f1af5d2fSBarry Smith } 214f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 215f1af5d2fSBarry Smith idx += 3; 216f1af5d2fSBarry Smith } 217f1af5d2fSBarry Smith /* backward solve the L^T */ 218f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 219f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 220f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 221f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 222f1af5d2fSBarry Smith idt = 3*i; 223f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 224f1af5d2fSBarry Smith while (nz--) { 225f1af5d2fSBarry Smith idx = 3*(*vi--); 226f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 227f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 228f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 229f1af5d2fSBarry Smith v -= 9; 230f1af5d2fSBarry Smith } 231f1af5d2fSBarry Smith } 2321ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 235f1af5d2fSBarry Smith PetscFunctionReturn(0); 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith 2384a2ae208SSatish Balay #undef __FUNCT__ 2394a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 240dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 241f1af5d2fSBarry Smith { 242f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243dfbe8321SBarry Smith PetscErrorCode ierr; 244690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 245690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 246f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 24787828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 24887828ca2SBarry Smith PetscScalar *x,*b; 249f1af5d2fSBarry Smith 250f1af5d2fSBarry Smith PetscFunctionBegin; 251ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2521ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2531ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 254f1af5d2fSBarry Smith 255f1af5d2fSBarry Smith /* forward solve the U^T */ 256f1af5d2fSBarry Smith idx = 0; 257f1af5d2fSBarry Smith for (i=0; i<n; i++) { 258f1af5d2fSBarry Smith 259f1af5d2fSBarry Smith v = aa + 16*diag[i]; 260f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 261ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 262f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 263f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 264f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 265f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 266f1af5d2fSBarry Smith v += 16; 267f1af5d2fSBarry Smith 268f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 269f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 270f1af5d2fSBarry Smith while (nz--) { 271f1af5d2fSBarry Smith oidx = 4*(*vi++); 272f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 273f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 274f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 275f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 276f1af5d2fSBarry Smith v += 16; 277f1af5d2fSBarry Smith } 278f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 279f1af5d2fSBarry Smith idx += 4; 280f1af5d2fSBarry Smith } 281f1af5d2fSBarry Smith /* backward solve the L^T */ 282f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 283f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 284f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 285f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 286f1af5d2fSBarry Smith idt = 4*i; 287f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 288f1af5d2fSBarry Smith while (nz--) { 289f1af5d2fSBarry Smith idx = 4*(*vi--); 290f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 291f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 292f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 293f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 294f1af5d2fSBarry Smith v -= 16; 295f1af5d2fSBarry Smith } 296f1af5d2fSBarry Smith } 2971ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2981ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 299dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 300f1af5d2fSBarry Smith PetscFunctionReturn(0); 301f1af5d2fSBarry Smith } 302f1af5d2fSBarry Smith 3034a2ae208SSatish Balay #undef __FUNCT__ 3044a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 305dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 306f1af5d2fSBarry Smith { 307f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 308dfbe8321SBarry Smith PetscErrorCode ierr; 309690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 310690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 311f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 31287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 31387828ca2SBarry Smith PetscScalar *x,*b; 314f1af5d2fSBarry Smith 315f1af5d2fSBarry Smith PetscFunctionBegin; 316ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3171ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 319f1af5d2fSBarry Smith 320f1af5d2fSBarry Smith /* forward solve the U^T */ 321f1af5d2fSBarry Smith idx = 0; 322f1af5d2fSBarry Smith for (i=0; i<n; i++) { 323f1af5d2fSBarry Smith 324f1af5d2fSBarry Smith v = aa + 25*diag[i]; 325f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 326ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 327f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 328f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 329f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 330f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 331f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 332f1af5d2fSBarry Smith v += 25; 333f1af5d2fSBarry Smith 334f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 335f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 336f1af5d2fSBarry Smith while (nz--) { 337f1af5d2fSBarry Smith oidx = 5*(*vi++); 338f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 339f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 340f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 341f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 342f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 343f1af5d2fSBarry Smith v += 25; 344f1af5d2fSBarry Smith } 345f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 346f1af5d2fSBarry Smith idx += 5; 347f1af5d2fSBarry Smith } 348f1af5d2fSBarry Smith /* backward solve the L^T */ 349f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 350f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 351f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 352f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 353f1af5d2fSBarry Smith idt = 5*i; 354f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 355f1af5d2fSBarry Smith while (nz--) { 356f1af5d2fSBarry Smith idx = 5*(*vi--); 357f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 358f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 359f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 360f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 361f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 362f1af5d2fSBarry Smith v -= 25; 363f1af5d2fSBarry Smith } 364f1af5d2fSBarry Smith } 3651ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3661ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 367dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 368f1af5d2fSBarry Smith PetscFunctionReturn(0); 369f1af5d2fSBarry Smith } 370f1af5d2fSBarry Smith 3714a2ae208SSatish Balay #undef __FUNCT__ 3724a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 373dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 374f1af5d2fSBarry Smith { 375f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 376dfbe8321SBarry Smith PetscErrorCode ierr; 377690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 378690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 379f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 38087828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 38187828ca2SBarry Smith PetscScalar *x,*b; 382f1af5d2fSBarry Smith 383f1af5d2fSBarry Smith PetscFunctionBegin; 384ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3851ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3861ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 387f1af5d2fSBarry Smith 388f1af5d2fSBarry Smith /* forward solve the U^T */ 389f1af5d2fSBarry Smith idx = 0; 390f1af5d2fSBarry Smith for (i=0; i<n; i++) { 391f1af5d2fSBarry Smith 392f1af5d2fSBarry Smith v = aa + 36*diag[i]; 393f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 394ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 395ef66eb69SBarry Smith x6 = x[5+idx]; 396f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 397f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 398f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 399f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 400f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 401f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 402f1af5d2fSBarry Smith v += 36; 403f1af5d2fSBarry Smith 404f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 405f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 406f1af5d2fSBarry Smith while (nz--) { 407f1af5d2fSBarry Smith oidx = 6*(*vi++); 408f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 409f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 410f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 411f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 412f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 413f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 414f1af5d2fSBarry Smith v += 36; 415f1af5d2fSBarry Smith } 416f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 417f1af5d2fSBarry Smith x[5+idx] = s6; 418f1af5d2fSBarry Smith idx += 6; 419f1af5d2fSBarry Smith } 420f1af5d2fSBarry Smith /* backward solve the L^T */ 421f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 422f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 423f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 424f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 425f1af5d2fSBarry Smith idt = 6*i; 426f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 427f1af5d2fSBarry Smith s6 = x[5+idt]; 428f1af5d2fSBarry Smith while (nz--) { 429f1af5d2fSBarry Smith idx = 6*(*vi--); 430f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 431f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 432f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 433f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 434f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 435f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 436f1af5d2fSBarry Smith v -= 36; 437f1af5d2fSBarry Smith } 438f1af5d2fSBarry Smith } 4391ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4401ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 441dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 442f1af5d2fSBarry Smith PetscFunctionReturn(0); 443f1af5d2fSBarry Smith } 444f1af5d2fSBarry Smith 4454a2ae208SSatish Balay #undef __FUNCT__ 4464a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 447dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 448f1af5d2fSBarry Smith { 449f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 450dfbe8321SBarry Smith PetscErrorCode ierr; 451690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 452690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 453f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 45487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 45587828ca2SBarry Smith PetscScalar *x,*b; 456f1af5d2fSBarry Smith 457f1af5d2fSBarry Smith PetscFunctionBegin; 458ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4591ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4601ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 461f1af5d2fSBarry Smith 462f1af5d2fSBarry Smith /* forward solve the U^T */ 463f1af5d2fSBarry Smith idx = 0; 464f1af5d2fSBarry Smith for (i=0; i<n; i++) { 465f1af5d2fSBarry Smith 466f1af5d2fSBarry Smith v = aa + 49*diag[i]; 467f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 468ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 469ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 470f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 471f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 472f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 473f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 474f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 475f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 476f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 477f1af5d2fSBarry Smith v += 49; 478f1af5d2fSBarry Smith 479f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 480f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 481f1af5d2fSBarry Smith while (nz--) { 482f1af5d2fSBarry Smith oidx = 7*(*vi++); 483f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 484f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 485f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 486f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 487f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 488f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 489f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 490f1af5d2fSBarry Smith v += 49; 491f1af5d2fSBarry Smith } 492f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 493f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 494f1af5d2fSBarry Smith idx += 7; 495f1af5d2fSBarry Smith } 496f1af5d2fSBarry Smith /* backward solve the L^T */ 497f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 498f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 499f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 500f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 501f1af5d2fSBarry Smith idt = 7*i; 502f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 503f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 504f1af5d2fSBarry Smith while (nz--) { 505f1af5d2fSBarry Smith idx = 7*(*vi--); 506f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 507f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 508f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 509f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 510f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 511f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 512f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 513f1af5d2fSBarry Smith v -= 49; 514f1af5d2fSBarry Smith } 515f1af5d2fSBarry Smith } 5161ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5171ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 518dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 519f1af5d2fSBarry Smith PetscFunctionReturn(0); 520f1af5d2fSBarry Smith } 521f1af5d2fSBarry Smith 522f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 5234a2ae208SSatish Balay #undef __FUNCT__ 5244a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 525dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 526f1af5d2fSBarry Smith { 527f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 528f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5296849ba73SBarry Smith PetscErrorCode ierr; 5305d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5315d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 532690b6cddSBarry Smith PetscInt *diag = a->diag; 533f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53487828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 535f1af5d2fSBarry Smith 536f1af5d2fSBarry Smith PetscFunctionBegin; 5371ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5381ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 539f1af5d2fSBarry Smith t = a->solve_work; 540f1af5d2fSBarry Smith 541f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 542f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 543f1af5d2fSBarry Smith 544f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 545f1af5d2fSBarry Smith for (i=0; i<n; i++) { 546f1af5d2fSBarry Smith t[i] = b[c[i]]; 547f1af5d2fSBarry Smith } 548f1af5d2fSBarry Smith 549f1af5d2fSBarry Smith /* forward solve the U^T */ 550f1af5d2fSBarry Smith for (i=0; i<n; i++) { 551f1af5d2fSBarry Smith 552f1af5d2fSBarry Smith v = aa + diag[i]; 553f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 554f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 555f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 556f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 557f1af5d2fSBarry Smith while (nz--) { 558f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 559f1af5d2fSBarry Smith } 560f1af5d2fSBarry Smith t[i] = s1; 561f1af5d2fSBarry Smith } 562f1af5d2fSBarry Smith /* backward solve the L^T */ 563f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 564f1af5d2fSBarry Smith v = aa + diag[i] - 1; 565f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 566f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 567f1af5d2fSBarry Smith s1 = t[i]; 568f1af5d2fSBarry Smith while (nz--) { 569f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 570f1af5d2fSBarry Smith } 571f1af5d2fSBarry Smith } 572f1af5d2fSBarry Smith 573f1af5d2fSBarry Smith /* copy t into x according to permutation */ 574f1af5d2fSBarry Smith for (i=0; i<n; i++) { 575f1af5d2fSBarry Smith x[r[i]] = t[i]; 576f1af5d2fSBarry Smith } 577f1af5d2fSBarry Smith 578f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 579f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5801ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 582dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 583f1af5d2fSBarry Smith PetscFunctionReturn(0); 584f1af5d2fSBarry Smith } 585f1af5d2fSBarry Smith 5864a2ae208SSatish Balay #undef __FUNCT__ 5874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 588dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 589f1af5d2fSBarry Smith { 590f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 591f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5926849ba73SBarry Smith PetscErrorCode ierr; 5935d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5945d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 595690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 596f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 59787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 59887828ca2SBarry Smith PetscScalar *x,*b,*t; 599f1af5d2fSBarry Smith 600f1af5d2fSBarry Smith PetscFunctionBegin; 6011ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6021ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 603f1af5d2fSBarry Smith t = a->solve_work; 604f1af5d2fSBarry Smith 605f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 606f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 607f1af5d2fSBarry Smith 608f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 609f1af5d2fSBarry Smith ii = 0; 610f1af5d2fSBarry Smith for (i=0; i<n; i++) { 611f1af5d2fSBarry Smith ic = 2*c[i]; 612f1af5d2fSBarry Smith t[ii] = b[ic]; 613f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 614f1af5d2fSBarry Smith ii += 2; 615f1af5d2fSBarry Smith } 616f1af5d2fSBarry Smith 617f1af5d2fSBarry Smith /* forward solve the U^T */ 618f1af5d2fSBarry Smith idx = 0; 619f1af5d2fSBarry Smith for (i=0; i<n; i++) { 620f1af5d2fSBarry Smith 621f1af5d2fSBarry Smith v = aa + 4*diag[i]; 622f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 623f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 624f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 625f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 626f1af5d2fSBarry Smith v += 4; 627f1af5d2fSBarry Smith 628f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 629f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 630f1af5d2fSBarry Smith while (nz--) { 631f1af5d2fSBarry Smith oidx = 2*(*vi++); 632f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 633f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 634f1af5d2fSBarry Smith v += 4; 635f1af5d2fSBarry Smith } 636f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 637f1af5d2fSBarry Smith idx += 2; 638f1af5d2fSBarry Smith } 639f1af5d2fSBarry Smith /* backward solve the L^T */ 640f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 641f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 642f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 643f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 644f1af5d2fSBarry Smith idt = 2*i; 645f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 646f1af5d2fSBarry Smith while (nz--) { 647f1af5d2fSBarry Smith idx = 2*(*vi--); 648f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 649f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 650f1af5d2fSBarry Smith v -= 4; 651f1af5d2fSBarry Smith } 652f1af5d2fSBarry Smith } 653f1af5d2fSBarry Smith 654f1af5d2fSBarry Smith /* copy t into x according to permutation */ 655f1af5d2fSBarry Smith ii = 0; 656f1af5d2fSBarry Smith for (i=0; i<n; i++) { 657f1af5d2fSBarry Smith ir = 2*r[i]; 658f1af5d2fSBarry Smith x[ir] = t[ii]; 659f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 660f1af5d2fSBarry Smith ii += 2; 661f1af5d2fSBarry Smith } 662f1af5d2fSBarry Smith 663f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 664f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6651ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6661ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 667dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 668f1af5d2fSBarry Smith PetscFunctionReturn(0); 669f1af5d2fSBarry Smith } 670f1af5d2fSBarry Smith 6714a2ae208SSatish Balay #undef __FUNCT__ 6724a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 673dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 674f1af5d2fSBarry Smith { 675f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 676f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6776849ba73SBarry Smith PetscErrorCode ierr; 6785d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 6795d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 680690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 681f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 68287828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 68387828ca2SBarry Smith PetscScalar *x,*b,*t; 684f1af5d2fSBarry Smith 685f1af5d2fSBarry Smith PetscFunctionBegin; 6861ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6871ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 688f1af5d2fSBarry Smith t = a->solve_work; 689f1af5d2fSBarry Smith 690f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 691f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 692f1af5d2fSBarry Smith 693f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 694f1af5d2fSBarry Smith ii = 0; 695f1af5d2fSBarry Smith for (i=0; i<n; i++) { 696f1af5d2fSBarry Smith ic = 3*c[i]; 697f1af5d2fSBarry Smith t[ii] = b[ic]; 698f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 699f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 700f1af5d2fSBarry Smith ii += 3; 701f1af5d2fSBarry Smith } 702f1af5d2fSBarry Smith 703f1af5d2fSBarry Smith /* forward solve the U^T */ 704f1af5d2fSBarry Smith idx = 0; 705f1af5d2fSBarry Smith for (i=0; i<n; i++) { 706f1af5d2fSBarry Smith 707f1af5d2fSBarry Smith v = aa + 9*diag[i]; 708f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 709f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 710f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 711f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 712f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 713f1af5d2fSBarry Smith v += 9; 714f1af5d2fSBarry Smith 715f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 716f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 717f1af5d2fSBarry Smith while (nz--) { 718f1af5d2fSBarry Smith oidx = 3*(*vi++); 719f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 720f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 721f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 722f1af5d2fSBarry Smith v += 9; 723f1af5d2fSBarry Smith } 724f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 725f1af5d2fSBarry Smith idx += 3; 726f1af5d2fSBarry Smith } 727f1af5d2fSBarry Smith /* backward solve the L^T */ 728f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 729f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 730f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 731f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 732f1af5d2fSBarry Smith idt = 3*i; 733f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 734f1af5d2fSBarry Smith while (nz--) { 735f1af5d2fSBarry Smith idx = 3*(*vi--); 736f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 737f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 738f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 739f1af5d2fSBarry Smith v -= 9; 740f1af5d2fSBarry Smith } 741f1af5d2fSBarry Smith } 742f1af5d2fSBarry Smith 743f1af5d2fSBarry Smith /* copy t into x according to permutation */ 744f1af5d2fSBarry Smith ii = 0; 745f1af5d2fSBarry Smith for (i=0; i<n; i++) { 746f1af5d2fSBarry Smith ir = 3*r[i]; 747f1af5d2fSBarry Smith x[ir] = t[ii]; 748f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 749f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 750f1af5d2fSBarry Smith ii += 3; 751f1af5d2fSBarry Smith } 752f1af5d2fSBarry Smith 753f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 754f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7551ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7561ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 757dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 758f1af5d2fSBarry Smith PetscFunctionReturn(0); 759f1af5d2fSBarry Smith } 760f1af5d2fSBarry Smith 7614a2ae208SSatish Balay #undef __FUNCT__ 7624a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 763dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 764f1af5d2fSBarry Smith { 765f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 766f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7676849ba73SBarry Smith PetscErrorCode ierr; 7685d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 7695d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 770690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 771f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 77287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 77387828ca2SBarry Smith PetscScalar *x,*b,*t; 774f1af5d2fSBarry Smith 775f1af5d2fSBarry Smith PetscFunctionBegin; 7761ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7771ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 778f1af5d2fSBarry Smith t = a->solve_work; 779f1af5d2fSBarry Smith 780f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 781f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 782f1af5d2fSBarry Smith 783f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 784f1af5d2fSBarry Smith ii = 0; 785f1af5d2fSBarry Smith for (i=0; i<n; i++) { 786f1af5d2fSBarry Smith ic = 4*c[i]; 787f1af5d2fSBarry Smith t[ii] = b[ic]; 788f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 789f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 790f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 791f1af5d2fSBarry Smith ii += 4; 792f1af5d2fSBarry Smith } 793f1af5d2fSBarry Smith 794f1af5d2fSBarry Smith /* forward solve the U^T */ 795f1af5d2fSBarry Smith idx = 0; 796f1af5d2fSBarry Smith for (i=0; i<n; i++) { 797f1af5d2fSBarry Smith 798f1af5d2fSBarry Smith v = aa + 16*diag[i]; 799f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 800f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 801f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 802f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 803f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 804f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 805f1af5d2fSBarry Smith v += 16; 806f1af5d2fSBarry Smith 807f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 808f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 809f1af5d2fSBarry Smith while (nz--) { 810f1af5d2fSBarry Smith oidx = 4*(*vi++); 811f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 812f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 813f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 814f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 815f1af5d2fSBarry Smith v += 16; 816f1af5d2fSBarry Smith } 817f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 818f1af5d2fSBarry Smith idx += 4; 819f1af5d2fSBarry Smith } 820f1af5d2fSBarry Smith /* backward solve the L^T */ 821f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 822f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 823f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 824f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 825f1af5d2fSBarry Smith idt = 4*i; 826f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 827f1af5d2fSBarry Smith while (nz--) { 828f1af5d2fSBarry Smith idx = 4*(*vi--); 829f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 830f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 831f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 832f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 833f1af5d2fSBarry Smith v -= 16; 834f1af5d2fSBarry Smith } 835f1af5d2fSBarry Smith } 836f1af5d2fSBarry Smith 837f1af5d2fSBarry Smith /* copy t into x according to permutation */ 838f1af5d2fSBarry Smith ii = 0; 839f1af5d2fSBarry Smith for (i=0; i<n; i++) { 840f1af5d2fSBarry Smith ir = 4*r[i]; 841f1af5d2fSBarry Smith x[ir] = t[ii]; 842f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 843f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 844f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 845f1af5d2fSBarry Smith ii += 4; 846f1af5d2fSBarry Smith } 847f1af5d2fSBarry Smith 848f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 849f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8501ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8511ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 852dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 853f1af5d2fSBarry Smith PetscFunctionReturn(0); 854f1af5d2fSBarry Smith } 855f1af5d2fSBarry Smith 8564a2ae208SSatish Balay #undef __FUNCT__ 8574a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 858dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 859f1af5d2fSBarry Smith { 860f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 861f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8626849ba73SBarry Smith PetscErrorCode ierr; 8635d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8645d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 865690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 866f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 86787828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 86887828ca2SBarry Smith PetscScalar *x,*b,*t; 869f1af5d2fSBarry Smith 870f1af5d2fSBarry Smith PetscFunctionBegin; 8711ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8721ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 873f1af5d2fSBarry Smith t = a->solve_work; 874f1af5d2fSBarry Smith 875f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 876f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 877f1af5d2fSBarry Smith 878f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 879f1af5d2fSBarry Smith ii = 0; 880f1af5d2fSBarry Smith for (i=0; i<n; i++) { 881f1af5d2fSBarry Smith ic = 5*c[i]; 882f1af5d2fSBarry Smith t[ii] = b[ic]; 883f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 884f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 885f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 886f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 887f1af5d2fSBarry Smith ii += 5; 888f1af5d2fSBarry Smith } 889f1af5d2fSBarry Smith 890f1af5d2fSBarry Smith /* forward solve the U^T */ 891f1af5d2fSBarry Smith idx = 0; 892f1af5d2fSBarry Smith for (i=0; i<n; i++) { 893f1af5d2fSBarry Smith 894f1af5d2fSBarry Smith v = aa + 25*diag[i]; 895f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 896f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 897f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 898f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 899f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 900f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 901f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 902f1af5d2fSBarry Smith v += 25; 903f1af5d2fSBarry Smith 904f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 905f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 906f1af5d2fSBarry Smith while (nz--) { 907f1af5d2fSBarry Smith oidx = 5*(*vi++); 908f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 909f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 910f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 911f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 912f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 913f1af5d2fSBarry Smith v += 25; 914f1af5d2fSBarry Smith } 915f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 916f1af5d2fSBarry Smith idx += 5; 917f1af5d2fSBarry Smith } 918f1af5d2fSBarry Smith /* backward solve the L^T */ 919f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 920f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 921f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 922f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 923f1af5d2fSBarry Smith idt = 5*i; 924f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 925f1af5d2fSBarry Smith while (nz--) { 926f1af5d2fSBarry Smith idx = 5*(*vi--); 927f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 928f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 929f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 930f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 931f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 932f1af5d2fSBarry Smith v -= 25; 933f1af5d2fSBarry Smith } 934f1af5d2fSBarry Smith } 935f1af5d2fSBarry Smith 936f1af5d2fSBarry Smith /* copy t into x according to permutation */ 937f1af5d2fSBarry Smith ii = 0; 938f1af5d2fSBarry Smith for (i=0; i<n; i++) { 939f1af5d2fSBarry Smith ir = 5*r[i]; 940f1af5d2fSBarry Smith x[ir] = t[ii]; 941f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 942f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 943f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 944f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 945f1af5d2fSBarry Smith ii += 5; 946f1af5d2fSBarry Smith } 947f1af5d2fSBarry Smith 948f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 949f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9501ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 9511ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 952dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 953f1af5d2fSBarry Smith PetscFunctionReturn(0); 954f1af5d2fSBarry Smith } 955f1af5d2fSBarry Smith 9564a2ae208SSatish Balay #undef __FUNCT__ 9574a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 958dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 959f1af5d2fSBarry Smith { 960f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 961f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9626849ba73SBarry Smith PetscErrorCode ierr; 9635d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9645d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 965690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 966f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 96787828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 96887828ca2SBarry Smith PetscScalar *x,*b,*t; 969f1af5d2fSBarry Smith 970f1af5d2fSBarry Smith PetscFunctionBegin; 9711ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9721ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 973f1af5d2fSBarry Smith t = a->solve_work; 974f1af5d2fSBarry Smith 975f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 976f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 977f1af5d2fSBarry Smith 978f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 979f1af5d2fSBarry Smith ii = 0; 980f1af5d2fSBarry Smith for (i=0; i<n; i++) { 981f1af5d2fSBarry Smith ic = 6*c[i]; 982f1af5d2fSBarry Smith t[ii] = b[ic]; 983f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 984f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 985f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 986f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 987f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 988f1af5d2fSBarry Smith ii += 6; 989f1af5d2fSBarry Smith } 990f1af5d2fSBarry Smith 991f1af5d2fSBarry Smith /* forward solve the U^T */ 992f1af5d2fSBarry Smith idx = 0; 993f1af5d2fSBarry Smith for (i=0; i<n; i++) { 994f1af5d2fSBarry Smith 995f1af5d2fSBarry Smith v = aa + 36*diag[i]; 996f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 997f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 998f1af5d2fSBarry Smith x6 = t[5+idx]; 999f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1000f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1001f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1002f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1003f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1004f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1005f1af5d2fSBarry Smith v += 36; 1006f1af5d2fSBarry Smith 1007f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1008f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1009f1af5d2fSBarry Smith while (nz--) { 1010f1af5d2fSBarry Smith oidx = 6*(*vi++); 1011f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1012f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1013f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1014f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1015f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1016f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1017f1af5d2fSBarry Smith v += 36; 1018f1af5d2fSBarry Smith } 1019f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1020f1af5d2fSBarry Smith t[5+idx] = s6; 1021f1af5d2fSBarry Smith idx += 6; 1022f1af5d2fSBarry Smith } 1023f1af5d2fSBarry Smith /* backward solve the L^T */ 1024f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1025f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 1026f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1027f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1028f1af5d2fSBarry Smith idt = 6*i; 1029f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1030f1af5d2fSBarry Smith s6 = t[5+idt]; 1031f1af5d2fSBarry Smith while (nz--) { 1032f1af5d2fSBarry Smith idx = 6*(*vi--); 1033f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1034f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1035f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1036f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1037f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1038f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1039f1af5d2fSBarry Smith v -= 36; 1040f1af5d2fSBarry Smith } 1041f1af5d2fSBarry Smith } 1042f1af5d2fSBarry Smith 1043f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1044f1af5d2fSBarry Smith ii = 0; 1045f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1046f1af5d2fSBarry Smith ir = 6*r[i]; 1047f1af5d2fSBarry Smith x[ir] = t[ii]; 1048f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1049f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1050f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1051f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1052f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1053f1af5d2fSBarry Smith ii += 6; 1054f1af5d2fSBarry Smith } 1055f1af5d2fSBarry Smith 1056f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1057f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 10581ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10591ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1060dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1061f1af5d2fSBarry Smith PetscFunctionReturn(0); 1062f1af5d2fSBarry Smith } 1063f1af5d2fSBarry Smith 10644a2ae208SSatish Balay #undef __FUNCT__ 10654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1066dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1067f1af5d2fSBarry Smith { 1068f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1069f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10706849ba73SBarry Smith PetscErrorCode ierr; 10715d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 10725d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1073690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1074f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 107587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 107687828ca2SBarry Smith PetscScalar *x,*b,*t; 1077f1af5d2fSBarry Smith 1078f1af5d2fSBarry Smith PetscFunctionBegin; 10791ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10801ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1081f1af5d2fSBarry Smith t = a->solve_work; 1082f1af5d2fSBarry Smith 1083f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1084f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1085f1af5d2fSBarry Smith 1086f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1087f1af5d2fSBarry Smith ii = 0; 1088f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1089f1af5d2fSBarry Smith ic = 7*c[i]; 1090f1af5d2fSBarry Smith t[ii] = b[ic]; 1091f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1092f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1093f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1094f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1095f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1096f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1097f1af5d2fSBarry Smith ii += 7; 1098f1af5d2fSBarry Smith } 1099f1af5d2fSBarry Smith 1100f1af5d2fSBarry Smith /* forward solve the U^T */ 1101f1af5d2fSBarry Smith idx = 0; 1102f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1103f1af5d2fSBarry Smith 1104f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1105f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1106f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1107f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1108f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1109f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1110f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1111f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1112f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1113f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1114f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1115f1af5d2fSBarry Smith v += 49; 1116f1af5d2fSBarry Smith 1117f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1118f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1119f1af5d2fSBarry Smith while (nz--) { 1120f1af5d2fSBarry Smith oidx = 7*(*vi++); 1121f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1122f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1123f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1124f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1125f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1126f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1127f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1128f1af5d2fSBarry Smith v += 49; 1129f1af5d2fSBarry Smith } 1130f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1131f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1132f1af5d2fSBarry Smith idx += 7; 1133f1af5d2fSBarry Smith } 1134f1af5d2fSBarry Smith /* backward solve the L^T */ 1135f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1136f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1137f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1138f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1139f1af5d2fSBarry Smith idt = 7*i; 1140f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1141f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1142f1af5d2fSBarry Smith while (nz--) { 1143f1af5d2fSBarry Smith idx = 7*(*vi--); 1144f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1145f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1146f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1147f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1148f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1149f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1150f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1151f1af5d2fSBarry Smith v -= 49; 1152f1af5d2fSBarry Smith } 1153f1af5d2fSBarry Smith } 1154f1af5d2fSBarry Smith 1155f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1156f1af5d2fSBarry Smith ii = 0; 1157f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1158f1af5d2fSBarry Smith ir = 7*r[i]; 1159f1af5d2fSBarry Smith x[ir] = t[ii]; 1160f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1161f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1162f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1163f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1164f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1165f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1166f1af5d2fSBarry Smith ii += 7; 1167f1af5d2fSBarry Smith } 1168f1af5d2fSBarry Smith 1169f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1170f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11711ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11721ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1173dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1174f1af5d2fSBarry Smith PetscFunctionReturn(0); 1175f1af5d2fSBarry Smith } 1176f1af5d2fSBarry Smith 11774e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11784a2ae208SSatish Balay #undef __FUNCT__ 11794a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1180dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11814e2b4712SSatish Balay { 11824e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11834e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11846849ba73SBarry Smith PetscErrorCode ierr; 11855d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 11865d0c19d7SBarry Smith PetscInt i,n=a->mbs; 11875d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11883f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 118987828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11904e2b4712SSatish Balay 11914e2b4712SSatish Balay PetscFunctionBegin; 11921ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11931ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1194f1af5d2fSBarry Smith t = a->solve_work; 11954e2b4712SSatish Balay 11964e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11974e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11984e2b4712SSatish Balay 11994e2b4712SSatish Balay /* forward solve the lower triangular */ 120087828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 12014e2b4712SSatish Balay for (i=1; i<n; i++) { 12024e2b4712SSatish Balay v = aa + bs2*ai[i]; 12034e2b4712SSatish Balay vi = aj + ai[i]; 12044e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1205f1af5d2fSBarry Smith s = t + bs*i; 120687828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 12074e2b4712SSatish Balay while (nz--) { 1208f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 12094e2b4712SSatish Balay v += bs2; 12104e2b4712SSatish Balay } 12114e2b4712SSatish Balay } 12124e2b4712SSatish Balay /* backward solve the upper triangular */ 1213d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 12144e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12154e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 12164e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 12174e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 121887828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 12194e2b4712SSatish Balay while (nz--) { 1220f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 12214e2b4712SSatish Balay v += bs2; 12224e2b4712SSatish Balay } 1223f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 122487828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 12254e2b4712SSatish Balay } 12264e2b4712SSatish Balay 12274e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12284e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12291ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12301ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1231dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 12324e2b4712SSatish Balay PetscFunctionReturn(0); 12334e2b4712SSatish Balay } 12344e2b4712SSatish Balay 12355c42ef9dSBarry Smith /* ----------------------------------------------------------- */ 12365c42ef9dSBarry Smith #undef __FUNCT__ 12375c42ef9dSBarry Smith #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 12385c42ef9dSBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 12395c42ef9dSBarry Smith { 12405c42ef9dSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 12415c42ef9dSBarry Smith IS iscol=a->col,isrow=a->row; 12425c42ef9dSBarry Smith PetscErrorCode ierr; 12435c42ef9dSBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 12445c42ef9dSBarry Smith PetscInt i,n=a->mbs,j; 12455c42ef9dSBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 12465c42ef9dSBarry Smith const MatScalar *aa=a->a,*v; 12475c42ef9dSBarry Smith PetscScalar *x,*t,*ls; 12485c42ef9dSBarry Smith const PetscScalar *b; 12495c42ef9dSBarry Smith PetscFunctionBegin; 12505c42ef9dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 12515c42ef9dSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 12525c42ef9dSBarry Smith t = a->solve_work; 12535c42ef9dSBarry Smith 12545c42ef9dSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 12555c42ef9dSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 12565c42ef9dSBarry Smith 12575c42ef9dSBarry Smith /* copy the b into temp work space according to permutation */ 12585c42ef9dSBarry Smith for (i=0; i<n; i++) { 12595c42ef9dSBarry Smith for (j=0; j<bs; j++) { 12605c42ef9dSBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 12615c42ef9dSBarry Smith } 12625c42ef9dSBarry Smith } 12635c42ef9dSBarry Smith 12645c42ef9dSBarry Smith 12655c42ef9dSBarry Smith /* forward solve the upper triangular transpose */ 12665c42ef9dSBarry Smith ls = a->solve_work + A->cmap->n; 12675c42ef9dSBarry Smith for (i=0; i<n; i++){ 12685c42ef9dSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 12695c42ef9dSBarry Smith Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 12705c42ef9dSBarry Smith v = aa + bs2*(a->diag[i] + 1); 12715c42ef9dSBarry Smith vi = aj + a->diag[i] + 1; 12725c42ef9dSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 12735c42ef9dSBarry Smith while (nz--) { 12745c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 12755c42ef9dSBarry Smith v += bs2; 12765c42ef9dSBarry Smith } 12775c42ef9dSBarry Smith } 12785c42ef9dSBarry Smith 12795c42ef9dSBarry Smith /* backward solve the lower triangular transpose */ 12805c42ef9dSBarry Smith for (i=n-1; i>=0; i--) { 12815c42ef9dSBarry Smith v = aa + bs2*ai[i]; 12825c42ef9dSBarry Smith vi = aj + ai[i]; 12835c42ef9dSBarry Smith nz = a->diag[i] - ai[i]; 12845c42ef9dSBarry Smith while (nz--) { 12855c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 12865c42ef9dSBarry Smith v += bs2; 12875c42ef9dSBarry Smith } 12885c42ef9dSBarry Smith } 12895c42ef9dSBarry Smith 12905c42ef9dSBarry Smith /* copy t into x according to permutation */ 12915c42ef9dSBarry Smith for (i=0; i<n; i++) { 12925c42ef9dSBarry Smith for (j=0; j<bs; j++) { 12935c42ef9dSBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 12945c42ef9dSBarry Smith } 12955c42ef9dSBarry Smith } 12965c42ef9dSBarry Smith 12975c42ef9dSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12985c42ef9dSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12995c42ef9dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13005c42ef9dSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 13015c42ef9dSBarry Smith ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 13025c42ef9dSBarry Smith PetscFunctionReturn(0); 13035c42ef9dSBarry Smith } 13045c42ef9dSBarry Smith 13054a2ae208SSatish Balay #undef __FUNCT__ 13064a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1307dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 13084e2b4712SSatish Balay { 13094e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 13104e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 13116849ba73SBarry Smith PetscErrorCode ierr; 13125d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 13135d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 13143f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 131587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 131687828ca2SBarry Smith PetscScalar *x,*b,*t; 13174e2b4712SSatish Balay 13184e2b4712SSatish Balay PetscFunctionBegin; 13191ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 13201ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1321f1af5d2fSBarry Smith t = a->solve_work; 13224e2b4712SSatish Balay 13234e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 13244e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 13254e2b4712SSatish Balay 13264e2b4712SSatish Balay /* forward solve the lower triangular */ 13274e2b4712SSatish Balay idx = 7*(*r++); 1328f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1329f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1330f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 13314e2b4712SSatish Balay 13324e2b4712SSatish Balay for (i=1; i<n; i++) { 13334e2b4712SSatish Balay v = aa + 49*ai[i]; 13344e2b4712SSatish Balay vi = aj + ai[i]; 13354e2b4712SSatish Balay nz = diag[i] - ai[i]; 13364e2b4712SSatish Balay idx = 7*(*r++); 1337f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1338f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 13394e2b4712SSatish Balay while (nz--) { 13404e2b4712SSatish Balay idx = 7*(*vi++); 1341f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1342f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1343f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1344f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1345f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1346f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1347f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1348f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1349f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1350f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13514e2b4712SSatish Balay v += 49; 13524e2b4712SSatish Balay } 13534e2b4712SSatish Balay idx = 7*i; 1354f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1355f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1356f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 13574e2b4712SSatish Balay } 13584e2b4712SSatish Balay /* backward solve the upper triangular */ 13594e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 13604e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 13614e2b4712SSatish Balay vi = aj + diag[i] + 1; 13624e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 13634e2b4712SSatish Balay idt = 7*i; 1364f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1365f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1366f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 13674e2b4712SSatish Balay while (nz--) { 13684e2b4712SSatish Balay idx = 7*(*vi++); 1369f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1370f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1371f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1372f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1373f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1374f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1375f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1376f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1377f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1378f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13794e2b4712SSatish Balay v += 49; 13804e2b4712SSatish Balay } 13814e2b4712SSatish Balay idc = 7*(*c--); 13824e2b4712SSatish Balay v = aa + 49*diag[i]; 1383f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1384f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1385f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1386f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1387f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1388f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1389f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1390f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1391f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1392f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1393f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1394f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1395f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1396f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 13974e2b4712SSatish Balay } 13984e2b4712SSatish Balay 13994e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 14004e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 14011ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 14021ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1403dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 14044e2b4712SSatish Balay PetscFunctionReturn(0); 14054e2b4712SSatish Balay } 14064e2b4712SSatish Balay 14078f690400SShri Abhyankar #undef __FUNCT__ 1408a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1409a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 141035aa4fcfSShri Abhyankar { 141135aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 141235aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 141335aa4fcfSShri Abhyankar PetscErrorCode ierr; 141435aa4fcfSShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 141535aa4fcfSShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 141635aa4fcfSShri Abhyankar MatScalar *aa=a->a,*v; 141735aa4fcfSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 141835aa4fcfSShri Abhyankar PetscScalar *x,*b,*t; 141935aa4fcfSShri Abhyankar 142035aa4fcfSShri Abhyankar PetscFunctionBegin; 142135aa4fcfSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 142235aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 142335aa4fcfSShri Abhyankar t = a->solve_work; 142435aa4fcfSShri Abhyankar 142535aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 142635aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 142735aa4fcfSShri Abhyankar 142835aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 142935aa4fcfSShri Abhyankar idx = 7*r[0]; 143035aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 143135aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 143235aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 143335aa4fcfSShri Abhyankar 143435aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 143535aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 143635aa4fcfSShri Abhyankar vi = aj + ai[i]; 143735aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 143835aa4fcfSShri Abhyankar idx = 7*r[i]; 143935aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 144035aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 144135aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 144235aa4fcfSShri Abhyankar idx = 7*vi[m]; 144335aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 144435aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 144535aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 144635aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 144735aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 144835aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 144935aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 145035aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 145135aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 145235aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 145335aa4fcfSShri Abhyankar v += 49; 145435aa4fcfSShri Abhyankar } 145535aa4fcfSShri Abhyankar idx = 7*i; 145635aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 145735aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 145835aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 145935aa4fcfSShri Abhyankar } 146035aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 146135aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 146235aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 146335aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 146435aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 146535aa4fcfSShri Abhyankar idt = 7*i; 146635aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 146735aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 146835aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 146935aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 147035aa4fcfSShri Abhyankar idx = 7*vi[m]; 147135aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 147235aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 147335aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 147435aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 147535aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 147635aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 147735aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 147835aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 147935aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 148035aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 148135aa4fcfSShri Abhyankar v += 49; 148235aa4fcfSShri Abhyankar } 148335aa4fcfSShri Abhyankar idc = 7*c[i]; 148435aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 148535aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 148635aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 148735aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 148835aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 148935aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 149035aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 149135aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 149235aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 149335aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 149435aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 149535aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 149635aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 149735aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 149835aa4fcfSShri Abhyankar } 149935aa4fcfSShri Abhyankar 150035aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 150135aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 150235aa4fcfSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 150335aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 150435aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 150535aa4fcfSShri Abhyankar PetscFunctionReturn(0); 150635aa4fcfSShri Abhyankar } 150735aa4fcfSShri Abhyankar 150835aa4fcfSShri Abhyankar #undef __FUNCT__ 15094a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1510dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 151115091d37SBarry Smith { 151215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1513690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1514dfbe8321SBarry Smith PetscErrorCode ierr; 1515690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1516d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1517d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1518d9fead3dSBarry Smith const PetscScalar *b; 151915091d37SBarry Smith 152015091d37SBarry Smith PetscFunctionBegin; 1521d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15221ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 152315091d37SBarry Smith /* forward solve the lower triangular */ 152415091d37SBarry Smith idx = 0; 152515091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 152615091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 152715091d37SBarry Smith x[6] = b[6+idx]; 152815091d37SBarry Smith for (i=1; i<n; i++) { 152915091d37SBarry Smith v = aa + 49*ai[i]; 153015091d37SBarry Smith vi = aj + ai[i]; 153115091d37SBarry Smith nz = diag[i] - ai[i]; 153215091d37SBarry Smith idx = 7*i; 1533f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1534f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1535f1af5d2fSBarry Smith s7 = b[6+idx]; 153615091d37SBarry Smith while (nz--) { 153715091d37SBarry Smith jdx = 7*(*vi++); 153815091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 153915091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 154015091d37SBarry Smith x7 = x[6+jdx]; 1541f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1542f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1543f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1544f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1545f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1546f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1547f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 154815091d37SBarry Smith v += 49; 154915091d37SBarry Smith } 1550f1af5d2fSBarry Smith x[idx] = s1; 1551f1af5d2fSBarry Smith x[1+idx] = s2; 1552f1af5d2fSBarry Smith x[2+idx] = s3; 1553f1af5d2fSBarry Smith x[3+idx] = s4; 1554f1af5d2fSBarry Smith x[4+idx] = s5; 1555f1af5d2fSBarry Smith x[5+idx] = s6; 1556f1af5d2fSBarry Smith x[6+idx] = s7; 155715091d37SBarry Smith } 155815091d37SBarry Smith /* backward solve the upper triangular */ 155915091d37SBarry Smith for (i=n-1; i>=0; i--){ 156015091d37SBarry Smith v = aa + 49*diag[i] + 49; 156115091d37SBarry Smith vi = aj + diag[i] + 1; 156215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 156315091d37SBarry Smith idt = 7*i; 1564f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1565f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1566f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1567f1af5d2fSBarry Smith s7 = x[6+idt]; 156815091d37SBarry Smith while (nz--) { 156915091d37SBarry Smith idx = 7*(*vi++); 157015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 157115091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 157215091d37SBarry Smith x7 = x[6+idx]; 1573f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1574f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1575f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1576f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1577f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1578f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1579f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 158015091d37SBarry Smith v += 49; 158115091d37SBarry Smith } 158215091d37SBarry Smith v = aa + 49*diag[i]; 1583f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1584f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1585f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1586f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1587f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1588f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1589f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1590f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1591f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1592f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1593f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1594f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1595f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1596f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 159715091d37SBarry Smith } 159815091d37SBarry Smith 1599d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1601dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 160215091d37SBarry Smith PetscFunctionReturn(0); 160315091d37SBarry Smith } 160415091d37SBarry Smith 1605cee9d6f2SShri Abhyankar #undef __FUNCT__ 1606a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1607a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 160853cca76cSShri Abhyankar { 160953cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 161053cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 161153cca76cSShri Abhyankar PetscErrorCode ierr; 161253cca76cSShri Abhyankar PetscInt idx,jdx,idt; 161353cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 161453cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 161553cca76cSShri Abhyankar PetscScalar *x; 161653cca76cSShri Abhyankar const PetscScalar *b; 161753cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 161853cca76cSShri Abhyankar 161953cca76cSShri Abhyankar PetscFunctionBegin; 162053cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 162153cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 162253cca76cSShri Abhyankar /* forward solve the lower triangular */ 162353cca76cSShri Abhyankar idx = 0; 162453cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 162553cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 162653cca76cSShri Abhyankar for (i=1; i<n; i++) { 162753cca76cSShri Abhyankar v = aa + bs2*ai[i]; 162853cca76cSShri Abhyankar vi = aj + ai[i]; 162953cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 163053cca76cSShri Abhyankar idx = bs*i; 163153cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 163253cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 163353cca76cSShri Abhyankar for(k=0;k<nz;k++) { 163453cca76cSShri Abhyankar jdx = bs*vi[k]; 163553cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 163653cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 163753cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 163853cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 163953cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 164053cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 164153cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 164253cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 164353cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 164453cca76cSShri Abhyankar v += bs2; 164553cca76cSShri Abhyankar } 164653cca76cSShri Abhyankar 164753cca76cSShri Abhyankar x[idx] = s1; 164853cca76cSShri Abhyankar x[1+idx] = s2; 164953cca76cSShri Abhyankar x[2+idx] = s3; 165053cca76cSShri Abhyankar x[3+idx] = s4; 165153cca76cSShri Abhyankar x[4+idx] = s5; 165253cca76cSShri Abhyankar x[5+idx] = s6; 165353cca76cSShri Abhyankar x[6+idx] = s7; 165453cca76cSShri Abhyankar } 165553cca76cSShri Abhyankar 165653cca76cSShri Abhyankar /* backward solve the upper triangular */ 165753cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 165853cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 165953cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 166053cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 166153cca76cSShri Abhyankar idt = bs*i; 166253cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 166353cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 166453cca76cSShri Abhyankar for(k=0;k<nz;k++) { 166553cca76cSShri Abhyankar idx = bs*vi[k]; 166653cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 166753cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 166853cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 166953cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 167053cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 167153cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 167253cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 167353cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 167453cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 167553cca76cSShri Abhyankar v += bs2; 167653cca76cSShri Abhyankar } 167753cca76cSShri Abhyankar /* x = inv_diagonal*x */ 167853cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 167953cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 168053cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 168153cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 168253cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 168353cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 168453cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 168553cca76cSShri Abhyankar } 168653cca76cSShri Abhyankar 168753cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 168853cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 168953cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 169053cca76cSShri Abhyankar PetscFunctionReturn(0); 169153cca76cSShri Abhyankar } 169253cca76cSShri Abhyankar 169353cca76cSShri Abhyankar #undef __FUNCT__ 16944a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1695dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 169615091d37SBarry Smith { 169715091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 169815091d37SBarry Smith IS iscol=a->col,isrow=a->row; 16996849ba73SBarry Smith PetscErrorCode ierr; 17005d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 17015d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1702d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1703d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1704d9fead3dSBarry Smith const PetscScalar *b; 170515091d37SBarry Smith PetscFunctionBegin; 1706d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17071ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1708f1af5d2fSBarry Smith t = a->solve_work; 170915091d37SBarry Smith 171015091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 171115091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 171215091d37SBarry Smith 171315091d37SBarry Smith /* forward solve the lower triangular */ 171415091d37SBarry Smith idx = 6*(*r++); 1715f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1716f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1717f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 171815091d37SBarry Smith for (i=1; i<n; i++) { 171915091d37SBarry Smith v = aa + 36*ai[i]; 172015091d37SBarry Smith vi = aj + ai[i]; 172115091d37SBarry Smith nz = diag[i] - ai[i]; 172215091d37SBarry Smith idx = 6*(*r++); 1723f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1724f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 172515091d37SBarry Smith while (nz--) { 172615091d37SBarry Smith idx = 6*(*vi++); 1727f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1728f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1729f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1730f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1731f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1732f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1733f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1734f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 173515091d37SBarry Smith v += 36; 173615091d37SBarry Smith } 173715091d37SBarry Smith idx = 6*i; 1738f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1739f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1740f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 174115091d37SBarry Smith } 174215091d37SBarry Smith /* backward solve the upper triangular */ 174315091d37SBarry Smith for (i=n-1; i>=0; i--){ 174415091d37SBarry Smith v = aa + 36*diag[i] + 36; 174515091d37SBarry Smith vi = aj + diag[i] + 1; 174615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 174715091d37SBarry Smith idt = 6*i; 1748f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1749f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1750f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 175115091d37SBarry Smith while (nz--) { 175215091d37SBarry Smith idx = 6*(*vi++); 1753f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1754f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1755f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1756f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1757f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1758f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1759f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1760f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1761f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 176215091d37SBarry Smith v += 36; 176315091d37SBarry Smith } 176415091d37SBarry Smith idc = 6*(*c--); 176515091d37SBarry Smith v = aa + 36*diag[i]; 1766f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1767f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1768f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1769f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1770f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1771f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1772f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1773f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1774f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1775f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1776f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1777f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 177815091d37SBarry Smith } 177915091d37SBarry Smith 178015091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 178115091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1782d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17831ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1784dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 178515091d37SBarry Smith PetscFunctionReturn(0); 178615091d37SBarry Smith } 178715091d37SBarry Smith 17886506fda5SShri Abhyankar #undef __FUNCT__ 1789a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 1790a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 17916506fda5SShri Abhyankar { 17926506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 17936506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 17946506fda5SShri Abhyankar PetscErrorCode ierr; 17956506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 17966506fda5SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 17976506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 17986506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 17996506fda5SShri Abhyankar const PetscScalar *b; 18006506fda5SShri Abhyankar PetscFunctionBegin; 18016506fda5SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18026506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 18036506fda5SShri Abhyankar t = a->solve_work; 18046506fda5SShri Abhyankar 18056506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 18066506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 18076506fda5SShri Abhyankar 18086506fda5SShri Abhyankar /* forward solve the lower triangular */ 18096506fda5SShri Abhyankar idx = 6*r[0]; 18106506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 18116506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 18126506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 18136506fda5SShri Abhyankar for (i=1; i<n; i++) { 18146506fda5SShri Abhyankar v = aa + 36*ai[i]; 18156506fda5SShri Abhyankar vi = aj + ai[i]; 18166506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 18176506fda5SShri Abhyankar idx = 6*r[i]; 18186506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 18196506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 18206506fda5SShri Abhyankar for(m=0;m<nz;m++){ 18216506fda5SShri Abhyankar idx = 6*vi[m]; 18226506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 18236506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 18246506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 18256506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 18266506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 18276506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 18286506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 18296506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 18306506fda5SShri Abhyankar v += 36; 18316506fda5SShri Abhyankar } 18326506fda5SShri Abhyankar idx = 6*i; 18336506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 18346506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 18356506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 18366506fda5SShri Abhyankar } 18376506fda5SShri Abhyankar /* backward solve the upper triangular */ 18386506fda5SShri Abhyankar for (i=n-1; i>=0; i--){ 18396506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 18406506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 18416506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 18426506fda5SShri Abhyankar idt = 6*i; 18436506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 18446506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 18456506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 18466506fda5SShri Abhyankar for(m=0;m<nz;m++){ 18476506fda5SShri Abhyankar idx = 6*vi[m]; 18486506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 18496506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 18506506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 18516506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 18526506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 18536506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 18546506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 18556506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 18566506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 18576506fda5SShri Abhyankar v += 36; 18586506fda5SShri Abhyankar } 18596506fda5SShri Abhyankar idc = 6*c[i]; 18606506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 18616506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 18626506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 18636506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 18646506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 18656506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 18666506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 18676506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 18686506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 18696506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 18706506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 18716506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 18726506fda5SShri Abhyankar } 18736506fda5SShri Abhyankar 18746506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 18756506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 18766506fda5SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18776506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 18786506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 18796506fda5SShri Abhyankar PetscFunctionReturn(0); 18806506fda5SShri Abhyankar } 18818f690400SShri Abhyankar 18828f690400SShri Abhyankar #undef __FUNCT__ 18834a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1884dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 188515091d37SBarry Smith { 188615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1887690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1888dfbe8321SBarry Smith PetscErrorCode ierr; 1889690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1890d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1891d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1892d9fead3dSBarry Smith const PetscScalar *b; 189315091d37SBarry Smith 189415091d37SBarry Smith PetscFunctionBegin; 1895d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18961ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 189715091d37SBarry Smith /* forward solve the lower triangular */ 189815091d37SBarry Smith idx = 0; 189915091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 190015091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 190115091d37SBarry Smith for (i=1; i<n; i++) { 190215091d37SBarry Smith v = aa + 36*ai[i]; 190315091d37SBarry Smith vi = aj + ai[i]; 190415091d37SBarry Smith nz = diag[i] - ai[i]; 190515091d37SBarry Smith idx = 6*i; 1906f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1907f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 190815091d37SBarry Smith while (nz--) { 190915091d37SBarry Smith jdx = 6*(*vi++); 191015091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 191115091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1912f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1913f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1914f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1915f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1916f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1917f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 191815091d37SBarry Smith v += 36; 191915091d37SBarry Smith } 1920f1af5d2fSBarry Smith x[idx] = s1; 1921f1af5d2fSBarry Smith x[1+idx] = s2; 1922f1af5d2fSBarry Smith x[2+idx] = s3; 1923f1af5d2fSBarry Smith x[3+idx] = s4; 1924f1af5d2fSBarry Smith x[4+idx] = s5; 1925f1af5d2fSBarry Smith x[5+idx] = s6; 192615091d37SBarry Smith } 192715091d37SBarry Smith /* backward solve the upper triangular */ 192815091d37SBarry Smith for (i=n-1; i>=0; i--){ 192915091d37SBarry Smith v = aa + 36*diag[i] + 36; 193015091d37SBarry Smith vi = aj + diag[i] + 1; 193115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 193215091d37SBarry Smith idt = 6*i; 1933f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1934f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1935f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 193615091d37SBarry Smith while (nz--) { 193715091d37SBarry Smith idx = 6*(*vi++); 193815091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 193915091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1940f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1941f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1942f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1943f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1944f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1945f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 194615091d37SBarry Smith v += 36; 194715091d37SBarry Smith } 194815091d37SBarry Smith v = aa + 36*diag[i]; 1949f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1950f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1951f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1952f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1953f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1954f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 195515091d37SBarry Smith } 195615091d37SBarry Smith 1957d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1959dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 196015091d37SBarry Smith PetscFunctionReturn(0); 196115091d37SBarry Smith } 196215091d37SBarry Smith 1963cee9d6f2SShri Abhyankar #undef __FUNCT__ 1964a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 1965a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 196653cca76cSShri Abhyankar { 196753cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 196853cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 196953cca76cSShri Abhyankar PetscErrorCode ierr; 197053cca76cSShri Abhyankar PetscInt idx,jdx,idt; 197153cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 197253cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 197353cca76cSShri Abhyankar PetscScalar *x; 197453cca76cSShri Abhyankar const PetscScalar *b; 197553cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 197653cca76cSShri Abhyankar 197753cca76cSShri Abhyankar PetscFunctionBegin; 197853cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 197953cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 198053cca76cSShri Abhyankar /* forward solve the lower triangular */ 198153cca76cSShri Abhyankar idx = 0; 198253cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 198353cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 198453cca76cSShri Abhyankar for (i=1; i<n; i++) { 198553cca76cSShri Abhyankar v = aa + bs2*ai[i]; 198653cca76cSShri Abhyankar vi = aj + ai[i]; 198753cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 198853cca76cSShri Abhyankar idx = bs*i; 198953cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 199053cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 199153cca76cSShri Abhyankar for(k=0;k<nz;k++){ 199253cca76cSShri Abhyankar jdx = bs*vi[k]; 199353cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 199453cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 199553cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 199653cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 199753cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 199853cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 199953cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 200053cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 200153cca76cSShri Abhyankar v += bs2; 200253cca76cSShri Abhyankar } 200353cca76cSShri Abhyankar 200453cca76cSShri Abhyankar x[idx] = s1; 200553cca76cSShri Abhyankar x[1+idx] = s2; 200653cca76cSShri Abhyankar x[2+idx] = s3; 200753cca76cSShri Abhyankar x[3+idx] = s4; 200853cca76cSShri Abhyankar x[4+idx] = s5; 200953cca76cSShri Abhyankar x[5+idx] = s6; 201053cca76cSShri Abhyankar } 201153cca76cSShri Abhyankar 201253cca76cSShri Abhyankar /* backward solve the upper triangular */ 201353cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 201453cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 201553cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 201653cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 201753cca76cSShri Abhyankar idt = bs*i; 201853cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 201953cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 202053cca76cSShri Abhyankar for(k=0;k<nz;k++){ 202153cca76cSShri Abhyankar idx = bs*vi[k]; 202253cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 202353cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 202453cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 202553cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 202653cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 202753cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 202853cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 202953cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 203053cca76cSShri Abhyankar v += bs2; 203153cca76cSShri Abhyankar } 203253cca76cSShri Abhyankar /* x = inv_diagonal*x */ 203353cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 203453cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 203553cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 203653cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 203753cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 203853cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 203953cca76cSShri Abhyankar } 204053cca76cSShri Abhyankar 204153cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 204253cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 204353cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 204453cca76cSShri Abhyankar PetscFunctionReturn(0); 204553cca76cSShri Abhyankar } 204653cca76cSShri Abhyankar 204753cca76cSShri Abhyankar #undef __FUNCT__ 20484a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2049dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 20504e2b4712SSatish Balay { 20514e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 20524e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 20536849ba73SBarry Smith PetscErrorCode ierr; 20545d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 20555d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2056d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2057d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2058d9fead3dSBarry Smith const PetscScalar *b; 20594e2b4712SSatish Balay 20604e2b4712SSatish Balay PetscFunctionBegin; 2061d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20621ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2063f1af5d2fSBarry Smith t = a->solve_work; 20644e2b4712SSatish Balay 20654e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 20664e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 20674e2b4712SSatish Balay 20684e2b4712SSatish Balay /* forward solve the lower triangular */ 20694e2b4712SSatish Balay idx = 5*(*r++); 2070f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2071f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 20724e2b4712SSatish Balay for (i=1; i<n; i++) { 20734e2b4712SSatish Balay v = aa + 25*ai[i]; 20744e2b4712SSatish Balay vi = aj + ai[i]; 20754e2b4712SSatish Balay nz = diag[i] - ai[i]; 20764e2b4712SSatish Balay idx = 5*(*r++); 2077f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2078f1af5d2fSBarry Smith s5 = b[4+idx]; 20794e2b4712SSatish Balay while (nz--) { 20804e2b4712SSatish Balay idx = 5*(*vi++); 2081f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2082f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2083f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2084f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2085f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2086f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2087f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 20884e2b4712SSatish Balay v += 25; 20894e2b4712SSatish Balay } 20904e2b4712SSatish Balay idx = 5*i; 2091f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2092f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 20934e2b4712SSatish Balay } 20944e2b4712SSatish Balay /* backward solve the upper triangular */ 20954e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 20964e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 20974e2b4712SSatish Balay vi = aj + diag[i] + 1; 20984e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 20994e2b4712SSatish Balay idt = 5*i; 2100f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2101f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 21024e2b4712SSatish Balay while (nz--) { 21034e2b4712SSatish Balay idx = 5*(*vi++); 2104f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2105f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2106f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2107f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2108f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2109f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2110f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 21114e2b4712SSatish Balay v += 25; 21124e2b4712SSatish Balay } 21134e2b4712SSatish Balay idc = 5*(*c--); 21144e2b4712SSatish Balay v = aa + 25*diag[i]; 2115f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2116f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 2117f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2118f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 2119f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2120f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 2121f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2122f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 2123f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2124f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 21254e2b4712SSatish Balay } 21264e2b4712SSatish Balay 21274e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21284e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2129d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21301ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2131dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 21324e2b4712SSatish Balay PetscFunctionReturn(0); 21334e2b4712SSatish Balay } 21344e2b4712SSatish Balay 213578bb4007SShri Abhyankar #undef __FUNCT__ 2136a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2137a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 213878bb4007SShri Abhyankar { 213978bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 214078bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 214178bb4007SShri Abhyankar PetscErrorCode ierr; 214278bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 214378bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 214478bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 214578bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 214678bb4007SShri Abhyankar const PetscScalar *b; 214778bb4007SShri Abhyankar 214878bb4007SShri Abhyankar PetscFunctionBegin; 214978bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 215078bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 215178bb4007SShri Abhyankar t = a->solve_work; 215278bb4007SShri Abhyankar 215378bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 215478bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 215578bb4007SShri Abhyankar 215678bb4007SShri Abhyankar /* forward solve the lower triangular */ 215778bb4007SShri Abhyankar idx = 5*r[0]; 215878bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 215978bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 216078bb4007SShri Abhyankar for (i=1; i<n; i++) { 216178bb4007SShri Abhyankar v = aa + 25*ai[i]; 216278bb4007SShri Abhyankar vi = aj + ai[i]; 216378bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 216478bb4007SShri Abhyankar idx = 5*r[i]; 216578bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 216678bb4007SShri Abhyankar s5 = b[4+idx]; 216778bb4007SShri Abhyankar for(m=0;m<nz;m++){ 216878bb4007SShri Abhyankar idx = 5*vi[m]; 216978bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 217078bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 217178bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 217278bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 217378bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 217478bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 217578bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 217678bb4007SShri Abhyankar v += 25; 217778bb4007SShri Abhyankar } 217878bb4007SShri Abhyankar idx = 5*i; 217978bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 218078bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 218178bb4007SShri Abhyankar } 218278bb4007SShri Abhyankar /* backward solve the upper triangular */ 218378bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 218478bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 218578bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 218678bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 218778bb4007SShri Abhyankar idt = 5*i; 218878bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 218978bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 219078bb4007SShri Abhyankar for(m=0;m<nz;m++){ 219178bb4007SShri Abhyankar idx = 5*vi[m]; 219278bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 219378bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 219478bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 219578bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 219678bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 219778bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 219878bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 219978bb4007SShri Abhyankar v += 25; 220078bb4007SShri Abhyankar } 220178bb4007SShri Abhyankar idc = 5*c[i]; 220278bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 220378bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 220478bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 220578bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 220678bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 220778bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 220878bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 220978bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 221078bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 221178bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 221278bb4007SShri Abhyankar } 221378bb4007SShri Abhyankar 221478bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 221578bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 221678bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 221778bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 221878bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 221978bb4007SShri Abhyankar PetscFunctionReturn(0); 222078bb4007SShri Abhyankar } 222178bb4007SShri Abhyankar 22228f690400SShri Abhyankar #undef __FUNCT__ 22234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2224dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 222515091d37SBarry Smith { 222615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2227690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2228dfbe8321SBarry Smith PetscErrorCode ierr; 2229690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2230d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2231d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2232d9fead3dSBarry Smith const PetscScalar *b; 223315091d37SBarry Smith 223415091d37SBarry Smith PetscFunctionBegin; 2235d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22361ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 223715091d37SBarry Smith /* forward solve the lower triangular */ 223815091d37SBarry Smith idx = 0; 223915091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 224015091d37SBarry Smith for (i=1; i<n; i++) { 224115091d37SBarry Smith v = aa + 25*ai[i]; 224215091d37SBarry Smith vi = aj + ai[i]; 224315091d37SBarry Smith nz = diag[i] - ai[i]; 224415091d37SBarry Smith idx = 5*i; 2245f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 224615091d37SBarry Smith while (nz--) { 224715091d37SBarry Smith jdx = 5*(*vi++); 224815091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2249f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2250f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2251f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2252f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2253f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 225415091d37SBarry Smith v += 25; 225515091d37SBarry Smith } 2256f1af5d2fSBarry Smith x[idx] = s1; 2257f1af5d2fSBarry Smith x[1+idx] = s2; 2258f1af5d2fSBarry Smith x[2+idx] = s3; 2259f1af5d2fSBarry Smith x[3+idx] = s4; 2260f1af5d2fSBarry Smith x[4+idx] = s5; 226115091d37SBarry Smith } 226215091d37SBarry Smith /* backward solve the upper triangular */ 226315091d37SBarry Smith for (i=n-1; i>=0; i--){ 226415091d37SBarry Smith v = aa + 25*diag[i] + 25; 226515091d37SBarry Smith vi = aj + diag[i] + 1; 226615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 226715091d37SBarry Smith idt = 5*i; 2268f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2269f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 227015091d37SBarry Smith while (nz--) { 227115091d37SBarry Smith idx = 5*(*vi++); 227215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2273f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2274f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2275f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2276f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2277f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 227815091d37SBarry Smith v += 25; 227915091d37SBarry Smith } 228015091d37SBarry Smith v = aa + 25*diag[i]; 2281f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2282f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2283f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2284f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2285f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 228615091d37SBarry Smith } 228715091d37SBarry Smith 2288d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22891ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2290dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 229115091d37SBarry Smith PetscFunctionReturn(0); 229215091d37SBarry Smith } 229315091d37SBarry Smith 2294cee9d6f2SShri Abhyankar #undef __FUNCT__ 2295a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2296a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 229753cca76cSShri Abhyankar { 229853cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 229953cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 230053cca76cSShri Abhyankar PetscErrorCode ierr; 230153cca76cSShri Abhyankar PetscInt jdx; 230253cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 230353cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 230453cca76cSShri Abhyankar const PetscScalar *b; 230553cca76cSShri Abhyankar 230653cca76cSShri Abhyankar PetscFunctionBegin; 230753cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 230853cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 230953cca76cSShri Abhyankar /* forward solve the lower triangular */ 231053cca76cSShri Abhyankar idx = 0; 231153cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 231253cca76cSShri Abhyankar for (i=1; i<n; i++) { 231353cca76cSShri Abhyankar v = aa + 25*ai[i]; 231453cca76cSShri Abhyankar vi = aj + ai[i]; 231553cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 231653cca76cSShri Abhyankar idx = 5*i; 231753cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 231853cca76cSShri Abhyankar for(k=0;k<nz;k++) { 231953cca76cSShri Abhyankar jdx = 5*vi[k]; 232053cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 232153cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 232253cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 232353cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 232453cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 232553cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 232653cca76cSShri Abhyankar v += 25; 232753cca76cSShri Abhyankar } 232853cca76cSShri Abhyankar x[idx] = s1; 232953cca76cSShri Abhyankar x[1+idx] = s2; 233053cca76cSShri Abhyankar x[2+idx] = s3; 233153cca76cSShri Abhyankar x[3+idx] = s4; 233253cca76cSShri Abhyankar x[4+idx] = s5; 233353cca76cSShri Abhyankar } 233453cca76cSShri Abhyankar 233553cca76cSShri Abhyankar /* backward solve the upper triangular */ 233653cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 233753cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 233853cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 233953cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 234053cca76cSShri Abhyankar idt = 5*i; 234153cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 234253cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 234353cca76cSShri Abhyankar for(k=0;k<nz;k++){ 234453cca76cSShri Abhyankar idx = 5*vi[k]; 234553cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 234653cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 234753cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 234853cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 234953cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 235053cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 235153cca76cSShri Abhyankar v += 25; 235253cca76cSShri Abhyankar } 235353cca76cSShri Abhyankar /* x = inv_diagonal*x */ 235453cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 235553cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 235653cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 235753cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 235853cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 235953cca76cSShri Abhyankar } 236053cca76cSShri Abhyankar 236153cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 236253cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 236353cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 236453cca76cSShri Abhyankar PetscFunctionReturn(0); 236553cca76cSShri Abhyankar } 236653cca76cSShri Abhyankar 236753cca76cSShri Abhyankar #undef __FUNCT__ 23684a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2369dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 23704e2b4712SSatish Balay { 23714e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 23724e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 23736849ba73SBarry Smith PetscErrorCode ierr; 23745d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 23755d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2376d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2377d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2378d9fead3dSBarry Smith const PetscScalar *b; 23794e2b4712SSatish Balay 23804e2b4712SSatish Balay PetscFunctionBegin; 2381d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2383f1af5d2fSBarry Smith t = a->solve_work; 23844e2b4712SSatish Balay 23854e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 23864e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 23874e2b4712SSatish Balay 23884e2b4712SSatish Balay /* forward solve the lower triangular */ 23894e2b4712SSatish Balay idx = 4*(*r++); 2390f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2391f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 23924e2b4712SSatish Balay for (i=1; i<n; i++) { 23934e2b4712SSatish Balay v = aa + 16*ai[i]; 23944e2b4712SSatish Balay vi = aj + ai[i]; 23954e2b4712SSatish Balay nz = diag[i] - ai[i]; 23964e2b4712SSatish Balay idx = 4*(*r++); 2397f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 23984e2b4712SSatish Balay while (nz--) { 23994e2b4712SSatish Balay idx = 4*(*vi++); 2400f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2401f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2402f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2403f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2404f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 24054e2b4712SSatish Balay v += 16; 24064e2b4712SSatish Balay } 24074e2b4712SSatish Balay idx = 4*i; 2408f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2409f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 24104e2b4712SSatish Balay } 24114e2b4712SSatish Balay /* backward solve the upper triangular */ 24124e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 24134e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 24144e2b4712SSatish Balay vi = aj + diag[i] + 1; 24154e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 24164e2b4712SSatish Balay idt = 4*i; 2417f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2418f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 24194e2b4712SSatish Balay while (nz--) { 24204e2b4712SSatish Balay idx = 4*(*vi++); 2421f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2422f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2423f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2424f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2425f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2426f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 24274e2b4712SSatish Balay v += 16; 24284e2b4712SSatish Balay } 24294e2b4712SSatish Balay idc = 4*(*c--); 24304e2b4712SSatish Balay v = aa + 16*diag[i]; 2431f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2432f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2433f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2434f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 24354e2b4712SSatish Balay } 24364e2b4712SSatish Balay 24374e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 24384e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2439d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24401ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2441dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 24424e2b4712SSatish Balay PetscFunctionReturn(0); 24434e2b4712SSatish Balay } 2444f26ec98cSKris Buschelman 24458f690400SShri Abhyankar #undef __FUNCT__ 2446a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 2447a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 244878bb4007SShri Abhyankar { 244978bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 245078bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 245178bb4007SShri Abhyankar PetscErrorCode ierr; 245278bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 245378bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 245478bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 245578bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 245678bb4007SShri Abhyankar const PetscScalar *b; 245778bb4007SShri Abhyankar 245878bb4007SShri Abhyankar PetscFunctionBegin; 245978bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 246078bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 246178bb4007SShri Abhyankar t = a->solve_work; 246278bb4007SShri Abhyankar 246378bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 246478bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 246578bb4007SShri Abhyankar 246678bb4007SShri Abhyankar /* forward solve the lower triangular */ 246778bb4007SShri Abhyankar idx = 4*r[0]; 246878bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 246978bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 247078bb4007SShri Abhyankar for (i=1; i<n; i++) { 247178bb4007SShri Abhyankar v = aa + 16*ai[i]; 247278bb4007SShri Abhyankar vi = aj + ai[i]; 247378bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 247478bb4007SShri Abhyankar idx = 4*r[i]; 247578bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 247678bb4007SShri Abhyankar for(m=0;m<nz;m++){ 247778bb4007SShri Abhyankar idx = 4*vi[m]; 247878bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 247978bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 248078bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 248178bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 248278bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 248378bb4007SShri Abhyankar v += 16; 248478bb4007SShri Abhyankar } 248578bb4007SShri Abhyankar idx = 4*i; 248678bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 248778bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 248878bb4007SShri Abhyankar } 248978bb4007SShri Abhyankar /* backward solve the upper triangular */ 249078bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 249178bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 249278bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 249378bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 249478bb4007SShri Abhyankar idt = 4*i; 249578bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 249678bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 249778bb4007SShri Abhyankar for(m=0;m<nz;m++){ 249878bb4007SShri Abhyankar idx = 4*vi[m]; 249978bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 250078bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 250178bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 250278bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 250378bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 250478bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 250578bb4007SShri Abhyankar v += 16; 250678bb4007SShri Abhyankar } 250778bb4007SShri Abhyankar idc = 4*c[i]; 250878bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 250978bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 251078bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 251178bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 251278bb4007SShri Abhyankar } 251378bb4007SShri Abhyankar 251478bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 251578bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 251678bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 251778bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 251878bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 251978bb4007SShri Abhyankar PetscFunctionReturn(0); 252078bb4007SShri Abhyankar } 252178bb4007SShri Abhyankar 252278bb4007SShri Abhyankar #undef __FUNCT__ 2523f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2524dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2525f26ec98cSKris Buschelman { 2526f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2527f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 25286849ba73SBarry Smith PetscErrorCode ierr; 25295d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 25305d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2531d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2532d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2533d9fead3dSBarry Smith PetscScalar *x; 2534d9fead3dSBarry Smith const PetscScalar *b; 2535f26ec98cSKris Buschelman 2536f26ec98cSKris Buschelman PetscFunctionBegin; 2537d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25381ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2539f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 2540f26ec98cSKris Buschelman 2541f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2542f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2543f26ec98cSKris Buschelman 2544f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2545f26ec98cSKris Buschelman idx = 4*(*r++); 2546f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 2547f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 2548f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 2549f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 2550f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2551f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2552f26ec98cSKris Buschelman vi = aj + ai[i]; 2553f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2554f26ec98cSKris Buschelman idx = 4*(*r++); 2555f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2556f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2557f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2558f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2559f26ec98cSKris Buschelman while (nz--) { 2560f26ec98cSKris Buschelman idx = 4*(*vi++); 2561f26ec98cSKris Buschelman x1 = t[idx]; 2562f26ec98cSKris Buschelman x2 = t[1+idx]; 2563f26ec98cSKris Buschelman x3 = t[2+idx]; 2564f26ec98cSKris Buschelman x4 = t[3+idx]; 2565f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2566f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2567f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2568f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2569f26ec98cSKris Buschelman v += 16; 2570f26ec98cSKris Buschelman } 2571f26ec98cSKris Buschelman idx = 4*i; 2572f26ec98cSKris Buschelman t[idx] = s1; 2573f26ec98cSKris Buschelman t[1+idx] = s2; 2574f26ec98cSKris Buschelman t[2+idx] = s3; 2575f26ec98cSKris Buschelman t[3+idx] = s4; 2576f26ec98cSKris Buschelman } 2577f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2578f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2579f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 2580f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2581f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2582f26ec98cSKris Buschelman idt = 4*i; 2583f26ec98cSKris Buschelman s1 = t[idt]; 2584f26ec98cSKris Buschelman s2 = t[1+idt]; 2585f26ec98cSKris Buschelman s3 = t[2+idt]; 2586f26ec98cSKris Buschelman s4 = t[3+idt]; 2587f26ec98cSKris Buschelman while (nz--) { 2588f26ec98cSKris Buschelman idx = 4*(*vi++); 2589f26ec98cSKris Buschelman x1 = t[idx]; 2590f26ec98cSKris Buschelman x2 = t[1+idx]; 2591f26ec98cSKris Buschelman x3 = t[2+idx]; 2592f26ec98cSKris Buschelman x4 = t[3+idx]; 2593f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2594f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2595f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2596f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2597f26ec98cSKris Buschelman v += 16; 2598f26ec98cSKris Buschelman } 2599f26ec98cSKris Buschelman idc = 4*(*c--); 2600f26ec98cSKris Buschelman v = aa + 16*diag[i]; 2601f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2602f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2603f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2604f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2605f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 2606f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 2607f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 2608f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 2609f26ec98cSKris Buschelman } 2610f26ec98cSKris Buschelman 2611f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2612f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2613d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26141ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2615dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2616f26ec98cSKris Buschelman PetscFunctionReturn(0); 2617f26ec98cSKris Buschelman } 2618f26ec98cSKris Buschelman 261924c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 262024c233c2SKris Buschelman 262124c233c2SKris Buschelman #include PETSC_HAVE_SSE 262224c233c2SKris Buschelman 262324c233c2SKris Buschelman #undef __FUNCT__ 262424c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 2625dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 262624c233c2SKris Buschelman { 262724c233c2SKris Buschelman /* 262824c233c2SKris Buschelman Note: This code uses demotion of double 262924c233c2SKris Buschelman to float when performing the mixed-mode computation. 263024c233c2SKris Buschelman This may not be numerically reasonable for all applications. 263124c233c2SKris Buschelman */ 263224c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 263324c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 26346849ba73SBarry Smith PetscErrorCode ierr; 26355d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 26365d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 263724c233c2SKris Buschelman MatScalar *aa=a->a,*v; 263887828ca2SBarry Smith PetscScalar *x,*b,*t; 263924c233c2SKris Buschelman 264024c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 264124c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 264224c233c2SKris Buschelman unsigned long offset; 264324c233c2SKris Buschelman 264424c233c2SKris Buschelman PetscFunctionBegin; 264524c233c2SKris Buschelman SSE_SCOPE_BEGIN; 264624c233c2SKris Buschelman 264724c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 264824c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 264924c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 265024c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 265124c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 265224c233c2SKris Buschelman 26531ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 26541ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 265524c233c2SKris Buschelman t = a->solve_work; 265624c233c2SKris Buschelman 265724c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 265824c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 265924c233c2SKris Buschelman 266024c233c2SKris Buschelman /* forward solve the lower triangular */ 266124c233c2SKris Buschelman idx = 4*(*r++); 266224c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 266324c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 266424c233c2SKris Buschelman v = aa + 16*ai[1]; 266524c233c2SKris Buschelman 266624c233c2SKris Buschelman for (i=1; i<n;) { 266724c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 266824c233c2SKris Buschelman vi = aj + ai[i]; 266924c233c2SKris Buschelman nz = diag[i] - ai[i]; 267024c233c2SKris Buschelman idx = 4*(*r++); 267124c233c2SKris Buschelman 267224c233c2SKris Buschelman /* Demote sum from double to float */ 267324c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 267424c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 267524c233c2SKris Buschelman 267624c233c2SKris Buschelman while (nz--) { 267724c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 267824c233c2SKris Buschelman idx = 4*(*vi++); 267924c233c2SKris Buschelman 268024c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 268124c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 268224c233c2SKris Buschelman 268324c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 268424c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 268524c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 268624c233c2SKris Buschelman 268724c233c2SKris Buschelman /* First Column */ 268824c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 268924c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 269024c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 269124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 269224c233c2SKris Buschelman 269324c233c2SKris Buschelman /* Second Column */ 269424c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 269524c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 269624c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 269724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 269824c233c2SKris Buschelman 269924c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 270024c233c2SKris Buschelman 270124c233c2SKris Buschelman /* Third Column */ 270224c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 270324c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 270424c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 270524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 270624c233c2SKris Buschelman 270724c233c2SKris Buschelman /* Fourth Column */ 270824c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 270924c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 271024c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 271124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 271224c233c2SKris Buschelman SSE_INLINE_END_2 271324c233c2SKris Buschelman 271424c233c2SKris Buschelman v += 16; 271524c233c2SKris Buschelman } 271624c233c2SKris Buschelman idx = 4*i; 271724c233c2SKris Buschelman v = aa + 16*ai[++i]; 271824c233c2SKris Buschelman PREFETCH_NTA(v); 271924c233c2SKris Buschelman STORE_PS(tmps,XMM7); 272024c233c2SKris Buschelman 272124c233c2SKris Buschelman /* Promote result from float to double */ 272224c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 272324c233c2SKris Buschelman } 272424c233c2SKris Buschelman /* backward solve the upper triangular */ 272524c233c2SKris Buschelman idt = 4*(n-1); 272624c233c2SKris Buschelman ai16 = 16*diag[n-1]; 272724c233c2SKris Buschelman v = aa + ai16 + 16; 272824c233c2SKris Buschelman for (i=n-1; i>=0;){ 272924c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 273024c233c2SKris Buschelman vi = aj + diag[i] + 1; 273124c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 273224c233c2SKris Buschelman 273324c233c2SKris Buschelman /* Demote accumulator from double to float */ 273424c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 273524c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 273624c233c2SKris Buschelman 273724c233c2SKris Buschelman while (nz--) { 273824c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 273924c233c2SKris Buschelman idx = 4*(*vi++); 274024c233c2SKris Buschelman 274124c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 274224c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 274324c233c2SKris Buschelman 274424c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 274524c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 274624c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 274724c233c2SKris Buschelman 274824c233c2SKris Buschelman /* First Column */ 274924c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 275024c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 275124c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 275224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 275324c233c2SKris Buschelman 275424c233c2SKris Buschelman /* Second Column */ 275524c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 275624c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 275724c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 275824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 275924c233c2SKris Buschelman 276024c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 276124c233c2SKris Buschelman 276224c233c2SKris Buschelman /* Third Column */ 276324c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 276424c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 276524c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 276624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 276724c233c2SKris Buschelman 276824c233c2SKris Buschelman /* Fourth Column */ 276924c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 277024c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 277124c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 277224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 277324c233c2SKris Buschelman SSE_INLINE_END_2 277424c233c2SKris Buschelman v += 16; 277524c233c2SKris Buschelman } 277624c233c2SKris Buschelman v = aa + ai16; 277724c233c2SKris Buschelman ai16 = 16*diag[--i]; 277824c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 277924c233c2SKris Buschelman /* 278024c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 278124c233c2SKris Buschelman which was inverted as part of the factorization 278224c233c2SKris Buschelman */ 278324c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 278424c233c2SKris Buschelman /* First Column */ 278524c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 278624c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 278724c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 278824c233c2SKris Buschelman 278924c233c2SKris Buschelman /* Second Column */ 279024c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 279124c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 279224c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 279324c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 279424c233c2SKris Buschelman 279524c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 279624c233c2SKris Buschelman 279724c233c2SKris Buschelman /* Third Column */ 279824c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 279924c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 280024c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 280124c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 280224c233c2SKris Buschelman 280324c233c2SKris Buschelman /* Fourth Column */ 280424c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 280524c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 280624c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 280724c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 280824c233c2SKris Buschelman 280924c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 281024c233c2SKris Buschelman SSE_INLINE_END_3 281124c233c2SKris Buschelman 281224c233c2SKris Buschelman /* Promote solution from float to double */ 281324c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 281424c233c2SKris Buschelman 281524c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 281624c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 281724c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 281824c233c2SKris Buschelman idc = 4*(*c--); 281924c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 282024c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 282124c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 282224c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 282324c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 282424c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 282524c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 282624c233c2SKris Buschelman SSE_INLINE_END_2 282724c233c2SKris Buschelman v = aa + ai16 + 16; 282824c233c2SKris Buschelman idt -= 4; 282924c233c2SKris Buschelman } 283024c233c2SKris Buschelman 283124c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 283224c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 28331ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 28341ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2835dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 283624c233c2SKris Buschelman SSE_SCOPE_END; 283724c233c2SKris Buschelman PetscFunctionReturn(0); 283824c233c2SKris Buschelman } 283924c233c2SKris Buschelman 284024c233c2SKris Buschelman #endif 28410ef38995SBarry Smith 28420ef38995SBarry Smith 28434e2b4712SSatish Balay /* 28444e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 28454e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 28464e2b4712SSatish Balay */ 28474a2ae208SSatish Balay #undef __FUNCT__ 28484a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2849dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 28504e2b4712SSatish Balay { 28514e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2852356650c2SBarry Smith PetscInt n=a->mbs; 2853356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 2854dfbe8321SBarry Smith PetscErrorCode ierr; 2855356650c2SBarry Smith const PetscInt *diag = a->diag; 2856d9fead3dSBarry Smith const MatScalar *aa=a->a; 2857d9fead3dSBarry Smith PetscScalar *x; 2858d9fead3dSBarry Smith const PetscScalar *b; 28594e2b4712SSatish Balay 28604e2b4712SSatish Balay PetscFunctionBegin; 2861d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28621ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28634e2b4712SSatish Balay 2864aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 28652853dc0eSBarry Smith { 286687828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 28672853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 28682853dc0eSBarry Smith } 2869aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 28702853dc0eSBarry Smith { 287187828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 28722853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 28732853dc0eSBarry Smith } 2874aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 28752853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2876e1293385SBarry Smith #else 287730d4dcafSBarry Smith { 287887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2879d9fead3dSBarry Smith const MatScalar *v; 2880356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 2881356650c2SBarry Smith const PetscInt *vi; 2882e1293385SBarry Smith 28834e2b4712SSatish Balay /* forward solve the lower triangular */ 28844e2b4712SSatish Balay idx = 0; 2885e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 28864e2b4712SSatish Balay for (i=1; i<n; i++) { 28874e2b4712SSatish Balay v = aa + 16*ai[i]; 28884e2b4712SSatish Balay vi = aj + ai[i]; 28894e2b4712SSatish Balay nz = diag[i] - ai[i]; 2890e1293385SBarry Smith idx += 4; 2891f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 28924e2b4712SSatish Balay while (nz--) { 28934e2b4712SSatish Balay jdx = 4*(*vi++); 28944e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2895f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2896f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2897f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2898f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 28994e2b4712SSatish Balay v += 16; 29004e2b4712SSatish Balay } 2901f1af5d2fSBarry Smith x[idx] = s1; 2902f1af5d2fSBarry Smith x[1+idx] = s2; 2903f1af5d2fSBarry Smith x[2+idx] = s3; 2904f1af5d2fSBarry Smith x[3+idx] = s4; 29054e2b4712SSatish Balay } 29064e2b4712SSatish Balay /* backward solve the upper triangular */ 29074e555682SBarry Smith idt = 4*(n-1); 29084e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 29094e555682SBarry Smith ai16 = 16*diag[i]; 29104e555682SBarry Smith v = aa + ai16 + 16; 29114e2b4712SSatish Balay vi = aj + diag[i] + 1; 29124e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2913f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2914f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 29154e2b4712SSatish Balay while (nz--) { 29164e2b4712SSatish Balay idx = 4*(*vi++); 29174e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2918f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2919f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2920f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2921f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 29224e2b4712SSatish Balay v += 16; 29234e2b4712SSatish Balay } 29244e555682SBarry Smith v = aa + ai16; 2925f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2926f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2927f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2928f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2929329f5518SBarry Smith idt -= 4; 29304e2b4712SSatish Balay } 293130d4dcafSBarry Smith } 2932e1293385SBarry Smith #endif 29334e2b4712SSatish Balay 2934d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29351ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2936dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 29374e2b4712SSatish Balay PetscFunctionReturn(0); 29384e2b4712SSatish Balay } 29394e2b4712SSatish Balay 2940b2b2dd24SShri Abhyankar #undef __FUNCT__ 2941a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 2942a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2943b2b2dd24SShri Abhyankar { 2944b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2945b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2946b2b2dd24SShri Abhyankar PetscErrorCode ierr; 2947b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 2948b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2949b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 2950b2b2dd24SShri Abhyankar PetscScalar *x; 2951b2b2dd24SShri Abhyankar const PetscScalar *b; 2952b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2953cee9d6f2SShri Abhyankar 2954b2b2dd24SShri Abhyankar PetscFunctionBegin; 2955b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2956b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2957b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 2958b2b2dd24SShri Abhyankar idx = 0; 2959b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2960b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 2961b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 2962b2b2dd24SShri Abhyankar vi = aj + ai[i]; 2963b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 2964b2b2dd24SShri Abhyankar idx = bs*i; 2965b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2966b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 2967b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 2968b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2969b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2970b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2971b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2972b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2973b2b2dd24SShri Abhyankar 2974b2b2dd24SShri Abhyankar v += bs2; 2975b2b2dd24SShri Abhyankar } 2976b2b2dd24SShri Abhyankar 2977b2b2dd24SShri Abhyankar x[idx] = s1; 2978b2b2dd24SShri Abhyankar x[1+idx] = s2; 2979b2b2dd24SShri Abhyankar x[2+idx] = s3; 2980b2b2dd24SShri Abhyankar x[3+idx] = s4; 2981b2b2dd24SShri Abhyankar } 2982b2b2dd24SShri Abhyankar 2983b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 2984b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 2985b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 2986b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 2987b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 2988b2b2dd24SShri Abhyankar idt = bs*i; 2989b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2990b2b2dd24SShri Abhyankar 2991b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 2992b2b2dd24SShri Abhyankar idx = bs*vi[k]; 2993b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2994b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2995b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2996b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2997b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2998b2b2dd24SShri Abhyankar 2999b2b2dd24SShri Abhyankar v += bs2; 3000b2b2dd24SShri Abhyankar } 3001b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 3002b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3003b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3004b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3005b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3006b2b2dd24SShri Abhyankar 3007b2b2dd24SShri Abhyankar } 3008b2b2dd24SShri Abhyankar 3009b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3010b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3011b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3012b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 3013b2b2dd24SShri Abhyankar } 3014cee9d6f2SShri Abhyankar 3015cee9d6f2SShri Abhyankar #undef __FUNCT__ 3016f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3017dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3018f26ec98cSKris Buschelman { 3019f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3020690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3021dfbe8321SBarry Smith PetscErrorCode ierr; 3022690b6cddSBarry Smith PetscInt *diag = a->diag; 3023f26ec98cSKris Buschelman MatScalar *aa=a->a; 3024f26ec98cSKris Buschelman PetscScalar *x,*b; 3025f26ec98cSKris Buschelman 3026f26ec98cSKris Buschelman PetscFunctionBegin; 30271ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 30281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3029f26ec98cSKris Buschelman 3030f26ec98cSKris Buschelman { 3031f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3032f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 3033690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3034f26ec98cSKris Buschelman 3035f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3036f26ec98cSKris Buschelman idx = 0; 3037f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 3038f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 3039f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 3040f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 3041f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3042f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3043f26ec98cSKris Buschelman vi = aj + ai[i]; 3044f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3045f26ec98cSKris Buschelman idx += 4; 3046f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3047f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3048f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3049f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3050f26ec98cSKris Buschelman while (nz--) { 3051f26ec98cSKris Buschelman jdx = 4*(*vi++); 3052f26ec98cSKris Buschelman x1 = t[jdx]; 3053f26ec98cSKris Buschelman x2 = t[1+jdx]; 3054f26ec98cSKris Buschelman x3 = t[2+jdx]; 3055f26ec98cSKris Buschelman x4 = t[3+jdx]; 3056f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3057f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3058f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3059f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3060f26ec98cSKris Buschelman v += 16; 3061f26ec98cSKris Buschelman } 3062f26ec98cSKris Buschelman t[idx] = s1; 3063f26ec98cSKris Buschelman t[1+idx] = s2; 3064f26ec98cSKris Buschelman t[2+idx] = s3; 3065f26ec98cSKris Buschelman t[3+idx] = s4; 3066f26ec98cSKris Buschelman } 3067f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3068f26ec98cSKris Buschelman idt = 4*(n-1); 3069f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3070f26ec98cSKris Buschelman ai16 = 16*diag[i]; 3071f26ec98cSKris Buschelman v = aa + ai16 + 16; 3072f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3073f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3074f26ec98cSKris Buschelman s1 = t[idt]; 3075f26ec98cSKris Buschelman s2 = t[1+idt]; 3076f26ec98cSKris Buschelman s3 = t[2+idt]; 3077f26ec98cSKris Buschelman s4 = t[3+idt]; 3078f26ec98cSKris Buschelman while (nz--) { 3079f26ec98cSKris Buschelman idx = 4*(*vi++); 3080f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 3081f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 3082f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 3083f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 3084f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3085f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3086f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3087f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3088f26ec98cSKris Buschelman v += 16; 3089f26ec98cSKris Buschelman } 3090f26ec98cSKris Buschelman v = aa + ai16; 3091f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3092f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3093f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3094f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3095f26ec98cSKris Buschelman idt -= 4; 3096f26ec98cSKris Buschelman } 3097f26ec98cSKris Buschelman } 3098f26ec98cSKris Buschelman 30991ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 31001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3101dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3102f26ec98cSKris Buschelman PetscFunctionReturn(0); 3103f26ec98cSKris Buschelman } 3104f26ec98cSKris Buschelman 31053660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 31063660e330SKris Buschelman 31073660e330SKris Buschelman #include PETSC_HAVE_SSE 31083660e330SKris Buschelman #undef __FUNCT__ 31097cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3110dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 31113660e330SKris Buschelman { 31123660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 31132aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 3114dfbe8321SBarry Smith PetscErrorCode ierr; 3115dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 31163660e330SKris Buschelman MatScalar *aa=a->a; 311787828ca2SBarry Smith PetscScalar *x,*b; 31183660e330SKris Buschelman 31193660e330SKris Buschelman PetscFunctionBegin; 31203660e330SKris Buschelman SSE_SCOPE_BEGIN; 31213660e330SKris Buschelman /* 31223660e330SKris Buschelman Note: This code currently uses demotion of double 31233660e330SKris Buschelman to float when performing the mixed-mode computation. 31243660e330SKris Buschelman This may not be numerically reasonable for all applications. 31253660e330SKris Buschelman */ 31263660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 31273660e330SKris Buschelman 31281ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 31291ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 31303660e330SKris Buschelman { 3131eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 3132eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 31332aa5897fSKris Buschelman int nz,i,idt,ai16; 31342aa5897fSKris Buschelman unsigned int jdx,idx; 31352aa5897fSKris Buschelman unsigned short *vi; 3136eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 31373660e330SKris Buschelman 3138eb05f457SKris Buschelman /* First block is the identity. */ 31393660e330SKris Buschelman idx = 0; 3140eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 31412aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 31423660e330SKris Buschelman 31433660e330SKris Buschelman for (i=1; i<n;) { 31443660e330SKris Buschelman PREFETCH_NTA(&v[8]); 31453660e330SKris Buschelman vi = aj + ai[i]; 31463660e330SKris Buschelman nz = diag[i] - ai[i]; 31473660e330SKris Buschelman idx += 4; 31483660e330SKris Buschelman 3149eb05f457SKris Buschelman /* Demote RHS from double to float. */ 3150eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3151eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 31523660e330SKris Buschelman 31533660e330SKris Buschelman while (nz--) { 31543660e330SKris Buschelman PREFETCH_NTA(&v[16]); 31552aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 31563660e330SKris Buschelman 31573660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 3158eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 31593660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 31603660e330SKris Buschelman 31613660e330SKris Buschelman /* First Column */ 31623660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 31633660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 31643660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 31653660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 31663660e330SKris Buschelman 31673660e330SKris Buschelman /* Second Column */ 31683660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 31693660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 31703660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 31713660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 31723660e330SKris Buschelman 31733660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 31743660e330SKris Buschelman 31753660e330SKris Buschelman /* Third Column */ 31763660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 31773660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 31783660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 31793660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 31803660e330SKris Buschelman 31813660e330SKris Buschelman /* Fourth Column */ 31823660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 31833660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 31843660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 31853660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 31863660e330SKris Buschelman SSE_INLINE_END_2 31873660e330SKris Buschelman 31883660e330SKris Buschelman v += 16; 31893660e330SKris Buschelman } 31903660e330SKris Buschelman v = aa + 16*ai[++i]; 31913660e330SKris Buschelman PREFETCH_NTA(v); 3192eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 31933660e330SKris Buschelman } 3194eb05f457SKris Buschelman 3195eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 3196eb05f457SKris Buschelman 31973660e330SKris Buschelman idt = 4*(n-1); 31983660e330SKris Buschelman ai16 = 16*diag[n-1]; 31993660e330SKris Buschelman v = aa + ai16 + 16; 32003660e330SKris Buschelman for (i=n-1; i>=0;){ 32013660e330SKris Buschelman PREFETCH_NTA(&v[8]); 32023660e330SKris Buschelman vi = aj + diag[i] + 1; 32033660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 32043660e330SKris Buschelman 3205eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 32063660e330SKris Buschelman 32073660e330SKris Buschelman while (nz--) { 32083660e330SKris Buschelman PREFETCH_NTA(&v[16]); 32092aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 32103660e330SKris Buschelman 32113660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 3212eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 32133660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 32143660e330SKris Buschelman 32153660e330SKris Buschelman /* First Column */ 32163660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 32173660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 32183660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 32193660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 32203660e330SKris Buschelman 32213660e330SKris Buschelman /* Second Column */ 32223660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 32233660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 32243660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 32253660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 32263660e330SKris Buschelman 32273660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 32283660e330SKris Buschelman 32293660e330SKris Buschelman /* Third Column */ 32303660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 32313660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 32323660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 32333660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 32343660e330SKris Buschelman 32353660e330SKris Buschelman /* Fourth Column */ 32363660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 32373660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 32383660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 32393660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 32403660e330SKris Buschelman SSE_INLINE_END_2 32413660e330SKris Buschelman v += 16; 32423660e330SKris Buschelman } 32433660e330SKris Buschelman v = aa + ai16; 32443660e330SKris Buschelman ai16 = 16*diag[--i]; 32453660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 32463660e330SKris Buschelman /* 32473660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 32483660e330SKris Buschelman which was inverted as part of the factorization 32493660e330SKris Buschelman */ 3250eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 32513660e330SKris Buschelman /* First Column */ 32523660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 32533660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 32543660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 32553660e330SKris Buschelman 32563660e330SKris Buschelman /* Second Column */ 32573660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 32583660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 32593660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 32603660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 32613660e330SKris Buschelman 32623660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 32633660e330SKris Buschelman 32643660e330SKris Buschelman /* Third Column */ 32653660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 32663660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 32673660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 32683660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 32693660e330SKris Buschelman 32703660e330SKris Buschelman /* Fourth Column */ 32713660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 32723660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 32733660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 32743660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 32753660e330SKris Buschelman 32763660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 32773660e330SKris Buschelman SSE_INLINE_END_3 32783660e330SKris Buschelman 32793660e330SKris Buschelman v = aa + ai16 + 16; 32803660e330SKris Buschelman idt -= 4; 32813660e330SKris Buschelman } 3282eb05f457SKris Buschelman 3283eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 3284eb05f457SKris Buschelman idt = 4*(n-1); 3285eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 3286eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3287eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3288eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 3289eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 3290eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 3291eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 3292eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 3293eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 329454693613SKris Buschelman idt -= 4; 32953660e330SKris Buschelman } 3296eb05f457SKris Buschelman 3297eb05f457SKris Buschelman } /* End of artificial scope. */ 32981ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 32991ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3300dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 33013660e330SKris Buschelman SSE_SCOPE_END; 33023660e330SKris Buschelman PetscFunctionReturn(0); 33033660e330SKris Buschelman } 33043660e330SKris Buschelman 33057cf1b8d3SKris Buschelman #undef __FUNCT__ 33067cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3307dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 33087cf1b8d3SKris Buschelman { 33097cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 33107cf1b8d3SKris Buschelman int *aj=a->j; 3311dfbe8321SBarry Smith PetscErrorCode ierr; 3312dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 33137cf1b8d3SKris Buschelman MatScalar *aa=a->a; 33147cf1b8d3SKris Buschelman PetscScalar *x,*b; 33157cf1b8d3SKris Buschelman 33167cf1b8d3SKris Buschelman PetscFunctionBegin; 33177cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 33187cf1b8d3SKris Buschelman /* 33197cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 33207cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 33217cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 33227cf1b8d3SKris Buschelman */ 33237cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 33247cf1b8d3SKris Buschelman 33251ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 33261ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 33277cf1b8d3SKris Buschelman { 33287cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 33297cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 33307cf1b8d3SKris Buschelman int nz,i,idt,ai16; 33317cf1b8d3SKris Buschelman int jdx,idx; 33327cf1b8d3SKris Buschelman int *vi; 33337cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 33347cf1b8d3SKris Buschelman 33357cf1b8d3SKris Buschelman /* First block is the identity. */ 33367cf1b8d3SKris Buschelman idx = 0; 33377cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 33387cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 33397cf1b8d3SKris Buschelman 33407cf1b8d3SKris Buschelman for (i=1; i<n;) { 33417cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 33427cf1b8d3SKris Buschelman vi = aj + ai[i]; 33437cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 33447cf1b8d3SKris Buschelman idx += 4; 33457cf1b8d3SKris Buschelman 33467cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 33477cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 33487cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 33497cf1b8d3SKris Buschelman 33507cf1b8d3SKris Buschelman while (nz--) { 33517cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 33527cf1b8d3SKris Buschelman jdx = 4*(*vi++); 33537cf1b8d3SKris Buschelman /* jdx = *vi++; */ 33547cf1b8d3SKris Buschelman 33557cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 33567cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 33577cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 33587cf1b8d3SKris Buschelman 33597cf1b8d3SKris Buschelman /* First Column */ 33607cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 33617cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 33627cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 33637cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 33647cf1b8d3SKris Buschelman 33657cf1b8d3SKris Buschelman /* Second Column */ 33667cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 33677cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 33687cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 33697cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 33707cf1b8d3SKris Buschelman 33717cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 33727cf1b8d3SKris Buschelman 33737cf1b8d3SKris Buschelman /* Third Column */ 33747cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 33757cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 33767cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 33777cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 33787cf1b8d3SKris Buschelman 33797cf1b8d3SKris Buschelman /* Fourth Column */ 33807cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 33817cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 33827cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 33837cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 33847cf1b8d3SKris Buschelman SSE_INLINE_END_2 33857cf1b8d3SKris Buschelman 33867cf1b8d3SKris Buschelman v += 16; 33877cf1b8d3SKris Buschelman } 33887cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 33897cf1b8d3SKris Buschelman PREFETCH_NTA(v); 33907cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 33917cf1b8d3SKris Buschelman } 33927cf1b8d3SKris Buschelman 33937cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 33947cf1b8d3SKris Buschelman 33957cf1b8d3SKris Buschelman idt = 4*(n-1); 33967cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 33977cf1b8d3SKris Buschelman v = aa + ai16 + 16; 33987cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 33997cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 34007cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 34017cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 34027cf1b8d3SKris Buschelman 34037cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 34047cf1b8d3SKris Buschelman 34057cf1b8d3SKris Buschelman while (nz--) { 34067cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 34077cf1b8d3SKris Buschelman idx = 4*(*vi++); 34087cf1b8d3SKris Buschelman /* idx = *vi++; */ 34097cf1b8d3SKris Buschelman 34107cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 34117cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 34127cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 34137cf1b8d3SKris Buschelman 34147cf1b8d3SKris Buschelman /* First Column */ 34157cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 34167cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 34177cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 34187cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 34197cf1b8d3SKris Buschelman 34207cf1b8d3SKris Buschelman /* Second Column */ 34217cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 34227cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 34237cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 34247cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 34257cf1b8d3SKris Buschelman 34267cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 34277cf1b8d3SKris Buschelman 34287cf1b8d3SKris Buschelman /* Third Column */ 34297cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 34307cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 34317cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 34327cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 34337cf1b8d3SKris Buschelman 34347cf1b8d3SKris Buschelman /* Fourth Column */ 34357cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 34367cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 34377cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 34387cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 34397cf1b8d3SKris Buschelman SSE_INLINE_END_2 34407cf1b8d3SKris Buschelman v += 16; 34417cf1b8d3SKris Buschelman } 34427cf1b8d3SKris Buschelman v = aa + ai16; 34437cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 34447cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 34457cf1b8d3SKris Buschelman /* 34467cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 34477cf1b8d3SKris Buschelman which was inverted as part of the factorization 34487cf1b8d3SKris Buschelman */ 34497cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 34507cf1b8d3SKris Buschelman /* First Column */ 34517cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 34527cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 34537cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 34547cf1b8d3SKris Buschelman 34557cf1b8d3SKris Buschelman /* Second Column */ 34567cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 34577cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 34587cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 34597cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 34607cf1b8d3SKris Buschelman 34617cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 34627cf1b8d3SKris Buschelman 34637cf1b8d3SKris Buschelman /* Third Column */ 34647cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 34657cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 34667cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 34677cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 34687cf1b8d3SKris Buschelman 34697cf1b8d3SKris Buschelman /* Fourth Column */ 34707cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 34717cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 34727cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 34737cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 34747cf1b8d3SKris Buschelman 34757cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 34767cf1b8d3SKris Buschelman SSE_INLINE_END_3 34777cf1b8d3SKris Buschelman 34787cf1b8d3SKris Buschelman v = aa + ai16 + 16; 34797cf1b8d3SKris Buschelman idt -= 4; 34807cf1b8d3SKris Buschelman } 34817cf1b8d3SKris Buschelman 34827cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 34837cf1b8d3SKris Buschelman idt = 4*(n-1); 34847cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 34857cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 34867cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 34877cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 34887cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 34897cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 34907cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 34917cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 34927cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 34937cf1b8d3SKris Buschelman idt -= 4; 34947cf1b8d3SKris Buschelman } 34957cf1b8d3SKris Buschelman 34967cf1b8d3SKris Buschelman } /* End of artificial scope. */ 34971ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 34981ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3499dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 35007cf1b8d3SKris Buschelman SSE_SCOPE_END; 35017cf1b8d3SKris Buschelman PetscFunctionReturn(0); 35027cf1b8d3SKris Buschelman } 35037cf1b8d3SKris Buschelman 35043660e330SKris Buschelman #endif 35058f690400SShri Abhyankar 35064a2ae208SSatish Balay #undef __FUNCT__ 35074a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3508dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 35094e2b4712SSatish Balay { 35104e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 35114e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 35126849ba73SBarry Smith PetscErrorCode ierr; 35135d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 35145d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3515d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3516d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3517d9fead3dSBarry Smith const PetscScalar *b; 35184e2b4712SSatish Balay 35194e2b4712SSatish Balay PetscFunctionBegin; 3520d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 35211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3522f1af5d2fSBarry Smith t = a->solve_work; 35234e2b4712SSatish Balay 35244e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 35254e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 35264e2b4712SSatish Balay 35274e2b4712SSatish Balay /* forward solve the lower triangular */ 35284e2b4712SSatish Balay idx = 3*(*r++); 3529f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 35304e2b4712SSatish Balay for (i=1; i<n; i++) { 35314e2b4712SSatish Balay v = aa + 9*ai[i]; 35324e2b4712SSatish Balay vi = aj + ai[i]; 35334e2b4712SSatish Balay nz = diag[i] - ai[i]; 35344e2b4712SSatish Balay idx = 3*(*r++); 3535f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 35364e2b4712SSatish Balay while (nz--) { 35374e2b4712SSatish Balay idx = 3*(*vi++); 3538f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3539f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3540f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3541f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 35424e2b4712SSatish Balay v += 9; 35434e2b4712SSatish Balay } 35444e2b4712SSatish Balay idx = 3*i; 3545f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 35464e2b4712SSatish Balay } 35474e2b4712SSatish Balay /* backward solve the upper triangular */ 35484e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 35494e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 35504e2b4712SSatish Balay vi = aj + diag[i] + 1; 35514e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 35524e2b4712SSatish Balay idt = 3*i; 3553f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 35544e2b4712SSatish Balay while (nz--) { 35554e2b4712SSatish Balay idx = 3*(*vi++); 3556f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3557f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3558f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3559f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 35604e2b4712SSatish Balay v += 9; 35614e2b4712SSatish Balay } 35624e2b4712SSatish Balay idc = 3*(*c--); 35634e2b4712SSatish Balay v = aa + 9*diag[i]; 3564f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3565f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3566f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 35674e2b4712SSatish Balay } 35684e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 35694e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3570d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 35711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3572dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 35734e2b4712SSatish Balay PetscFunctionReturn(0); 35744e2b4712SSatish Balay } 35754e2b4712SSatish Balay 35760c4413a7SShri Abhyankar #undef __FUNCT__ 3577a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 3578a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 35790c4413a7SShri Abhyankar { 35800c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 35810c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 35820c4413a7SShri Abhyankar PetscErrorCode ierr; 35830c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 35840c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 35850c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 35860c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 35870c4413a7SShri Abhyankar const PetscScalar *b; 35880c4413a7SShri Abhyankar 35890c4413a7SShri Abhyankar PetscFunctionBegin; 35900c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 35910c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 35920c4413a7SShri Abhyankar t = a->solve_work; 35930c4413a7SShri Abhyankar 35940c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 35950c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 35960c4413a7SShri Abhyankar 35970c4413a7SShri Abhyankar /* forward solve the lower triangular */ 35980c4413a7SShri Abhyankar idx = 3*r[0]; 35990c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 36000c4413a7SShri Abhyankar for (i=1; i<n; i++) { 36010c4413a7SShri Abhyankar v = aa + 9*ai[i]; 36020c4413a7SShri Abhyankar vi = aj + ai[i]; 36030c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 36040c4413a7SShri Abhyankar idx = 3*r[i]; 36050c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 36060c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 36070c4413a7SShri Abhyankar idx = 3*vi[m]; 36080c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 36090c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 36100c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 36110c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 36120c4413a7SShri Abhyankar v += 9; 36130c4413a7SShri Abhyankar } 36140c4413a7SShri Abhyankar idx = 3*i; 36150c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 36160c4413a7SShri Abhyankar } 36170c4413a7SShri Abhyankar /* backward solve the upper triangular */ 36180c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 36190c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 36200c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 36210c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 36220c4413a7SShri Abhyankar idt = 3*i; 36230c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 36240c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 36250c4413a7SShri Abhyankar idx = 3*vi[m]; 36260c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 36270c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 36280c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 36290c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 36300c4413a7SShri Abhyankar v += 9; 36310c4413a7SShri Abhyankar } 36320c4413a7SShri Abhyankar idc = 3*c[i]; 36330c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 36340c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 36350c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 36360c4413a7SShri Abhyankar } 36370c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 36380c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 36390c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 36400c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 36410c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 36420c4413a7SShri Abhyankar PetscFunctionReturn(0); 36430c4413a7SShri Abhyankar } 36440c4413a7SShri Abhyankar 364515091d37SBarry Smith /* 364615091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 364715091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 364815091d37SBarry Smith */ 36494a2ae208SSatish Balay #undef __FUNCT__ 36504a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 3651dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 365215091d37SBarry Smith { 365315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3654690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3655dfbe8321SBarry Smith PetscErrorCode ierr; 3656690b6cddSBarry Smith PetscInt *diag = a->diag; 3657d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3658d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 3659d9fead3dSBarry Smith const PetscScalar *b; 3660690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 366115091d37SBarry Smith 366215091d37SBarry Smith PetscFunctionBegin; 3663d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 36641ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 366515091d37SBarry Smith 366615091d37SBarry Smith /* forward solve the lower triangular */ 366715091d37SBarry Smith idx = 0; 366815091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 366915091d37SBarry Smith for (i=1; i<n; i++) { 367015091d37SBarry Smith v = aa + 9*ai[i]; 367115091d37SBarry Smith vi = aj + ai[i]; 367215091d37SBarry Smith nz = diag[i] - ai[i]; 367315091d37SBarry Smith idx += 3; 3674f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 367515091d37SBarry Smith while (nz--) { 367615091d37SBarry Smith jdx = 3*(*vi++); 367715091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 3678f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3679f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3680f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 368115091d37SBarry Smith v += 9; 368215091d37SBarry Smith } 3683f1af5d2fSBarry Smith x[idx] = s1; 3684f1af5d2fSBarry Smith x[1+idx] = s2; 3685f1af5d2fSBarry Smith x[2+idx] = s3; 368615091d37SBarry Smith } 368715091d37SBarry Smith /* backward solve the upper triangular */ 368815091d37SBarry Smith for (i=n-1; i>=0; i--){ 368915091d37SBarry Smith v = aa + 9*diag[i] + 9; 369015091d37SBarry Smith vi = aj + diag[i] + 1; 369115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 369215091d37SBarry Smith idt = 3*i; 3693f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3694f1af5d2fSBarry Smith s3 = x[2+idt]; 369515091d37SBarry Smith while (nz--) { 369615091d37SBarry Smith idx = 3*(*vi++); 369715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 3698f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3699f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3700f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 370115091d37SBarry Smith v += 9; 370215091d37SBarry Smith } 370315091d37SBarry Smith v = aa + 9*diag[i]; 3704f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3705f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3706f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 370715091d37SBarry Smith } 370815091d37SBarry Smith 3709d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 37101ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3711dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 371215091d37SBarry Smith PetscFunctionReturn(0); 371315091d37SBarry Smith } 371415091d37SBarry Smith 3715cee9d6f2SShri Abhyankar #undef __FUNCT__ 3716a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 3717a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3718b2b2dd24SShri Abhyankar { 3719b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3720b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3721b2b2dd24SShri Abhyankar PetscErrorCode ierr; 3722b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 3723b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3724b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 3725b2b2dd24SShri Abhyankar PetscScalar *x; 3726b2b2dd24SShri Abhyankar const PetscScalar *b; 3727b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 3728b2b2dd24SShri Abhyankar 3729b2b2dd24SShri Abhyankar PetscFunctionBegin; 3730b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3731b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3732b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 3733b2b2dd24SShri Abhyankar idx = 0; 3734b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 3735b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 3736b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 3737b2b2dd24SShri Abhyankar vi = aj + ai[i]; 3738b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 3739b2b2dd24SShri Abhyankar idx = bs*i; 3740b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 3741b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 3742b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 3743b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 3744b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3745b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3746b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3747b2b2dd24SShri Abhyankar 3748b2b2dd24SShri Abhyankar v += bs2; 3749b2b2dd24SShri Abhyankar } 3750b2b2dd24SShri Abhyankar 3751b2b2dd24SShri Abhyankar x[idx] = s1; 3752b2b2dd24SShri Abhyankar x[1+idx] = s2; 3753b2b2dd24SShri Abhyankar x[2+idx] = s3; 3754b2b2dd24SShri Abhyankar } 3755b2b2dd24SShri Abhyankar 3756b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 3757b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 3758b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 3759b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 3760b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 3761b2b2dd24SShri Abhyankar idt = bs*i; 3762b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 3763b2b2dd24SShri Abhyankar 3764b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 3765b2b2dd24SShri Abhyankar idx = bs*vi[k]; 3766b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3767b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3768b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3769b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3770b2b2dd24SShri Abhyankar 3771b2b2dd24SShri Abhyankar v += bs2; 3772b2b2dd24SShri Abhyankar } 3773b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 3774b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3775b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3776b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3777b2b2dd24SShri Abhyankar 3778b2b2dd24SShri Abhyankar } 3779b2b2dd24SShri Abhyankar 3780b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3781b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3782b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3783b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 3784b2b2dd24SShri Abhyankar } 3785b2b2dd24SShri Abhyankar 3786b2b2dd24SShri Abhyankar #undef __FUNCT__ 37874a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 3788dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 37894e2b4712SSatish Balay { 37904e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 37914e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 37926849ba73SBarry Smith PetscErrorCode ierr; 37935d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 37945d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3795d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3796d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 3797d9fead3dSBarry Smith const PetscScalar *b; 37984e2b4712SSatish Balay 37994e2b4712SSatish Balay PetscFunctionBegin; 3800d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 38011ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3802f1af5d2fSBarry Smith t = a->solve_work; 38034e2b4712SSatish Balay 38044e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 38054e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 38064e2b4712SSatish Balay 38074e2b4712SSatish Balay /* forward solve the lower triangular */ 38084e2b4712SSatish Balay idx = 2*(*r++); 3809f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 38104e2b4712SSatish Balay for (i=1; i<n; i++) { 38114e2b4712SSatish Balay v = aa + 4*ai[i]; 38124e2b4712SSatish Balay vi = aj + ai[i]; 38134e2b4712SSatish Balay nz = diag[i] - ai[i]; 38144e2b4712SSatish Balay idx = 2*(*r++); 3815f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 38164e2b4712SSatish Balay while (nz--) { 38174e2b4712SSatish Balay idx = 2*(*vi++); 3818f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3819f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3820f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 38214e2b4712SSatish Balay v += 4; 38224e2b4712SSatish Balay } 38234e2b4712SSatish Balay idx = 2*i; 3824f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 38254e2b4712SSatish Balay } 38264e2b4712SSatish Balay /* backward solve the upper triangular */ 38274e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 38284e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 38294e2b4712SSatish Balay vi = aj + diag[i] + 1; 38304e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 38314e2b4712SSatish Balay idt = 2*i; 3832f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 38334e2b4712SSatish Balay while (nz--) { 38344e2b4712SSatish Balay idx = 2*(*vi++); 3835f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3836f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3837f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 38384e2b4712SSatish Balay v += 4; 38394e2b4712SSatish Balay } 38404e2b4712SSatish Balay idc = 2*(*c--); 38414e2b4712SSatish Balay v = aa + 4*diag[i]; 3842f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 3843f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 38444e2b4712SSatish Balay } 38454e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 38464e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3847d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 38481ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3849dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 38504e2b4712SSatish Balay PetscFunctionReturn(0); 38514e2b4712SSatish Balay } 38524e2b4712SSatish Balay 38530c4413a7SShri Abhyankar #undef __FUNCT__ 3854a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 3855a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 38560c4413a7SShri Abhyankar { 38570c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 38580c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 38590c4413a7SShri Abhyankar PetscErrorCode ierr; 38600c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 38610c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 38620c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 38630c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 38640c4413a7SShri Abhyankar const PetscScalar *b; 38650c4413a7SShri Abhyankar 38660c4413a7SShri Abhyankar PetscFunctionBegin; 38670c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 38680c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 38690c4413a7SShri Abhyankar t = a->solve_work; 38700c4413a7SShri Abhyankar 38710c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 38720c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 38730c4413a7SShri Abhyankar 38740c4413a7SShri Abhyankar /* forward solve the lower triangular */ 38750c4413a7SShri Abhyankar idx = 2*r[0]; 38760c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 38770c4413a7SShri Abhyankar for (i=1; i<n; i++) { 38780c4413a7SShri Abhyankar v = aa + 4*ai[i]; 38790c4413a7SShri Abhyankar vi = aj + ai[i]; 38800c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 38810c4413a7SShri Abhyankar idx = 2*r[i]; 38820c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 38830c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 38840c4413a7SShri Abhyankar jdx = 2*vi[m]; 38850c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 38860c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 38870c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 38880c4413a7SShri Abhyankar v += 4; 38890c4413a7SShri Abhyankar } 38900c4413a7SShri Abhyankar idx = 2*i; 38910c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 38920c4413a7SShri Abhyankar } 38930c4413a7SShri Abhyankar /* backward solve the upper triangular */ 38940c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 38950c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 38960c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 38970c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 38980c4413a7SShri Abhyankar idt = 2*i; 38990c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 39000c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 39010c4413a7SShri Abhyankar idx = 2*vi[m]; 39020c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 39030c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 39040c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 39050c4413a7SShri Abhyankar v += 4; 39060c4413a7SShri Abhyankar } 39070c4413a7SShri Abhyankar idc = 2*c[i]; 39080c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 39090c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 39100c4413a7SShri Abhyankar } 39110c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 39120c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 39130c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 39140c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 39150c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 39160c4413a7SShri Abhyankar PetscFunctionReturn(0); 39170c4413a7SShri Abhyankar } 39188f690400SShri Abhyankar 391915091d37SBarry Smith /* 392015091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 392115091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 392215091d37SBarry Smith */ 39234a2ae208SSatish Balay #undef __FUNCT__ 39244a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 3925dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 392615091d37SBarry Smith { 392715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3928690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3929dfbe8321SBarry Smith PetscErrorCode ierr; 3930690b6cddSBarry Smith PetscInt *diag = a->diag; 3931d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3932d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 3933d9fead3dSBarry Smith const PetscScalar *b; 3934690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 393515091d37SBarry Smith 393615091d37SBarry Smith PetscFunctionBegin; 3937d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 39381ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 393915091d37SBarry Smith 394015091d37SBarry Smith /* forward solve the lower triangular */ 394115091d37SBarry Smith idx = 0; 394215091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 394315091d37SBarry Smith for (i=1; i<n; i++) { 394415091d37SBarry Smith v = aa + 4*ai[i]; 394515091d37SBarry Smith vi = aj + ai[i]; 394615091d37SBarry Smith nz = diag[i] - ai[i]; 394715091d37SBarry Smith idx += 2; 3948f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 394915091d37SBarry Smith while (nz--) { 395015091d37SBarry Smith jdx = 2*(*vi++); 395115091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 3952f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3953f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 395415091d37SBarry Smith v += 4; 395515091d37SBarry Smith } 3956f1af5d2fSBarry Smith x[idx] = s1; 3957f1af5d2fSBarry Smith x[1+idx] = s2; 395815091d37SBarry Smith } 395915091d37SBarry Smith /* backward solve the upper triangular */ 396015091d37SBarry Smith for (i=n-1; i>=0; i--){ 396115091d37SBarry Smith v = aa + 4*diag[i] + 4; 396215091d37SBarry Smith vi = aj + diag[i] + 1; 396315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 396415091d37SBarry Smith idt = 2*i; 3965f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 396615091d37SBarry Smith while (nz--) { 396715091d37SBarry Smith idx = 2*(*vi++); 396815091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 3969f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3970f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 397115091d37SBarry Smith v += 4; 397215091d37SBarry Smith } 397315091d37SBarry Smith v = aa + 4*diag[i]; 3974f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 3975f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 397615091d37SBarry Smith } 397715091d37SBarry Smith 3978d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 39791ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3980dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 398115091d37SBarry Smith PetscFunctionReturn(0); 398215091d37SBarry Smith } 398315091d37SBarry Smith 3984cee9d6f2SShri Abhyankar #undef __FUNCT__ 3985a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 3986a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3987b2b2dd24SShri Abhyankar { 3988b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3989b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 3990b2b2dd24SShri Abhyankar PetscErrorCode ierr; 3991b2b2dd24SShri Abhyankar PetscInt jdx; 3992b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 3993b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 3994b2b2dd24SShri Abhyankar const PetscScalar *b; 3995b2b2dd24SShri Abhyankar 3996b2b2dd24SShri Abhyankar PetscFunctionBegin; 3997b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3998b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3999b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4000b2b2dd24SShri Abhyankar idx = 0; 4001b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4002b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4003b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 4004b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4005b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4006b2b2dd24SShri Abhyankar idx = 2*i; 4007b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4008b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4009b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 4010b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4011b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4012b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4013b2b2dd24SShri Abhyankar v += 4; 4014b2b2dd24SShri Abhyankar } 4015b2b2dd24SShri Abhyankar x[idx] = s1; 4016b2b2dd24SShri Abhyankar x[1+idx] = s2; 4017b2b2dd24SShri Abhyankar } 4018b2b2dd24SShri Abhyankar 4019b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4020b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4021b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 4022b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4023b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4024b2b2dd24SShri Abhyankar idt = 2*i; 4025b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4026b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4027b2b2dd24SShri Abhyankar idx = 2*vi[k]; 4028b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4029b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4030b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4031b2b2dd24SShri Abhyankar v += 4; 4032b2b2dd24SShri Abhyankar } 4033b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4034b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4035b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4036b2b2dd24SShri Abhyankar } 4037b2b2dd24SShri Abhyankar 4038b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4039b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4040b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4041b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4042b2b2dd24SShri Abhyankar } 4043b2b2dd24SShri Abhyankar 4044b2b2dd24SShri Abhyankar #undef __FUNCT__ 40454a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4046dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 40474e2b4712SSatish Balay { 40484e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 40494e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 40506849ba73SBarry Smith PetscErrorCode ierr; 40515d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 40525d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 40533f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 405487828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 40554e2b4712SSatish Balay 40564e2b4712SSatish Balay PetscFunctionBegin; 40574e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 40584e2b4712SSatish Balay 40591ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 40601ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4061f1af5d2fSBarry Smith t = a->solve_work; 40624e2b4712SSatish Balay 40634e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 40644e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 40654e2b4712SSatish Balay 40664e2b4712SSatish Balay /* forward solve the lower triangular */ 4067f1af5d2fSBarry Smith t[0] = b[*r++]; 40684e2b4712SSatish Balay for (i=1; i<n; i++) { 40694e2b4712SSatish Balay v = aa + ai[i]; 40704e2b4712SSatish Balay vi = aj + ai[i]; 40714e2b4712SSatish Balay nz = diag[i] - ai[i]; 4072f1af5d2fSBarry Smith s1 = b[*r++]; 40734e2b4712SSatish Balay while (nz--) { 4074f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 40754e2b4712SSatish Balay } 4076f1af5d2fSBarry Smith t[i] = s1; 40774e2b4712SSatish Balay } 40784e2b4712SSatish Balay /* backward solve the upper triangular */ 40794e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 40804e2b4712SSatish Balay v = aa + diag[i] + 1; 40814e2b4712SSatish Balay vi = aj + diag[i] + 1; 40824e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4083f1af5d2fSBarry Smith s1 = t[i]; 40844e2b4712SSatish Balay while (nz--) { 4085f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 40864e2b4712SSatish Balay } 4087f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 40884e2b4712SSatish Balay } 40894e2b4712SSatish Balay 40904e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 40914e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 40921ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 40931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4094dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 40954e2b4712SSatish Balay PetscFunctionReturn(0); 40964e2b4712SSatish Balay } 409715091d37SBarry Smith /* 409815091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 409915091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 410015091d37SBarry Smith */ 41014a2ae208SSatish Balay #undef __FUNCT__ 41024a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4103dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 410415091d37SBarry Smith { 410515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4106690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4107dfbe8321SBarry Smith PetscErrorCode ierr; 4108690b6cddSBarry Smith PetscInt *diag = a->diag; 410915091d37SBarry Smith MatScalar *aa=a->a; 411087828ca2SBarry Smith PetscScalar *x,*b; 411187828ca2SBarry Smith PetscScalar s1,x1; 411215091d37SBarry Smith MatScalar *v; 4113690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 411415091d37SBarry Smith 411515091d37SBarry Smith PetscFunctionBegin; 41161ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 41171ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 411815091d37SBarry Smith 411915091d37SBarry Smith /* forward solve the lower triangular */ 412015091d37SBarry Smith idx = 0; 412115091d37SBarry Smith x[0] = b[0]; 412215091d37SBarry Smith for (i=1; i<n; i++) { 412315091d37SBarry Smith v = aa + ai[i]; 412415091d37SBarry Smith vi = aj + ai[i]; 412515091d37SBarry Smith nz = diag[i] - ai[i]; 412615091d37SBarry Smith idx += 1; 4127f1af5d2fSBarry Smith s1 = b[idx]; 412815091d37SBarry Smith while (nz--) { 412915091d37SBarry Smith jdx = *vi++; 413015091d37SBarry Smith x1 = x[jdx]; 4131f1af5d2fSBarry Smith s1 -= v[0]*x1; 413215091d37SBarry Smith v += 1; 413315091d37SBarry Smith } 4134f1af5d2fSBarry Smith x[idx] = s1; 413515091d37SBarry Smith } 413615091d37SBarry Smith /* backward solve the upper triangular */ 413715091d37SBarry Smith for (i=n-1; i>=0; i--){ 413815091d37SBarry Smith v = aa + diag[i] + 1; 413915091d37SBarry Smith vi = aj + diag[i] + 1; 414015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 414115091d37SBarry Smith idt = i; 4142f1af5d2fSBarry Smith s1 = x[idt]; 414315091d37SBarry Smith while (nz--) { 414415091d37SBarry Smith idx = *vi++; 414515091d37SBarry Smith x1 = x[idx]; 4146f1af5d2fSBarry Smith s1 -= v[0]*x1; 414715091d37SBarry Smith v += 1; 414815091d37SBarry Smith } 414915091d37SBarry Smith v = aa + diag[i]; 4150f1af5d2fSBarry Smith x[idt] = v[0]*s1; 415115091d37SBarry Smith } 41521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 41531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4154dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 415515091d37SBarry Smith PetscFunctionReturn(0); 415615091d37SBarry Smith } 41574e2b4712SSatish Balay 41584e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 415916a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 41606bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 4161ae3d28f0SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth); 41626bce7ff8SHong Zhang 41636bce7ff8SHong Zhang #undef __FUNCT__ 41646bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 41656bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 41666bce7ff8SHong Zhang { 41676bce7ff8SHong Zhang Mat C=B; 41686bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 41696bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 41706bce7ff8SHong Zhang PetscErrorCode ierr; 41716bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 41726bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 41736bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4174b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4175914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4176914a18a2SHong Zhang MatScalar *v_work; 4177ae3d28f0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 41786bce7ff8SHong Zhang 41796bce7ff8SHong Zhang PetscFunctionBegin; 41806bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 41816bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4182ae3d28f0SHong Zhang 4183fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4184fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 41856bce7ff8SHong Zhang ics = ic; 41866bce7ff8SHong Zhang 4187914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 4188fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 4189914a18a2SHong Zhang 41906bce7ff8SHong Zhang for (i=0; i<n; i++){ 41916bce7ff8SHong Zhang /* zero rtmp */ 41926bce7ff8SHong Zhang /* L part */ 41936bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 41946bce7ff8SHong Zhang bjtmp = bj + bi[i]; 4195914a18a2SHong Zhang for (j=0; j<nz; j++){ 4196914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4197914a18a2SHong Zhang } 41986bce7ff8SHong Zhang 41996bce7ff8SHong Zhang /* U part */ 42001a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 42011a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 42021a83e813SShri Abhyankar for (j=0; j<nz; j++){ 42031a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 42041a83e813SShri Abhyankar } 42051a83e813SShri Abhyankar 42061a83e813SShri Abhyankar /* load in initial (unfactored row) */ 42071a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 42081a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 42091a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 42101a83e813SShri Abhyankar for (j=0; j<nz; j++) { 42111a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 42121a83e813SShri Abhyankar } 42131a83e813SShri Abhyankar 42141a83e813SShri Abhyankar /* elimination */ 42151a83e813SShri Abhyankar bjtmp = bj + bi[i]; 42161a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 42171a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 42181a83e813SShri Abhyankar row = bjtmp[k]; 42191a83e813SShri Abhyankar pc = rtmp + bs2*row; 42201a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 42211a83e813SShri Abhyankar if (flg) { 42221a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 42231a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 42241a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 42251a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 42261a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 42271a83e813SShri Abhyankar for (j=0; j<nz; j++) { 42281a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 42291a83e813SShri Abhyankar } 42301a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 42311a83e813SShri Abhyankar } 42321a83e813SShri Abhyankar } 42331a83e813SShri Abhyankar 42341a83e813SShri Abhyankar /* finished row so stick it into b->a */ 42351a83e813SShri Abhyankar /* L part */ 42361a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 42371a83e813SShri Abhyankar pj = b->j + bi[i] ; 42381a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 42391a83e813SShri Abhyankar for (j=0; j<nz; j++) { 42401a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 42411a83e813SShri Abhyankar } 42421a83e813SShri Abhyankar 42431a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 42441a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 42451a83e813SShri Abhyankar pj = b->j + bdiag[i]; 42461a83e813SShri Abhyankar /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 42471a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 42481a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 42491a83e813SShri Abhyankar 42501a83e813SShri Abhyankar /* U part */ 42511a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 42521a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 42531a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 42541a83e813SShri Abhyankar for (j=0; j<nz; j++){ 42551a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 42561a83e813SShri Abhyankar } 42571a83e813SShri Abhyankar } 42581a83e813SShri Abhyankar 42591a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 4260fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 42611a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 42621a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 42631a83e813SShri Abhyankar 4264ae3d28f0SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4265ae3d28f0SHong Zhang ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 4266ae3d28f0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 4267ae3d28f0SHong Zhang if (both_identity){ 4268a2d6a19aSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 4269ae3d28f0SHong Zhang } else { 4270a2d6a19aSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 4271ae3d28f0SHong Zhang } 4272ae3d28f0SHong Zhang 42731a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 42741a83e813SShri Abhyankar ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 42751a83e813SShri Abhyankar PetscFunctionReturn(0); 42761a83e813SShri Abhyankar } 42771a83e813SShri Abhyankar 42786bce7ff8SHong Zhang /* 42796bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 428016a2bf60SHong Zhang See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 428116a2bf60SHong Zhang because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 42826bce7ff8SHong Zhang */ 4283c0c7eb62SShri Abhyankar 42846bce7ff8SHong Zhang #undef __FUNCT__ 42856bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 42866bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 42876bce7ff8SHong Zhang { 42886bce7ff8SHong Zhang 42896bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 42906bce7ff8SHong Zhang PetscErrorCode ierr; 429116a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 429235aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 429335aa4fcfSShri Abhyankar 429435aa4fcfSShri Abhyankar PetscFunctionBegin; 429535aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 429635aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 429735aa4fcfSShri Abhyankar 429835aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 429935aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 430035aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 430135aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 430235aa4fcfSShri Abhyankar if (!b->diag){ 430335aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 430435aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 430535aa4fcfSShri Abhyankar } 430635aa4fcfSShri Abhyankar bdiag = b->diag; 430735aa4fcfSShri Abhyankar 430835aa4fcfSShri Abhyankar if (n > 0) { 430935aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 431035aa4fcfSShri Abhyankar } 431135aa4fcfSShri Abhyankar 431235aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 431335aa4fcfSShri Abhyankar bi = b->i; 431435aa4fcfSShri Abhyankar bj = b->j; 431535aa4fcfSShri Abhyankar 431635aa4fcfSShri Abhyankar /* L part */ 431735aa4fcfSShri Abhyankar bi[0] = 0; 431835aa4fcfSShri Abhyankar for (i=0; i<n; i++){ 431935aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 432035aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 432135aa4fcfSShri Abhyankar aj = a->j + ai[i]; 432235aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 432335aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 432435aa4fcfSShri Abhyankar } 432535aa4fcfSShri Abhyankar } 432635aa4fcfSShri Abhyankar 432735aa4fcfSShri Abhyankar /* U part */ 432835aa4fcfSShri Abhyankar bi_temp = bi[n]; 432935aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 433035aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 433135aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 433235aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 433335aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 433435aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 433535aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 433635aa4fcfSShri Abhyankar } 433735aa4fcfSShri Abhyankar /* diag[i] */ 433835aa4fcfSShri Abhyankar *bj = i; bj++; 433935aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 434035aa4fcfSShri Abhyankar } 434135aa4fcfSShri Abhyankar PetscFunctionReturn(0); 434235aa4fcfSShri Abhyankar } 434335aa4fcfSShri Abhyankar 434435aa4fcfSShri Abhyankar #undef __FUNCT__ 434516a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 434616a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 434716a2bf60SHong Zhang { 434816a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 434916a2bf60SHong Zhang IS isicol; 435016a2bf60SHong Zhang PetscErrorCode ierr; 435116a2bf60SHong Zhang const PetscInt *r,*ic; 43527fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 435316a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 435416a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 435516a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 43567fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 435716a2bf60SHong Zhang PetscReal f; 435816a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 435916a2bf60SHong Zhang PetscBT lnkbt; 436016a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 436116a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 436216a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 436316a2bf60SHong Zhang PetscTruth missing; 43647fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 436516a2bf60SHong Zhang 436616a2bf60SHong Zhang PetscFunctionBegin; 436716a2bf60SHong Zhang if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 436816a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 436916a2bf60SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 437016a2bf60SHong Zhang 437116a2bf60SHong Zhang f = info->fill; 437216a2bf60SHong Zhang levels = (PetscInt)info->levels; 437316a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 437416a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 437516a2bf60SHong Zhang 437616a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 437716a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 43787fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 437916a2bf60SHong Zhang 43807fa3a6a0SHong Zhang if (!levels && both_identity) { 438116a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 438216a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 4383ae3d28f0SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 438435aa4fcfSShri Abhyankar 438535aa4fcfSShri Abhyankar fact->factor = MAT_FACTOR_ILU; 438635aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 438735aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 438835aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 438935aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 439035aa4fcfSShri Abhyankar b->row = isrow; 439135aa4fcfSShri Abhyankar b->col = iscol; 439235aa4fcfSShri Abhyankar b->icol = isicol; 439335aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 439435aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 439535aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 439635aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 439735aa4fcfSShri Abhyankar PetscFunctionReturn(0); 439835aa4fcfSShri Abhyankar } 439935aa4fcfSShri Abhyankar 440035aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 440135aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 440235aa4fcfSShri Abhyankar 440335aa4fcfSShri Abhyankar /* get new row pointers */ 440435aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 440535aa4fcfSShri Abhyankar bi[0] = 0; 440635aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 440735aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 440835aa4fcfSShri Abhyankar bdiag[0] = 0; 440935aa4fcfSShri Abhyankar 4410fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 441135aa4fcfSShri Abhyankar 441235aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 441335aa4fcfSShri Abhyankar nlnk = n + 1; 441435aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 441535aa4fcfSShri Abhyankar 441635aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 441735aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 441835aa4fcfSShri Abhyankar current_space = free_space; 441935aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 442035aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 442135aa4fcfSShri Abhyankar 442235aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 442335aa4fcfSShri Abhyankar nzi = 0; 442435aa4fcfSShri Abhyankar /* copy current row into linked list */ 442535aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 442635aa4fcfSShri Abhyankar if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 442735aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 442835aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 442935aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 443035aa4fcfSShri Abhyankar nzi += nlnk; 443135aa4fcfSShri Abhyankar 443235aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 443335aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 443435aa4fcfSShri Abhyankar fm = n; 443535aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 443635aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 443735aa4fcfSShri Abhyankar lnk[fm] = i; 443835aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 443935aa4fcfSShri Abhyankar nzi++; dcount++; 444035aa4fcfSShri Abhyankar } 444135aa4fcfSShri Abhyankar 444235aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 444335aa4fcfSShri Abhyankar nzbd = 0; 444435aa4fcfSShri Abhyankar prow = lnk[n]; 444535aa4fcfSShri Abhyankar while (prow < i) { 444635aa4fcfSShri Abhyankar nnz = bdiag[prow]; 444735aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 444835aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 444935aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 445035aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 445135aa4fcfSShri Abhyankar nzi += nlnk; 445235aa4fcfSShri Abhyankar prow = lnk[prow]; 445335aa4fcfSShri Abhyankar nzbd++; 445435aa4fcfSShri Abhyankar } 445535aa4fcfSShri Abhyankar bdiag[i] = nzbd; 445635aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 445735aa4fcfSShri Abhyankar 445835aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 445935aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 446035aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 446135aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 446235aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 446335aa4fcfSShri Abhyankar reallocs++; 446435aa4fcfSShri Abhyankar } 446535aa4fcfSShri Abhyankar 446635aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 446735aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 446835aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 446935aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 447035aa4fcfSShri Abhyankar 447135aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 447235aa4fcfSShri Abhyankar if (*(bj_ptr[i]+bdiag[i]) != i) { 447335aa4fcfSShri Abhyankar SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 447435aa4fcfSShri Abhyankar try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 447535aa4fcfSShri Abhyankar } 447635aa4fcfSShri Abhyankar 447735aa4fcfSShri Abhyankar current_space->array += nzi; 447835aa4fcfSShri Abhyankar current_space->local_used += nzi; 447935aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 448035aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 448135aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 448235aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 448335aa4fcfSShri Abhyankar } 448435aa4fcfSShri Abhyankar 448535aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 448635aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 448735aa4fcfSShri Abhyankar 448835aa4fcfSShri Abhyankar /* destroy list of free space and other temporary arrays */ 448935aa4fcfSShri Abhyankar ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 449035aa4fcfSShri Abhyankar 449135aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 449235aa4fcfSShri Abhyankar ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 449335aa4fcfSShri Abhyankar 449435aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 449535aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 4496fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 449735aa4fcfSShri Abhyankar 449835aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 449935aa4fcfSShri Abhyankar { 450035aa4fcfSShri Abhyankar PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 450135aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 450235aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 450335aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 450435aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 450535aa4fcfSShri Abhyankar if (diagonal_fill) { 450635aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 450735aa4fcfSShri Abhyankar } 450835aa4fcfSShri Abhyankar } 450935aa4fcfSShri Abhyankar #endif 451035aa4fcfSShri Abhyankar 451135aa4fcfSShri Abhyankar /* put together the new matrix */ 451235aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 451335aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 451435aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 451535aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 451635aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 451735aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 451835aa4fcfSShri Abhyankar ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 451935aa4fcfSShri Abhyankar b->j = bj; 452035aa4fcfSShri Abhyankar b->i = bi; 452135aa4fcfSShri Abhyankar b->diag = bdiag; 452235aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 452335aa4fcfSShri Abhyankar b->ilen = 0; 452435aa4fcfSShri Abhyankar b->imax = 0; 452535aa4fcfSShri Abhyankar b->row = isrow; 452635aa4fcfSShri Abhyankar b->col = iscol; 452735aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 452835aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 452935aa4fcfSShri Abhyankar b->icol = isicol; 453035aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 453135aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 453235aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 453335aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 453435aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 4535ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocs; 4536ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 4537ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 4538ae3d28f0SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 453935aa4fcfSShri Abhyankar PetscFunctionReturn(0); 454035aa4fcfSShri Abhyankar } 454135aa4fcfSShri Abhyankar 454235aa4fcfSShri Abhyankar 45434e2b4712SSatish Balay /* 45444e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 45454e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 45464e2b4712SSatish Balay Not a good example of code reuse. 45474e2b4712SSatish Balay */ 45484a2ae208SSatish Balay #undef __FUNCT__ 45494a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 45500481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 45514e2b4712SSatish Balay { 45524e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 45534e2b4712SSatish Balay IS isicol; 45546849ba73SBarry Smith PetscErrorCode ierr; 45555d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 45565d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 4557a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 4558d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 455941df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 4560329f5518SBarry Smith PetscReal f; 4561c0c7eb62SShri Abhyankar PetscTruth newdatastruct = PETSC_FALSE; 45624e2b4712SSatish Balay 45634e2b4712SSatish Balay PetscFunctionBegin; 456416a2bf60SHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 456516a2bf60SHong Zhang if (newdatastruct){ 456616a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 456716a2bf60SHong Zhang PetscFunctionReturn(0); 456816a2bf60SHong Zhang } 456916a2bf60SHong Zhang 45706bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 45716bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 45726bce7ff8SHong Zhang 4573435faa5fSBarry Smith f = info->fill; 4574690b6cddSBarry Smith levels = (PetscInt)info->levels; 4575690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 45764c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 457716a2bf60SHong Zhang 4578667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4579667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 45807d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 4581309c388cSBarry Smith 458241df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 458316a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 45846bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 45856bce7ff8SHong Zhang 4586719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 4587ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 4588bb3d539aSBarry Smith b->row = isrow; 4589bb3d539aSBarry Smith b->col = iscol; 4590bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4591bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4592bb3d539aSBarry Smith b->icol = isicol; 4593bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4594b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 45956bce7ff8SHong Zhang PetscFunctionReturn(0); 45966bce7ff8SHong Zhang } 45976bce7ff8SHong Zhang 45986bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 45994e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 46004e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 46014e2b4712SSatish Balay 46024e2b4712SSatish Balay /* get new row pointers */ 4603690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 46044e2b4712SSatish Balay ainew[0] = 0; 46054e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 4606690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 4607690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 46084e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 4609690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 46104e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 4611690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 46124e2b4712SSatish Balay /* im is level for each filled value */ 4613690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 46144e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 4615690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 46164e2b4712SSatish Balay dloc[0] = 0; 46174e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 4618435faa5fSBarry Smith 4619435faa5fSBarry Smith /* copy prow into linked list */ 46204e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 46213b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 46224e2b4712SSatish Balay xi = aj + ai[r[prow]]; 46234e2b4712SSatish Balay fill[n] = n; 4624435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 46254e2b4712SSatish Balay while (nz--) { 46264e2b4712SSatish Balay fm = n; 46274e2b4712SSatish Balay idx = ic[*xi++]; 46284e2b4712SSatish Balay do { 46294e2b4712SSatish Balay m = fm; 46304e2b4712SSatish Balay fm = fill[m]; 46314e2b4712SSatish Balay } while (fm < idx); 46324e2b4712SSatish Balay fill[m] = idx; 46334e2b4712SSatish Balay fill[idx] = fm; 46344e2b4712SSatish Balay im[idx] = 0; 46354e2b4712SSatish Balay } 4636435faa5fSBarry Smith 4637435faa5fSBarry Smith /* make sure diagonal entry is included */ 4638435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 4639435faa5fSBarry Smith fm = n; 4640435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 4641435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 4642435faa5fSBarry Smith fill[fm] = prow; 4643435faa5fSBarry Smith im[prow] = 0; 4644435faa5fSBarry Smith nzf++; 4645335d9088SBarry Smith dcount++; 4646435faa5fSBarry Smith } 4647435faa5fSBarry Smith 46484e2b4712SSatish Balay nzi = 0; 46494e2b4712SSatish Balay row = fill[n]; 46504e2b4712SSatish Balay while (row < prow) { 46514e2b4712SSatish Balay incrlev = im[row] + 1; 46524e2b4712SSatish Balay nz = dloc[row]; 4653435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 46544e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 46554e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 46564e2b4712SSatish Balay fm = row; 46574e2b4712SSatish Balay while (nnz-- > 0) { 46584e2b4712SSatish Balay idx = *xi++; 46594e2b4712SSatish Balay if (*flev + incrlev > levels) { 46604e2b4712SSatish Balay flev++; 46614e2b4712SSatish Balay continue; 46624e2b4712SSatish Balay } 46634e2b4712SSatish Balay do { 46644e2b4712SSatish Balay m = fm; 46654e2b4712SSatish Balay fm = fill[m]; 46664e2b4712SSatish Balay } while (fm < idx); 46674e2b4712SSatish Balay if (fm != idx) { 46684e2b4712SSatish Balay im[idx] = *flev + incrlev; 46694e2b4712SSatish Balay fill[m] = idx; 46704e2b4712SSatish Balay fill[idx] = fm; 46714e2b4712SSatish Balay fm = idx; 46724e2b4712SSatish Balay nzf++; 4673ecf371e4SBarry Smith } else { 46744e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 46754e2b4712SSatish Balay } 46764e2b4712SSatish Balay flev++; 46774e2b4712SSatish Balay } 46784e2b4712SSatish Balay row = fill[row]; 46794e2b4712SSatish Balay nzi++; 46804e2b4712SSatish Balay } 46814e2b4712SSatish Balay /* copy new filled row into permanent storage */ 46824e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 46834e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 4684ecf371e4SBarry Smith 4685ecf371e4SBarry Smith /* estimate how much additional space we will need */ 4686ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 4687ecf371e4SBarry Smith /* just double the memory each time */ 4688690b6cddSBarry Smith PetscInt maxadd = jmax; 4689ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 46904e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 46914e2b4712SSatish Balay jmax += maxadd; 4692ecf371e4SBarry Smith 4693ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 46945d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 46955d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4696606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 46975d0c19d7SBarry Smith ajnew = xitmp; 46985d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 46995d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 4700606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 47015d0c19d7SBarry Smith ajfill = xitmp; 4702eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 47034e2b4712SSatish Balay } 47045d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 47054e2b4712SSatish Balay flev = ajfill + ainew[prow]; 47064e2b4712SSatish Balay dloc[prow] = nzi; 47074e2b4712SSatish Balay fm = fill[n]; 47084e2b4712SSatish Balay while (nzf--) { 47095d0c19d7SBarry Smith *xitmp++ = fm; 47104e2b4712SSatish Balay *flev++ = im[fm]; 47114e2b4712SSatish Balay fm = fill[fm]; 47124e2b4712SSatish Balay } 4713435faa5fSBarry Smith /* make sure row has diagonal entry */ 4714435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 471577431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 47162401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 4717435faa5fSBarry Smith } 47184e2b4712SSatish Balay } 4719606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 47204e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 47214e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4722606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 4723606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 47244e2b4712SSatish Balay 47256cf91177SBarry Smith #if defined(PETSC_USE_INFO) 47264e2b4712SSatish Balay { 4727329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 4728ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 4729ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 4730ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 4731ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 4732335d9088SBarry Smith if (diagonal_fill) { 4733ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 4734335d9088SBarry Smith } 47354e2b4712SSatish Balay } 473663ba0a88SBarry Smith #endif 47374e2b4712SSatish Balay 47384e2b4712SSatish Balay /* put together the new matrix */ 4739719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 4740719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 4741ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 4742e6b907acSBarry Smith b->free_a = PETSC_TRUE; 4743e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 47447c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 4745a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 47464e2b4712SSatish Balay b->j = ajnew; 47474e2b4712SSatish Balay b->i = ainew; 47484e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 47494e2b4712SSatish Balay b->diag = dloc; 47507f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 47514e2b4712SSatish Balay b->ilen = 0; 47524e2b4712SSatish Balay b->imax = 0; 47534e2b4712SSatish Balay b->row = isrow; 47544e2b4712SSatish Balay b->col = iscol; 4755bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4756c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4757c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4758e51c0b9cSSatish Balay b->icol = isicol; 475987828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 47604e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 47614e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 4762719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 47634e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 47644e2b4712SSatish Balay 4765ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocate; 4766ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 4767ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 47686bce7ff8SHong Zhang 476941df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 47708661488fSKris Buschelman PetscFunctionReturn(0); 47718661488fSKris Buschelman } 47728661488fSKris Buschelman 4773732ee342SKris Buschelman #undef __FUNCT__ 47747e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 4775dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 47767e7071cdSKris Buschelman { 477712272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 477812272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 47795a9542e3SKris Buschelman PetscFunctionBegin; 47807cf1b8d3SKris Buschelman /* Undo Column scaling */ 47817cf1b8d3SKris Buschelman /* while (nz--) { */ 47827cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 47837cf1b8d3SKris Buschelman /* } */ 4784c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 4785c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 47867cf1b8d3SKris Buschelman PetscFunctionReturn(0); 47877cf1b8d3SKris Buschelman } 47887cf1b8d3SKris Buschelman 47897cf1b8d3SKris Buschelman #undef __FUNCT__ 47907cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 4791dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 47927cf1b8d3SKris Buschelman { 47937cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4794b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 47952aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 47965a9542e3SKris Buschelman PetscFunctionBegin; 47970b9da03eSKris Buschelman /* Is this really necessary? */ 479820235379SKris Buschelman while (nz--) { 47990b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 48007e7071cdSKris Buschelman } 4801c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 48027e7071cdSKris Buschelman PetscFunctionReturn(0); 48037e7071cdSKris Buschelman } 48047e7071cdSKris Buschelman 4805732ee342SKris Buschelman 4806